Beispiel #1
0
def grid_bin(data, xvar, yvar, xbins, ybins, zvar=None):
    x_disc = Discretizer.create_discretized_var(xvar, xbins[1:-1])
    y_disc = Discretizer.create_discretized_var(yvar, ybins[1:-1])

    x_min, x_max = xbins[0], xbins[-1]
    y_min, y_max = ybins[0], ybins[-1]

    querydomain = [x_disc, y_disc]
    if zvar is not None:
        querydomain = querydomain + [zvar]

    querydomain = Orange.data.Domain(querydomain)

    def interval_filter(var, low, high):
        return Orange.data.filter.Values(
            [Orange.data.filter.FilterContinuous(
                 var, max=high, min=low,
                 oper=Orange.data.filter.FilterContinuous.Between)]
        )

    def value_filter(var, val):
        return Orange.data.filter.Values(
            [Orange.data.filter.FilterDiscrete(var, [val])]
        )

    def filters_join(filters):
        return Orange.data.filter.Values(
            reduce(list.__iadd__, (f.conditions for f in filters), [])
        )

    inf_bounds = np.isinf([x_min, x_max, y_min, y_max])
    if not all(inf_bounds):
        # No need to filter the data
        range_filters = [interval_filter(xvar, x_min, x_max),
                         interval_filter(yvar, y_min, y_max)]
        range_filter = filters_join(range_filters)
        subset = range_filter(data)
    else:
        subset = data

    if zvar.is_discrete:

        filters = [value_filter(zvar, val) for val in zvar.values]
        contingencies = [
            contingency.get_contingency(
                filter_(subset.from_table(querydomain, subset)),
                col_variable=y_disc, row_variable=x_disc
            )
            for filter_ in filters
        ]
        contingencies = np.dstack(contingencies)
    else:
        contingencies = contingency.get_contingency(
            subset.from_table(querydomain, subset),
            col_variable=y_disc, row_variable=x_disc
        )

    contingencies = np.asarray(contingencies)
    return Tree(xbins, ybins, contingencies, None)
Beispiel #2
0
def burt_table(data, variables):
    """
    Construct a 'Burt table' (all values cross-tabulation) for variables.

    Return and ordered list of (variable, value) pairs and a
    numpy.ndarray contingency

    :param Orange.data.Table data: Data table.
    :param variables: List of variables (discrete).
    :type variables: list of Orange.data.DiscreteVariable

    """
    values = [(var, value) for var in variables for value in var.values]

    table = np.zeros((len(values), len(values)))
    counts = [len(attr.values) for attr in variables]
    offsets = np.r_[0, np.cumsum(counts)]

    for i in range(len(variables)):
        for j in range(i + 1):
            var1 = variables[i]
            var2 = variables[j]

            cm = contingency.get_contingency(data, var2, var1)

            start1, end1 = offsets[i], offsets[i] + counts[i]
            start2, end2 = offsets[j], offsets[j] + counts[j]

            table[start1: end1, start2: end2] += cm
            if i != j:
                table[start2: end2, start1: end1] += cm.T

    return values, table
Beispiel #3
0
 def compute_score(attr):
     if attr is group_var:
         return 3
     if attr.is_continuous:
         # One-way ANOVA
         col = data.get_column_view(attr)[0].astype(float)
         groups = (col[group_col == i] for i in range(n_groups))
         groups = (col[~np.isnan(col)] for col in groups)
         groups = [group for group in groups if len(group)]
         p = f_oneway(*groups)[1] if len(groups) > 1 else 2
     else:
         # Chi-square with the given distribution into groups
         # (see degrees of freedom in computation of the p-value)
         if not attr.values or not group_var.values:
             return 2
         observed = np.array(
             contingency.get_contingency(data, group_var, attr))
         observed = observed[observed.sum(axis=1) != 0, :]
         observed = observed[:, observed.sum(axis=0) != 0]
         if min(observed.shape) < 2:
             return 2
         expected = \
             np.outer(observed.sum(axis=1), observed.sum(axis=0)) / \
             np.sum(observed)
         p = chisquare(observed.ravel(), f_exp=expected.ravel(),
                       ddof=n_groups - 1)[1]
     if math.isnan(p):
         return 2
     return p
Beispiel #4
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     self.is_continuous = attr.is_continuous
     if dataset is None or not self.is_continuous and not attr.values or \
                     self.group_var and not self.group_var.values:
         self.stats = self.dist = self.conts = []
         return
     if self.group_var:
         self.dist = []
         self.conts = contingency.get_contingency(
             dataset, attr, self.group_var)
         if self.is_continuous:
             self.stats = [BoxData(cont, attr, i, self.group_var)
                           for i, cont in enumerate(self.conts)]
         self.label_txts_all = self.group_var.values
     else:
         self.dist = distribution.get_distribution(dataset, attr)
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist, attr, None)]
         self.label_txts_all = [""]
     self.label_txts = [txts for stat, txts in zip(self.stats,
                                                   self.label_txts_all)
                        if stat.n > 0]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Beispiel #5
0
 def test_mixedtype_metas(self):
     import Orange
     zoo = Orange.data.Table("zoo")
     dom = Orange.data.Domain(zoo.domain.attributes, zoo.domain.class_var,
                              zoo.domain.metas + zoo.domain.attributes[:2])
     t = Orange.data.Table(dom, zoo)
     cont = contingency.get_contingency(zoo, 2, t.domain.metas[1])
     assert_dist_equal(cont["1"], [38, 5])
     assert_dist_equal(cont, [[4, 54], [38, 5]])
     zoo[25][t.domain.metas[1]] = float("nan")
     zoo[0][2] = float("nan")
     cont = contingency.get_contingency(zoo, 2, t.domain.metas[1])
     assert_dist_equal(cont["1"], [37, 5])
     assert_dist_equal(cont, [[4, 53], [37, 5]])
     np.testing.assert_almost_equal(cont.unknowns, [0, 1])
     self.assertEqual(cont.unknown_rows, 1)
Beispiel #6
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            data = self.data[:, (self.var, self.cvar) if self.cvar else self.var ]
            disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index])
            data = Orange.preprocess.Discretize(data, method=disc)
            self.var = data.domain.variables[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        if self.cvar:
            self.contingencies = \
                contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = \
                distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
def contingency_table(data, columns, rows):
    ct = contingency.get_contingency(data, columns, rows)
    metavar = StringVariable(rows.name)
    metas = [[str(val)] for val in rows.values]
    domain = Domain([ContinuousVariable(val, number_of_decimals=0)
                     for val in columns.values], metas=[metavar])
    return Table(domain, ct, metas=metas)
Beispiel #8
0
    def test_compute_contingency_invalid(self):
        rstate = np.random.RandomState(0xFFFF)
        X = data.ContinuousVariable("X")
        C = data.DiscreteVariable("C", values=["C{}".format(i + 1) for i in range(1024)])
        domain = data.Domain([X], [C])
        d = data.Table.from_numpy(
            domain,
            rstate.uniform(size=(20, 1)).round(1),
            rstate.randint(0, 1024, size=(20, 1)),
        )
        c = contingency.get_contingency(d, X, C)
        self.assertEqual(c.counts.shape[0], 1024)

        d.Y[5] = 1024
        with self.assertRaises(IndexError):
            contingency.get_contingency(d, X, C)
Beispiel #9
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete
                   for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(np.diag(
            contingency.get_contingency(table, table.domain.class_var)))
        nclss = (class_freq != 0).sum()
        if not nclss:
            raise ValueError("Data has no defined target values")

        # Laplacian smoothing considers only classes that appear in the data,
        # in part to avoid cases where the probabilities are affected by empty
        # (or completely spurious) classes that appear because of Orange's reuse
        # of variables. See GH-2943.
        # The corresponding elements of class_probs are set to zero only after
        # mock non-zero values are used in computation of log_cont_prob to
        # prevent division by zero.
        class_prob = (class_freq + 1) / (np.sum(class_freq) + nclss)
        log_cont_prob = [np.log(
            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] + nclss)
            / class_prob[:, None])
                         for c in cont]
        class_prob[class_freq == 0] = 0
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
Beispiel #10
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
            self.cb_prob.clear()
            self.cb_prob.addItem("(None)")
            self.cb_prob.addItems(self.cvar.values)
            self.cb_prob.addItem("(All)")
            self.show_prob = min(max(self.show_prob, 0), len(self.cvar.values) + 1)
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            data = self.data[:, (self.var, self.cvar) if self.cvar else self.var]
            disc = Orange.preprocess.discretize.EqualWidth(n=self.bins[self.smoothing_index])
            data = Orange.preprocess.Discretize(data, method=disc, remove_const=False)
            self.var = data.domain[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        if self.cvar:
            self.contingencies = contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
Beispiel #11
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(isinstance(var, DiscreteVariable) for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.diag(contingency.get_contingency(table, table.domain.class_var))
        return NaiveBayesModel(cont, class_freq, table.domain)
Beispiel #12
0
 def __call__(self, data, attribute):
     cont = contingency.get_contingency(data, attribute)
     values, I = cont.values, cont.counts.T
     cut_ind = np.array(_entropy_discretize_sorted(I, self.force))
     if len(cut_ind) > 0:
         #"the midpoint between each successive pair of examples" (FI p.1)
         points = (values[cut_ind] + values[cut_ind - 1])/2.
         return _discretized_var(data, attribute, points)
     else:
         return None
Beispiel #13
0
 def __init__(self, data, attr1, attr2):
     self.observed = get_contingency(data, attr1, attr2)
     self.n = np.sum(self.observed)
     self.probs_x = self.observed.sum(axis=0) / self.n
     self.probs_y = self.observed.sum(axis=1) / self.n
     self.expected = np.outer(self.probs_y, self.probs_x) * self.n
     self.residuals = (self.observed - self.expected) / np.sqrt(self.expected)
     self.chisqs = self.residuals ** 2
     self.chisq = float(np.sum(self.chisqs))
     self.p = chi2.sf(self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
 def compute_box_data(self):
     if self.split_var:
         return (
             contingency.get_contingency(
                 self.dataset, self.attribute, self.split_var),
             self.split_var.values)
     else:
         return [
             distribution.get_distribution(
                 self.dataset, self.attribute)], [""]
Beispiel #15
0
 def __call__(self, data, attribute):
     from Orange.statistics import contingency as c
     cont = c.get_contingency(data, attribute)
     values, I = join_contingency(cont)
     cut_ind = numpy.array(entropy_discretize_sorted(I))
     if len(cut_ind) > 0:
         points = values[cut_ind - 1]
         return disc._discretized_var(data, attribute, points)
     else:
         return None
Beispiel #16
0
    def draw_distributions(self):
        """Draw distributions with discrete attributes"""
        if not (self.show_distributions and self.have_data and self.data_has_discrete_class):
            return
        class_count = len(self.data_domain.class_var.values)
        class_ = self.data_domain.class_var

        # we create a hash table of possible class values (happens only if we have a discrete class)
        if self.domain_contingencies is None:
            self.domain_contingencies = dict(
                zip(
                    [attr for attr in self.data_domain if isinstance(attr, DiscreteVariable)],
                    get_contingencies(self.raw_data, skipContinuous=True),
                )
            )
            self.domain_contingencies[class_] = get_contingency(self.raw_data, class_, class_)

        max_count = max([contingency.max() for contingency in self.domain_contingencies.values()] or [1])
        sorted_class_values = get_variable_values_sorted(self.data_domain.class_var)

        for axis_idx, attr_idx in enumerate(self.attribute_indices):
            attr = self.data_domain[attr_idx]
            if isinstance(attr, DiscreteVariable):
                continue

            contingency = self.domain_contingencies[attr]
            attr_len = len(attr.values)

            # we create a hash table of variable values and their indices
            sorted_variable_values = get_variable_values_sorted(attr)

            # create bar curve
            for j in range(attr_len):
                attribute_value = sorted_variable_values[j]
                value_count = contingency[:, attribute_value]

                for i in range(class_count):
                    class_value = sorted_class_values[i]

                    color = QColor(self.discrete_palette[i])
                    color.setAlpha(self.alpha_value)

                    width = float(value_count[class_value] * 0.5) / float(max_count)
                    y_off = float(1.0 + 2.0 * j) / float(2 * attr_len)
                    height = 0.7 / float(class_count * attr_len)

                    y_low_bottom = y_off + float(class_count * height) / 2.0 - i * height
                    curve = PolygonCurve(
                        QPen(color),
                        QBrush(color),
                        xData=[axis_idx, axis_idx + width, axis_idx + width, axis_idx],
                        yData=[y_low_bottom, y_low_bottom, y_low_bottom - height, y_low_bottom - height],
                        tooltip=attr.name,
                    )
                    curve.attach(self)
    def test_get_contingency(self):
        d = self._construct_sparse()
        cont = contingency.get_contingency(d, 5)
        self.assertIsInstance(cont, contingency.Discrete)
        np.testing.assert_almost_equal(cont[0], [1, 0, 0])
        np.testing.assert_almost_equal(cont["b"], [0, 1, 1])
        np.testing.assert_almost_equal(cont[2], [1, 0, 0])

        cont = contingency.get_contingency(d, "c4")
        self.assertIsInstance(cont, contingency.Continuous)
        np.testing.assert_almost_equal(cont[0], [[], []])
        np.testing.assert_almost_equal(cont["b"], [[1], [1]])
        np.testing.assert_almost_equal(cont[2], [[2], [1]])

        cont = contingency.get_contingency(d, d.domain[13])
        self.assertIsInstance(cont, contingency.Continuous)
        np.testing.assert_almost_equal(cont[0], [[1.1], [1]])
        np.testing.assert_almost_equal(cont["b"], [[1], [1]])
        np.testing.assert_almost_equal(cont[2], [[], []])
        np.testing.assert_almost_equal(cont[2], [[], []])
Beispiel #18
0
    def fit_storage(self, table):
        if not isinstance(table, Storage):
            raise TypeError("Data is not a subclass of Orange.data.Storage.")
        if not all(var.is_discrete
                   for var in table.domain.variables):
            raise NotImplementedError("Only discrete variables are supported.")

        cont = contingency.get_contingencies(table)
        class_freq = np.array(np.diag(
            contingency.get_contingency(table, table.domain.class_var)))
        class_prob = (class_freq + 1) / (np.sum(class_freq) + len(class_freq))
        log_cont_prob = [np.log(
            (np.array(c) + 1) / (np.sum(np.array(c), axis=0)[None, :] +
                                 c.shape[0]) / class_prob[:, None])
                         for c in cont]
        return NaiveBayesModel(log_cont_prob, class_prob, table.domain)
Beispiel #19
0
    def rank(self):
        if self.progress:
            return

        disc = Orange.preprocess.EqualWidth(n=10)

        ndomain = Orange.data.Domain(
            [
                disc(self.data, attr) if type(attr) == Orange.data.variable.ContinuousVariable else attr
                for attr in self.data.domain.attributes
            ],
            self.data.domain.class_vars,
        )

        t = self.data.from_table(ndomain, self.data)

        attrs = t.domain.attributes

        tables = {}
        l = 0
        self.progress = gui.ProgressBar(self, len(attrs) * (len(attrs) - 1) / 2)
        for i in range(len(attrs)):
            for j in range(i):
                ct = np.array(contingency.get_contingency(t, attrs[j], attrs[i]))
                pindex, _, _ = p_index(ct)
                tables[i, j] = ct

                item = QStandardItem()
                item.setData(float(pindex), Qt.DisplayRole)
                self.projectionTableModel.setItem(l, 0, item)

                item = QStandardItem()
                item.setData(attrs[i].name, Qt.DisplayRole)
                self.projectionTableModel.setItem(l, 1, item)

                item = QStandardItem()
                item.setData(attrs[j].name, Qt.DisplayRole)
                self.projectionTableModel.setItem(l, 2, item)

                self.progress.advance()
                l += 1

        self.progress.finish()
        self.progress = None
Beispiel #20
0
 def __init__(self, data, attr1, attr2):
     attr1 = data.domain[attr1]
     attr2 = data.domain[attr2]
     if attr1.is_discrete and not attr1.values or \
             attr2.is_discrete and not attr2.values:
         self.p = np.nan
         return
     self.observed = get_contingency(data, attr1, attr2)
     self.n = np.sum(self.observed)
     self.probs_x = self.observed.sum(axis=0) / self.n
     self.probs_y = self.observed.sum(axis=1) / self.n
     self.expected = np.outer(self.probs_y, self.probs_x) * self.n
     self.residuals = \
         (self.observed - self.expected) / np.sqrt(self.expected)
     self.residuals = np.nan_to_num(self.residuals)
     self.chisqs = self.residuals ** 2
     self.chisq = float(np.sum(self.chisqs))
     self.p = chi2.sf(
         self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
Beispiel #21
0
    def _setup(self):
        self.plot.clear()
        self.plot_prob.clear()
        self._legend.clear()
        self._legend.hide()

        varidx = self.variable_idx
        self.var = self.cvar = None
        if varidx >= 0:
            self.var = self.varmodel[varidx]
        if self.groupvar_idx > 0:
            self.cvar = self.groupvarmodel[self.groupvar_idx]
            prob = self.controls.show_prob
            prob.clear()
            prob.addItem("(None)")
            prob.addItems(self.cvar.values)
            prob.addItem("(All)")
            self.show_prob = min(max(self.show_prob, 0),
                                 len(self.cvar.values) + 1)
        data = self.data
        self._setup_smoothing()
        if self.var is None:
            return
        if self.disc_cont:
            domain = Orange.data.Domain(
                [self.var, self.cvar] if self.cvar else [self.var])
            data = Orange.data.Table(domain, data)
            disc = EqualWidth(n=self.bins[self.smoothing_index])
            data = Discretize(method=disc, remove_const=False)(data)
            self.var = data.domain[0]
        self.set_left_axis_name()
        self.enable_disable_rel_freq()
        self.controls.cumulative_distr.setDisabled(not self.var.is_continuous)
        if self.cvar:
            self.contingencies = \
                contingency.get_contingency(data, self.var, self.cvar)
            self.display_contingency()
        else:
            self.distributions = \
                distribution.get_distribution(data, self.var)
            self.display_distribution()
        self.plot.autoRange()
Beispiel #22
0
 def _setup(self):
     self.plot.clear()
     varidx = self.variable_idx
     self.var = self.cvar = None
     if varidx >= 0:
         self.var = self.varmodel[varidx]
     if self.groupvar_idx > 0:
         self.cvar = self.groupvarmodel[self.groupvar_idx]
     self.set_left_axis_name()
     self.enable_disable_rel_freq()
     if self.var is None:
         return
     if self.cvar:
         self.contingencies = \
             contingency.get_contingency(self.data, self.var, self.cvar)
         self.display_contingency()
     else:
         self.distributions = \
             distribution.get_distribution(self.data, self.var)
         self.display_distribution()
Beispiel #23
0
    def update_XY(self):
        self.axis_x_cb.clear()
        self.axis_y_cb.clear()
        ca_vars = self.selected_vars()
        if len(ca_vars) == 0:
            return

        multi = len(ca_vars) != 2
        if multi:
            _, ctable = burt_table(self.data, ca_vars)
        else:
            ctable = contingency.get_contingency(self.data, *ca_vars[::-1])

        self.ca = correspondence(ctable, )
        rfs = self.ca.row_factors.shape[1]
        axes = ["{}".format(i + 1)
                for i in range(rfs)]
        self.axis_x_cb.addItems(axes)
        self.axis_y_cb.addItems(axes)
        return rfs
Beispiel #24
0
 def __init__(self, data, attr1, attr2):
     attr1 = data.domain[attr1]
     attr2 = data.domain[attr2]
     if attr1.is_discrete and not attr1.values or \
             attr2.is_discrete and not attr2.values:
         self.p = np.nan
         return
     self.observed = get_contingency(data, attr1, attr2)
     self.n = np.sum(self.observed)
     # pylint: disable=unexpected-keyword-arg
     self.probs_x = self.observed.sum(axis=0) / self.n
     self.probs_y = self.observed.sum(axis=1) / self.n
     self.expected = np.outer(self.probs_y, self.probs_x) * self.n
     with np.errstate(divide="ignore", invalid="ignore"):
         self.residuals = \
             (self.observed - self.expected) / np.sqrt(self.expected)
     self.residuals = np.nan_to_num(self.residuals)
     self.chisqs = self.residuals ** 2
     self.chisq = float(np.sum(self.chisqs))
     self.p = chi2.sf(
         self.chisq, (len(self.probs_x) - 1) * (len(self.probs_y) - 1))
Beispiel #25
0
    def _setup(self):
        """Setup the plot."""
        self.plot.clear()

        varidx = self.variable_idx
        var = cvar = None
        if varidx >= 0:
            var = self.varmodel[varidx]

        if self.groupvar_idx >= 0:
            cvar = self.groupvarmodel[self.groupvar_idx]

        if var is None:
            return

        if is_discrete(cvar):
            cont = contingency.get_contingency(self.data, var, cvar)
            self.set_contingency(cont, var, cvar)
        else:
            dist = distribution.get_distribution(self.data, var)
            self.set_distribution(dist, var)
Beispiel #26
0
    def _update_CA(self):
        ca_vars = self.selected_vars()
        if len(ca_vars) == 0:
            return

        multi = len(ca_vars) != 2
        if multi:
            _, ctable = burt_table(self.data, ca_vars)
        else:
            ctable = contingency.get_contingency(self.data, *ca_vars[::-1])

        self.ca = correspondence(ctable, )
        axes = ["{}".format(i + 1)
                for i in range(self.ca.row_factors.shape[1])]
        self.axis_x_cb.clear()
        self.axis_x_cb.addItems(axes)
        self.axis_y_cb.clear()
        self.axis_y_cb.addItems(axes)
        self.component_x, self.component_y = self.component_x, self.component_y

        self._setup_plot()
        self._update_info()
Beispiel #27
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     self.is_continuous = attr.is_continuous
     if dataset is None or not self.is_continuous and not attr.values or \
                     self.group_var and not self.group_var.values:
         self.stats = self.dist = self.conts = []
         return
     if self.group_var:
         self.dist = []
         self.conts = contingency.get_contingency(
             dataset, attr, self.group_var)
         if self.is_continuous:
             stats, label_texts = [], []
             for i, cont in enumerate(self.conts):
                 if np.sum(cont[1]):
                     stats.append(BoxData(cont, attr, i, self.group_var))
                     label_texts.append(self.group_var.values[i])
             self.stats = stats
             self.label_txts_all = label_texts
         else:
             self.label_txts_all = \
                 [v for v, c in zip(self.group_var.values, self.conts)
                  if np.sum(c) > 0]
     else:
         self.dist = distribution.get_distribution(dataset, attr)
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist, attr, None)]
         self.label_txts_all = [""]
     self.label_txts = [txts for stat, txts in zip(self.stats,
                                                   self.label_txts_all)
                        if stat.n > 0]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Beispiel #28
0
 def compute_box_data(self):
     attr = self.attribute
     if not attr:
         return
     dataset = self.dataset
     self.is_continuous = attr.is_continuous
     if dataset is None or not self.is_continuous and not attr.values or \
                     self.group_var and not self.group_var.values:
         self.stats = self.dist = self.conts = []
         return
     if self.group_var:
         self.dist = []
         self.conts = contingency.get_contingency(
             dataset, attr, self.group_var)
         if self.is_continuous:
             stats, label_texts = [], []
             for i, cont in enumerate(self.conts):
                 if np.sum(cont[1]):
                     stats.append(BoxData(cont, attr, i, self.group_var))
                     label_texts.append(self.group_var.values[i])
             self.stats = stats
             self.label_txts_all = label_texts
         else:
             self.label_txts_all = \
                 [v for v, c in zip(self.group_var.values, self.conts)
                  if np.sum(c) > 0]
     else:
         self.dist = distribution.get_distribution(dataset, attr)
         self.conts = []
         if self.is_continuous:
             self.stats = [BoxData(self.dist, attr, None)]
         self.label_txts_all = [""]
     self.label_txts = [txts for stat, txts in zip(self.stats,
                                                   self.label_txts_all)
                        if stat.n > 0]
     self.stats = [stat for stat in self.stats if stat.n > 0]
Beispiel #29
0
    def updateGraph(self, *args):
        for item in self.canvas.items():
            self.canvas.removeItem(item)
        if self.data is None or len(self.data) == 0 or \
                self.attrX is None or self.attrY is None:
            return
        data = self.data[:, [self.attrX, self.attrY]]
        valsX = []
        valsY = []
        contX = get_contingency(data, self.attrX, self.attrX)
        contY = get_contingency(data, self.attrY, self.attrY)
        # compute contingency of x and y attributes
        for entry in contX:
            sum_ = 0
            try:
                for val in entry:
                    sum_ += val
            except:
                pass
            valsX.append(sum_)

        for entry in contY:
            sum_ = 0
            try:
                for val in entry:
                    sum_ += val
            except:
                pass
            valsY.append(sum_)

        contXY, _ = get_conditional_distribution(
            data, [data.domain[self.attrX], data.domain[self.attrY]])
        # compute probabilities
        probs = {}
        for i in range(len(valsX)):
            valx = valsX[i]
            for j in range(len(valsY)):
                valy = valsY[j]
                try:
                    actualProb = contXY['%s-%s' %
                                        (data.domain[self.attrX].values[i],
                                         data.domain[self.attrY].values[j])]
                    # for val in contXY['%s-%s' %(i, j)]: actualProb += val
                except:
                    actualProb = 0
                probs['%s-%s' % (data.domain[self.attrX].values[i],
                                 data.domain[self.attrY].values[j])] = ((
                                     data.domain[self.attrX].values[i],
                                     valx), (data.domain[self.attrY].values[j],
                                             valy), actualProb, len(data))

        #get text width of Y labels
        max_ylabel_w = 0
        for j in range(len(valsY)):
            xl = CanvasText(self.canvas,
                            "",
                            0,
                            0,
                            html_text=getHtmlCompatibleString(
                                data.domain[self.attrY].values[j]),
                            show=False)
            max_ylabel_w = max(int(xl.boundingRect().width()), max_ylabel_w)
        max_ylabel_w = min(max_ylabel_w, 200)  #upper limit for label widths
        # get text width of Y attribute name
        text = CanvasText(self.canvas,
                          data.domain[self.attrY].name,
                          x=0,
                          y=0,
                          bold=1,
                          show=0,
                          vertical=True)
        xOff = int(text.boundingRect().height() + max_ylabel_w)
        yOff = 55
        sqareSize = min(self.canvasView.width() - xOff - 35,
                        self.canvasView.height() - yOff - 50)
        sqareSize = max(sqareSize, 10)
        self.canvasView.setSceneRect(0, 0, self.canvasView.width(),
                                     self.canvasView.height())

        # print graph name
        name = "<b>P(%s, %s) &#8800; P(%s)&times;P(%s)</b>" % (
            self.attrX, self.attrY, self.attrX, self.attrY)
        CanvasText(self.canvas,
                   "",
                   xOff + sqareSize / 2,
                   20,
                   Qt.AlignCenter,
                   html_text=name)
        CanvasText(self.canvas,
                   "N = " + str(len(data)),
                   xOff + sqareSize / 2,
                   38,
                   Qt.AlignCenter,
                   bold=0)

        ######################
        # compute chi-square
        chisquare = 0.0
        for i in range(len(valsX)):
            for j in range(len(valsY)):
                ((xAttr, xVal), (yAttr, yVal), actual,
                 sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i],
                                          data.domain[self.attrY].values[j])]
                expected = float(xVal * yVal) / float(sum_)
                if expected == 0: continue
                pearson2 = (actual - expected) * (actual - expected) / expected
                chisquare += pearson2

        ######################
        # draw rectangles
        currX = xOff
        max_xlabel_h = 0
        normX, normY = sum(valsX), sum(valsY)
        self.areas = []
        for i in range(len(valsX)):
            if valsX[i] == 0: continue
            currY = yOff
            width = int(float(sqareSize * valsX[i]) / float(normX))

            for j in range(len(valsY) - 1, -1,
                           -1):  # this way we sort y values correctly
                ((xAttr, xVal), (yAttr, yVal), actual,
                 sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i],
                                          data.domain[self.attrY].values[j])]
                if valsY[j] == 0: continue
                height = int(float(sqareSize * valsY[j]) / float(normY))

                # create rectangle
                selected = len(self.areas) in self.selection
                rect = CanvasRectangle(self.canvas,
                                       currX + 2,
                                       currY + 2,
                                       width - 4,
                                       height - 4,
                                       z=-10,
                                       onclick=self.select_area)
                rect.value_pair = i, j
                self.areas.append(rect)
                self.addRectIndependencePearson(
                    rect,
                    currX + 2,
                    currY + 2,
                    width - 4,
                    height - 4,
                    (xAttr, xVal),
                    (yAttr, yVal),
                    actual,
                    sum_,
                    width=1 + 3 * selected,  # Ugly! This is needed since
                    # resize redraws the graph! When this is handled by resizing
                    # just the viewer, update_selection will take care of this
                )

                expected = float(xVal * yVal) / float(sum_)
                pearson = (actual - expected) / sqrt(expected)
                tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(x)): <b>%d (%.2f%%)</b><hr>
                                <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(y)): <b>%d (%.2f%%)</b><hr>
                                <b>Number Of Instances (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b>
                                <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" % (
                    self.attrX, getHtmlCompatibleString(xAttr), xVal,
                    100.0 * float(xVal) / float(sum_), self.attrY,
                    getHtmlCompatibleString(yAttr), yVal,
                    100.0 * float(yVal) / float(sum_), expected,
                    100.0 * float(xVal * yVal) / float(sum_ * sum_), actual,
                    100.0 * float(actual) / float(sum_), chisquare, pearson)
                rect.setToolTip(tooltipText)

                currY += height
                if currX == xOff:
                    CanvasText(self.canvas,
                               "",
                               xOff,
                               currY - height / 2,
                               Qt.AlignRight | Qt.AlignVCenter,
                               html_text=getHtmlCompatibleString(
                                   data.domain[self.attrY].values[j]))

            xl = CanvasText(self.canvas,
                            "",
                            currX + width / 2,
                            yOff + sqareSize,
                            Qt.AlignHCenter | Qt.AlignTop,
                            html_text=getHtmlCompatibleString(
                                data.domain[self.attrX].values[i]))
            max_xlabel_h = max(int(xl.boundingRect().height()), max_xlabel_h)

            currX += width

        # show attribute names
        CanvasText(self.canvas,
                   self.attrY,
                   0,
                   yOff + sqareSize / 2,
                   Qt.AlignLeft | Qt.AlignVCenter,
                   bold=1,
                   vertical=True)
        CanvasText(self.canvas,
                   self.attrX,
                   xOff + sqareSize / 2,
                   yOff + sqareSize + max_xlabel_h,
                   Qt.AlignHCenter | Qt.AlignTop,
                   bold=1)
Beispiel #30
0
    def updateGraph(self, *args):
        for item in self.canvas.items():
            self.canvas.removeItem(item)    # remove all canvas items
        if not self.data: return
        if not self.attrX or not self.attrY: return

        data = self.getConditionalData()
        if not data or len(data) == 0: return

        valsX = []
        valsY = []
        # contX = orange.ContingencyAttrAttr(self.attrX, self.attrX, data)   # distribution of X attribute
        # contY = orange.ContingencyAttrAttr(self.attrY, self.attrY, data)   # distribution of Y attribute
        contX = get_contingency(data, self.attrX, self.attrX)
        contY = get_contingency(data, self.attrY, self.attrY)

        # compute contingency of x and y attributes
        for entry in contX:
            sum_ = 0
            try:
                for val in entry: sum_ += val
            except: pass
            valsX.append(sum_)

        for entry in contY:
            sum_ = 0
            try:
                for val in entry: sum_ += val
            except: pass
            valsY.append(sum_)

        # create cartesian product of selected attributes and compute contingency
        # (cart, profit) = FeatureByCartesianProduct(data, [data.domain[self.attrX], data.domain[self.attrY]])
        # tempData = data.select(list(data.domain) + [cart])
        # contXY = orange.ContingencyAttrAttr(cart, cart, tempData)   # distribution of X attribute
        # contXY = get_contingency(tempData, cart, cart)
        contXY = self.getConditionalDistributions(data, [data.domain[self.attrX], data.domain[self.attrY]])

        # compute probabilities
        probs = {}
        for i in range(len(valsX)):
            valx = valsX[i]
            for j in range(len(valsY)):
                valy = valsY[j]

                actualProb = 0
                try:
                    actualProb = contXY['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])]
                    # for val in contXY['%s-%s' %(i, j)]: actualProb += val
                except:
                    actualProb = 0
                probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] = ((data.domain[self.attrX].values[i], valx), (data.domain[self.attrY].values[j], valy), actualProb, len(data))

        # get text width of Y attribute name
        text = OWCanvasText(self.canvas, data.domain[self.attrY].name, x  = 0, y = 0, bold = 1, show = 0, vertical=True)
        xOff = int(text.boundingRect().height() + 40)
        yOff = 50
        sqareSize = min(self.canvasView.width() - xOff - 35, self.canvasView.height() - yOff - 30)
        if sqareSize < 0: return    # canvas is too small to draw rectangles
        self.canvasView.setSceneRect(0, 0, self.canvasView.width(), self.canvasView.height())

        # print graph name
        if self.attrCondition == "(None)":
            name  = "<b>P(%s, %s) &#8800; P(%s)&times;P(%s)</b>" %(self.attrX, self.attrY, self.attrX, self.attrY)
        else:
            name = "<b>P(%s, %s | %s = %s) &#8800; P(%s | %s = %s)&times;P(%s | %s = %s)</b>" %(self.attrX, self.attrY, self.attrCondition, getHtmlCompatibleString(self.attrConditionValue), self.attrX, self.attrCondition, getHtmlCompatibleString(self.attrConditionValue), self.attrY, self.attrCondition, getHtmlCompatibleString(self.attrConditionValue))
        OWCanvasText(self.canvas, "" , xOff+ sqareSize/2, 20, Qt.AlignCenter, htmlText = name)
        OWCanvasText(self.canvas, "N = " + str(len(data)), xOff+ sqareSize/2, 38, Qt.AlignCenter, bold = 0)

        ######################
        # compute chi-square
        chisquare = 0.0
        for i in range(len(valsX)):
            for j in range(len(valsY)):
                ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])]
                expected = float(xVal*yVal)/float(sum_)
                if expected == 0: continue
                pearson2 = (actual - expected)*(actual - expected) / expected
                chisquare += pearson2

        ######################
        # draw rectangles
        currX = xOff
        max_ylabel_w = 0

        normX, normY = sum(valsX), sum(valsY)
        for i in range(len(valsX)):
            if valsX[i] == 0: continue
            currY = yOff
            width = int(float(sqareSize * valsX[i])/float(normX))
            
            #for j in range(len(valsY)):
            for j in range(len(valsY)-1, -1, -1):   # this way we sort y values correctly
                ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])]
                if valsY[j] == 0: continue
                height = int(float(sqareSize * valsY[j])/float(normY))

                # create rectangle
                rect = OWCanvasRectangle(self.canvas, currX+2, currY+2, width-4, height-4, z = -10)
                self.addRectIndependencePearson(rect, currX+2, currY+2, width-4, height-4, (xAttr, xVal), (yAttr, yVal), actual, sum_)

                expected = float(xVal*yVal)/float(sum_)
                pearson = (actual - expected) / sqrt(expected)
                tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(x)): <b>%d (%.2f%%)</b><hr>
                                <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(y)): <b>%d (%.2f%%)</b><hr>
                                <b>Number Of Examples (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b>
                                <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" %(self.attrX, getHtmlCompatibleString(xAttr), xVal, 100.0*float(xVal)/float(sum_), self.attrY, getHtmlCompatibleString(yAttr), yVal, 100.0*float(yVal)/float(sum_), expected, 100.0*float(xVal*yVal)/float(sum_*sum_), actual, 100.0*float(actual)/float(sum_), chisquare, pearson )
                rect.setToolTip(tooltipText)

                currY += height
                if currX == xOff:
                    xl = OWCanvasText(self.canvas, "", xOff - 10, currY - height/2, Qt.AlignRight | Qt.AlignVCenter, htmlText = getHtmlCompatibleString(data.domain[self.attrY].values[j]))
                    max_ylabel_w = max(int(xl.boundingRect().width()), max_ylabel_w)

            OWCanvasText(self.canvas, "", currX + width/2, yOff + sqareSize + 5, Qt.AlignCenter, htmlText = getHtmlCompatibleString(data.domain[self.attrX].values[i]))
            currX += width

        # show attribute names
        OWCanvasText(self.canvas, self.attrY, max(xOff-20-max_ylabel_w, 20), yOff + sqareSize/2, Qt.AlignRight | Qt.AlignVCenter, bold = 1, vertical=True)
        OWCanvasText(self.canvas, self.attrX, xOff + sqareSize/2, yOff + sqareSize + 15, Qt.AlignCenter, bold = 1)
Beispiel #31
0
    def updateGraph(self, *args):
        for item in self.canvas.items():
            self.canvas.removeItem(item)
        if self.data is None or len(self.data) == 0 or \
                self.attrX is None or self.attrY is None:
            return
        data = self.data[:, [self.attrX, self.attrY]]
        valsX = []
        valsY = []
        contX = get_contingency(data, self.attrX, self.attrX)
        contY = get_contingency(data, self.attrY, self.attrY)
        # compute contingency of x and y attributes
        for entry in contX:
            sum_ = 0
            try:
                for val in entry: sum_ += val
            except: pass
            valsX.append(sum_)

        for entry in contY:
            sum_ = 0
            try:
                for val in entry: sum_ += val
            except: pass
            valsY.append(sum_)

        contXY, _ = get_conditional_distribution(
            data, [data.domain[self.attrX], data.domain[self.attrY]])
        # compute probabilities
        probs = {}
        for i in range(len(valsX)):
            valx = valsX[i]
            for j in range(len(valsY)):
                valy = valsY[j]
                try:
                    actualProb = contXY['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])]
                    # for val in contXY['%s-%s' %(i, j)]: actualProb += val
                except:
                    actualProb = 0
                probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])] = ((data.domain[self.attrX].values[i], valx), (data.domain[self.attrY].values[j], valy), actualProb, len(data))

        #get text width of Y labels
        max_ylabel_w = 0
        for j in range(len(valsY)):
            xl = CanvasText(self.canvas, "", 0, 0, html_text= getHtmlCompatibleString(data.domain[self.attrY].values[j]), show=False)
            max_ylabel_w = max(int(xl.boundingRect().width()), max_ylabel_w)
        max_ylabel_w = min(max_ylabel_w, 200) #upper limit for label widths
        # get text width of Y attribute name
        text = CanvasText(self.canvas, data.domain[self.attrY].name, x  = 0, y = 0, bold = 1, show = 0, vertical=True)
        xOff = int(text.boundingRect().height() + max_ylabel_w)
        yOff = 55
        sqareSize = min(self.canvasView.width() - xOff - 35, self.canvasView.height() - yOff - 50)
        sqareSize = max(sqareSize, 10)
        self.canvasView.setSceneRect(0, 0, self.canvasView.width(), self.canvasView.height())

        # print graph name
        name  = "<b>P(%s, %s) &#8800; P(%s)&times;P(%s)</b>" %(self.attrX, self.attrY, self.attrX, self.attrY)
        CanvasText(self.canvas, "", xOff + sqareSize / 2, 20, Qt.AlignCenter, html_text= name)
        CanvasText(self.canvas, "N = " + str(len(data)), xOff + sqareSize / 2, 38, Qt.AlignCenter, bold = 0)

        ######################
        # compute chi-square
        chisquare = 0.0
        for i in range(len(valsX)):
            for j in range(len(valsY)):
                ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])]
                expected = float(xVal*yVal)/float(sum_)
                if expected == 0: continue
                pearson2 = (actual - expected)*(actual - expected) / expected
                chisquare += pearson2

        ######################
        # draw rectangles
        currX = xOff
        max_xlabel_h = 0
        normX, normY = sum(valsX), sum(valsY)
        self.areas = []
        for i in range(len(valsX)):
            if valsX[i] == 0: continue
            currY = yOff
            width = int(float(sqareSize * valsX[i])/float(normX))

            for j in range(len(valsY)-1, -1, -1):   # this way we sort y values correctly
                ((xAttr, xVal), (yAttr, yVal), actual, sum_) = probs['%s-%s' %(data.domain[self.attrX].values[i], data.domain[self.attrY].values[j])]
                if valsY[j] == 0: continue
                height = int(float(sqareSize * valsY[j])/float(normY))

                # create rectangle
                selected = len(self.areas) in self.selection
                rect = CanvasRectangle(
                    self.canvas, currX+2, currY+2, width-4, height-4, z = -10,
                    onclick=self.select_area)
                rect.value_pair = i, j
                self.areas.append(rect)
                self.addRectIndependencePearson(rect, currX+2, currY+2, width-4, height-4, (xAttr, xVal), (yAttr, yVal), actual, sum_,
                    width=1 + 3 * selected,  # Ugly! This is needed since
                    # resize redraws the graph! When this is handled by resizing
                    # just the viewer, update_selection will take care of this
                    )

                expected = float(xVal*yVal)/float(sum_)
                pearson = (actual - expected) / sqrt(expected)
                tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(x)): <b>%d (%.2f%%)</b><hr>
                                <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of instances (p(y)): <b>%d (%.2f%%)</b><hr>
                                <b>Number Of Instances (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b>
                                <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" %(self.attrX, getHtmlCompatibleString(xAttr), xVal, 100.0*float(xVal)/float(sum_), self.attrY, getHtmlCompatibleString(yAttr), yVal, 100.0*float(yVal)/float(sum_), expected, 100.0*float(xVal*yVal)/float(sum_*sum_), actual, 100.0*float(actual)/float(sum_), chisquare, pearson )
                rect.setToolTip(tooltipText)

                currY += height
                if currX == xOff:
                    CanvasText(self.canvas, "", xOff, currY - height / 2, Qt.AlignRight | Qt.AlignVCenter, html_text= getHtmlCompatibleString(data.domain[self.attrY].values[j]))

            xl = CanvasText(self.canvas, "", currX + width / 2, yOff + sqareSize, Qt.AlignHCenter | Qt.AlignTop, html_text= getHtmlCompatibleString(data.domain[self.attrX].values[i]))
            max_xlabel_h = max(int(xl.boundingRect().height()), max_xlabel_h)

            currX += width

        # show attribute names
        CanvasText(self.canvas, self.attrY, 0, yOff + sqareSize / 2, Qt.AlignLeft | Qt.AlignVCenter, bold = 1, vertical=True)
        CanvasText(self.canvas, self.attrX, xOff + sqareSize / 2, yOff + sqareSize + max_xlabel_h, Qt.AlignHCenter | Qt.AlignTop, bold = 1)
Beispiel #32
0
 def fit_storage(self, table):
     cont = contingency.get_contingencies(table)
     class_freq = np.diag(
         contingency.get_contingency(table, table.domain.class_var))
     return BayesStorageClassifier(cont, class_freq, table.domain)
Beispiel #33
0
    print(cattrs)
    print(dattrs)

    t0 = time.time()
    for i in range(len(cattrs)):
        for j in range(i):
            print(p_index(cdata, cattrs[i], cattrs[j]))

    t1 = time.time() - t0
    print('t1', t1)

    t0 = time.time()
    for i in range(len(dattrs)):
        for j in range(i):
            ct = np.array(
                contingency.get_contingency(ddata, dattrs[j], dattrs[i]))
            print(p_index_ct(ct))

    t2 = time.time() - t0
    print('t2', t2)

    Ns.append(N)
    times1.append(t1)
    times2.append(t2)

    fig, ax = plt.subplots()
    ax.plot(Ns, times1, '-', Ns, times2, '-')
    ax.set_xlabel('Number of examples')
    ax.set_ylabel('Time in seconds')
    ax.set_title('Rank Scatterplots - {}'.format(dataset))
    plt.xlim(min(Ns), max(Ns))
Beispiel #34
0
    def updateGraph(self, *args):
        for item in self.canvas.items():
            self.canvas.removeItem(item)  # remove all canvas items
        if not self.data: return
        if not self.attrX or not self.attrY: return

        data = self.getConditionalData()
        if not data or len(data) == 0: return

        valsX = []
        valsY = []
        # contX = orange.ContingencyAttrAttr(self.attrX, self.attrX, data)   # distribution of X attribute
        # contY = orange.ContingencyAttrAttr(self.attrY, self.attrY, data)   # distribution of Y attribute
        contX = get_contingency(data, self.attrX, self.attrX)
        contY = get_contingency(data, self.attrY, self.attrY)

        # compute contingency of x and y attributes
        for entry in contX:
            sum_ = 0
            try:
                for val in entry:
                    sum_ += val
            except:
                pass
            valsX.append(sum_)

        for entry in contY:
            sum_ = 0
            try:
                for val in entry:
                    sum_ += val
            except:
                pass
            valsY.append(sum_)

        # create cartesian product of selected attributes and compute contingency
        # (cart, profit) = FeatureByCartesianProduct(data, [data.domain[self.attrX], data.domain[self.attrY]])
        # tempData = data.select(list(data.domain) + [cart])
        # contXY = orange.ContingencyAttrAttr(cart, cart, tempData)   # distribution of X attribute
        # contXY = get_contingency(tempData, cart, cart)
        contXY = self.getConditionalDistributions(
            data, [data.domain[self.attrX], data.domain[self.attrY]])

        # compute probabilities
        probs = {}
        for i in range(len(valsX)):
            valx = valsX[i]
            for j in range(len(valsY)):
                valy = valsY[j]

                actualProb = 0
                try:
                    actualProb = contXY['%s-%s' %
                                        (data.domain[self.attrX].values[i],
                                         data.domain[self.attrY].values[j])]
                    # for val in contXY['%s-%s' %(i, j)]: actualProb += val
                except:
                    actualProb = 0
                probs['%s-%s' % (data.domain[self.attrX].values[i],
                                 data.domain[self.attrY].values[j])] = ((
                                     data.domain[self.attrX].values[i],
                                     valx), (data.domain[self.attrY].values[j],
                                             valy), actualProb, len(data))

        # get text width of Y attribute name
        text = OWCanvasText(self.canvas,
                            data.domain[self.attrY].name,
                            x=0,
                            y=0,
                            bold=1,
                            show=0,
                            vertical=True)
        xOff = int(text.boundingRect().height() + 40)
        yOff = 50
        sqareSize = min(self.canvasView.width() - xOff - 35,
                        self.canvasView.height() - yOff - 30)
        if sqareSize < 0: return  # canvas is too small to draw rectangles
        self.canvasView.setSceneRect(0, 0, self.canvasView.width(),
                                     self.canvasView.height())

        # print graph name
        if self.attrCondition == "(None)":
            name = "<b>P(%s, %s) &#8800; P(%s)&times;P(%s)</b>" % (
                self.attrX, self.attrY, self.attrX, self.attrY)
        else:
            name = "<b>P(%s, %s | %s = %s) &#8800; P(%s | %s = %s)&times;P(%s | %s = %s)</b>" % (
                self.attrX, self.attrY, self.attrCondition,
                getHtmlCompatibleString(
                    self.attrConditionValue), self.attrX, self.attrCondition,
                getHtmlCompatibleString(
                    self.attrConditionValue), self.attrY, self.attrCondition,
                getHtmlCompatibleString(self.attrConditionValue))
        OWCanvasText(self.canvas,
                     "",
                     xOff + sqareSize / 2,
                     20,
                     Qt.AlignCenter,
                     htmlText=name)
        OWCanvasText(self.canvas,
                     "N = " + str(len(data)),
                     xOff + sqareSize / 2,
                     38,
                     Qt.AlignCenter,
                     bold=0)

        ######################
        # compute chi-square
        chisquare = 0.0
        for i in range(len(valsX)):
            for j in range(len(valsY)):
                ((xAttr, xVal), (yAttr, yVal), actual,
                 sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i],
                                          data.domain[self.attrY].values[j])]
                expected = float(xVal * yVal) / float(sum_)
                if expected == 0: continue
                pearson2 = (actual - expected) * (actual - expected) / expected
                chisquare += pearson2

        ######################
        # draw rectangles
        currX = xOff
        max_ylabel_w = 0

        normX, normY = sum(valsX), sum(valsY)
        for i in range(len(valsX)):
            if valsX[i] == 0: continue
            currY = yOff
            width = int(float(sqareSize * valsX[i]) / float(normX))

            #for j in range(len(valsY)):
            for j in range(len(valsY) - 1, -1,
                           -1):  # this way we sort y values correctly
                ((xAttr, xVal), (yAttr, yVal), actual,
                 sum_) = probs['%s-%s' % (data.domain[self.attrX].values[i],
                                          data.domain[self.attrY].values[j])]
                if valsY[j] == 0: continue
                height = int(float(sqareSize * valsY[j]) / float(normY))

                # create rectangle
                rect = OWCanvasRectangle(self.canvas,
                                         currX + 2,
                                         currY + 2,
                                         width - 4,
                                         height - 4,
                                         z=-10)
                self.addRectIndependencePearson(rect, currX + 2, currY + 2,
                                                width - 4, height - 4,
                                                (xAttr, xVal), (yAttr, yVal),
                                                actual, sum_)

                expected = float(xVal * yVal) / float(sum_)
                pearson = (actual - expected) / sqrt(expected)
                tooltipText = """<b>X Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(x)): <b>%d (%.2f%%)</b><hr>
                                <b>Y Attribute: %s</b><br>Value: <b>%s</b><br>Number of examples (p(y)): <b>%d (%.2f%%)</b><hr>
                                <b>Number Of Examples (Probabilities):</b><br>Expected (p(x)p(y)): <b>%.1f (%.2f%%)</b><br>Actual (p(x,y)): <b>%d (%.2f%%)</b>
                                <hr><b>Statistics:</b><br>Chi-square: <b>%.2f</b><br>Standardized Pearson residual: <b>%.2f</b>""" % (
                    self.attrX, getHtmlCompatibleString(xAttr), xVal,
                    100.0 * float(xVal) / float(sum_), self.attrY,
                    getHtmlCompatibleString(yAttr), yVal,
                    100.0 * float(yVal) / float(sum_), expected,
                    100.0 * float(xVal * yVal) / float(sum_ * sum_), actual,
                    100.0 * float(actual) / float(sum_), chisquare, pearson)
                rect.setToolTip(tooltipText)

                currY += height
                if currX == xOff:
                    xl = OWCanvasText(self.canvas,
                                      "",
                                      xOff - 10,
                                      currY - height / 2,
                                      Qt.AlignRight | Qt.AlignVCenter,
                                      htmlText=getHtmlCompatibleString(
                                          data.domain[self.attrY].values[j]))
                    max_ylabel_w = max(int(xl.boundingRect().width()),
                                       max_ylabel_w)

            OWCanvasText(self.canvas,
                         "",
                         currX + width / 2,
                         yOff + sqareSize + 5,
                         Qt.AlignCenter,
                         htmlText=getHtmlCompatibleString(
                             data.domain[self.attrX].values[i]))
            currX += width

        # show attribute names
        OWCanvasText(self.canvas,
                     self.attrY,
                     max(xOff - 20 - max_ylabel_w, 20),
                     yOff + sqareSize / 2,
                     Qt.AlignRight | Qt.AlignVCenter,
                     bold=1,
                     vertical=True)
        OWCanvasText(self.canvas,
                     self.attrX,
                     xOff + sqareSize / 2,
                     yOff + sqareSize + 15,
                     Qt.AlignCenter,
                     bold=1)
Beispiel #35
0
    def draw_distributions(self):
        """Draw distributions with discrete attributes"""
        if not (self.show_distributions and self.data is not None
                and self.domain.has_discrete_class):
            return
        class_count = len(self.domain.class_var.values)
        class_ = self.domain.class_var

        # we create a hash table of possible class values (happens only if we have a discrete class)
        if self.domain_contingencies is None:
            self.domain_contingencies = dict(
                zip([attr for attr in self.domain if attr.is_discrete],
                    get_contingencies(self.data, skipContinuous=True)))
            self.domain_contingencies[class_] = get_contingency(
                self.data, class_, class_)

        max_count = max([
            contingency.max()
            for contingency in self.domain_contingencies.values()
        ] or [1])
        sorted_class_values = get_variable_values_sorted(self.domain.class_var)

        for axis_idx, attr_idx in enumerate(self.attribute_indices):
            attr = self.domain[attr_idx]
            if attr.is_discrete:
                continue

            contingency = self.domain_contingencies[attr]
            attr_len = len(attr.values)

            # we create a hash table of variable values and their indices
            sorted_variable_values = get_variable_values_sorted(attr)

            # create bar curve
            for j in range(attr_len):
                attribute_value = sorted_variable_values[j]
                value_count = contingency[:, attribute_value]

                for i in range(class_count):
                    class_value = sorted_class_values[i]

                    color = QColor(*self.colors[i])
                    color.setAlpha(self.alpha_value)

                    width = float(
                        value_count[class_value] * 0.5) / float(max_count)
                    y_off = float(1.0 + 2.0 * j) / float(2 * attr_len)
                    height = 0.7 / float(class_count * attr_len)

                    y_low_bottom = y_off + float(
                        class_count * height) / 2.0 - i * height
                    curve = PolygonCurve(QPen(color),
                                         QBrush(color),
                                         xData=[
                                             axis_idx, axis_idx + width,
                                             axis_idx + width, axis_idx
                                         ],
                                         yData=[
                                             y_low_bottom, y_low_bottom,
                                             y_low_bottom - height,
                                             y_low_bottom - height
                                         ],
                                         tooltip=attr.name)
                    curve.attach(self)
Beispiel #36
0
def grid_bin(data, xvar, yvar, xbins, ybins, zvar=None):
    x_disc = Discretizer.create_discretized_var(data, xvar, xbins[1:-1])
    y_disc = Discretizer.create_discretized_var(data, yvar, ybins[1:-1])

    x_min, x_max = xbins[0], xbins[-1]
    y_min, y_max = ybins[0], ybins[-1]

    querydomain = [x_disc, y_disc]
    if zvar is not None:
        querydomain = querydomain + [zvar]

    querydomain = Orange.data.Domain(querydomain)

    def interval_filter(var, low, high):
        return Orange.data.filter.Values([
            Orange.data.filter.FilterContinuous(
                var,
                max=high,
                min=low,
                oper=Orange.data.filter.FilterContinuous.Between)
        ])

    def value_filter(var, val):
        return Orange.data.filter.Values(
            [Orange.data.filter.FilterDiscrete(var, [val])])

    def filters_join(filters):
        return Orange.data.filter.Values(
            reduce(list.__iadd__, (f.conditions for f in filters), []))

    inf_bounds = np.isinf([x_min, x_max, y_min, y_max])
    if not all(inf_bounds):
        # No need to filter the data
        range_filters = [
            interval_filter(xvar, x_min, x_max),
            interval_filter(yvar, y_min, y_max)
        ]
        range_filter = filters_join(range_filters)
        subset = range_filter(data)
    else:
        subset = data

    if is_discrete(zvar):

        filters = [value_filter(zvar, val) for val in zvar.values]
        contingencies = [
            contingency.get_contingency(filter_(
                subset.from_table(querydomain, subset)),
                                        col_variable=y_disc,
                                        row_variable=x_disc)
            for filter_ in filters
        ]
        contingencies = np.dstack(contingencies)
    else:
        contingencies = contingency.get_contingency(subset.from_table(
            querydomain, subset),
                                                    col_variable=y_disc,
                                                    row_variable=x_disc)

    contingencies = np.asarray(contingencies)
    return Tree(xbins, ybins, contingencies, None)