Example #1
0
    def test_make_proxy_disc(self):
        abc = DiscreteVariable("abc", values="abc", ordered=True)
        abc1 = abc.make_proxy()
        abc2 = abc1.make_proxy()
        self.assertIs(abc.master, abc)
        self.assertIs(abc1.master, abc)
        self.assertIs(abc2.master, abc)
        self.assertEqual(abc, abc1)
        self.assertEqual(abc, abc2)
        self.assertEqual(abc1, abc2)
        self.assertEqual(hash(abc), hash(abc1))
        self.assertEqual(hash(abc1), hash(abc2))

        abcx = DiscreteVariable("abc", values="abc", ordered=True)
        self.assertNotEqual(abc, abcx)

        abc1p = pickle.loads(pickle.dumps(abc1))
        self.assertIs(abc1p.master, abc)
        self.assertEqual(abc1p, abc)

        abcp, abc1p, abc2p = pickle.loads(pickle.dumps((abc, abc1, abc2)))
        self.assertIs(abcp.master, abcp.master)
        self.assertIs(abc1p.master, abcp.master)
        self.assertIs(abc2p.master, abcp.master)
        self.assertEqual(abcp, abc1p)
        self.assertEqual(abcp, abc2p)
        self.assertEqual(abc1p, abc2p)
Example #2
0
    def test_colors(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertIsNone(var._colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertIs(var._colors, var.colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertFalse(var.colors.flags.writeable)

        var.colors = np.arange(6).reshape((2, 3))
        np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]])
        self.assertFalse(var.colors.flags.writeable)
        with self.assertRaises(ValueError):
            var.colors[0] = [42, 41, 40]
        var.set_color(0, [42, 41, 40])
        np.testing.assert_almost_equal(var.colors, [[42, 41, 40], [3, 4, 5]])

        var = DiscreteVariable.make("x", values=["A", "B"])
        var.attributes["colors"] = ['#0a0b0c', '#0d0e0f']
        np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]])

        # Test ncolors adapts to nvalues
        var = DiscreteVariable.make('foo', values=['d', 'r'])
        self.assertEqual(len(var.colors), 2)
        var.add_value('e')
        self.assertEqual(len(var.colors), 3)
        user_defined = (0, 0, 0)
        var.set_color(2, user_defined)
        var.add_value('k')
        self.assertEqual(len(var.colors), 4)
        np.testing.assert_array_equal(var.colors[2], user_defined)
Example #3
0
    def test_repr(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertEqual(
            repr(var),
            "DiscreteVariable(name='a', values=['F', 'M'])")
        var.ordered = True
        self.assertEqual(
            repr(var),
            "DiscreteVariable(name='a', values=['F', 'M'], ordered=True)")

        var = DiscreteVariable.make("a", values="1234567")
        self.assertEqual(
            repr(var),
            "DiscreteVariable(name='a', values=['1', '2', '3', '4', '5', '6', '7'])")
Example #4
0
    def test_copy_descriptor_discrete(self):
        var = DiscreteVariable("foo", values=list("abc"), ordered=True)
        var.attributes = {"bar": 42, "baz": 13}
        copied = copy_descriptor(var)
        self.assertIsInstance(copied, DiscreteVariable)
        self.assertEqual(copied.name, "foo")
        self.assertEqual(list(copied.values), list("abc"))
        self.assertTrue(copied.ordered)
        self.assertEqual(copied.attributes, var.attributes)
        self.assertIsNot(copied.attributes, var.attributes)

        var = DiscreteVariable("foo", values=list("abc"), ordered=False)
        copied = copy_descriptor(var, "cux")
        self.assertEqual(copied.name, "cux")
        self.assertFalse(copied.ordered)
Example #5
0
 def test_unpickle(self):
     d1 = DiscreteVariable("A", values=["two", "one"])
     s = pickle.dumps(d1)
     d2 = DiscreteVariable.make("A", values=["one", "two", "three"])
     d2_values = tuple(d2.values)
     d1c = pickle.loads(s)
     # See: gh-3238
     # The unpickle reconstruction picks an existing variable (d2), on which
     # __setstate__ or __dict__.update is called
     self.assertSequenceEqual(d2.values, d2_values)
     self.assertSequenceEqual(d1c.values, d1.values)
     s = pickle.dumps(d2)
     DiscreteVariable._clear_all_caches()  # [comment redacted]
     d1 = DiscreteVariable("A", values=["one", "two"])
     d2 = pickle.loads(s)
     self.assertSequenceEqual(d2.values, ["two", "one", "three"])
    def _create_corpus(self):
        corpus = None
        names = ["name", "path", "content"]
        data = []
        category_data = []
        text_categories = list(set(t.category for t in self._text_data))
        values = list(set(text_categories))
        category_var = DiscreteVariable.make("category", values=values)
        for textdata in self._text_data:
            data.append(
                [textdata.name,
                 textdata.path,
                 textdata.content]
            )
            category_data.append(category_var.to_val(textdata.category))
        if len(text_categories) > 1:
            category_data = np.array(category_data)
        else:
            category_var = []
            category_data = np.empty((len(data), 0))
        domain = Domain(
            [], category_var, [StringVariable.make(name) for name in names]
        )
        domain["name"].attributes["title"] = True
        data = np.array(data, dtype=object)
        if len(data):
            corpus = Corpus(domain,
                            Y=category_data,
                            metas=data,
                            text_features=[domain.metas[2]])

        return corpus
Example #7
0
    def create_discretized_var(cls, var, points):
        lpoints = list(points)
        if lpoints:
            values = [
                cls._fmt_interval(low, high, var.number_of_decimals)
                for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])]
            to_sql = BinSql(var, lpoints)
        else:
            values = ["single_value"]
            to_sql = SingleValueSql(values[0])

        dvar = DiscreteVariable(name=var.name, values=values,
                                compute_value=cls(var, points))
        dvar.source_variable = var
        dvar.to_sql = to_sql
        return dvar
Example #8
0
    def extend_attributes(self, X, feature_names, feature_values=None,
                          compute_values=None, var_attrs=None, sparse=False):
        """
        Append features to corpus. If `feature_values` argument is present,
        features will be Discrete else Continuous.

        Args:
            X (numpy.ndarray or scipy.sparse.csr_matrix): Features values to append
            feature_names (list): List of string containing feature names
            feature_values (list): A list of possible values for Discrete features.
            compute_values (list): Compute values for corresponding features.
            var_attrs (dict): Additional attributes appended to variable.attributes.
            sparse (bool): Whether the features should be marked as sparse.
        """
        if self.X.size == 0:
            self.X = X
        elif sp.issparse(self.X) or sp.issparse(X):
            self.X = sp.hstack((self.X, X)).tocsr()
        else:
            self.X = np.hstack((self.X, X))

        if compute_values is None:
            compute_values = [None] * X.shape[1]
        if feature_values is None:
            feature_values = [None] * X.shape[1]

        new_attr = self.domain.attributes
        for f, values, cv in zip(feature_names, feature_values, compute_values):
            if values is not None:
                var = DiscreteVariable(f, values=values, compute_value=cv)
            else:
                var = ContinuousVariable(f, compute_value=cv)
            var.sparse = sparse     # don't pass this to constructor so this works with Orange < 3.8.0
            if cv is not None:      # set original variable for cv
                cv.variable = var
            if isinstance(var_attrs, dict):
                var.attributes.update(var_attrs)
            new_attr += (var, )

        new_domain = Domain(
                attributes=new_attr,
                class_vars=self.domain.class_vars,
                metas=self.domain.metas
        )
        self.domain = new_domain
Example #9
0
    def test_repr(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['F', 'M'])")
        var.base_value = 1
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['F', 'M'], base_value=1)")
        var.ordered = True
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['F', 'M'], "
            "ordered=True, base_value=1)")

        var = DiscreteVariable.make("a", values="1234567")
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['1', '2', '3', '4', '5', ...])")
Example #10
0
    def test_colors(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertIsNone(var._colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertIs(var._colors, var.colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertFalse(var.colors.flags.writeable)

        var.colors = np.arange(6).reshape((2, 3))
        np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]])
        self.assertFalse(var.colors.flags.writeable)
        with self.assertRaises(ValueError):
            var.colors[0] = [42, 41, 40]
        var.set_color(0, [42, 41, 40])
        np.testing.assert_almost_equal(var.colors, [[42, 41, 40], [3, 4, 5]])

        var = DiscreteVariable.make("x", values=["A", "B"])
        var.attributes["colors"] = ['#0a0b0c', '#0d0e0f']
        np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]])
Example #11
0
    def create_discretized_var(cls, var, points):
        lpoints = list(points)
        if points:
            values = [
                cls._fmt_interval(low, high, var.number_of_decimals)
                for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])]

            def discretized_attribute():
                return 'bin(%s, ARRAY%s)' % (var.to_sql(), str(lpoints))
        else:
            values = ["single_value"]

            def discretized_attribute():
                return "'%s'" % values[0]

        dvar = DiscreteVariable(name="D_" + var.name, values=values)
        dvar.compute_value = cls(var, points)
        dvar.source_variable = var
        dvar.to_sql = discretized_attribute
        return dvar
Example #12
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        type_code = field_metadata[0]

        FLOATISH_TYPES = (700, 701, 1700)  # real, float8, numeric
        INT_TYPES = (20, 21, 23)  # bigint, int, smallint
        CHAR_TYPES = (25, 1042, 1043,)  # text, char, varchar
        BOOLEAN_TYPES = (16,)  # bool
        DATE_TYPES = (1082, 1114, 1184, )  # date, timestamp, timestamptz
        # time, timestamp, timestamptz, timetz
        TIME_TYPES = (1083, 1114, 1184, 1266,)

        if type_code in FLOATISH_TYPES:
            return ContinuousVariable.make(field_name)

        if type_code in TIME_TYPES + DATE_TYPES:
            tv = TimeVariable.make(field_name)
            tv.have_date |= type_code in DATE_TYPES
            tv.have_time |= type_code in TIME_TYPES
            return tv

        if type_code in INT_TYPES:  # bigint, int, smallint
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable.make(field_name, values)
            return ContinuousVariable.make(field_name)

        if type_code in BOOLEAN_TYPES:
            return DiscreteVariable.make(field_name, ['false', 'true'])

        if type_code in CHAR_TYPES:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                # remove trailing spaces
                values = [v.rstrip() for v in values]
                if values:
                    return DiscreteVariable.make(field_name, values)

        return StringVariable.make(field_name)
Example #13
0
    def test_to_val(self):
        values = ["F", "M"]
        var = DiscreteVariable(name="Feature 0", values=values)

        self.assertEqual(var.to_val(0), 0)
        self.assertEqual(var.to_val("F"), 0)
        self.assertEqual(var.to_val(0.), 0)
        self.assertTrue(math.isnan(var.to_val("?")))

        # TODO: with self.assertRaises(ValueError): var.to_val(2)
        with self.assertRaises(ValueError):
            var.to_val("G")
Example #14
0
    def test_find_compatible_ordered(self):
        abc = DiscreteVariable("abc", values="abc", ordered=True)

        find_comp = DiscreteVariable._find_compatible

        self.assertIsNone(find_comp("abc"))
        self.assertIsNone(find_comp("abc", list("abc")))
        self.assertIs(find_comp("abc", ordered=True), abc)
        self.assertIs(find_comp("abc", ["a"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b", "c"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b", "c", "d"], ordered=True), abc)

        abd = DiscreteVariable.make(
            "abc", values=["a", "d", "b"], ordered=True)
        self.assertIsNot(abc, abd)

        abc_un = DiscreteVariable.make("abc", values=["a", "b", "c"])
        self.assertIsNot(abc_un, abc)

        self.assertIs(
            find_comp("abc", values=["a", "d", "b"], ordered=True), abd)
        self.assertIs(find_comp("abc", values=["a", "b", "c"]), abc_un)
Example #15
0
    def __init__(self, filename):
        reader = vcf.Reader(open(filename, "r"))
        records = [r for r in reader]

        self.samples = np.array(reader.samples)

        self.gq = np.array([[s.data.GQ for s in r.samples] for r in records],
                           dtype="f")
        self.gq = np.nan_to_num(self.gq)

        gt = np.array([[s.data.GT for s in r.samples] for r in records])
        self.gt = gt != "0/0"
        self.records = records

        self.variables = [
            DiscreteVariable("%s-%s" % (r.CHROM, r.POS), values=["0", "1"])
            for r in self.records
        ]

        for v, r in zip(self.variables, records):
            v.attributes["CHROM"] = str(r.CHROM)
            v.attributes["POS"] = str(r.POS)
            v.attributes["REF"] = str(r.REF)
            v.attributes["ALT"] = "".join(str(s) for s in r.ALT)
Example #16
0
    def test_select_data_discrete(self):
        """
        Test select data function
        """
        w = self.widget

        # test with data set for logistic regression - class discrete
        domain = Domain([ContinuousVariable('a'),
                         ContinuousVariable('b')],
                        DiscreteVariable('c', values=['a', 'b']))
        data = Table.from_numpy(domain, [[1, 2], [1, 2]], [0, 1])

        self.send_signal(w.Inputs.data, data)
        self.assertEqual(len(w.select_data()), len(data))
        self.assertEqual(len(w.select_data().domain.attributes), 2)
        self.assertEqual(len(w.select_data().domain.class_var.values), 2)
        self.assertEqual(w.select_data().domain.class_var.values[1],
                         data.domain.class_var.values[1])
        self.assertEqual(w.select_data().domain.class_var.values[0],
                         data.domain.class_var.values[0])
        self.assertEqual(w.select_data().domain.attributes[0].name, w.attr_x)
        self.assertEqual(w.select_data().domain.attributes[1].name, w.attr_y)
        self.assertEqual(w.select_data().domain.class_var.values[0],
                         w.target_class)
Example #17
0
 def test_metadata(self):
     """
     Widget should interpret meta data which are continuous or discrete in
     the same way as features or target. However still one variable should
     be target or feature.
     """
     table = Table(
         Domain(
             [],
             [],
             [
                 ContinuousVariable("a"),
                 DiscreteVariable("b", values=["y", "n"])
             ],
         ),
         list(zip([42.48, 16.84, 15.23, 23.8], "yynn")),
     )
     with patch("Orange.widgets.visualize.owsieve.Discretize",
                wraps=Discretize) as disc:
         self.send_signal(self.widget.Inputs.data, table)
         self.assertTrue(disc.called)
     metas = self.widget.discrete_data.domain.metas
     self.assertEqual(len(metas), 2)
     self.assertTrue(all(attr.is_discrete for attr in metas))
Example #18
0
 def test_XY_large(self):
     from Orange.data.sql.table import AUTO_DL_LIMIT as DLL
     mat = np.random.randint(0, 2, (DLL + 100, 3))
     conn, table_name = self.create_sql_table(mat)
     sql_table = SqlTable(conn,
                          table_name,
                          type_hints=Domain([],
                                            DiscreteVariable(name='col2',
                                                             values=('0',
                                                                     '1',
                                                                     '2'))))
     self.assertRaises(ValueError, lambda: sql_table.X)
     self.assertRaises(ValueError, lambda: sql_table.Y)
     with self.assertRaises(ValueError):
         sql_table.download_data(DLL + 10)
     # Download partial data
     sql_table.download_data(DLL + 10, partial=True)
     assert_almost_equal(sql_table.X, mat[:DLL + 10, :2])
     assert_almost_equal(sql_table.Y.flatten()[:DLL + 10], mat[:DLL + 10,
                                                               2])
     # Download all data
     sql_table.download_data()
     assert_almost_equal(sql_table.X, mat[:, :2])
     assert_almost_equal(sql_table.Y.flatten(), mat[:, 2])
Example #19
0
def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_vars = []
    time_var = None
    for field_name, _ in includes_metadata:
        if field_name == PUBMED_FIELD_DATE:
            time_var = TimeVariable(field_name)
            meta_vars.append(time_var)
        else:
            meta_vars.append(StringVariable.make(field_name))
            if field_name == PUBMED_FIELD_TITLE:
                meta_vars[-1].attributes["title"] = True

    meta_values, class_values = _records_to_corpus_entries(
        records,
        includes_metadata=includes_metadata,
        time_var=time_var,
    )

    class_vars = [
        DiscreteVariable('section',
                         values=list(map(str, set(filter(None, class_values)))))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(domain=domain, Y=Y, metas=meta_values)
Example #20
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)])
        clust_ids = km(self.data)
        silhouette_var = ContinuousVariable(
            get_unique_names(domain, "Silhouette"))
        if km.silhouette_samples is not None:
            self.Warning.no_silhouettes.clear()
            scores = np.arctan(km.silhouette_samples) / np.pi + 0.5
        else:
            self.Warning.no_silhouettes()
            scores = np.nan

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = clust_ids.X.ravel()
        new_table.get_column_view(silhouette_var)[0][:] = scores

        centroids = Table(Domain(km.pre_domain.attributes), km.centroids)

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)
Example #21
0
    def set_marking_items(self, items):
        self.markInputCombo.clear()
        self.markInputRadioButton.setEnabled(False)
        self.markInputItems = items

        self.warning()

        if items is None:
            return

        if self.graph is None or self.graph.items() is None:
            self.warning(
                'No graph provided or no items attached to the graph.')
            return

        graph_items = self.graph.items()
        domain = graph_items.domain

        if len(items) > 0:
            commonVars = (
                set(x.name
                    for x in chain(items.domain.variables, items.domain.metas))
                & set(x.name for x in chain(domain.variables, domain.metas)))

            self.markInputCombo.addItem(
                gui.attributeIconDict[gui.vartype(DiscreteVariable())], "ID")

            for var in commonVars:
                orgVar, mrkVar = domain[var], items.domain[var]

                if type(orgVar) == type(mrkVar) == StringVariable:
                    self.markInputCombo.addItem(
                        gui.attributeIconDict[gui.vartype(orgVar)],
                        orgVar.name)

            self.markInputRadioButton.setEnabled(True)
    def test_result_shape_numpy(self):
        """
        Test whether results shapes are correct when testing on numpy data
        """
        iris = Table('iris')
        iris_bin = Table(
            Domain(iris.domain.attributes,
                   DiscreteVariable("iris", values=["a", "b"])), iris.X[:100],
            iris.Y[:100])
        for learner in all_learners():
            with self.subTest(learner.__name__):
                args = []
                if learner in (ThresholdLearner, CalibratedLearner):
                    args = [LogisticRegressionLearner()]
                data = iris_bin if learner is ThresholdLearner else iris
                model = learner(*args)(data)
                transformed_iris = model.data_to_model_domain(data)

                res = model(transformed_iris.X[0:5])
                self.assertTupleEqual((5, ), res.shape)

                res = model(transformed_iris.X[0:1], model.Probs)
                self.assertTupleEqual((1, len(data.domain.class_var.values)),
                                      res.shape)
Example #23
0
 def __call__(self, data: Table, attribute):
     fmt = [
         "%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%y %b %d %H:%M",
         "%H:%M:%S"
     ][self.unit]
     values, _ = data.get_column_view(attribute)
     times = []
     if values.size:
         mn, mx = ut.nanmin(values), ut.nanmax(values)
         if not np.isnan(mn):
             mn = utc_from_timestamp(mn).timetuple()
             mx = utc_from_timestamp(mx).timetuple()
             times = _time_range(mn, mx, self.unit, self.width, 0, 100)
             if times is None:
                 raise TooManyIntervals
     times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1]
     points = np.array([calendar.timegm(t) for t in times])
     values = [time.strftime(fmt, t) for t in times]
     values = _simplified_time_intervals(values)
     var = data.domain[attribute]
     return DiscreteVariable(name=var.name,
                             values=values,
                             compute_value=Discretizer(var, points),
                             sparse=var.sparse)
    def test_domain_union(self):
        X1, X2, X3 = map(ContinuousVariable, ["X1", "X2", "X3"])
        D1, D2, D3 = map(
            lambda n: DiscreteVariable(n, values=["a", "b"]), ["D1", "D2", "D3"]
        )
        S1, S2 = map(StringVariable, ["S1", "S2"])
        domain1 = Domain([X1, X2], [D1], [S1])
        domain2 = Domain([X3], [D2], [S2])
        res = domain_union(domain1, domain2)

        self.assertSequenceEqual(res.attributes, [X1, X2, X3])
        self.assertSequenceEqual(res.class_vars, [D1, D2])
        self.assertSequenceEqual(res.metas, [S1, S2])

        domain2 = Domain([X3, X2], [D2, D1, D3], [S2, S1])
        res = domain_union(domain1, domain2)
        self.assertSequenceEqual(res.attributes, [X1, X2, X3])
        self.assertSequenceEqual(res.class_vars, [D1, D2, D3])
        self.assertSequenceEqual(res.metas, [S1, S2])

        res = domain_union(domain1, domain1)
        self.assertSequenceEqual(res.attributes, domain1.attributes)
        self.assertSequenceEqual(res.class_vars, domain1.class_vars)
        self.assertSequenceEqual(res.metas, domain1.metas)
Example #25
0
    def test_vizrank_class_nan(self):
        """
        When class values are nan, vizrank should be disabled. It should behave like
        the class column is missing.
        GH-2757
        """
        def assert_vizrank_enabled(data, is_enabled):
            self.send_signal(self.widget.Inputs.data, data)
            self.assertEqual(is_enabled, self.widget.vizrank_button.isEnabled())

        data1 = Table("iris")[::30]
        data2 = Table("iris")[::30].copy()
        with data2.unlocked():
            data2.Y[:] = np.nan
        domain = Domain(
            attributes=data2.domain.attributes[:4], class_vars=DiscreteVariable("iris", values=()))
        data2 = Table(domain, data2.X, Y=data2.Y)
        data3 = Table("iris")[::30].copy()
        with data3.unlocked():
            data3.Y[:] = np.nan

        for data, is_enabled in zip([data1, data2, data1, data3, data1],
                                    [True, False, True, False, True]):
            assert_vizrank_enabled(data, is_enabled)
    def _create_corpus(self):
        corpus = None
        names = ["name", "path", "content"]
        data = []
        category_data = []
        text_categories = list(set(t.category for t in self._text_data))
        values = list(set(text_categories))
        category_var = DiscreteVariable.make("category", values=values)
        for textdata in self._text_data:
            data.append([
                # some characters are written as decomposed (č is char c
                # and separate char for caron), with NFC normalization we
                # normalize them to be written as precomposed (č is one
                # unicode char - 0x10D)
                # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
                normalize('NFC', textdata.name),
                normalize('NFC', textdata.path),
                normalize('NFC', textdata.content)
            ])
            category_data.append(category_var.to_val(textdata.category))
        if len(text_categories) > 1:
            category_data = np.array(category_data)
        else:
            category_var = []
            category_data = np.empty((len(data), 0))
        domain = Domain([], category_var,
                        [StringVariable.make(name) for name in names])
        domain["name"].attributes["title"] = True
        data = np.array(data, dtype=object)
        if len(data):
            corpus = Corpus(domain,
                            Y=category_data,
                            metas=data,
                            text_features=[domain.metas[2]])

        return corpus
Example #27
0
    def _create_variable(self):
        rules = self.active_rules
        # Transposition + stripping
        valid_rules = [label or pattern or n_matches
                       for (label, pattern), n_matches in
                       zip(rules, self.match_counts)]
        patterns = tuple(
            pattern for (_, pattern), valid in zip(rules, valid_rules) if valid)
        names = tuple(
            name for name, valid in zip(self.class_labels(), valid_rules)
            if valid)
        transformer = self.TRANSFORMERS[type(self.attribute)]

        var_key = (self.attribute, self.class_name, names,
                   patterns, self.case_sensitive, self.match_beginning)
        if var_key in self.cached_variables:
            return self.cached_variables[var_key]

        compute_value = transformer(
            self.attribute, patterns, self.case_sensitive, self.match_beginning)
        new_var = DiscreteVariable(
            self.class_name, names, compute_value=compute_value)
        self.cached_variables[var_key] = new_var
        return new_var
Example #28
0
    def stratify_data(self, data: Table, state: TaskState) -> Optional[Table]:
        cohort_vars = ()
        steps = iter(np.linspace(0, 100, len(data)))

        def callback():
            try:
                state.set_progress_value(next(steps))
            except StopIteration:
                pass

        if self.stratify_on == StratifyOn.CoxRiskScore:
            cox_model = self.learner(data)
            _, risk_score_label = self.stratify_on_options[self.stratify_on]
            risk_score_var = ContinuousVariable(risk_score_label,
                                                compute_value=partial(
                                                    cox_risk_score, cox_model,
                                                    data.domain))
            risk_group_var = DiscreteVariable(
                'Cohorts',
                values=['Low risk', 'High risk'],
                compute_value=partial(stratify, risk_score_var,
                                      self.splitting_criteria, data.domain,
                                      callback),
            )

            cohort_vars = (
                risk_score_var,
                risk_group_var,
            )

        domain = Domain(
            self.data.domain.attributes,
            self.data.domain.class_vars,
            self.data.domain.metas + cohort_vars,
        )
        return self.data.transform(domain)
    def _send_data(self):
        if self.partition is None or self.data is None:
            return
        domain = self.data.domain
        # Compute the frequency of each cluster index
        counts = np.bincount(self.partition)
        indices = np.argsort(counts)[::-1]
        index_map = {n: o for n, o in zip(indices, range(len(indices)))}
        new_partition = list(map(index_map.get, self.partition))

        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (i + 1) for i, _ in enumerate(np.unique(new_partition))]
        )

        new_domain = add_columns(domain, metas=[cluster_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = new_partition
        self.Outputs.annotated_data.send(new_table)

        if Graph is not None:
            graph = Graph(self.graph)
            graph.set_items(new_table)
            self.Outputs.graph.send(graph)
Example #30
0
 def test_SimpleTree_to_string_regression(self):
     domain = Domain([DiscreteVariable(name='d1', values='ef'),
                      ContinuousVariable(name='c1')],
                     ContinuousVariable(name='cls'))
     data = Table.from_list(domain, [['e', 1, 10],
                                     ['e', 1, 20],
                                     ['e', 2, 20],
                                     ['f', 2, 30],
                                     ["e", 3, 10],
                                     ['f', 3, 30]])
     lrn = SimpleTreeReg(min_instances=1)
     reg = lrn(data)
     reg_str = reg.to_string()
     res = '\n' \
           'd1 (20: 6.0)\n' \
           ': e\n' \
           '   c1 (15: 4.0)\n' \
           '   : <=2.5\n' \
           '      c1 (16.6667: 3.0)\n' \
           '      : <=1.5 --> (15: 2.0)\n' \
           '      : >1.5 --> (20: 1.0)\n' \
           '   : >2.5 --> (10: 1.0)\n' \
           ': f --> (30: 2.0)'
     self.assertEqual(reg_str, res)
Example #31
0
    def send_corpus(self):
        if self.clustering_mask is not None:
            cluster_var = DiscreteVariable(
                'Duplicates Cluster',
                values=[str(Cluster(v)) for v in set(self.clustering_mask.flatten())]
            )
            corpus, domain = self.corpus, self.corpus.domain
            attrs = domain.attributes
            class_ = domain.class_vars
            metas = domain.metas

            if self.cluster_role == self.AttributeRole:
                attrs = attrs + (cluster_var,)
            elif self.cluster_role == self.ClassRole:
                class_ = class_ + (cluster_var,)
            elif self.cluster_role == self.MetaRole:
                metas = metas + (cluster_var,)

            domain = Domain(attrs, class_, metas)
            corpus = corpus.from_table(domain, corpus)
            corpus.get_column_view(cluster_var)[0][:] = self.clustering_mask
            self.send(IO.corpus, corpus)
        else:
            self.send(IO.corpus, None)
Example #32
0
    def test_mapper_inplace(self):
        s = list(range(7))
        abc = DiscreteVariable("a", values=tuple("abc"))
        dca = DiscreteVariable("a", values=tuple("dca"))
        mapper = dca.get_mapper_from(abc)

        arr = np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T
        mapper(arr, 0)
        np.testing.assert_array_equal(
            arr,
            np.array([[2, 2, 1, np.nan, 2, np.nan, np.nan], s]).T)

        self.assertRaises(ValueError, mapper, sp.csr_matrix(arr), 0)
        self.assertRaises(ValueError, mapper, [1, 2, 3], 0)
        self.assertRaises(ValueError, mapper, 1, 0)

        acd = DiscreteVariable("a", values=tuple("acd"))
        mapper = acd.get_mapper_from(abc)

        arr = np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T
        mapper(arr, 0)
        np.testing.assert_array_equal(
            arr,
            np.array([[0, 0, 1, np.nan, 0, np.nan, np.nan], s]).T)

        arr = sp.csr_matrix(np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T)
        mapper(arr, 0)
        np.testing.assert_array_equal(
            arr.todense(),
            np.array([[0, 0, 1, np.nan, 0, np.nan, np.nan], s]).T)

        arr = sp.csc_matrix(np.array([[0, 0, 2, 1, 0, 1, np.nan], s]).T)
        mapper(arr, 0)
        np.testing.assert_array_equal(
            arr.todense(),
            np.array([[0, 0, 1, np.nan, 0, np.nan, np.nan], s]).T)
    def test_str(self):
        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")])
        inst = Instance(domain, [42, 0])
        self.assertEqual(str(inst), "[42.000, M]")

        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")])
        inst = Instance(domain, [42, "M", "B"])
        self.assertEqual(str(inst), "[42.000, M | B]")

        domain = self.create_domain(["x", DiscreteVariable("g", values="MF")],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        inst = Instance(domain, [42, "M", "B", "X", 43, "Foo"])
        self.assertEqual(str(inst), "[42.000, M | B] {X, 43.000, Foo}")

        domain = self.create_domain([],
                                    [DiscreteVariable("y", values="ABC")],
                                    self.metas)
        inst = Instance(domain, ["B", "X", 43, "Foo"])
        self.assertEqual(str(inst), "[ | B] {X, 43.000, Foo}")

        domain = self.create_domain([],
                                    [],
                                    self.metas)
        inst = Instance(domain, ["X", 43, "Foo"])
        self.assertEqual(str(inst), "[] {X, 43.000, Foo}")

        domain = self.create_domain(self.attributes)
        inst = Instance(domain, range(len(self.attributes)))
        self.assertEqual(
            str(inst),
            "[{}]".format(", ".join("{:.3f}".format(x)
                                    for x in range(len(self.attributes)))))

        for attr in domain:
            attr.number_of_decimals = 0
        self.assertEqual(
            str(inst),
            "[{}]".format(", ".join("{}".format(x)
                                    for x in range(len(self.attributes)))))
item_summary_df[item_summary_df.total_perc <= 0.5].shape


# In[13]:


item_summary_df[item_summary_df.total_perc <= 0.5]


# # Construct Orange Table 

# In[16]:


input_assoc_rules = grocery_df
domain_grocery = Domain([DiscreteVariable.make(name=item,values=['0', '1']) for item in input_assoc_rules.columns])
data_gro_1 = Orange.data.Table.from_numpy(domain=domain_grocery,  X=input_assoc_rules.as_matrix(),Y= None)


# # Prune Dataset for frequently purchased items

# In[2]:


def prune_dataset(input_df, length_trans = 2, total_sales_perc = 0.5, start_item = None, end_item = None):
    if 'total_items' in input_df.columns:
        del(input_df['total_items'])
    item_count = input_df.sum().sort_values(ascending = False).reset_index()
    total_items = sum(input_df.sum().sort_values(ascending = False))
    item_count.rename(columns={item_count.columns[0]:'item_name',item_count.columns[1]:'item_count'}, inplace=True)
    if not start_item and not end_item: 
Example #35
0
 def test_val_from_str(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertTrue(math.isnan(var.to_val(None)))
     self.assertEqual(var.to_val(1), 1)
Example #36
0
 def test_make(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertIsInstance(var, DiscreteVariable)
     self.assertEqual(var.name, "a")
     self.assertEqual(var.values, ["F", "M"])
Example #37
0
 def setUp(self):
     DiscreteVariable._clear_cache()
Example #38
0
 def test_no_duplicated_values(self):
     a = DiscreteVariable("foo", values=["a", "b", "c"])
     a.add_value("b")
     self.assertEqual(list(a.values), ["a", "b", "c"])
     self.assertEqual(list(a._value_index), ["a", "b", "c"])
Example #39
0
VarDataPair = namedtuple('VarDataPair', ['variable', 'data'])

# Continuous variable variations
continuous_full = VarDataPair(
    ContinuousVariable('continuous_full'),
    np.array([0, 1, 2, 3, 4], dtype=float),
)
continuous_missing = VarDataPair(
    ContinuousVariable('continuous_missing'),
    np.array([0, 1, 2, np.nan, 4], dtype=float),
)

# Unordered discrete variable variations
rgb_full = VarDataPair(
    DiscreteVariable('rgb_full', values=('r', 'g', 'b')),
    np.array([0, 1, 1, 1, 2], dtype=float),
)
rgb_missing = VarDataPair(
    DiscreteVariable('rgb_missing', values=('r', 'g', 'b')),
    np.array([0, 1, 1, np.nan, 2], dtype=float),
)

# Ordered discrete variable variations
ints_full = VarDataPair(
    DiscreteVariable('ints_full', values=('2', '3', '4'), ordered=True),
    np.array([0, 1, 1, 1, 2], dtype=float),
)
ints_missing = VarDataPair(
    DiscreteVariable('ints_missing', values=('2', '3', '4'), ordered=True),
    np.array([0, 1, 1, np.nan, 2], dtype=float),
Example #40
0
 def test_no_nonstringvalues(self):
     self.assertRaises(TypeError, DiscreteVariable, "foo", values=["a", 42])
     a = DiscreteVariable("foo", values=["a", "b", "c"])
     self.assertRaises(TypeError, a.add_value, 42)
Example #41
0
 def test_val_from_str(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertTrue(math.isnan(var.to_val(None)))
     self.assertEqual(var.to_val(1), 1)
Example #42
0
 def test_make(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertIsInstance(var, DiscreteVariable)
     self.assertEqual(var.name, "a")
     self.assertEqual(var.values, ["F", "M"])
Example #43
0
    def test_have_date_have_time_in_construct(self):
        """Test if have_time and have_date is correctly set"""
        var = TimeVariable('time', have_date=1)
        self.assertTrue(var.have_date)
        self.assertFalse(var.have_time)


PickleContinuousVariable = create_pickling_tests(
    "PickleContinuousVariable",
    ("with_name", lambda: ContinuousVariable(name="Feature 0")),
)

PickleDiscreteVariable = create_pickling_tests(
    "PickleDiscreteVariable",
    ("with_name", lambda: DiscreteVariable(name="Feature 0")),
    ("with_str_value",
     lambda: DiscreteVariable(name="Feature 0", values=["F", "M"])),
    ("ordered", lambda: DiscreteVariable(
        name="Feature 0", values=["F", "M"], ordered=True)),
    ("with_base_value", lambda: DiscreteVariable(
        name="Feature 0", values=["F", "M"], base_value=0)))

PickleStringVariable = create_pickling_tests(
    "PickleStringVariable",
    ("with_name", lambda: StringVariable(name="Feature 0")))


class VariableTestMakeProxy(unittest.TestCase):
    def setUp(self):
        Variable._clear_all_caches()
Example #44
0
 def test_value_from_discrete_substring(self):
     trans = ValueFromDiscreteSubstring(
         DiscreteVariable("x", values=self.arr), self.patterns)
     np.testing.assert_equal(trans.lookup_table, [0, 1, 2, 0, 3])
import warnings
from unittest import TestCase
from unittest.mock import Mock
from Orange.data import Domain, DiscreteVariable
from Orange.data import ContinuousVariable
from Orange.util import OrangeDeprecationWarning
from Orange.widgets.settings import DomainContextHandler, ContextSetting
from Orange.widgets.utils import vartype

Continuous = vartype(ContinuousVariable())
Discrete = vartype(DiscreteVariable())


class TestDomainContextHandler(TestCase):
    def setUp(self):
        self.domain = Domain(
            attributes=[
                ContinuousVariable("c1"),
                DiscreteVariable("d1", values="abc"),
                DiscreteVariable("d2", values="def"),
            ],
            class_vars=[DiscreteVariable("d3", values="ghi")],
            metas=[
                ContinuousVariable("c2"),
                DiscreteVariable("d4", values="jkl")
            ],
        )
        self.args = (
            self.domain,
            {
                "c1": Continuous,
Example #46
0
 def setUp(self):
     DiscreteVariable._clear_cache()
     self.data = Table([[1, 2, 3]])
Example #47
0
 def setUp(self):
     DiscreteVariable._clear_cache()
    def test_set_data(self):
        """
        Test widget behavior when data set
        """
        w = self.widget

        num_continuous_attributes = sum(True
                                        for var in self.iris.domain.attributes
                                        if isinstance(var, ContinuousVariable))

        self.send_signal(w.Inputs.data, self.iris)

        # widget does not have any problems with that data set so
        # everything should be fine
        self.assertEqual(w.cbx.count(), num_continuous_attributes)
        self.assertEqual(w.cby.count(), num_continuous_attributes)
        self.assertEqual(w.target_class_combobox.count(),
                         len(self.iris.domain.class_var.values))
        self.assertEqual(w.cbx.currentText(), self.iris.domain[0].name)
        self.assertEqual(w.cby.currentText(), self.iris.domain[1].name)
        self.assertEqual(w.target_class_combobox.currentText(),
                         self.iris.domain.class_var.values[0])

        self.assertEqual(w.attr_x, self.iris.domain[0].name)
        self.assertEqual(w.attr_y, self.iris.domain[1].name)
        self.assertEqual(w.target_class, self.iris.domain.class_var.values[0])

        # change showed attributes
        w.attr_x = self.iris.domain[1].name
        w.attr_y = self.iris.domain[2].name
        w.target_class = self.iris.domain.class_var.values[1]

        self.assertEqual(w.cbx.currentText(), self.iris.domain[1].name)
        self.assertEqual(w.cby.currentText(), self.iris.domain[2].name)
        self.assertEqual(w.target_class_combobox.currentText(),
                         self.iris.domain.class_var.values[1])

        self.assertEqual(w.attr_x, self.iris.domain[1].name)
        self.assertEqual(w.attr_y, self.iris.domain[2].name)
        self.assertEqual(w.target_class, self.iris.domain.class_var.values[1])

        # remove data set
        self.send_signal(w.Inputs.data, None)
        self.assertEqual(w.cbx.count(), 0)
        self.assertEqual(w.cby.count(), 0)
        self.assertEqual(w.target_class_combobox.count(), 0)

        # set data set again
        self.send_signal(w.Inputs.data, self.iris)

        # widget does not have any problems with that data set so
        # everything should be fine
        self.assertEqual(w.cbx.count(), num_continuous_attributes)
        self.assertEqual(w.cby.count(), num_continuous_attributes)
        self.assertEqual(w.target_class_combobox.count(),
                         len(self.iris.domain.class_var.values))
        self.assertEqual(w.cbx.currentText(), self.iris.domain[0].name)
        self.assertEqual(w.cby.currentText(), self.iris.domain[1].name)
        self.assertEqual(w.target_class_combobox.currentText(),
                         self.iris.domain.class_var.values[0])

        self.assertEqual(w.attr_x, self.iris.domain[0].name)
        self.assertEqual(w.attr_y, self.iris.domain[1].name)
        self.assertEqual(w.target_class, self.iris.domain.class_var.values[0])

        # set data set with no class
        table_no_class = Table(
            Domain([ContinuousVariable("x"),
                    ContinuousVariable("y")]), [[1, 2], [2, 3]])
        self.send_signal(w.Inputs.data, table_no_class)

        self.assertEqual(w.cbx.count(), 0)
        self.assertEqual(w.cby.count(), 0)
        self.assertEqual(w.target_class_combobox.count(), 0)
        self.assertTrue(w.Error.no_class.is_shown())

        # set data with one class variable
        table_one_class = Table(
            Domain([ContinuousVariable("x"),
                    ContinuousVariable("y")],
                   DiscreteVariable("a", values=["k"])), [[1, 2], [2, 3]],
            [0, 0])
        self.send_signal(w.Inputs.data, table_one_class)

        self.assertEqual(w.cbx.count(), 0)
        self.assertEqual(w.cby.count(), 0)
        self.assertEqual(w.target_class_combobox.count(), 0)
        self.assertTrue(w.Error.no_class.is_shown())

        # set data with not enough continuous variables
        table_no_enough_cont = Table(
            Domain([
                ContinuousVariable("x"),
                DiscreteVariable("y", values=["a", "b"])
            ], ContinuousVariable("a")), [[1, 0], [2, 1]], [0, 0])
        self.send_signal(w.Inputs.data, table_no_enough_cont)

        self.assertEqual(w.cbx.count(), 0)
        self.assertEqual(w.cby.count(), 0)
        self.assertEqual(w.target_class_combobox.count(), 0)
        self.assertTrue(w.Error.to_few_features.is_shown())
Example #49
0
    def test_colors_diff_domain(self):
        """
        Test whether the color selection for values is correct.
        """
        # pylint: disable=protected-access
        self.send_signal(self.widget.Inputs.data, self.iris)

        # case 1: two domains one subset other
        idom = self.iris.domain
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values))
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values[:2]))
        iris1 = self.iris[:100].transform(dom1)
        iris2 = self.iris[:100].transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)

        # case 2: two domains one subset other - different color order
        idom = self.iris.domain
        colors = idom.class_var.colors[::-1]
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values))
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values[:2]))
        dom1.class_var.colors = colors
        dom2.class_var.colors = colors[:2]
        iris1 = self.iris[:100].transform(dom1)
        iris2 = self.iris[:100].transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)

        # case 3: domain color, values miss-match - use default colors
        idom = self.iris.domain
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values))
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values))
        dom1.class_var.colors = dom1.class_var.colors[::-1]
        iris1 = self.iris.transform(dom1)
        iris2 = self.iris.transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, ColorPaletteGenerator.palette(3))

        # case 4: two domains different values order, matching colors
        idom = self.iris.domain
        # this way we know that default colors are not used
        colors = ColorPaletteGenerator.palette(5)[2:]
        dom1 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values))
        dom2 = Domain(
            idom.attributes,
            DiscreteVariable(idom.class_var.name, idom.class_var.values[::-1]))
        dom1.class_var.colors = colors
        dom2.class_var.colors = colors[::-1]  # colors mixed same than values
        iris1 = self.iris[:100].transform(dom1)
        iris2 = self.iris[:100].transform(dom2)

        predictor_iris1 = ConstantLearner()(iris1)
        predictor_iris2 = ConstantLearner()(iris2)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris1)
        self.send_signal(self.widget.Inputs.predictors, predictor_iris2, 1)
        colors = self.widget._get_colors()
        np.testing.assert_array_equal(colors, iris1.domain.class_var.colors)