Example #1
21
    def set_data(self, data):
        self.information(1)
        self.__timer.stop()
        self.sampling.setVisible(False)
        self.sql_data = None
        if isinstance(data, SqlTable):
            if data.approx_len() < 4000:
                data = Table(data)
            else:
                self.information(1, "Large SQL table (showing a sample)")
                self.sql_data = data
                data_sample = data.sample_time(0.8, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
                self.sampling.setVisible(True)
                if self.auto_sample:
                    self.__timer.start()

        if data is not None and (len(data) == 0 or len(data.domain) == 0):
            data = None
        if self.data and data and self.data.checksum() == data.checksum():
            return

        self.closeContext()
        same_domain = self.data and data and data.domain.checksum() == self.data.domain.checksum()
        self.data = data
        self.data_metas_X = self.move_primitive_metas_to_X(data)

        if not same_domain:
            self.init_attr_values()
        self.vizrank._initialize()
        self.vizrank_button.setEnabled(
            self.data is not None and self.data.domain.class_var is not None and len(self.data.domain.attributes) > 1
        )
        self.openContext(self.data)
Example #2
18
    def commit(self):
        transformed = components = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (all components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(
                transformed.domain.attributes[:self.ncomponents],
                self.data.domain.class_vars,
                self.data.domain.metas
            )
            transformed = transformed.from_table(domain, transformed)
            dom = Domain(self._pca.orig_domain.attributes,
                         metas=[StringVariable(name='component')])
            metas = numpy.array([['PC{}'.format(i + 1)
                                  for i in range(self.ncomponents)]],
                                dtype=object).T
            components = Table(dom, self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

        self._pca_projector.component = self.ncomponents
        self.send("Transformed data", transformed)
        self.send("Components", components)
        self.send("PCA", self._pca_projector)
Example #3
1
    def extend_corpus(self, metadata, Y):
        """
        Append documents to corpus.

        Args:
            metadata (numpy.ndarray): Meta data
            Y (numpy.ndarray): Class variables
        """
        if np.prod(self.X.shape) != 0:
            raise ValueError("Extending corpus only works when X is empty"
                             "while the shape of X is {}".format(self.X.shape))

        self.metas = np.vstack((self.metas, metadata))

        cv = self.domain.class_var
        for val in set(filter(None, Y)):
            if val not in cv.values:
                cv.add_value(val)
        new_Y = np.array([cv.to_val(i) for i in Y])[:, None]
        self._Y = np.vstack((self._Y, new_Y))

        self.X = self.W = np.zeros((self.metas.shape[0], 0))
        Table._init_ids(self)

        self._tokens = None     # invalidate tokens
Example #4
0
    def __init__(self, X=None, Y=None, metas=None, domain=None, text_features=None):
        """
        Args:
            X (numpy.ndarray): attributes
            Y (numpy.ndarray): class variables
            metas (numpy.ndarray): meta attributes; e.g. text
            domain (Orange.data.domain): the domain for this Corpus
            text_features (list): meta attributes that are used for
                text mining. Infer them if None.
        """
        n_doc = _check_arrays(X, Y, metas)

        self.X = X if X is not None else np.zeros((n_doc, 0))
        self.Y = Y if Y is not None else np.zeros((n_doc, 0))
        self.metas = metas if metas is not None else np.zeros((n_doc, 0))
        self.W = np.zeros((n_doc, 0))
        self.domain = domain
        self.text_features = None    # list of text features for mining

        if domain is not None and text_features is None:
            self._infer_text_features()
        elif domain is not None:
            self.set_text_features(text_features)

        Table._init_ids(self)
Example #5
0
 def send_features(self):
     features = None
     if self.attr_x or self.attr_y:
         dom = Domain([], metas=(StringVariable(name="feature"),))
         features = Table(dom, [[self.attr_x], [self.attr_y]])
         features.name = "Features"
     self.Outputs.features.send(features)
Example #6
0
    def __call__(self, data):
        """
        Apply randomization of the given data. Returns a new
        data table.

        Parameters
        ----------
        data : Orange.data.Table
            A data table to be randomized.

        Returns
        -------
        data : Orange.data.Table
            Randomized data table.
        """
        new_data = Table(data)
        new_data.ensure_copy()

        if self.rand_type == Randomize.RandomizeClasses:
            self.randomize(new_data.Y)
        elif self.rand_type == Randomize.RandomizeAttributes:
            self.randomize(new_data.X)
        elif self.rand_type == Randomize.RandomizeMetas:
            self.randomize(new_data.metas)
        else:
            raise TypeError('Unsupported type')

        return new_data
Example #7
0
    def set_data(self, data):
        self.information(1)
        if isinstance(data, SqlTable):
            if data.approx_len() < 4000:
                data = Table(data)
            else:
                self.information(1, "Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)

        if data is not None and (len(data) == 0 or len(data.domain) == 0):
            data = None
        if self.data and data and self.data.checksum() == data.checksum():
            return

        self.closeContext()
        same_domain = \
            self.data and data and \
            data.domain.checksum() == self.data.domain.checksum()
        self.data = data
        self.data_metas_X = self.move_primitive_metas_to_X(data)

        # TODO: adapt scatter plot to work on SqlTables (avoid use of X and Y)
        if isinstance(self.data, SqlTable):
            self.data.download_data()

        if not same_domain:
            self.init_attr_values()
        self.vizrank._initialize()
        self.vizrank_button.setEnabled(
            self.data is not None and self.data.domain.class_var is not None
            and len(self.data.domain.attributes) > 1)
        self.openContext(self.data)
Example #8
0
    def set_test_data(self, data):
        """
        Set the input separate testing dataset.
        """
        self.error(1)
        self.information(1)
        if data and not data.domain.class_var:
            self.error(1, "Test data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(1, "Test data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.test_data_missing_vals = data is not None and \
                                      np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(4, self._get_missing_data_warning(
                self.train_data_missing_vals, self.test_data_missing_vals
            ))
            if data:
                data = RemoveNaNClasses(data)

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()
Example #9
0
    def commit(self):
        transformed = components = pp = None
        if self._pca is not None:
            if self._transformed is None:
                # Compute the full transform (MAX_COMPONENTS components) only once.
                self._transformed = self._pca(self.data)
            transformed = self._transformed

            domain = Domain(
                transformed.domain.attributes[:self.ncomponents],
                self.data.domain.class_vars,
                self.data.domain.metas
            )
            transformed = transformed.from_table(domain, transformed)
            # prevent caching new features by defining compute_value
            dom = Domain([ContinuousVariable(a.name, compute_value=lambda _: None)
                          for a in self._pca.orig_domain.attributes],
                         metas=[StringVariable(name='component')])
            metas = numpy.array([['PC{}'.format(i + 1)
                                  for i in range(self.ncomponents)]],
                                dtype=object).T
            components = Table(dom, self._pca.components_[:self.ncomponents],
                               metas=metas)
            components.name = 'components'

            pp = ApplyDomain(domain, "PCA")

        self._pca_projector.component = self.ncomponents
        self.Outputs.transformed_data.send(transformed)
        self.Outputs.components.send(components)
        self.Outputs.pca.send(self._pca_projector)
        self.Outputs.preprocessor.send(pp)
Example #10
0
    def __call__(self, data):
        if not self.check_learner_adequacy(data.domain):
            raise ValueError(self.learner_adequacy_err_msg)

        origdomain = data.domain

        if isinstance(data, Instance):
            data = Table(data.domain, [data])
        data = self.preprocess(data)

        if len(data.domain.class_vars) > 1 and not self.supports_multiclass:
            raise TypeError("%s doesn't support multiple class variables" %
                            self.__class__.__name__)

        self.domain = data.domain

        if type(self).fit is Learner.fit:
            model = self.fit_storage(data)
        else:
            X, Y, W = data.X, data.Y, data.W if data.has_weights() else None
            model = self.fit(X, Y, W)
        model.domain = data.domain
        model.supports_multiclass = self.supports_multiclass
        model.name = self.name
        model.original_domain = origdomain
        return model
Example #11
0
    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.error(0)
        self.information(0)
        if data and not data.domain.class_var:
            self.error(0, "Train data input requires a class variable")
            data = None

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information(0, "Train data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.warning(4)
        self.train_data_missing_vals = data is not None and \
                                       np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.warning(4, self._get_missing_data_warning(
                self.train_data_missing_vals, self.test_data_missing_vals
            ))
            if data:
                data = RemoveNaNClasses(data)

        self.data = data
        self.closeContext()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain.class_var)
        self._invalidate()
Example #12
0
 def test_constant_data(self):
     data = Table("iris")[::5]
     data.X[:, :] = 1.0
     self.send_signal("Data", data)
     self.assertTrue(self.widget.Warning.trivial_components.is_shown())
     self.assertIsNone(self.get_output("Transformed Data"))
     self.assertIsNone(self.get_output("Components"))
Example #13
0
 def prepare_data():
     data = Table("iris")
     values = list(range(15))
     class_var = DiscreteVariable("iris5", values=[str(v) for v in values])
     data = data.transform(Domain(attributes=data.domain.attributes, class_vars=[class_var]))
     data.Y = np.array(values * 10, dtype=float)
     return data
Example #14
0
    def test_format_combo(self):
        widget = self.widget
        filetype = widget.controls.filetype

        widget.save_file = Mock()

        data = Table("iris")
        sparse_data = Table("iris")
        sparse_data.is_sparse = Mock(return_value=True)

        self.send_signal(widget.Inputs.data, data)
        n_nonsparse = filetype.count()

        self.send_signal(widget.Inputs.data, sparse_data)
        n_sparse = filetype.count()
        self.assertGreater(n_nonsparse, n_sparse)

        self.send_signal(widget.Inputs.data, sparse_data)
        self.assertEqual(filetype.count(), n_sparse)

        self.send_signal(widget.Inputs.data, data)
        self.assertEqual(filetype.count(), n_nonsparse)

        self.send_signal(widget.Inputs.data, None)
        self.send_signal(widget.Inputs.data, data)
        self.assertEqual(filetype.count(), n_nonsparse)

        self.send_signal(widget.Inputs.data, None)
        self.send_signal(widget.Inputs.data, sparse_data)
        self.assertEqual(filetype.count(), n_sparse)
Example #15
0
    def set_data(self, data):
        self.closeContext()
        self.clear_messages()
        self.clear()
        self.information()
        self.data = None
        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.information("Data has been sampled")
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(2000, partial=True)
                data = Table(data_sample)
        if isinstance(data, Table):
            if len(data.domain.attributes) == 0:
                self.Error.no_features()
                self.clear_outputs()
                return
            if len(data) == 0:
                self.Error.no_instances()
                self.clear_outputs()
                return

        self.openContext(data)
        self._init_projector()

        self.data = data
        self.fit()
Example #16
0
    def setUp(self):
        self.cont_data = Table.from_list(
            self.cont_domain,
            [[1, 3, 2],
             [-1, 5, 0],
             [1, 1, 1],
             [7, 2, 3]])

        self.cont_data2 = Table.from_list(
            self.cont_domain,
            [[2, 1, 3],
             [1, 2, 2]]
        )

        self.disc_data = Table.from_list(
            self.disc_domain,
            [[0, 0, 0],
             [0, 1, 1],
             [1, 3, 1]]
        )

        self.disc_data4 = Table.from_list(
            self.disc_domain,
            [[0, 0, 0],
             [0, 1, 1],
             [0, 1, 1],
             [1, 3, 1]]
        )

        self.mixed_data = self.data = Table.from_numpy(
            self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))
Example #17
0
    def test_inputs_check_sql(self):
        """Test if check_sql_input is called when data is sent to a widget."""
        d = Table()
        self.send_signal(self.widget.Inputs.data, d)
        self.assertIs(self.widget.pop_called_with(), d)

        a_table = object()
        with patch("Orange.widgets.utils.sql.Table",
                   MagicMock(return_value=a_table)) as table_mock:
            d = SqlTable(None, None, MagicMock())

            d.approx_len = MagicMock(return_value=AUTO_DL_LIMIT - 1)
            self.send_signal(self.widget.Inputs.data, d)
            table_mock.assert_called_once_with(d)
            self.assertIs(self.widget.pop_called_with(), a_table)
            table_mock.reset_mock()

            d.approx_len = MagicMock(return_value=AUTO_DL_LIMIT + 1)
            self.send_signal(self.widget.Inputs.data, d)
            table_mock.assert_not_called()
            self.assertIs(self.widget.pop_called_with(), None)
            self.assertTrue(self.widget.Error.download_sql_data.is_shown())
            table_mock.reset_mock()

            self.send_signal(self.widget.Inputs.data, None)
            table_mock.assert_not_called()
            self.assertIs(self.widget.pop_called_with(), None)
            self.assertFalse(self.widget.Error.download_sql_data.is_shown())
Example #18
0
    def test_data_with_similarity(self):
        widget = self.widget
        indices = np.array([5, 10, 15, 100])

        data = Table("iris")
        widget.data = data

        widget.distances = np.arange(1000, 1150).astype(float)
        neighbours = widget._data_with_similarity(indices)
        self.assertEqual(neighbours.metas.shape, (4, 1))
        np.testing.assert_almost_equal(
            neighbours.metas.flatten(), indices + 1000)
        np.testing.assert_almost_equal(neighbours.X, data.X[indices])

        domain = data.domain
        domain2 = Domain([domain[2]], domain.class_var, metas=domain[:2])
        data2 = data.transform(domain2)
        widget.data = data2

        widget.distances = np.arange(1000, 1150).astype(float)
        neighbours = widget._data_with_similarity(indices)
        self.assertEqual(len(neighbours.domain.metas), 3)
        self.assertEqual(neighbours.metas.shape, (4, 3))
        np.testing.assert_almost_equal(
            neighbours.get_column_view("distance")[0], indices + 1000)
        np.testing.assert_almost_equal(neighbours.X, data2.X[indices])
Example #19
0
 def test_constant_data(self):
     data = Table("iris")[::5]
     data.X[:, :] = 1.0
     self.send_signal(self.widget.Inputs.data, data)
     self.assertTrue(self.widget.Warning.trivial_components.is_shown())
     self.assertIsNone(self.get_output(self.widget.Outputs.transformed_data))
     self.assertIsNone(self.get_output(self.widget.Outputs.components))
    def test_varying_between_combined(self):
        X = np.array([[0, 0, 0, 0, 0, 1,],
                      [0, 0, 1, 1, 0, 1,],
                      [0, 0, 0, 2, np.nan, np.nan,],
                      [0, 1, 0, 0, 0, 0,],
                      [0, 1, 0, 2, 0, 0,],
                      [0, 1, 0, 0, np.nan, 0,]])

        M = np.array([["A", 0, 0, 0, 0, 0, 1,],
                      ["A", 0, 0, 1, 1, 0, 1,],
                      ["A", 0, 0, 0, 2, np.nan, np.nan,],
                      ["B", 0, 1, 0, 0, 0, 0,],
                      ["B", 0, 1, 0, 2, 0, 0,],
                      ["B", 0, 1, 0, 0, np.nan, 0,]], dtype=str)

        variables = [ContinuousVariable(name="F%d" % j) for j in range(X.shape[1])]
        metas = [StringVariable(name="M%d" % j) for j in range(M.shape[1])]
        domain = Domain(attributes=variables, metas=metas)

        data = Table.from_numpy(X=X, domain=domain, metas=M)

        self.assertEqual(varying_between(data, idvar=data.domain.metas[0]),
                         [variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]])

        # scipy.sparse uses matrix; this filter can be removed when it's fixed
        warnings.filterwarnings(
            "ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
        data = Table.from_numpy(X=sp.csr_matrix(X), domain=domain, metas=M)
        self.assertEqual(varying_between(data, idvar=data.domain.metas[0]),
                         [variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]])
Example #21
0
    def test_do_not_recluster_on_same_data(self):
        """Do not recluster data points when targets or metas change."""

        # Prepare some dummy data
        x = np.eye(5)
        y1, y2 = np.ones((5, 1)), np.ones((5, 2))
        meta1, meta2 = np.ones((5, 1)), np.ones((5, 2))

        table1 = Table.from_numpy(
            domain=Domain.from_numpy(X=x, Y=y1, metas=meta1),
            X=x, Y=y1, metas=meta1,
        )
        # X is same, should not cause update
        table2 = Table.from_numpy(
            domain=Domain.from_numpy(X=x, Y=y2, metas=meta2),
            X=x, Y=y2, metas=meta2,
        )
        # X is different, should cause update
        table3 = table1.copy()
        table3.X[:, 0] = 1

        with patch.object(self.widget, 'commit') as commit:
            self.send_signal(self.widget.Inputs.data, table1)
            self.commit_and_wait()
            call_count = commit.call_count

            # Sending data with same X should not recompute the clustering
            self.send_signal(self.widget.Inputs.data, table2)
            self.commit_and_wait()
            self.assertEqual(call_count, commit.call_count)

            # Sending data with different X should recompute the clustering
            self.send_signal(self.widget.Inputs.data, table3)
            self.commit_and_wait()
            self.assertEqual(call_count + 1, commit.call_count)
Example #22
0
    def check_data(self):
        self.clear_messages()
        self.__timer.stop()
        self.sampling.setVisible(False)
        self.sql_data = None
        if isinstance(self.data, SqlTable):
            if self.data.approx_len() < 4000:
                self.data = Table(self.data)
            else:
                self.Information.sampled_sql()
                self.sql_data = self.data
                data_sample = self.data.sample_time(0.8, no_cache=True)
                data_sample.download_data(2000, partial=True)
                self.data = Table(data_sample)
                self.sampling.setVisible(True)
                if self.auto_sample:
                    self.__timer.start()

        if self.data is not None:
            if not self.data.domain.has_continuous_attributes(True, True):
                self.Warning.no_continuous_vars()
                self.data = None

        if self.data is not None and (len(self.data) == 0 or
                                      len(self.data.domain) == 0):
            self.data = None
Example #23
0
    def test_wrong_input(self):
        # no data
        self.data = None
        self.send_signal(self.widget.Inputs.data, self.data)
        self.assertIsNone(self.widget.data)

        # <2 rows
        self.data = Table(self.domain, [[1, 2, 3, 4, 5, 'STG1']])
        self.send_signal(self.widget.Inputs.data, self.data)
        self.assertIsNone(self.widget.data)
        self.assertTrue(self.widget.Error.not_enough_rows.is_shown())

        # no attributes
        self.data = Table(self.empty_domain, [['STG1']] * 2)
        self.send_signal(self.widget.Inputs.data, self.data)
        self.assertIsNone(self.widget.data)
        self.assertTrue(self.widget.Error.no_attributes.is_shown())

        # constant data
        self.data = Table(self.domain, [[1, 2, 3, 4, 5, 'STG1']] * 2)
        self.send_signal(self.widget.Inputs.data, self.data)
        self.assertIsNone(self.widget.data)
        self.assertTrue(self.widget.Error.constant_data.is_shown())

        # correct input
        self.data = Table(self.domain, [[1, 2, 3, 4, 5, 'STG1'],
                                        [5, 4, 3, 2, 1, 'STG1']])
        self.send_signal(self.widget.Inputs.data, self.data)
        self.assertIsNotNone(self.widget.data)
        self.assertFalse(self.widget.Error.not_enough_rows.is_shown())
        self.assertFalse(self.widget.Error.no_attributes.is_shown())
        self.assertFalse(self.widget.Error.constant_data.is_shown())
Example #24
0
    def set_train_data(self, data):
        """
        Set the input training dataset.

        Parameters
        ----------
        data : Optional[Orange.data.Table]
        """
        self.Information.data_sampled.clear()
        self.Error.train_data_empty.clear()
        self.Error.class_required.clear()
        self.Error.too_many_classes.clear()
        self.Error.no_class_values.clear()
        self.Error.only_one_class_var_value.clear()
        if data is not None and not len(data):
            self.Error.train_data_empty()
            data = None
        if data:
            conds = [not data.domain.class_vars,
                     len(data.domain.class_vars) > 1,
                     np.isnan(data.Y).all(),
                     data.domain.has_discrete_class and len(data.domain.class_var.values) == 1]
            errors = [self.Error.class_required,
                      self.Error.too_many_classes,
                      self.Error.no_class_values,
                      self.Error.only_one_class_var_value]
            for cond, error in zip(conds, errors):
                if cond:
                    error()
                    data = None
                    break

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = HasClass()(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        self._update_scorers()
        self._update_controls()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain)
            if self.fold_feature_selected and bool(self.feature_model):
                self.resampling = OWTestLearners.FeatureFold
        self._invalidate()
Example #25
0
    def commit(self):
        if self.data is None or self.cont_data is None:
            self.Outputs.data.send(self.data)
            self.Outputs.features.send(None)
            self.Outputs.correlations.send(None)
            return

        attrs = [ContinuousVariable("Correlation"), ContinuousVariable("FDR")]
        metas = [StringVariable("Feature 1"), StringVariable("Feature 2")]
        domain = Domain(attrs, metas=metas)
        model = self.vizrank.rank_model
        x = np.array([[float(model.data(model.index(row, 0), role))
                       for role in (Qt.DisplayRole, CorrelationRank.PValRole)]
                      for row in range(model.rowCount())])
        x[:, 1] = FDR(list(x[:, 1]))
        # pylint: disable=protected-access
        m = np.array([[a.name for a in model.data(model.index(row, 0),
                                                  CorrelationRank._AttrRole)]
                      for row in range(model.rowCount())], dtype=object)
        corr_table = Table(domain, x, metas=m)
        corr_table.name = "Correlations"

        self.Outputs.data.send(self.data)
        # data has been imputed; send original attributes
        self.Outputs.features.send(AttributeList(
            [self.data.domain[name] for name, _ in self.selection]))
        self.Outputs.correlations.send(corr_table)
Example #26
0
    def set_test_data(self, data):
        """
        Set the input separate testing dataset.
        """
        self.Information.test_data_sampled.clear()
        if data and not data.domain.class_var:
            self.Error.class_required()
            data = None
        else:
            self.Error.class_required_test.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.test_data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.test_data_missing_vals = data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.test_data = data
        if self.resampling == OWTestLearners.TestOnTest:
            self._invalidate()
Example #27
0
class SVMTest(unittest.TestCase):
    def setUp(self):
        self.data = Table('ionosphere')
        self.data.shuffle()

    def test_SVM(self):
        learn = SVMLearner()
        res = CrossValidation(self.data, [learn], k=2)
        self.assertGreater(CA(res)[0], 0.9)

    def test_LinearSVM(self):
        learn = LinearSVMLearner()
        res = CrossValidation(self.data, [learn], k=2)
        self.assertTrue(0.8 < CA(res)[0] < 0.9)

    def test_NuSVM(self):
        learn = NuSVMLearner(nu=0.01)
        res = CrossValidation(self.data, [learn], k=2)
        self.assertGreater(CA(res)[0], 0.9)

    def test_SVR(self):
        nrows, ncols = 200, 5
        X = np.random.rand(nrows, ncols)
        y = X.dot(np.random.rand(ncols))
        data = Table(X, y)
        learn = SVRLearner(kernel='rbf', gamma=0.1)
        res = CrossValidation(data, [learn], k=2)
        self.assertLess(RMSE(res)[0], 0.15)

    def test_NuSVR(self):
        nrows, ncols = 200, 5
        X = np.random.rand(nrows, ncols)
        y = X.dot(np.random.rand(ncols))
        data = Table(X, y)
        learn = NuSVRLearner(kernel='rbf', gamma=0.1)
        res = CrossValidation(data, [learn], k=2)
        self.assertLess(RMSE(res)[0], 0.1)

    def test_OneClassSVM(self):
        np.random.seed(42)
        domain = Domain((ContinuousVariable("c1"), ContinuousVariable("c2")))
        X_in = 0.3 * np.random.randn(40, 2)
        X_out = np.random.uniform(low=-4, high=4, size=(20, 2))
        X_all = Table(domain, np.r_[X_in + 2, X_in - 2, X_out])
        n_true_in = len(X_in) * 2
        n_true_out = len(X_out)

        nu = 0.2
        learner = OneClassSVMLearner(nu=nu)
        cls = learner(X_all)
        y_pred = cls(X_all)
        n_pred_out_all = np.sum(y_pred == -1)
        n_pred_in_true_in = np.sum(y_pred[:n_true_in] == 1)
        n_pred_out_true_out = np.sum(y_pred[- n_true_out:] == -1)

        self.assertTrue(all(np.absolute(y_pred) == 1))
        self.assertTrue(n_pred_out_all <= len(X_all) * nu)
        self.assertTrue(np.absolute(n_pred_out_all - n_true_out) < 2)
        self.assertTrue(np.absolute(n_pred_in_true_in - n_true_in) < 4)
        self.assertTrue(np.absolute(n_pred_out_true_out - n_true_out) < 3)
Example #28
0
    def test_varying_between_combined(self):
        X = np.array([[0, 0, 0, 0, 0, 1,],
                      [0, 0, 1, 1, 0, 1,],
                      [0, 0, 0, 2, np.nan, np.nan,],
                      [0, 1, 0, 0, 0, 0,],
                      [0, 1, 0, 2, 0, 0,],
                      [0, 1, 0, 0, np.nan, 0,]])

        M = np.array([["A", 0, 0, 0, 0, 0, 1,],
                      ["A", 0, 0, 1, 1, 0, 1,],
                      ["A", 0, 0, 0, 2, np.nan, np.nan,],
                      ["B", 0, 1, 0, 0, 0, 0,],
                      ["B", 0, 1, 0, 2, 0, 0,],
                      ["B", 0, 1, 0, 0, np.nan, 0,]], dtype=str)

        variables = [ContinuousVariable(name="F%d" % j) for j in range(X.shape[1])]
        metas = [StringVariable(name="M%d" % j) for j in range(M.shape[1])]
        domain = Domain(attributes=variables, metas=metas)

        data = Table.from_numpy(X=X, domain=domain, metas=M)

        self.assertEqual(varying_between(data, idvar=data.domain.metas[0]),
                         [variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]])

        data = Table.from_numpy(X=sp.csr_matrix(X), domain=domain, metas=M)
        self.assertEqual(varying_between(data, idvar=data.domain.metas[0]),
                         [variables[2], variables[3], metas[3], metas[4], metas[5], metas[6]])
Example #29
0
    def set_train_data(self, data):
        """
        Set the input training dataset.
        """
        self.Information.data_sampled.clear()
        if data and not data.domain.class_var:
            self.Error.class_required()
            data = None
        else:
            self.Error.class_required.clear()

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = RemoveNaNClasses(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain.class_var)
        self._invalidate()
    def _send_output_signals(self, embeddings):
        skipped_images_bool = np.array([x is None for x in embeddings])

        if np.any(skipped_images_bool):
            skipped_images = self._input_data[skipped_images_bool]
            skipped_images = Table(skipped_images)
            skipped_images.ids = self._input_data.ids[skipped_images_bool]
            self.send(_Output.SKIPPED_IMAGES, skipped_images)
        else:
            self.send(_Output.SKIPPED_IMAGES, None)

        embedded_images_bool = np.logical_not(skipped_images_bool)

        if np.any(embedded_images_bool):
            embedded_images = self._input_data[embedded_images_bool]

            embeddings = embeddings[embedded_images_bool]
            embeddings = np.stack(embeddings)

            embedded_images = self._construct_output_data_table(
                embedded_images,
                embeddings
            )
            embedded_images.ids = self._input_data.ids[embedded_images_bool]
            self.send(_Output.EMBEDDINGS, embedded_images)
        else:
            self.send(_Output.EMBEDDINGS, None)
Example #31
0
 def test_clone_context(self):
     context = self.handler.new_context()
     iris = Table('iris')
     attrs, metas = self.handler.encode_domain(iris.domain)
     self.handler.clone_context(context, iris.domain, attrs, metas)
 def setUp(self):
     self.widget = self.create_widget(
         OWUnivariateRegression)  # type: OWUnivariateRegression
     self.data = Table("iris")
     self.data_housing = Table("housing")
Example #33
0
 def test_data_attributes(self):
     """No crash on data attributes of different types"""
     data = Table("iris")
     data.attributes = {"att 1": 1, "att 2": True, "att 3": 3}
     self.send_signal(self.widget.Inputs.data, data)
Example #34
0
 def test_empty_data(self):
     """No crash on empty data"""
     data = Table("iris")
     self.send_signal(self.widget.Inputs.data,
                      Table.from_domain(data.domain))
Example #35
0
 def test_data(self):
     """No crash on iris"""
     data = Table("iris")
     self.send_signal(self.widget.Inputs.data, data)
 def setUp(self) -> None:
     super().setUp()
     data = Table("brown-selected")
     self.model = RichTableModel(data)
     self.view.setModel(self.model)
Example #37
0
    def apply(self):
        self.clear_messages()
        transformed_data = None
        if self.data and self.template_domain is not None:
            try:
                transformed_data = self.data.transform(self.template_domain)
            except Exception as ex:  # pylint: disable=broad-except
                self.Error.error(ex)

        data = transformed_data
        self.transformed_info = describe_data(data)
        self.Outputs.transformed_data.send(data)
        self.set_template_label_text()
        self.set_output_label_text(data)

    def send_report(self):
        if self.data:
            self.report_data("Data", self.data)
        if self.template_domain is not None:
            self.report_domain("Template data", self.template_domain)
        if self.transformed_info:
            self.report_items("Transformed data", self.transformed_info)


if __name__ == "__main__":  # pragma: no cover
    from Orange.preprocess import Discretize

    table = Table("iris")
    WidgetPreview(OWTransform).run(set_data=table,
                                   set_template_data=Discretize()(table))
Example #38
0
 def setUp(self):
     self.data = Table("iris")
Example #39
0
    def setUp(self) -> None:
        self.widget = self.create_widget(OWGroupBy)
        self.iris = Table("iris")

        self.data = create_sample_data()
Example #40
0
 def setUpClass(cls):
     cls.data = Table(test_filename('datasets/ionosphere.tab'))
     cls.data.shuffle()
Example #41
0
class TestOWDBSCAN(WidgetTest):
    def setUp(self):
        self.widget = self.create_widget(OWDBSCAN)
        self.iris = Table("iris")

    def tearDown(self):
        self.widgets.remove(self.widget)
        self.widget.onDeleteWidget()
        self.widget = None

    def test_cluster(self):
        w = self.widget

        self.send_signal(w.Inputs.data, self.iris)

        output = self.get_output(w.Outputs.annotated_data)
        self.assertIsNotNone(output)
        self.assertEqual(len(self.iris), len(output))
        self.assertTupleEqual(self.iris.X.shape, output.X.shape)
        self.assertTupleEqual(self.iris.Y.shape, output.Y.shape)
        self.assertEqual(2, output.metas.shape[1])

        self.assertEqual("Cluster", str(output.domain.metas[0]))
        self.assertEqual("DBSCAN Core", str(output.domain.metas[1]))

    def test_unique_domain(self):
        w = self.widget
        data = possible_duplicate_table("Cluster")
        self.send_signal(w.Inputs.data, data)
        output = self.get_output(w.Outputs.annotated_data)
        self.assertEqual(output.domain.metas[0].name, "Cluster (1)")

    def test_bad_input(self):
        w = self.widget

        self.send_signal(w.Inputs.data, self.iris[:1])
        self.assertTrue(w.Error.not_enough_instances.is_shown())

        self.send_signal(w.Inputs.data, self.iris[:2])
        self.assertFalse(w.Error.not_enough_instances.is_shown())

        self.send_signal(w.Inputs.data, self.iris)
        self.assertFalse(w.Error.not_enough_instances.is_shown())

    def test_data_none(self):
        w = self.widget

        self.send_signal(w.Inputs.data, None)

        output = self.get_output(w.Outputs.annotated_data)
        self.assertIsNone(output)

    def test_change_eps(self):
        w = self.widget

        self.send_signal(w.Inputs.data, self.iris)

        # change parameters
        self.widget.controls.eps.valueChanged.emit(0.5)
        output1 = self.get_output(w.Outputs.annotated_data)
        self.widget.controls.eps.valueChanged.emit(1)
        output2 = self.get_output(w.Outputs.annotated_data)

        # on this data higher eps has greater sum of clusters - less nan
        # values
        self.assertGreater(np.nansum(output2.metas[:, 0]),
                           np.nansum(output1.metas[:, 0]))

        # try when no data
        self.send_signal(w.Inputs.data, None)
        self.widget.controls.eps.valueChanged.emit(0.5)
        output = self.get_output(w.Outputs.annotated_data)
        self.assertIsNone(output)

    def test_change_min_samples(self):
        w = self.widget

        self.send_signal(w.Inputs.data, self.iris)

        # change parameters
        self.widget.controls.min_samples.valueChanged.emit(5)
        output1 = self.get_output(w.Outputs.annotated_data)
        self.widget.controls.min_samples.valueChanged.emit(1)
        output2 = self.get_output(w.Outputs.annotated_data)

        # on this data lower min_samples has greater sum of clusters - less nan
        # values
        self.assertGreater(np.nansum(output2.metas[:, 0]),
                           np.nansum(output1.metas[:, 0]))

        # try when no data
        self.send_signal(w.Inputs.data, None)
        self.widget.controls.min_samples.valueChanged.emit(3)
        output = self.get_output(w.Outputs.annotated_data)
        self.assertIsNone(output)

    def test_change_metric_idx(self):
        w = self.widget

        self.send_signal(w.Inputs.data, self.iris)

        # change parameters
        cbox = self.widget.controls.metric_idx
        simulate.combobox_activate_index(cbox, 0)  # Euclidean
        output1 = self.get_output(w.Outputs.annotated_data)
        simulate.combobox_activate_index(cbox, 1)  # Manhattan
        output2 = self.get_output(w.Outputs.annotated_data)

        # Manhattan has more nan clusters
        self.assertGreater(np.nansum(output1.metas[:, 0]),
                           np.nansum(output2.metas[:, 0]))

        # try when no data
        self.send_signal(w.Inputs.data, None)
        cbox = self.widget.controls.metric_idx
        simulate.combobox_activate_index(cbox, 0)  # Euclidean

    def test_sparse_csr_data(self):
        with self.iris.unlocked():
            self.iris.X = csr_matrix(self.iris.X)

        w = self.widget

        self.send_signal(w.Inputs.data, self.iris)

        output = self.get_output(w.Outputs.annotated_data)
        self.assertIsNotNone(output)
        self.assertEqual(len(self.iris), len(output))
        self.assertTupleEqual(self.iris.X.shape, output.X.shape)
        self.assertTupleEqual(self.iris.Y.shape, output.Y.shape)
        self.assertEqual(2, output.metas.shape[1])

        self.assertEqual("Cluster", str(output.domain.metas[0]))
        self.assertEqual("DBSCAN Core", str(output.domain.metas[1]))

    def test_sparse_csc_data(self):
        with self.iris.unlocked():
            self.iris.X = csc_matrix(self.iris.X)

        w = self.widget

        self.send_signal(w.Inputs.data, self.iris)

        output = self.get_output(w.Outputs.annotated_data)
        self.assertIsNotNone(output)
        self.assertEqual(len(self.iris), len(output))
        self.assertTupleEqual(self.iris.X.shape, output.X.shape)
        self.assertTupleEqual(self.iris.Y.shape, output.Y.shape)
        self.assertEqual(2, output.metas.shape[1])

        self.assertEqual("Cluster", str(output.domain.metas[0]))
        self.assertEqual("DBSCAN Core", str(output.domain.metas[1]))

    def test_get_kth_distances(self):
        dists = get_kth_distances(self.iris, "euclidean", k=5)
        self.assertEqual(len(self.iris), len(dists))
        # dists must be sorted
        np.testing.assert_array_equal(dists, np.sort(dists)[::-1])

        # test with different distance - e.g. Orange distance
        dists = get_kth_distances(self.iris, Euclidean, k=5)
        self.assertEqual(len(self.iris), len(dists))
        # dists must be sorted
        np.testing.assert_array_equal(dists, np.sort(dists)[::-1])

    def test_metric_changed(self):
        w = self.widget

        self.send_signal(w.Inputs.data, self.iris)
        cbox = w.controls.metric_idx
        simulate.combobox_activate_index(cbox, 2)

        output = self.get_output(w.Outputs.annotated_data)
        self.assertIsNotNone(output)
        self.assertEqual(len(self.iris), len(output))
        self.assertTupleEqual(self.iris.X.shape, output.X.shape)
        self.assertTupleEqual(self.iris.Y.shape, output.Y.shape)

    def test_large_data(self):
        """
        When data has less than 1000 instances they are subsampled in k-values
        computation.
        """
        w = self.widget

        data = Table(self.iris.domain, np.repeat(self.iris.X, 10, axis=0),
                     np.repeat(self.iris.Y, 10, axis=0))

        self.send_signal(w.Inputs.data, data)
        output = self.get_output(w.Outputs.annotated_data)

        self.assertEqual(len(data), len(output))
        self.assertTupleEqual(data.X.shape, output.X.shape)
        self.assertTupleEqual(data.Y.shape, output.Y.shape)
        self.assertEqual(2, output.metas.shape[1])

    def test_titanic(self):
        """
        Titanic is a data-set with many 0 in k-nearest neighbours and thus some
        manipulation is required to set cut-point.
        This test checks whether widget works on those type of data.
        """
        w = self.widget
        data = Table("titanic")
        self.send_signal(w.Inputs.data, data)

    def test_data_retain_ids(self):
        self.send_signal(self.widget.Inputs.data, self.iris)
        output = self.get_output(self.widget.Outputs.annotated_data)
        np.testing.assert_array_equal(self.iris.ids, output.ids)

    def test_missing_data(self):
        w = self.widget
        with self.iris.unlocked():
            self.iris[1:5, 1] = np.nan
        self.send_signal(w.Inputs.data, self.iris)
        output = self.get_output(w.Outputs.annotated_data)
        self.assertTupleEqual((150, 1), output[:, "Cluster"].metas.shape)

    def test_normalize_data(self):
        # not normalized
        self.widget.controls.normalize.setChecked(False)

        data = Table("heart_disease")
        self.send_signal(self.widget.Inputs.data, data)

        kwargs = {
            "eps": self.widget.eps,
            "min_samples": self.widget.min_samples,
            "metric": "euclidean"
        }
        clusters = DBSCAN(**kwargs)(data)

        output = self.get_output(self.widget.Outputs.annotated_data)
        output_clusters = output.metas[:, 0].copy()
        output_clusters[np.isnan(output_clusters)] = -1
        np.testing.assert_array_equal(output_clusters, clusters)

        # normalized
        self.widget.controls.normalize.setChecked(True)

        kwargs = {
            "eps": self.widget.eps,
            "min_samples": self.widget.min_samples,
            "metric": "euclidean"
        }
        for pp in (Continuize(), Normalize(), SklImpute()):
            data = pp(data)
        clusters = DBSCAN(**kwargs)(data)

        output = self.get_output(self.widget.Outputs.annotated_data)
        output_clusters = output.metas[:, 0].copy()
        output_clusters[np.isnan(output_clusters)] = -1
        np.testing.assert_array_equal(output_clusters, clusters)

    def test_normalize_changed(self):
        self.send_signal(self.widget.Inputs.data, self.iris)
        simulate.combobox_run_through_all(self.widget.controls.metric_idx)
        self.widget.controls.normalize.setChecked(False)
        simulate.combobox_run_through_all(self.widget.controls.metric_idx)
Example #42
0
        self.clear()
        self.data = None
        self.shutdown()
        super().onDeleteWidget()

    @classmethod
    def migrate_settings(cls, settings, version):
        if version < 3:
            if "selection_indices" in settings:
                settings["selection"] = settings["selection_indices"]
        if version < 4:
            settings.pop("max_iter", None)

    @classmethod
    def migrate_context(cls, context, version):
        if version < 3:
            values = context.values
            values["attr_color"] = values["graph"]["attr_color"]
            values["attr_size"] = values["graph"]["attr_size"]
            values["attr_shape"] = values["graph"]["attr_shape"]
            values["attr_label"] = values["graph"]["attr_label"]


if __name__ == "__main__":
    import sys
    data = Table(sys.argv[1] if len(sys.argv) > 1 else "iris")
    WidgetPreview(OWtSNE).run(
        set_data=data,
        set_subset_data=data[np.random.choice(len(data), 10)],
    )
Example #43
0
 def test_string_variables(self):
     self.send_signal(self.widget.Inputs.data, Table("zoo"))
Example #44
0
 def setUp(self):
     self.widget = self.create_widget(OWDBSCAN)
     self.iris = Table("iris")
Example #45
0
    def data_table(cls, data, headers=None):
        """
        Return Orange.data.Table given rows of `headers` (iterable of iterable)
        and rows of `data` (iterable of iterable).

        Basically, the idea of subclasses is to produce those two iterables,
        however they might.

        If `headers` is not provided, the header rows are extracted from `data`,
        assuming they precede it.
        """
        if not headers:
            headers, data = cls.parse_headers(data)

        # Consider various header types (single-row, two-row, three-row, none)
        if len(headers) == 3:
            names, types, flags = map(list, headers)
        else:
            if len(headers) == 1:
                HEADER1_FLAG_SEP = '#'
                # First row format either:
                #   1) delimited column names
                #   2) -||- with type and flags prepended, separated by #,
                #      e.g. d#sex,c#age,cC#IQ
                _flags, names = zip(*[
                    i.split(HEADER1_FLAG_SEP, 1) if HEADER1_FLAG_SEP in i else
                    ('', i) for i in headers[0]
                ])
                names = list(names)
            elif len(headers) == 2:
                names, _flags = map(list, headers)
            else:
                # Use heuristics for everything
                names, _flags = [], []
            types = [
                ''.join(filter(str.isupper, flag)).lower() for flag in _flags
            ]
            flags = [Flags.join(filter(str.islower, flag)) for flag in _flags]

        # Determine maximum row length
        rowlen = max(map(len, (names, types, flags)))

        strip = False

        def _equal_length(lst):
            nonlocal strip
            if len(lst) > rowlen > 0:
                lst = lst[:rowlen]
                strip = True
            elif len(lst) < rowlen:
                lst.extend([''] * (rowlen - len(lst)))
            return lst

        # Ensure all data is of equal width in a column-contiguous array
        data = [
            _equal_length([s.strip() for s in row]) for row in data if any(row)
        ]
        data = np.array(data, dtype=object, order='F')

        if strip:
            warnings.warn("Columns with no headers were removed.")

        # Data may actually be longer than headers were
        try:
            rowlen = data.shape[1]
        except IndexError:
            pass
        else:
            for lst in (names, types, flags):
                _equal_length(lst)

        NAMEGEN = namegen('Feature ', 1)
        Xcols, attrs = [], []
        Mcols, metas = [], []
        Ycols, clses = [], []
        Wcols = []

        # Rename variables if necessary
        # Reusing across files still works if both files have same duplicates
        name_counts = Counter(names)
        del name_counts[""]
        if len(name_counts) != len(names) and name_counts:
            uses = {
                name: 0
                for name, count in name_counts.items() if count > 1
            }
            for i, name in enumerate(names):
                if name in uses:
                    uses[name] += 1
                    names[i] = "{}_{}".format(name, uses[name])

        namask = np.empty(data.shape[0], dtype=bool)
        # Iterate through the columns
        for col in range(rowlen):
            flag = Flags(Flags.split(flags[col]))
            if flag.i:
                continue

            type_flag = types and types[col].strip()
            try:
                orig_values = data[:, col]
            except IndexError:
                orig_values = np.array([], dtype=object)

            namask = isnastr(orig_values, out=namask)

            coltype_kwargs = {}
            valuemap = None
            values = orig_values

            if type_flag in StringVariable.TYPE_HEADERS:
                coltype = StringVariable
                values = orig_values
            elif type_flag in ContinuousVariable.TYPE_HEADERS:
                coltype = ContinuousVariable
                values = np.empty(data.shape[0], dtype=float)
                try:
                    np.copyto(values,
                              orig_values,
                              casting="unsafe",
                              where=~namask)
                    values[namask] = np.nan
                except ValueError:
                    for row, num in enumerate(orig_values):
                        if not isnastr(num):
                            try:
                                float(num)
                            except ValueError:
                                break
                    raise ValueError('Non-continuous value in (1-based) '
                                     'line {}, column {}'.format(
                                         row + len(headers) + 1, col + 1))

            elif type_flag in TimeVariable.TYPE_HEADERS:
                coltype = TimeVariable
                values = np.where(namask, "", orig_values)
            elif (type_flag in DiscreteVariable.TYPE_HEADERS
                  or _RE_DISCRETE_LIST.match(type_flag)):
                coltype = DiscreteVariable
                orig_values = values = np.where(namask, "", orig_values)
                if _RE_DISCRETE_LIST.match(type_flag):
                    valuemap = Flags.split(type_flag)
                    coltype_kwargs.update(ordered=True)
                else:
                    valuemap = sorted(set(orig_values) - {""})
            else:
                # No known type specified, use heuristics
                valuemap, values, coltype = guess_data_type(
                    orig_values, namask)

            if flag.m or coltype is StringVariable:
                append_to = (Mcols, metas)
            elif flag.w:
                append_to = (Wcols, None)
            elif flag.c:
                append_to = (Ycols, clses)
            else:
                append_to = (Xcols, attrs)

            cols, domain_vars = append_to

            if domain_vars is not None:
                var_name = names and names[col]
                if not var_name:
                    var_name = next(NAMEGEN)

                values, var = sanitize_variable(valuemap,
                                                values,
                                                orig_values,
                                                coltype,
                                                coltype_kwargs,
                                                name=var_name)
            else:
                var = None
            if domain_vars is not None:
                var.attributes.update(flag.attributes)
                domain_vars.append(var)

            if isinstance(values, np.ndarray) and not values.flags.owndata:
                values = values.copy()  # might view `data` (string columns)
            cols.append(values)

            try:
                # allow gc to reclaim memory used by string values
                data[:, col] = None
            except IndexError:
                pass

        domain = Domain(attrs, clses, metas)

        if not data.size:
            return Table.from_domain(domain, 0)

        X = Y = M = W = None
        if Xcols:
            X = np.c_[tuple(Xcols)]
            assert X.dtype == np.float_
        else:
            X = np.empty((data.shape[0], 0), dtype=np.float_)
        if Ycols:
            Y = np.c_[tuple(Ycols)]
            assert Y.dtype == np.float_
        if Mcols:
            M = np.c_[tuple(Mcols)].astype(object)
        if Wcols:
            W = np.c_[tuple(Wcols)].astype(float)

        table = Table.from_numpy(domain, X, Y, M, W)
        return table
 def setUp(self):
     self.widget = self.create_widget(
         OWLouvainClustering, stored_settings={'auto_commit': False}
     )
     self.iris = Table('iris')
Example #47
0
 def init(self):
     self.data = Table("iris")
     self.same_input_output_domain = True
Example #48
0
 def setUp(self):
     self.widget = self.create_widget(owcolor.OWColor)
     self.iris = Table("iris")
Example #49
0
 def setUp(self):
     self.widget = self.create_widget(OWLookalike)
     self.zoo = Table("zoo-with-images")
Example #50
0
 def setUpClass(cls):
     cls.iris = Table("iris")
     cls.housing = Table("housing")
Example #51
0
        else:
            self.invalidate(unconditional=True)

    def send_report(self):
        # False positives (Setting is not recognized as int)
        # pylint: disable=invalid-sequence-index
        if self.optimize_k and self.selected_row() is not None:
            k_clusters = self.k_from + self.selected_row()
        else:
            k_clusters = self.k
        init_method = self.INIT_METHODS[self.smart_init][0]
        init_method = init_method[0].lower() + init_method[1:]
        self.report_items((
            ("Number of clusters", k_clusters),
            ("Optimization", "{}, {} re-runs limited to {} steps".format(
                init_method, self.n_init, self.max_iterations))))
        if self.data is not None:
            self.report_data("Data", self.data)
            if self.optimize_k:
                self.report_table(
                    "Silhouette scores for different numbers of clusters",
                    self.table_view)

    def onDeleteWidget(self):
        self.cancel()
        super().onDeleteWidget()


if __name__ == "__main__":  # pragma: no cover
    WidgetPreview(OWKMeans).run(Table("heart_disease"))
Example #52
0
        arrow1 = pg.ArrowItem(
            parent=self, angle=angle_1, brush=color, pen=pg.mkPen(color)
        )
        arrow1.setPos(np.cos(angle - dangle), np.sin(angle - dangle))
        arrow2 = pg.ArrowItem(
            parent=self, angle=angle_2, brush=color, pen=pg.mkPen(color)
        )
        arrow2.setPos(np.cos(angle + dangle), np.sin(angle + dangle))
        arc_x = np.fromfunction(
            lambda i: np.cos((angle - dangle) + (2 * dangle) * i / 120.),
            (121,), dtype=int
        )
        arc_y = np.fromfunction(
            lambda i: np.sin((angle - dangle) + (2 * dangle) * i / 120.),
            (121,), dtype=int
        )
        pg.PlotCurveItem(
            parent=self, x=arc_x, y=arc_y, pen=pg.mkPen(color), antialias=False
        )

    def paint(self, painter, option, widget):
        pass

    def boundingRect(self):
        return QRectF()


if __name__ == "__main__":  # pragma: no cover
    data = Table("brown-selected")
    WidgetPreview(OWRadviz).run(set_data=data, set_subset_data=data[::10])
Example #53
0
        return SampleRandomN(n,
                             self.stratified,
                             random_state=self.random_state)(table)


class SampleBootstrap(Reprable):
    def __init__(self, size=0, random_state=None):
        self.size = size
        self.random_state = random_state

    def __call__(self, table=None):
        """Bootstrap indices

        Args:
            table: Not used (but part of the signature)
        Returns:
            tuple (out_of_sample, sample) indices
        """
        # pylint: disable=no-member
        rgen = np.random.RandomState(self.random_state)
        sample = rgen.randint(0, self.size, self.size)
        sample.sort()  # not needed for the code below, just for the user
        insample = np.ones((self.size, ), dtype=np.bool)
        insample[sample] = False
        remaining = np.flatnonzero(insample)
        return remaining, sample


if __name__ == "__main__":  # pragma: no cover
    WidgetPreview(OWDataSampler).run(Table("iris"))
Example #54
0
    def send_data(self):
        if self.optimize_k:
            row = self.selected_row()
            k = self.k_from + row if row is not None else None
        else:
            k = self.k

        km = self.clusterings.get(k)
        if self.data is None or km is None or isinstance(km, str):
            self.Outputs.annotated_data.send(None)
            self.Outputs.centroids.send(None)
            return

        domain = self.data.domain
        cluster_var = DiscreteVariable(
            get_unique_names(domain, "Cluster"),
            values=["C%d" % (x + 1) for x in range(km.k)]
        )
        clust_ids = km.labels
        silhouette_var = ContinuousVariable(
            get_unique_names(domain, "Silhouette"))
        if len(self.data) <= SILHOUETTE_MAX_SAMPLES:
            self.Warning.no_silhouettes.clear()
            scores = self.samples_scores(clust_ids)
            clust_scores = []
            for i in range(km.k):
                in_clust = clust_ids == i
                if in_clust.any():
                    clust_scores.append(np.mean(scores[in_clust]))
                else:
                    clust_scores.append(0.)
            clust_scores = np.atleast_2d(clust_scores).T
        else:
            self.Warning.no_silhouettes()
            scores = np.nan
            clust_scores = np.full((km.k, 1), np.nan)

        new_domain = add_columns(domain, metas=[cluster_var, silhouette_var])
        new_table = self.data.transform(new_domain)
        new_table.get_column_view(cluster_var)[0][:] = clust_ids
        new_table.get_column_view(silhouette_var)[0][:] = scores

        centroid_attributes = [
            attr.compute_value.variable
            if isinstance(attr.compute_value, ReplaceUnknowns)
            and attr.compute_value.variable in domain.attributes
            else attr
            for attr in km.domain.attributes]
        centroid_domain = add_columns(
            Domain(centroid_attributes, [], domain.metas),
            metas=[cluster_var, silhouette_var])
        centroids = Table(
            centroid_domain, km.centroids, None,
            np.hstack((np.full((km.k, len(domain.metas)), np.nan),
                       np.arange(km.k).reshape(km.k, 1),
                       clust_scores))
        )
        if self.data.name == Table.name:
            centroids.name = "centroids"
        else:
            centroids.name = f"{self.data.name} centroids"

        self.Outputs.annotated_data.send(new_table)
        self.Outputs.centroids.send(centroids)
Example #55
0
 def setUp(self):
     self.widget = self.create_widget(OWEditDomain)
     self.iris = Table("iris")
 def setUpClass(cls):
     super().setUpClass()
     cls.iris = Table("iris")[::5]
     cls.titanic = Table("titanic")[::10]
Example #57
0
 def test_continuous(self):
     table = Table("housing")
     self.send_signal(self.widget.Inputs.data, table)
     self.widget.unconditional_commit()
Example #58
0
    def get_learner_parameters(self):
        items = OrderedDict()
        items['Loss function'] = self.LOSS_FUNCTIONS[self.loss_function]
        if self.loss_function != self.SqLoss:
            items['Loss function'] += ", ε={}".format(self.epsilon)
        items['Penalty'] = self.PENALTIES[self.penalty_type]
        if self.penalty_type == self.ElasticNet:
            items['Penalty'] += ": L1 : L2 = {} : {}".format(
                self.l1_ratio, 1.0 - self.l1_ratio)
        items['Penalty'] = items['Penalty'] + ', α={}'.format(self.alpha)
        items['Learning rate'] = self.LEARNING_RATES[self.learning_rate]
        items['Learning rate'] += ", η<sub>0</sub>={}".format(self.eta0)
        if self.learning_rate == self.InvScaling:
            items['Learning rate'] += ", power_t={}".format(self.power_t)
        items['Number of iterations'] = self.n_iter
        return items


if __name__ == "__main__":
    import sys
    from PyQt4.QtGui import QApplication

    a = QApplication(sys.argv)
    ow = OWSGDRegression()
    d = Table('housing')
    ow.set_data(d)
    ow.show()
    a.exec_()
    ow.saveSettings()
Example #59
0
                            preprocessors=self.preprocessors,
                            algorithm=self.algorithms[self.algorithm_index],
                            loss=self.losses[self.loss_index].lower())

    @Inputs.learner
    def set_base_learner(self, learner):
        self.Error.no_weight_support.clear()
        if learner and not learner.supports_weights:
            # Clear the error and reset to default base learner
            self.Error.no_weight_support()
            self.base_estimator = None
            self.base_label.setText("Base estimator: INVALID")
        else:
            self.base_estimator = learner or self.DEFAULT_BASE_ESTIMATOR
            self.base_label.setText("Base estimator: %s" %
                                    self.base_estimator.name.title())
        if self.auto_apply:
            self.apply()

    def get_learner_parameters(self):
        return (("Base estimator", self.base_estimator),
                ("Number of estimators", self.n_estimators),
                ("Algorithm (classification)",
                 self.algorithms[self.algorithm_index].capitalize()),
                ("Loss (regression)",
                 self.losses[self.loss_index].capitalize()))


if __name__ == "__main__":  # pragma: no cover
    WidgetPreview(OWAdaBoost).run(Table("iris"))
Example #60
0
 def setUp(self):
     Variable._clear_all_caches()  # pylint: disable=protected-access
     random.seed(42)
     self.zoo = Table("zoo")