Beispiel #1
0
    def test_unique(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            np.testing.assert_array_equal(
                unique(X_sparse, return_counts=False),
                np.unique(X, return_counts=False))

            for a1, a2 in zip(unique(X_sparse, return_counts=True),
                              np.unique(X, return_counts=True)):
                np.testing.assert_array_equal(a1, a2)
Beispiel #2
0
    def test_unique(self):
        for X in self.data:
            X_sparse = csr_matrix(X)
            np.testing.assert_array_equal(
                unique(X_sparse, return_counts=False),
                np.unique(X, return_counts=False))

            for a1, a2 in zip(unique(X_sparse, return_counts=True),
                              np.unique(X, return_counts=True)):
                np.testing.assert_array_equal(a1, a2)
Beispiel #3
0
    def test_sparse_explicit_zeros(self):
        # Use `lil_matrix` to fix sparse warning for matrix construction
        x = lil_matrix(np.eye(3))
        x[0, 1] = 0
        x[1, 0] = 0
        x = x.tocsr()
        # Test against identity matrix
        y = csr_matrix(np.eye(3))

        np.testing.assert_array_equal(unique(y, return_counts=True),
                                      unique(x, return_counts=True))
Beispiel #4
0
    def test_sparse_explicit_zeros(self):
        # Use `lil_matrix` to fix sparse warning for matrix construction
        x = lil_matrix(np.eye(3))
        x[0, 1] = 0
        x[1, 0] = 0
        x = x.tocsr()
        # Test against identity matrix
        y = csr_matrix(np.eye(3))

        np.testing.assert_array_equal(
            unique(y, return_counts=True),
            unique(x, return_counts=True),
        )
Beispiel #5
0
    def test_returns_counts(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7., 6.]])
        expected = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1]

        np.testing.assert_equal(unique(x, return_counts=True)[1], expected)
Beispiel #6
0
    def _init_feature_marker_values(self):
        self.feature_marker_values = []
        cls_index = self.target_class_index
        instances = Table(self.domain, self.instances) \
            if self.instances else None
        values = []
        for i, attr in enumerate(self.domain.attributes):
            value, feature_val = 0, None
            if len(self.log_reg_coeffs):
                if attr.is_discrete:
                    ind, n = unique(self.data.X[:, i], return_counts=True)
                    feature_val = np.nan_to_num(ind[np.argmax(n)])
                else:
                    feature_val = nanmean(self.data.X[:, i])

            # If data is provided on a separate signal, use the first data
            # instance to position the points instead of the mean
            inst_in_dom = instances and attr in instances.domain
            if inst_in_dom and not np.isnan(instances[0][attr]):
                feature_val = instances[0][attr]

            if feature_val is not None:
                value = (self.points[i][cls_index][int(feature_val)]
                         if attr.is_discrete else
                         self.log_reg_coeffs_orig[i][cls_index][0] * feature_val)
            values.append(value)
        self.feature_marker_values = np.asarray(values)
Beispiel #7
0
    def test_returns_unique_values(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7., 6.]])
        expected = [-1, 0, 1, 2, 3, 5, 6, 7, np.nan, np.nan]

        np.testing.assert_equal(unique(x, return_counts=False), expected)
Beispiel #8
0
    def _init_feature_marker_values(self):
        self.feature_marker_values = []
        cls_index = self.target_class_index
        instances = Table(self.domain, self.instances) \
            if self.instances else None
        values = []
        for i, attr in enumerate(self.domain.attributes):
            value, feature_val = 0, None
            if len(self.log_reg_coeffs):
                if attr.is_discrete:
                    ind, n = unique(self.data.X[:, i], return_counts=True)
                    feature_val = np.nan_to_num(ind[np.argmax(n)])
                else:
                    feature_val = nanmean(self.data.X[:, i])

            # If data is provided on a separate signal, use the first data
            # instance to position the points instead of the mean
            inst_in_dom = instances and attr in instances.domain
            if inst_in_dom and not np.isnan(instances[0][attr]):
                feature_val = instances[0][attr]

            if feature_val is not None:
                value = (self.points[i][cls_index][int(feature_val)]
                         if attr.is_discrete else
                         self.log_reg_coeffs_orig[i][cls_index][0] * feature_val)
            values.append(value)
        self.feature_marker_values = np.asarray(values)
Beispiel #9
0
    def set_train_data(self, data):
        """
        Set the input training dataset.

        Parameters
        ----------
        data : Optional[Orange.data.Table]
        """
        self.cancel()
        self.Information.data_sampled.clear()
        self.Error.train_data_error.clear()

        if data is not None:
            data_errors = [
                ("Train dataset is empty.", len(data) == 0),
                ("Train data input requires a target variable.",
                 not data.domain.class_vars),
                ("Too many target variables.",
                 len(data.domain.class_vars) > 1),
                ("Target variable has no values.", np.isnan(data.Y).all()),
                ("Target variable has only one value.",
                 data.domain.has_discrete_class and len(unique(data.Y)) < 2),
                ("Data has no features to learn from.", data.X.shape[1] == 0),
            ]

            for error_msg, cond in data_errors:
                if cond:
                    self.Error.train_data_error(error_msg)
                    data = None
                    break

        if isinstance(data, SqlTable):
            if data.approx_len() < AUTO_DL_LIMIT:
                data = Table(data)
            else:
                self.Information.data_sampled()
                data_sample = data.sample_time(1, no_cache=True)
                data_sample.download_data(AUTO_DL_LIMIT, partial=True)
                data = Table(data_sample)

        self.train_data_missing_vals = \
            data is not None and np.isnan(data.Y).any()
        if self.train_data_missing_vals or self.test_data_missing_vals:
            self.Warning.missing_data(self._which_missing_data())
            if data:
                data = HasClass()(data)
        else:
            self.Warning.missing_data.clear()

        self.data = data
        self.closeContext()
        self._update_scorers()
        self._update_controls()
        if data is not None:
            self._update_class_selection()
            self.openContext(data.domain)
            if self.fold_feature_selected and bool(self.feature_model):
                self.resampling = OWTestAndScore.FeatureFold
        self._invalidate()
Beispiel #10
0
    def test_returns_counts(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan],
                   [ 0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7.,     6.]])
        expected = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1]

        np.testing.assert_equal(unique(x, return_counts=True)[1], expected)
Beispiel #11
0
    def test_returns_unique_values(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan],
                   [ 0., 0., 0., 3., 5., np.nan],
                   [-1., 0., 0., 1., 7.,     6.]])
        expected = [-1, 0, 1, 2, 3, 5, 6, 7, np.nan, np.nan]

        np.testing.assert_equal(unique(x, return_counts=False), expected)
Beispiel #12
0
    def get_domain(self, domain, data):
        """Create domain (and dataset) from changes made in the widget.

        Parameters
        ----------
        domain : old domain
        data : source data

        Returns
        -------
        (new_domain, [attribute_columns, class_var_columns, meta_columns])
        """
        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        for (name, tpe, place, _, _), (orig_var, orig_plc) in \
                zip(variables,
                        chain([(at, Place.feature) for at in domain.attributes],
                              [(cl, Place.class_var) for cl in domain.class_vars],
                              [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)
            if name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                # change the name so that all_vars will get the correct name
                orig_var.name = name
                var = orig_var
            elif tpe == DiscreteVariable:
                values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
                var = tpe(name, values)
                col_data = [np.nan if self._is_missing(x) else values.index(str(x))
                            for x in self._iter_vals(col_data)]
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable and type(orig_var) == DiscreteVariable:
                var = tpe(name)
                col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
                            for x in self._iter_vals(col_data)]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            else:
                var = tpe(name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        return domain, [X, Y, m]
Beispiel #13
0
    def test_unique_explicit_zeros(self):
        x1 = csr_matrix(np.eye(3))
        x2 = csr_matrix(np.eye(3))

        # set some of-diagonal to explicit zeros
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=sp.sparse.SparseEfficiencyWarning)
            x2[0, 1] = 0
            x2[1, 0] = 0

        np.testing.assert_array_equal(
            unique(x1, return_counts=False),
            unique(x2, return_counts=False),
        )
        np.testing.assert_array_equal(
            unique(x1, return_counts=True),
            unique(x2, return_counts=True),
        )
Beispiel #14
0
    def test_unique_explicit_zeros(self):
        x1 = csr_matrix(np.eye(3))
        x2 = csr_matrix(np.eye(3))

        # set some of-diagonal to explicit zeros
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore",
                                    category=sp.sparse.SparseEfficiencyWarning)
            x2[0, 1] = 0
            x2[1, 0] = 0

        np.testing.assert_array_equal(
            unique(x1, return_counts=False),
            unique(x2, return_counts=False),
        )
        np.testing.assert_array_equal(
            unique(x1, return_counts=True),
            unique(x2, return_counts=True),
        )
    def test_returns_counts(self, array):
        # pylint: disable=bad-whitespace
        x = array([[-1., 1., 0., 2., 3., np.nan], [0., 0., 0., 3., 5., 42.],
                   [-1., 0., 0., 1., 7., 6.]])
        expected = [-1, 0, 1, 2, 3, 5, 6, 7, 42, np.nan]
        expected_counts = [2, 6, 2, 1, 2, 1, 1, 1, 1, 1]

        vals, counts = unique(x, return_counts=True)

        np.testing.assert_equal(vals, expected)
        np.testing.assert_equal(counts, expected_counts)
Beispiel #16
0
 def check_data(self):
     self.valid_data = False
     self.Error.sparse_not_supported.clear()
     if self.data is not None and self.learner is not None:
         self.Error.data_error.clear()
         if not self.learner.check_learner_adequacy(self.data.domain):
             self.Error.data_error(self.learner.learner_adequacy_err_msg)
         elif not len(self.data):
             self.Error.data_error("Dataset is empty.")
         elif len(ut.unique(self.data.Y)) < 2:
             self.Error.data_error("Data contains a single target value.")
         elif self.data.X.size == 0:
             self.Error.data_error("Data has no features to learn from.")
         elif self.data.is_sparse() and not self.supports_sparse:
             self.Error.sparse_not_supported()
         else:
             self.valid_data = True
     return self.valid_data
Beispiel #17
0
 def _init_feature_marker_values(self):
     self.feature_marker_values = []
     cls_index = self.target_class_index
     instances = Table(self.domain, self.instances) \
         if self.instances else None
     for i, attr in enumerate(self.domain.attributes):
         value, feature_val = 0, None
         if len(self.log_reg_coeffs):
             if attr.is_discrete:
                 ind, n = unique(self.data.X[:, i], return_counts=True)
                 feature_val = np.nan_to_num(ind[np.argmax(n)])
             else:
                 feature_val = mean(self.data.X[:, i])
         inst_in_dom = instances and attr in instances.domain
         if inst_in_dom and not np.isnan(instances[0][attr]):
             feature_val = instances[0][attr]
         if feature_val is not None:
             value = self.points[i][cls_index][int(feature_val)] \
                 if attr.is_discrete else \
                 self.log_reg_coeffs_orig[i][cls_index][0] * feature_val
         self.feature_marker_values.append(value)
Beispiel #18
0
    def check_data(self):
        self.valid_data = False
        self.Error.sparse_not_supported.clear()
        if self.data is not None and self.learner is not None:
            self.Error.data_error.clear()

            incompatibility_reason = None
            for cls in type(self.learner).mro():
                if 'incompatibility_reason' in cls.__dict__:
                    # pylint: disable=assignment-from-none
                    incompatibility_reason = \
                        self.learner.incompatibility_reason(self.data.domain)
                    break
                if 'check_learner_adequacy' in cls.__dict__:
                    warnings.warn(
                        "check_learner_adequacy is deprecated and will be removed "
                        "in upcoming releases. Learners should instead implement "
                        "the incompatibility_reason method.",
                        OrangeDeprecationWarning)
                    if not self.learner.check_learner_adequacy(
                            self.data.domain):
                        incompatibility_reason = self.learner.learner_adequacy_err_msg
                    break

            if incompatibility_reason is not None:
                self.Error.data_error(incompatibility_reason)
            elif not len(self.data):
                self.Error.data_error("Dataset is empty.")
            elif len(ut.unique(self.data.Y)) < 2:
                self.Error.data_error("Data contains a single target value.")
            elif self.data.X.size == 0:
                self.Error.data_error("Data has no features to learn from.")
            elif self.data.is_sparse() and not self.supports_sparse:
                self.Error.sparse_not_supported()
            else:
                self.valid_data = True

        return self.valid_data
Beispiel #19
0
    def get_domain(self, domain, data):
        """Create domain (and dataset) from changes made in the widget.

        Parameters
        ----------
        domain : old domain
        data : source data

        Returns
        -------
        (new_domain, [attribute_columns, class_var_columns, meta_columns])
        """
        # Allow type-checking with type() instead of isinstance() for exact comparison
        # pylint: disable=unidiomatic-typecheck

        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        def numbers_are_round(var, col_data):
            if type(var) == ContinuousVariable:
                data = np.asarray(col_data.data)  # Works for dense and sparse
                data = data[~np.isnan(data)]
                return (data == data.astype(int)).all()
            return False

        # Exit early with original domain if the user didn't actually change anything
        if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc)
               for (name, tpe, place, _, _), (orig_var, orig_plc) in \
                       zip(variables,
                           chain(((at, Place.feature) for at in domain.attributes),
                                 ((cl, Place.class_var) for cl in domain.class_vars),
                                 ((mt, Place.meta) for mt in domain.metas)))):
            return domain, [data.X, data.Y, data.metas]

        for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
                zip(variables,
                        chain([(at, Place.feature) for at in domain.attributes],
                              [(cl, Place.class_var) for cl in domain.class_vars],
                              [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)

            if name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                # change the name so that all_vars will get the correct name
                orig_var.name = name
                var = orig_var
            elif tpe == DiscreteVariable:
                values = list(
                    str(i) for i in unique(col_data)
                    if not self._is_missing(i))
                round_numbers = numbers_are_round(orig_var, col_data)
                col_data = [
                    np.nan if self._is_missing(x) else values.index(str(x))
                    for x in self._iter_vals(col_data)
                ]
                if round_numbers:
                    values = [str(int(float(v))) for v in values]
                var = tpe(name, values)
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable:
                var = tpe(name)
                if type(orig_var) == DiscreteVariable:
                    col_data = [
                        orig_var.repr_val(x) if not np.isnan(x) else ""
                        for x in self._iter_vals(col_data)
                    ]
                elif type(orig_var) == ContinuousVariable:
                    round_numbers = numbers_are_round(orig_var, col_data)
                    col_data = [
                        str(int(x)) if round_numbers else
                        orig_var.repr_val(x) if not np.isnan(x) else ""
                        for x in self._iter_vals(col_data)
                    ]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            elif tpe == ContinuousVariable and type(
                    orig_var) == DiscreteVariable:
                var = tpe(name)
                if may_be_numeric:
                    col_data = [
                        np.nan if self._is_missing(x) else float(
                            orig_var.values[int(x)])
                        for x in self._iter_vals(col_data)
                    ]
                col_data = self._to_column(col_data, is_sparse)
            else:
                var = tpe(name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        return domain, [X, Y, m]
Beispiel #20
0
    def get_domain(self, domain, data):
        """Create domain (and dataset) from changes made in the widget.

        Parameters
        ----------
        domain : old domain
        data : source data

        Returns
        -------
        (new_domain, [attribute_columns, class_var_columns, meta_columns])
        """
        # Allow type-checking with type() instead of isinstance() for exact comparison
        # pylint: disable=unidiomatic-typecheck

        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        def numbers_are_round(var, col_data):
            if type(var) == ContinuousVariable:
                data = np.asarray(col_data.data)  # Works for dense and sparse
                data = data[~np.isnan(data)]
                return (data == data.astype(int)).all()
            return False

        # Exit early with original domain if the user didn't actually change anything
        if all((name == orig_var.name and tpe == type(orig_var) and place == orig_plc)
               for (name, tpe, place, _, _), (orig_var, orig_plc) in
               zip(variables,
                   chain(((at, Place.feature) for at in domain.attributes),
                         ((cl, Place.class_var) for cl in domain.class_vars),
                         ((mt, Place.meta) for mt in domain.metas)))):
            return domain, [data.X, data.Y, data.metas]

        for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
                zip(variables,
                        chain([(at, Place.feature) for at in domain.attributes],
                              [(cl, Place.class_var) for cl in domain.class_vars],
                              [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)

            if name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                # change the name so that all_vars will get the correct name
                orig_var.name = name
                var = orig_var
            elif tpe == DiscreteVariable:
                values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
                round_numbers = numbers_are_round(orig_var, col_data)
                col_data = [np.nan if self._is_missing(x) else values.index(str(x))
                            for x in self._iter_vals(col_data)]
                if round_numbers:
                    values = [str(int(float(v))) for v in values]
                var = tpe(name, values)
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable:
                var = tpe.make(name)
                if type(orig_var) == DiscreteVariable:
                    col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
                                for x in self._iter_vals(col_data)]
                elif type(orig_var) == ContinuousVariable:
                    round_numbers = numbers_are_round(orig_var, col_data)
                    col_data = ['' if np.isnan(x) else
                                str(int(x)) if round_numbers else
                                orig_var.repr_val(x)
                                for x in self._iter_vals(col_data)]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable:
                var = tpe.make(name)
                if may_be_numeric:
                    col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)])
                                for x in self._iter_vals(col_data)]
                col_data = self._to_column(col_data, is_sparse)
            else:
                var = tpe(name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        return domain, [X, Y, m]
Beispiel #21
0
    def get_domain(self, domain, data):
        """Create domain (and dataset) from changes made in the widget.

        Parameters
        ----------
        domain : old domain
        data : source data

        Returns
        -------
        (new_domain, [attribute_columns, class_var_columns, meta_columns])
        """
        # Allow type-checking with type() instead of isinstance() for exact comparison
        # pylint: disable=unidiomatic-typecheck

        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        for (name, tpe, place, _, _), (orig_var, orig_plc) in \
                zip(variables,
                        chain([(at, Place.feature) for at in domain.attributes],
                              [(cl, Place.class_var) for cl in domain.class_vars],
                              [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)

            cont_ints = type(orig_var) == ContinuousVariable and \
                        all(x.is_integer() for x in self._iter_vals(col_data) if not np.isnan(x))
            disc_ints = type(orig_var) == DiscreteVariable and \
                        all(x.isdecimal() for x in orig_var.values)

            if name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                # change the name so that all_vars will get the correct name
                orig_var.name = name
                var = orig_var
            elif tpe == DiscreteVariable:
                values = list(str(i) for i in unique(col_data) if not self._is_missing(i))
                col_data = [np.nan if self._is_missing(x) else values.index(str(x))
                            for x in self._iter_vals(col_data)]
                if cont_ints:
                    values = [str(int(float(v))) for v in values]
                var = tpe(name, values)
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable:
                var = tpe(name)
                if type(orig_var) == DiscreteVariable:
                    col_data = [orig_var.repr_val(x) if not np.isnan(x) else ""
                                for x in self._iter_vals(col_data)]
                elif type(orig_var) == ContinuousVariable:
                    col_data = [str(int(x)) if cont_ints else orig_var.repr_val(x)
                                if not np.isnan(x) else ""
                                for x in self._iter_vals(col_data)]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            elif tpe == ContinuousVariable and type(orig_var) == DiscreteVariable:
                var = tpe(name)
                if disc_ints:
                    col_data = [np.nan if self._is_missing(x) else float(orig_var.values[int(x)])
                                for x in self._iter_vals(col_data)]
                col_data = self._to_column(col_data, is_sparse)
            else:
                var = tpe(name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if len(feats) else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        return domain, [X, Y, m]
Beispiel #22
0
    def get_domain(self, domain, data, deduplicate=False):
        """
        Create domain (and dataset) from changes made in the widget.

        Returns
        -------

        Args:
            domain (Domain): original domain
            data (Table): original data
            deduplicate (bool): if True, variable names are deduplicated and
               the result contains an additional list with names of renamed
               variables

        Returns:
            (new_domain, [attribute_columns, class_var_columns, meta_columns])
            or
            (new_domain, [attribute_columns, class_var_columns, meta_columns], renamed)
        """
        # Allow type-checking with type() instead of isinstance() for exact comparison
        # pylint: disable=unidiomatic-typecheck
        variables = self.model().variables
        places = [[], [], []]  # attributes, class_vars, metas
        cols = [[], [], []]  # Xcols, Ycols, Mcols

        def numbers_are_round(var, col_data):
            if type(var) == ContinuousVariable:
                data = np.asarray(col_data.data)  # Works for dense and sparse
                data = data[~np.isnan(data)]
                return (data == data.astype(int)).all()
            return False

        # Exit early with original domain if the user didn't actually change anything
        if all((name == orig_var.name and tpe == type(orig_var)
                and place == orig_plc)
               for (name, tpe, place, _, _), (orig_var, orig_plc) in zip(
                   variables,
                   chain(((at, Place.feature) for at in domain.attributes), (
                       (cl, Place.class_var) for cl in domain.class_vars), (
                           (mt, Place.meta) for mt in domain.metas)))):
            if deduplicate:
                return domain, [data.X, data.Y, data.metas], []
            else:
                return domain, [data.X, data.Y, data.metas]

        relevant_names = [var[0] for var in variables if var[2] != Place.skip]
        if deduplicate:
            renamed_iter = iter(get_unique_names_duplicates(relevant_names))
        else:
            renamed_iter = iter(relevant_names)
        renamed = []
        for (name, tpe, place, _, may_be_numeric), (orig_var, orig_plc) in \
                zip(variables,
                    chain([(at, Place.feature) for at in domain.attributes],
                          [(cl, Place.class_var)
                           for cl in domain.class_vars],
                          [(mt, Place.meta) for mt in domain.metas])):
            if place == Place.skip:
                continue

            new_name = next(renamed_iter)
            if new_name != name and name not in renamed:
                renamed.append(name)

            col_data = self._get_column(data, orig_var, orig_plc)
            is_sparse = sp.issparse(col_data)

            if new_name == orig_var.name and tpe == type(orig_var):
                var = orig_var
            elif tpe == type(orig_var):
                var = orig_var.copy(name=new_name)
            elif tpe == DiscreteVariable:
                values = natural_sorted(
                    list(
                        str(i) for i in unique(col_data)
                        if not self._is_missing(i)))
                round_numbers = numbers_are_round(orig_var, col_data)
                col_data = [
                    np.nan if self._is_missing(x) else values.index(str(x))
                    for x in self._iter_vals(col_data)
                ]
                if round_numbers:
                    values = [str(int(float(v))) for v in values]
                var = tpe(new_name, values)
                col_data = self._to_column(col_data, is_sparse)
            elif tpe == StringVariable:
                var = tpe.make(new_name)
                if type(orig_var) in [DiscreteVariable, TimeVariable]:
                    col_data = [
                        orig_var.repr_val(x) if not np.isnan(x) else ""
                        for x in self._iter_vals(col_data)
                    ]
                elif type(orig_var) == ContinuousVariable:
                    round_numbers = numbers_are_round(orig_var, col_data)
                    col_data = [
                        '' if np.isnan(x) else
                        str(int(x)) if round_numbers else orig_var.repr_val(x)
                        for x in self._iter_vals(col_data)
                    ]
                # don't obey sparsity for StringVariable since they are
                # in metas which are transformed to dense below
                col_data = self._to_column(col_data, False, dtype=object)
            elif tpe == ContinuousVariable and type(
                    orig_var) == DiscreteVariable:
                var = tpe.make(new_name)
                if may_be_numeric:
                    col_data = [
                        np.nan if self._is_missing(x) else float(
                            orig_var.values[int(x)])
                        for x in self._iter_vals(col_data)
                    ]
                col_data = self._to_column(col_data, is_sparse)
            else:
                var = tpe(new_name)
            places[place].append(var)
            cols[place].append(col_data)

        # merge columns for X, Y and metas
        feats = cols[Place.feature]
        X = self._merge(feats) if feats else np.empty((len(data), 0))
        Y = self._merge(cols[Place.class_var], force_dense=True)
        m = self._merge(cols[Place.meta], force_dense=True)
        domain = Domain(*places)
        if deduplicate:
            return domain, [X, Y, m], renamed
        else:
            return domain, [X, Y, m]