Example #1
0
 def test_empty():
     np.testing.assert_array_equal(
         one_hot([]), np.zeros((0, 0))
     )
     np.testing.assert_array_equal(
         one_hot([], dim=2), np.zeros((0, 2))
     )
Example #2
0
        def one_hot_probs(value):
            if not multitarget:
                return one_hot(value)

            max_card = max(len(c.values) for c in self.domain.class_vars)
            probs = np.zeros(value.shape + (max_card,), float)
            for i in range(len(self.domain.class_vars)):
                probs[:, i, :] = one_hot(value[:, i])
            return probs
Example #3
0
    def _get_bin_distributions(self, bin_indices):
        """Compute the distribution of instances within bins.

        Parameters
        ----------
        bin_indices : np.ndarray
            An array with same shape as `x` but containing the bin index of the
            instance.

        Returns
        -------
        np.ndarray
            A 2d array; the first dimension represents different bins, the
            second - the counts of different target values.

        """
        if self.target_var and self.target_var.is_discrete:
            y = self.y
            # TODO This probably also isn't the best handling of sparse data...
            if sp.issparse(y):
                y = np.squeeze(np.array(y.todense()))
            y = one_hot(y)
            bins = np.arange(self.n_bins)[:, np.newaxis]
            mask = bin_indices == bins
            distributions = np.zeros((self.n_bins, y.shape[1]))
            for bin_idx in range(self.n_bins):
                distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0)
        else:
            distributions, _ = ut.bincount(bin_indices.astype(np.int64))
            # To keep things consistent across different variable types, we
            # want to return a 2d array where the first dim represent different
            # bins, and the second the distributions.
            distributions = distributions[:, np.newaxis]

        return distributions
Example #4
0
 def test_dim(self):
     np.testing.assert_array_equal(
         one_hot(self.values, dim=4),
         [[1, 0, 0, 0],
          [0, 1, 0, 0],
          [0, 0, 1, 0],
          [0, 1, 0, 0]]
     )
Example #5
0
 def test_one_hot(self):
     np.testing.assert_array_equal(
         one_hot(self.values),
         [[1, 0, 0],
          [0, 1, 0],
          [0, 0, 1],
          [0, 1, 0]]
     )
Example #6
0
    def _get_bin_distributions(self, bin_indices):
        """Compute the distribution of instances within bins.

        Parameters
        ----------
        bin_indices : np.ndarray
            An array with same shape as `x` but containing the bin index of the
            instance.

        Returns
        -------
        np.ndarray
            A 2d array; the first dimension represents different bins, the
            second - the counts of different target values.

        """
        if self.target_var and self.target_var.is_discrete:
            y = self.y
            # TODO This probably also isn't the best handling of sparse data...
            if sp.issparse(y):
                y = np.squeeze(np.array(y.todense()))

            # Since y can contain missing values, we need to filter them out as
            # well as their corresponding `x` values
            y_nan_mask = np.isnan(y)
            y, bin_indices = y[~y_nan_mask], bin_indices[~y_nan_mask]

            y = one_hot(y)
            # In the event that y does not take up all the values and the
            # largest discrete value does not appear at all, one hot encoding
            # will produce too few columns. This causes problems, so we need to
            # pad y with zeros to properly compute the distribution
            if y.shape[1] != len(self.target_var.values):
                n_missing_columns = len(self.target_var.values) - y.shape[1]
                y = np.hstack((y, np.zeros((y.shape[0], n_missing_columns))))

            bins = np.arange(self.n_bins)[:, np.newaxis]
            mask = bin_indices == bins
            distributions = np.zeros((self.n_bins, y.shape[1]))
            for bin_idx in range(self.n_bins):
                distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0)
        else:
            distributions, _ = ut.bincount(bin_indices.astype(np.int64))
            # To keep things consistent across different variable types, we
            # want to return a 2d array where the first dim represent different
            # bins, and the second the distributions.
            distributions = distributions[:, np.newaxis]

        return distributions
Example #7
0
    def _get_bin_distributions(self, bin_indices):
        """Compute the distribution of instances within bins.

        Parameters
        ----------
        bin_indices : np.ndarray
            An array with same shape as `x` but containing the bin index of the
            instance.

        Returns
        -------
        np.ndarray
            A 2d array; the first dimension represents different bins, the
            second - the counts of different target values.

        """
        if self.target_var and self.target_var.is_discrete:
            y = self.y
            # TODO This probably also isn't the best handling of sparse data...
            if sp.issparse(y):
                y = np.squeeze(np.array(y.todense()))

            # Since y can contain missing values, we need to filter them out as
            # well as their corresponding `x` values
            y_nan_mask = np.isnan(y)
            y, bin_indices = y[~y_nan_mask], bin_indices[~y_nan_mask]

            y = one_hot(y)
            # In the event that y does not take up all the values and the
            # largest discrete value does not appear at all, one hot encoding
            # will produce too few columns. This causes problems, so we need to
            # pad y with zeros to properly compute the distribution
            if y.shape[1] != len(self.target_var.values):
                n_missing_columns = len(self.target_var.values) - y.shape[1]
                y = np.hstack((y, np.zeros((y.shape[0], n_missing_columns))))

            bins = np.arange(self.n_bins)[:, np.newaxis]
            mask = bin_indices == bins
            distributions = np.zeros((self.n_bins, y.shape[1]))
            for bin_idx in range(self.n_bins):
                distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0)
        else:
            distributions, _ = ut.bincount(bin_indices.astype(np.int64))
            # To keep things consistent across different variable types, we
            # want to return a 2d array where the first dim represent different
            # bins, and the second the distributions.
            distributions = distributions[:, np.newaxis]

        return distributions
Example #8
0
    def __call__(self, data, ret=Value):
        def fix_dim(x):
            return x[0] if one_d else x

        if not 0 <= ret <= 2:
            raise ValueError("invalid value of argument 'ret'")
        if ret > 0 and any(v.is_continuous for v in self.domain.class_vars):
            raise ValueError("cannot predict continuous distributions")

        # Call the predictor
        one_d = False
        if isinstance(data, np.ndarray):
            one_d = data.ndim == 1
            prediction = self.predict(np.atleast_2d(data))
        elif isinstance(data, scipy.sparse.csr.csr_matrix):
            prediction = self.predict(data)
        elif isinstance(data, (Table, Instance)):
            if isinstance(data, Instance):
                data = Table(data.domain, [data])
                one_d = True
            if data.domain != self.domain:
                if self.original_domain.attributes != data.domain.attributes \
                        and data.X.size \
                        and not np.isnan(data.X).all():
                    data = data.transform(self.original_domain)
                    if np.isnan(data.X).all():
                        raise DomainTransformationError(
                            "domain transformation produced no defined values")
                data = data.transform(self.domain)
            prediction = self.predict_storage(data)
        elif isinstance(data, (list, tuple)):
            if not isinstance(data[0], (list, tuple)):
                data = [data]
                one_d = True
            data = Table.from_list(self.original_domain, data)
            data = data.transform(self.domain)
            prediction = self.predict_storage(data)
        else:
            raise TypeError("Unrecognized argument (instance of '{}')"
                            .format(type(data).__name__))

        # Parse the result into value and probs
        multitarget = len(self.domain.class_vars) > 1
        if isinstance(prediction, tuple):
            value, probs = prediction
        elif prediction.ndim == 1 + multitarget:
            value, probs = prediction, None
        elif prediction.ndim == 2 + multitarget:
            value, probs = None, prediction
        else:
            raise TypeError("model returned a %i-dimensional array",
                            prediction.ndim)

        # Ensure that we have what we need to return
        if ret != Model.Probs and value is None:
            value = np.argmax(probs, axis=-1)
        if ret != Model.Value and probs is None:
            if multitarget:
                max_card = max(len(c.values)
                               for c in self.domain.class_vars)
                probs = np.zeros(value.shape + (max_card,), float)
                for i in range(len(self.domain.class_vars)):
                    probs[:, i, :] = one_hot(value[:, i])
            else:
                probs = one_hot(value)
            if ret == Model.ValueProbs:
                return fix_dim(value), fix_dim(probs)
            else:
                return fix_dim(probs)

        # Return what we need to
        if ret == Model.Probs:
            return fix_dim(probs)
        if isinstance(data, Instance) and not multitarget:
            value = Value(self.domain.class_var, value[0])
        if ret == Model.Value:
            return fix_dim(value)
        else:  # ret == Model.ValueProbs
            return fix_dim(value), fix_dim(probs)
Example #9
0
    def __call__(self, data, ret=Value):
        if not 0 <= ret <= 2:
            raise ValueError("invalid value of argument 'ret'")
        if ret > 0 and any(v.is_continuous for v in self.domain.class_vars):
            raise ValueError("cannot predict continuous distributions")

        # Call the predictor
        if isinstance(data, np.ndarray):
            prediction = self.predict(np.atleast_2d(data))
        elif isinstance(data, scipy.sparse.csr.csr_matrix):
            prediction = self.predict(data)
        elif isinstance(data, (Table, Instance)):
            if isinstance(data, Instance):
                data = Table(data.domain, [data])
            if data.domain != self.domain:
                data = data.transform(self.domain)
            prediction = self.predict_storage(data)
        elif isinstance(data, (list, tuple)):
            if not isinstance(data[0], (list, tuple)):
                data = [data]
            data = Table(self.original_domain, data)
            data = data.transform(self.domain)
            prediction = self.predict_storage(data)
        else:
            raise TypeError("Unrecognized argument (instance of '{}')".format(
                type(data).__name__))

        # Parse the result into value and probs
        multitarget = len(self.domain.class_vars) > 1
        if isinstance(prediction, tuple):
            value, probs = prediction
        elif prediction.ndim == 1 + multitarget:
            value, probs = prediction, None
        elif prediction.ndim == 2 + multitarget:
            value, probs = None, prediction
        else:
            raise TypeError("model returned a %i-dimensional array",
                            prediction.ndim)

        # Ensure that we have what we need to return
        if ret != Model.Probs and value is None:
            value = np.argmax(probs, axis=-1)
        if ret != Model.Value and probs is None:
            if multitarget:
                max_card = max(len(c.values) for c in self.domain.class_vars)
                probs = np.zeros(value.shape + (max_card, ), float)
                for i, cvar in enumerate(self.domain.class_vars):
                    probs[:, i, :] = one_hot(value[:, i])
            else:
                probs = one_hot(value)
            if ret == Model.ValueProbs:
                return value, probs
            else:
                return probs

        # Return what we need to
        if ret == Model.Probs:
            return probs
        if isinstance(data, Instance) and not multitarget:
            value = Value(self.domain.class_var, value[0])
        if ret == Model.Value:
            return value
        else:  # ret == Model.ValueProbs
            return value, probs
Example #10
0
    def __call__(self, data, ret=Value):
        if not 0 <= ret <= 2:
            raise ValueError("invalid value of argument 'ret'")
        if ret > 0 and any(v.is_continuous for v in self.domain.class_vars):
            raise ValueError("cannot predict continuous distributions")

        # Call the predictor
        if isinstance(data, np.ndarray):
            prediction = self.predict(np.atleast_2d(data))
        elif isinstance(data, scipy.sparse.csr.csr_matrix):
            prediction = self.predict(data)
        elif isinstance(data, Instance):
            if data.domain != self.domain:
                data = Instance(self.domain, data)
            data = Table(data.domain, [data])
            prediction = self.predict_storage(data)
        elif isinstance(data, Table):
            if data.domain != self.domain:
                data = data.transform(self.domain)
            prediction = self.predict_storage(data)
        elif isinstance(data, (list, tuple)):
            if not isinstance(data[0], (list, tuple)):
                data = [data]
            data = Table(self.original_domain, data)
            data = data.transform(self.domain)
            prediction = self.predict_storage(data)
        else:
            raise TypeError("Unrecognized argument (instance of '{}')"
                            .format(type(data).__name__))

        # Parse the result into value and probs
        multitarget = len(self.domain.class_vars) > 1
        if isinstance(prediction, tuple):
            value, probs = prediction
        elif prediction.ndim == 1 + multitarget:
            value, probs = prediction, None
        elif prediction.ndim == 2 + multitarget:
            value, probs = None, prediction
        else:
            raise TypeError("model returned a %i-dimensional array",
                            prediction.ndim)

        # Ensure that we have what we need to return
        if ret != Model.Probs and value is None:
            value = np.argmax(probs, axis=-1)
        if ret != Model.Value and probs is None:
            if multitarget:
                max_card = max(len(c.values)
                               for c in self.domain.class_vars)
                probs = np.zeros(value.shape + (max_card,), float)
                for i, cvar in enumerate(self.domain.class_vars):
                    probs[:, i, :] = one_hot(value[:, i])
            else:
                probs = one_hot(value)
            if ret == Model.ValueProbs:
                return value, probs
            else:
                return probs

        # Return what we need to
        if ret == Model.Probs:
            return probs
        if isinstance(data, Instance) and not multitarget:
            value = Value(self.domain.class_var, value[0])
        if ret == Model.Value:
            return value
        else:  # ret == Model.ValueProbs
            return value, probs
Example #11
0
 def test_one_hot(self):
     np.testing.assert_equal(
         one_hot([0, 1, 2, 1], int), [[1, 0, 0],
                                      [0, 1, 0],
                                      [0, 0, 1],
                                      [0, 1, 0]])
Example #12
0
 def test_one_hot(self):
     np.testing.assert_equal(
         one_hot([0, 1, 2, 1], int), [[1, 0, 0],
                                      [0, 1, 0],
                                      [0, 0, 1],
                                      [0, 1, 0]])
 def test_one_hot(self):
     np.testing.assert_equal(
         one_hot([0, 1, 2, 1], int), [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0, 1, 0]]
     )
     np.testing.assert_equal(one_hot([], int), np.zeros((0, 0), dtype=int))
Example #14
0
 def test_dim_too_low(self):
     with self.assertRaises(ValueError):
         one_hot(self.values, dim=2)
Example #15
0
 def test_dtype(self):
     res = one_hot(self.values)
     self.assertEqual(res.dtype, float)
     res = one_hot(self.values, dtype=int)
     self.assertEqual(res.dtype, int)