Ejemplo n.º 1
0
    def test_bincount(self):
        hist, n_nans = bincount([0., 1., np.nan, 3])
        self.assertEqual(n_nans, 1)
        np.testing.assert_equal(hist, [1, 1, 0, 1])

        hist, n_nans = bincount([0., 1., 3], max_val=3)
        self.assertEqual(n_nans, 0)
        np.testing.assert_equal(hist, [1, 1, 0, 1])
Ejemplo n.º 2
0
    def test_all_zeros_or_nans(self, array):
        """Sparse arrays with only nans with no explicit zeros will have no non
        zero indices. Check that this counts the zeros properly."""
        x = array([np.nan] * 5 + [0] * 5)
        expected = [5]

        np.testing.assert_equal(bincount(x)[0], expected)
Ejemplo n.º 3
0
    def _get_bin_distributions(self, bin_indices):
        """Compute the distribution of instances within bins.

        Parameters
        ----------
        bin_indices : np.ndarray
            An array with same shape as `x` but containing the bin index of the
            instance.

        Returns
        -------
        np.ndarray
            A 2d array; the first dimension represents different bins, the
            second - the counts of different target values.

        """
        if self.target_var and self.target_var.is_discrete:
            y = self.y
            # TODO This probably also isn't the best handling of sparse data...
            if sp.issparse(y):
                y = np.squeeze(np.array(y.todense()))

            # Since y can contain missing values, we need to filter them out as
            # well as their corresponding `x` values
            y_nan_mask = np.isnan(y)
            y, bin_indices = y[~y_nan_mask], bin_indices[~y_nan_mask]

            y = one_hot(y)
            # In the event that y does not take up all the values and the
            # largest discrete value does not appear at all, one hot encoding
            # will produce too few columns. This causes problems, so we need to
            # pad y with zeros to properly compute the distribution
            if y.shape[1] != len(self.target_var.values):
                n_missing_columns = len(self.target_var.values) - y.shape[1]
                y = np.hstack((y, np.zeros((y.shape[0], n_missing_columns))))

            bins = np.arange(self.n_bins)[:, np.newaxis]
            mask = bin_indices == bins
            distributions = np.zeros((self.n_bins, y.shape[1]))
            for bin_idx in range(self.n_bins):
                distributions[bin_idx] = y[mask[bin_idx]].sum(axis=0)
        else:
            distributions, _ = ut.bincount(bin_indices.astype(np.int64))
            # To keep things consistent across different variable types, we
            # want to return a 2d array where the first dim represent different
            # bins, and the second the distributions.
            distributions = distributions[:, np.newaxis]

        return distributions
Ejemplo n.º 4
0
 def _discrete_counts():
     """
     Generate pairs similar to _string_counts, except that the arrays
     contain bin counts for the attribute's values matching the pattern.
     """
     attr_vals = np.array(attr.values)
     attr_vals = _lower_if_needed(attr_vals)
     bins = bincount(data, max_val=len(attr.values) - 1)[0]
     remaining = np.array(bins)
     for _, pattern in self.active_rules:
         matching = _matcher(attr_vals, pattern)
         yield remaining[matching], bins[matching]
         remaining[matching] = 0
         if not np.any(remaining):
             break
Ejemplo n.º 5
0
    def get_discrete_stats(self, column, n_bins):
        """
        Return tables used computing distance between missing discrete values.

        Args:
            column (np.ndarray): column data
            n_bins (int): maximal number of bins in the data set

        Returns:
            dist_missing_disc (np.ndarray): `dist_missing_disc[value]` is
                1 - probability of `value`, which is used as the distance added
                for the given `value` in the column `col` if the value for the
                other row is missing
            dist_missing2_disc (float): the distance between two missing
                values in this columns
        """
        dist = util.bincount(column, minlength=n_bins)[0]
        dist /= max(1, sum(dist))
        return 1 - dist, 1 - np.sum(dist ** 2)
Ejemplo n.º 6
0
    def test_adds_empty_bins(self, array):
        x = array([0, 1, 3, 5])
        expected = [1, 1, 0, 1, 0, 1]

        np.testing.assert_equal(bincount(x)[0], expected)
Ejemplo n.º 7
0
    def test_count_nans(self, array):
        x = array([0, 0, 1, 2, np.nan, 2])
        expected = 1

        np.testing.assert_equal(bincount(x)[1], expected)
Ejemplo n.º 8
0
def _categorical_entropy(x):
    """Compute the entropy of a dense/sparse matrix, column-wise. Assuming
    categorical values."""
    p = [ut.bincount(row)[0] for row in x.T]
    p = [pk / np.sum(pk) for pk in p]
    return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
Ejemplo n.º 9
0
    def get_column(self,
                   attr,
                   filter_valid=True,
                   max_categories=None,
                   return_labels=False):
        """
        Retrieve the data from the given column in the data table

        The method:
        - densifies sparse data,
        - converts arrays with dtype object to floats if the attribute is
          actually primitive,
        - filters out invalid data (if `filter_valid` is `True`),
        - merges infrequent (discrete) values into a single value
          (if `max_categories` is set).

        Tha latter feature is used for shapes and labels, where only a
        specified number of different values is shown, and others are
        merged into category 'Other'. In this case, the method may return
        either the data (e.g. color indices, shape indices) or the list
        of retained values, followed by `['Other']`.

        Args:
            attr (:obj:~Orange.data.Variable): the column to extract
            filter_valid (bool): filter out invalid data (default: `True`)
            max_categories (int): merge infrequent values (default: `None`);
                ignored for non-discrete attributes
            return_labels (bool): return a list of labels instead of data
                (default: `False`)

        Returns:
            (np.ndarray): (valid) data from the column, or a list of labels
        """
        if attr is None:
            return None

        needs_merging = attr.is_discrete \
                        and max_categories is not None \
                        and len(attr.values) >= max_categories
        if return_labels and not needs_merging:
            assert attr.is_discrete
            return attr.values

        all_data = self.data.get_column_view(attr)[0]
        if all_data.dtype == object and attr.is_primitive():
            all_data = all_data.astype(float)
        if filter_valid and self.valid_data is not None:
            all_data = all_data[self.valid_data]
        if not needs_merging:
            return all_data

        dist = bincount(all_data, max_val=len(attr.values) - 1)[0]
        infrequent = np.zeros(len(attr.values), dtype=bool)
        infrequent[np.argsort(dist)[:-(max_categories - 1)]] = True
        if return_labels:
            return [
                value
                for value, infreq in zip(attr.values, infrequent) if not infreq
            ] + ["Other"]
        else:
            result = all_data.copy()
            freq_vals = [i for i, f in enumerate(infrequent) if not f]
            for i, infreq in enumerate(infrequent):
                if infreq:
                    result[all_data == i] = max_categories - 1
                else:
                    result[all_data == i] = freq_vals.index(i)
            return result
Ejemplo n.º 10
0
    def test_weights_with_transposed_x(self, array):
        x = array([0, 0, 1, 1, 2, 2, 3, 3]).T
        w = np.array([1, 2, 0, 0, 1, 1, 0, 1])

        expected = [3, 0, 2, 1]
        np.testing.assert_equal(bincount(x, w)[0], expected)
Ejemplo n.º 11
0
    def test_minlength_adds_empty_bins(self, array):
        x = array([1, 1, 1, 2, 3, 2])
        minlength = 5
        expected = [0, 3, 2, 1, 0]

        np.testing.assert_equal(bincount(x, minlength=minlength)[0], expected)
Ejemplo n.º 12
0
    def test_weights(self, array):
        x = array([0, 0, 1, 1, 2, 2, 3, 3])
        w = np.array([1, 2, 0, 0, 1, 1, 0, 1])

        expected = [3, 0, 2, 1]
        np.testing.assert_equal(bincount(x, w)[0], expected)
Ejemplo n.º 13
0
    def test_minlength_adds_empty_bins(self, array):
        x = array([1, 1, 1, 2, 3, 2])
        minlength = 5
        expected = [0, 3, 2, 1, 0]

        np.testing.assert_equal(bincount(x, minlength=minlength)[0], expected)
Ejemplo n.º 14
0
    def test_maxval_doesnt_truncate_values_when_too_small(self, array):
        x = array([1, 1, 1, 2, 3, 2])
        max_val = 1
        expected = [0, 3, 2, 1]

        np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
Ejemplo n.º 15
0
    def test_maxval_adds_empty_bins(self, array):
        x = array([1, 1, 1, 2, 3, 2])
        max_val = 5
        expected = [0, 3, 2, 1, 0, 0]

        np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
Ejemplo n.º 16
0
    def test_adds_empty_bins(self, array):
        x = array([0, 1, 3, 5])
        expected = [1, 1, 0, 1, 0, 1]

        np.testing.assert_equal(bincount(x)[0], expected)
Ejemplo n.º 17
0
    def test_count_nans(self, array):
        x = array([0, 0, 1, 2, np.nan, 2])
        expected = 1

        np.testing.assert_equal(bincount(x)[1], expected)
Ejemplo n.º 18
0
    def test_maxval_adds_empty_bins(self, array):
        x = array([1, 1, 1, 2, 3, 2])
        max_val = 5
        expected = [0, 3, 2, 1, 0, 0]

        np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
Ejemplo n.º 19
0
    def test_maxval_doesnt_truncate_values_when_too_small(self, array):
        x = array([1, 1, 1, 2, 3, 2])
        max_val = 1
        expected = [0, 3, 2, 1]

        np.testing.assert_equal(bincount(x, max_val=max_val)[0], expected)
Ejemplo n.º 20
0
    def test_all_nans(self, array):
        x = array([np.nan] * 5)
        expected = []

        np.testing.assert_equal(bincount(x)[0], expected)
Ejemplo n.º 21
0
    def test_weights_with_nans(self, array):
        x = array([0, 0, 1, 1, np.nan, 2, np.nan, 3])
        w = np.array([1, 2, 0, 0, 1, 1, 0, 1])

        expected = [3, 0, 1, 1]
        np.testing.assert_equal(bincount(x, w)[0], expected)
Ejemplo n.º 22
0
 def majority(x):
     if x.shape[0] == 0:
         return np.nan
     counts = bincount(x)[0]
     return np.argmax(counts) if counts.shape[0] else np.nan
Ejemplo n.º 23
0
    def test_all_nans(self, array):
        x = array([np.nan] * 5)
        expected = []

        np.testing.assert_equal(bincount(x)[0], expected)
Ejemplo n.º 24
0
def _categorical_entropy(x):
    """Compute the entropy of a dense/sparse matrix, column-wise. Assuming
    categorical values."""
    p = [ut.bincount(row)[0] for row in x.T]
    p = [pk / np.sum(pk) for pk in p]
    return np.fromiter((ss.entropy(pk) for pk in p), dtype=np.float64)
Ejemplo n.º 25
0
    def get_column(self, attr, filter_valid=True,
                   merge_infrequent=False, return_labels=False):
        """
        Retrieve the data from the given column in the data table

        The method:
        - densifies sparse data,
        - converts arrays with dtype object to floats if the attribute is
          actually primitive,
        - filters out invalid data (if `filter_valid` is `True`),
        - merges infrequent (discrete) values into a single value
          (if `merge_infrequent` is `True`).

        Tha latter feature is used for shapes and labels, where only a
        set number (`MAX`) of different values is shown, and others are
        merged into category 'Other'. In this case, the method may return
        either the data (e.g. color indices, shape indices) or the list
        of retained values, followed by `['Other']`.

        Args:
            attr (:obj:~Orange.data.Variable): the column to extract
            filter_valid (bool): filter out invalid data (default: `True`)
            merge_infrequent (bool): merge infrequent values (default: `False`);
                ignored for non-discrete attributes
            return_labels (bool): return a list of labels instead of data
                (default: `False`)

        Returns:
            (np.ndarray): (valid) data from the column, or a list of labels
        """
        if attr is None:
            return None

        needs_merging = \
            attr.is_discrete \
            and merge_infrequent and len(attr.values) >= MAX_CATEGORIES
        if return_labels and not needs_merging:
            assert attr.is_discrete
            return attr.values

        all_data = self.data.get_column_view(attr)[0]
        if all_data.dtype == object and attr.is_primitive():
            all_data = all_data.astype(float)
        if filter_valid and self.valid_data is not None:
            all_data = all_data[self.valid_data]
        if not needs_merging:
            return all_data

        dist = bincount(all_data, max_val=len(attr.values) - 1)[0]
        infrequent = np.zeros(len(attr.values), dtype=bool)
        infrequent[np.argsort(dist)[:-(MAX_CATEGORIES-1)]] = True
        if return_labels:
            return [value for value, infreq in zip(attr.values, infrequent)
                    if not infreq] + ["Other"]
        else:
            result = all_data.copy()
            freq_vals = [i for i, f in enumerate(infrequent) if not f]
            for i, infreq in enumerate(infrequent):
                if infreq:
                    result[all_data == i] = MAX_CATEGORIES - 1
                else:
                    result[all_data == i] = freq_vals.index(i)
            return result
Ejemplo n.º 26
0
    def test_count_nans_objectarray(self):
        x = np.array([0, 0, 1, 2, np.nan, 2], dtype=object)
        expected = 1

        np.testing.assert_equal(bincount(x)[1], expected)