예제 #1
0
def su_measure(X, y):
    entropy_y = entropy(y)
    f_ratios = np.empty(X.shape[1])
    for index in range(X.shape[1]):
        entropy_x = entropy(X[:, index])
        cond_entropy = conditional_entropy(X[:, index], y)
        f_ratios[index] = 2 * (entropy_y - cond_entropy) / (entropy_x + entropy_y)
    return f_ratios
예제 #2
0
def information_gain(X, y):
    """
    Calculates mutual information for each feature by formula,
    I(X,Y) = H(X) - H(X|Y)

    Parameters
    ----------
    X : numpy array, shape (n_samples, n_features)
        The input samples.
    y : numpy array, shape (n_samples, )
        The classes for the samples.

    Returns
    -------
    Score for each feature as a numpy array, shape (n_features, )

    See Also
    --------
    
    Examples
    --------
    >>> import sklearn.datasets as datasets
    >>> from ITMO_FS.filters.univariate import information_gain
    >>> X = np.array([[1, 2, 3, 3, 1],[2, 2, 3, 3, 2], [1, 3, 3, 1, 3],[3, 1, 3, 1, 4],[4, 4, 3, 1, 5]], dtype = np.integer)
    >>> y = np.array([1, 2, 3, 4, 5], dtype=np.integer)
    >>> scores = information_gain(X, y)
    >>> print(scores)

    """
    entropy_x = entropy(y)
    cond_entropy = np.apply_along_axis(conditional_entropy, 0, X, y)
    return entropy_x - cond_entropy
예제 #3
0
파일: measures.py 프로젝트: hugoas/ITMO_FS
def su_measure(X, y):
    """
    SU is a correlation measure between the features and the class
    calculated, via formula SU(X,Y) = 2 * I(X|Y) / (H(X) + H(Y))

    Parameters
    ----------
    X : numpy array, shape (n_samples, n_features)
        The input samples.
    y : numpy array, shape (n_samples, )
        The classes for the samples.

    Returns
    -------
    Score for each feature as a numpy array, shape (n_features, )

    See Also
    --------
    https://www.matec-conferences.org/articles/matecconf/pdf/2016/05/matecconf_iccma2016_06002.pdf

    Examples
    --------
    >>> import sklearn.datasets as datasets
    >>> from ITMO_FS.filters.univariate import su_measure
    >>> X = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]])
    >>> y = np.array([1, 3, 2, 1, 2])
    >>> scores = su_measure(X, y)
    >>> print(scores)
    [0.82173546 0.67908587 0.79187567 0.73717549 0.86172942]
    """
    entropy_y = entropy(y)
    f_ratios = np.empty(X.shape[1])
    for index in range(X.shape[1]):
        entropy_x = entropy(X[:, index])
        cond_entropy = conditional_entropy(y, X[:, index])
        f_ratios[index] = (entropy_x + entropy_y - cond_entropy) / (entropy_x +
                                                                    entropy_y)
    return f_ratios
예제 #4
0
def _complementarity(x_i, x_j, y):
    return entropy(x_i) + entropy(x_j) + entropy(y) - entropy(list(zip(x_i, x_j))) - \
           entropy(list(zip(x_i, y))) - entropy(list(zip(x_j, y))) + entropy(list(zip(x_i, x_j, y)))
예제 #5
0
    def run(self, X, y):
        """
            Fits filter

            Parameters
            ----------
            X : numpy array, shape (n_samples, n_features)

            y : numpy array, shape (n_samples, )

            Returns
            ----------
            selected_features : numpy array
                selected pool of features

        """

        self.n_features = X.shape[1]
        if self.expected_size is None:
            self.expected_size = self.n_features / 3
        free_features = np.array([], dtype=np.integer)
        self.selected_features = np.arange(self.n_features, dtype=np.integer)
        self._vertices = np.ones(self.n_features)
        self._edges = np.zeros((self.n_features, self.n_features))
        for i in range(self.n_features):
            for j in range(self.n_features):
                entropy_pair = entropy(list(zip(X[:, i], X[:, j])))
                if entropy_pair != 0.:
                    self._edges[i][j] = _chained_information(
                        X[:, i], X[:, j], y) / entropy_pair

        while self.selected_features.size != self.expected_size:
            min_index = np.argmin(
                np.vectorize(lambda x: self.__count_weight(x))(np.arange(
                    self.n_features)))
            self._vertices[min_index] = 0
            free_features = np.append(free_features, min_index)
            self.selected_features = np.delete(self.selected_features,
                                               min_index)

        change = True
        while change:
            change = False
            swap_index = (-1, -1)
            max_difference = 0
            for i in range(len(free_features)):
                for j in range(len(self.selected_features)):
                    temp_difference = self.__count_weight(
                        free_features[i]) - self.__count_weight(
                            self.selected_features[j])
                    if temp_difference > max_difference:
                        max_difference = temp_difference
                        swap_index = (i, j)
            if max_difference > 0:
                change = True
                new_selected, new_free = swap_index
                free_features = np.append(free_features, new_free)
                free_features = np.delete(free_features, new_selected)
                self.selected_features = np.append(self.selected_features,
                                                   new_selected)
                self.selected_features = np.delete(self.selected_features,
                                                   new_free)

        return self.selected_features
예제 #6
0
    def fit(self, X, y, feature_names=None):
        """
            Fits filter

            Parameters
            ----------
            X : array-like, shape (n_samples, n_features)
                The training input samples.
            y : array-like, shape (n_samples, )
                The target values.
            feature_names : list of strings, optional
                In case you want to define feature names

            Returns
            -------
            None
        """

        features = generate_features(X)
        X, y, feature_names = self._check_input(X, y, feature_names)
        self.feature_names = dict(zip(features, feature_names))
        self.n_features = X.shape[1]
        if self.expected_size is None:
            self.expected_size = self.n_features // 3
        free_features = np.array([], dtype='object')
        self.selected_features = generate_features(X)
        self._vertices = np.ones(self.n_features)
        self._edges = np.zeros((self.n_features, self.n_features))
        for i in range(self.n_features):
            for j in range(self.n_features):
                entropy_pair = entropy(list(zip(X[:, i], X[:, j])))
                if entropy_pair != 0.:
                    self._edges[i][j] = _chained_information(
                        X[:, i], X[:, j], y) / entropy_pair

        # TODO apply vectorize to selected_features and not arange(n_features)?
        while self.selected_features.size != self.expected_size:
            min_index = np.argmin(
                np.vectorize(lambda x: self.__count_weight(x))(
                    self.selected_features))
            self._vertices[min_index] = 0
            free_features = np.append(free_features, min_index)
            self.selected_features = np.delete(self.selected_features,
                                               min_index)

        change = True
        while change:
            change = False
            swap_index = (-1, -1)
            max_difference = 0
            for i in range(len(free_features)):
                for j in range(len(self.selected_features)):
                    temp_difference = self.__count_weight(
                        free_features[i]) - self.__count_weight(
                            self.selected_features[j])
                    if temp_difference > max_difference:
                        max_difference = temp_difference
                        swap_index = (i, j)
            if max_difference > 0:
                change = True
                new_selected, new_free = swap_index
                free_features = np.append(free_features, new_free)
                free_features = np.delete(free_features, new_selected)
                self.selected_features = np.append(self.selected_features,
                                                   new_selected)
                self.selected_features = np.delete(self.selected_features,
                                                   new_free)
        self.selected_features = features[self.selected_features]