def su_measure(X, y): entropy_y = entropy(y) f_ratios = np.empty(X.shape[1]) for index in range(X.shape[1]): entropy_x = entropy(X[:, index]) cond_entropy = conditional_entropy(X[:, index], y) f_ratios[index] = 2 * (entropy_y - cond_entropy) / (entropy_x + entropy_y) return f_ratios
def information_gain(X, y): """ Calculates mutual information for each feature by formula, I(X,Y) = H(X) - H(X|Y) Parameters ---------- X : numpy array, shape (n_samples, n_features) The input samples. y : numpy array, shape (n_samples, ) The classes for the samples. Returns ------- Score for each feature as a numpy array, shape (n_features, ) See Also -------- Examples -------- >>> import sklearn.datasets as datasets >>> from ITMO_FS.filters.univariate import information_gain >>> X = np.array([[1, 2, 3, 3, 1],[2, 2, 3, 3, 2], [1, 3, 3, 1, 3],[3, 1, 3, 1, 4],[4, 4, 3, 1, 5]], dtype = np.integer) >>> y = np.array([1, 2, 3, 4, 5], dtype=np.integer) >>> scores = information_gain(X, y) >>> print(scores) """ entropy_x = entropy(y) cond_entropy = np.apply_along_axis(conditional_entropy, 0, X, y) return entropy_x - cond_entropy
def su_measure(X, y): """ SU is a correlation measure between the features and the class calculated, via formula SU(X,Y) = 2 * I(X|Y) / (H(X) + H(Y)) Parameters ---------- X : numpy array, shape (n_samples, n_features) The input samples. y : numpy array, shape (n_samples, ) The classes for the samples. Returns ------- Score for each feature as a numpy array, shape (n_features, ) See Also -------- https://www.matec-conferences.org/articles/matecconf/pdf/2016/05/matecconf_iccma2016_06002.pdf Examples -------- >>> import sklearn.datasets as datasets >>> from ITMO_FS.filters.univariate import su_measure >>> X = np.array([[3, 3, 3, 2, 2], [3, 3, 1, 2, 3], [1, 3, 5, 1, 1], [3, 1, 4, 3, 1], [3, 1, 2, 3, 1]]) >>> y = np.array([1, 3, 2, 1, 2]) >>> scores = su_measure(X, y) >>> print(scores) [0.82173546 0.67908587 0.79187567 0.73717549 0.86172942] """ entropy_y = entropy(y) f_ratios = np.empty(X.shape[1]) for index in range(X.shape[1]): entropy_x = entropy(X[:, index]) cond_entropy = conditional_entropy(y, X[:, index]) f_ratios[index] = (entropy_x + entropy_y - cond_entropy) / (entropy_x + entropy_y) return f_ratios
def _complementarity(x_i, x_j, y): return entropy(x_i) + entropy(x_j) + entropy(y) - entropy(list(zip(x_i, x_j))) - \ entropy(list(zip(x_i, y))) - entropy(list(zip(x_j, y))) + entropy(list(zip(x_i, x_j, y)))
def run(self, X, y): """ Fits filter Parameters ---------- X : numpy array, shape (n_samples, n_features) y : numpy array, shape (n_samples, ) Returns ---------- selected_features : numpy array selected pool of features """ self.n_features = X.shape[1] if self.expected_size is None: self.expected_size = self.n_features / 3 free_features = np.array([], dtype=np.integer) self.selected_features = np.arange(self.n_features, dtype=np.integer) self._vertices = np.ones(self.n_features) self._edges = np.zeros((self.n_features, self.n_features)) for i in range(self.n_features): for j in range(self.n_features): entropy_pair = entropy(list(zip(X[:, i], X[:, j]))) if entropy_pair != 0.: self._edges[i][j] = _chained_information( X[:, i], X[:, j], y) / entropy_pair while self.selected_features.size != self.expected_size: min_index = np.argmin( np.vectorize(lambda x: self.__count_weight(x))(np.arange( self.n_features))) self._vertices[min_index] = 0 free_features = np.append(free_features, min_index) self.selected_features = np.delete(self.selected_features, min_index) change = True while change: change = False swap_index = (-1, -1) max_difference = 0 for i in range(len(free_features)): for j in range(len(self.selected_features)): temp_difference = self.__count_weight( free_features[i]) - self.__count_weight( self.selected_features[j]) if temp_difference > max_difference: max_difference = temp_difference swap_index = (i, j) if max_difference > 0: change = True new_selected, new_free = swap_index free_features = np.append(free_features, new_free) free_features = np.delete(free_features, new_selected) self.selected_features = np.append(self.selected_features, new_selected) self.selected_features = np.delete(self.selected_features, new_free) return self.selected_features
def fit(self, X, y, feature_names=None): """ Fits filter Parameters ---------- X : array-like, shape (n_samples, n_features) The training input samples. y : array-like, shape (n_samples, ) The target values. feature_names : list of strings, optional In case you want to define feature names Returns ------- None """ features = generate_features(X) X, y, feature_names = self._check_input(X, y, feature_names) self.feature_names = dict(zip(features, feature_names)) self.n_features = X.shape[1] if self.expected_size is None: self.expected_size = self.n_features // 3 free_features = np.array([], dtype='object') self.selected_features = generate_features(X) self._vertices = np.ones(self.n_features) self._edges = np.zeros((self.n_features, self.n_features)) for i in range(self.n_features): for j in range(self.n_features): entropy_pair = entropy(list(zip(X[:, i], X[:, j]))) if entropy_pair != 0.: self._edges[i][j] = _chained_information( X[:, i], X[:, j], y) / entropy_pair # TODO apply vectorize to selected_features and not arange(n_features)? while self.selected_features.size != self.expected_size: min_index = np.argmin( np.vectorize(lambda x: self.__count_weight(x))( self.selected_features)) self._vertices[min_index] = 0 free_features = np.append(free_features, min_index) self.selected_features = np.delete(self.selected_features, min_index) change = True while change: change = False swap_index = (-1, -1) max_difference = 0 for i in range(len(free_features)): for j in range(len(self.selected_features)): temp_difference = self.__count_weight( free_features[i]) - self.__count_weight( self.selected_features[j]) if temp_difference > max_difference: max_difference = temp_difference swap_index = (i, j) if max_difference > 0: change = True new_selected, new_free = swap_index free_features = np.append(free_features, new_free) free_features = np.delete(free_features, new_selected) self.selected_features = np.append(self.selected_features, new_selected) self.selected_features = np.delete(self.selected_features, new_free) self.selected_features = features[self.selected_features]