def _encode_categorical(self, X, y): """TODO""" # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) # Separate categorical features from continuous features X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_categorical = X[:, self.categorical_features_].copy() X_minority = X_continuous[np.flatnonzero(y == class_minority)] if sparse.issparse(X): if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder(sparse=True, handle_unknown="ignore", dtype=dtype_ohe) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical ) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original # categorical encoding which will be later used for inversing the OHE if math.isclose(self.median_std_, 0): self._X_categorical_encoded = X_ohe.toarray() X_ohe.data = np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 if self._issparse: X_encoded = np.hstack([X_continuous.toarray(), X_ohe.toarray()]) else: X_encoded = np.hstack([X_continuous, X_ohe.toarray()]) return X_encoded
def _fit_resample(self, X, y): self.n_features_ = X.shape[1] self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc']) X_minority = safe_indexing(X_continuous, np.flatnonzero(y == class_minority)) if sparse.issparse(X): if X.format == 'csr': _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] if X_continuous.dtype.name != 'object': dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder(sparse=True, handle_unknown='ignore', dtype=dtype_ohe) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform( X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2) X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr') X_resampled, y_resampled = super(SMOTENC, self)._fit_resample( X_encoded, y) # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( (X_resampled[:, :self.continuous_features_.size], X_res_cat_dec), format='csr' ) else: X_resampled = np.hstack( (X_resampled[:, :self.continuous_features_.size].toarray(), X_res_cat_dec) ) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_)) ) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled
def _fit_resample(self, X, y): # FIXME: to be removed in 0.12 if self.n_jobs is not None: warnings.warn( "The parameter `n_jobs` has been deprecated in 0.10 and will be " "removed in 0.12. You can pass an nearest neighbors estimator where " "`n_jobs` is already set instead.", FutureWarning, ) self.n_features_ = X.shape[1] self._validate_estimator() # compute the median of the standard deviation of the minority class target_stats = Counter(y) class_minority = min(target_stats, key=target_stats.get) X_continuous = X[:, self.continuous_features_] X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"]) X_minority = _safe_indexing(X_continuous, np.flatnonzero(y == class_minority)) if sparse.issparse(X): if X.format == "csr": _, var = csr_mean_variance_axis0(X_minority) else: _, var = csc_mean_variance_axis0(X_minority) else: var = X_minority.var(axis=0) self.median_std_ = np.median(np.sqrt(var)) X_categorical = X[:, self.categorical_features_] if X_continuous.dtype.name != "object": dtype_ohe = X_continuous.dtype else: dtype_ohe = np.float64 self.ohe_ = OneHotEncoder(sparse=True, handle_unknown="ignore", dtype=dtype_ohe) # the input of the OneHotEncoder needs to be dense X_ohe = self.ohe_.fit_transform(X_categorical.toarray( ) if sparse.issparse(X_categorical) else X_categorical) # we can replace the 1 entries of the categorical features with the # median of the standard deviation. It will ensure that whenever # distance is computed between 2 samples, the difference will be equal # to the median of the standard deviation as in the original paper. # In the edge case where the median of the std is equal to 0, the 1s # entries will be also nullified. In this case, we store the original # categorical encoding which will be later used for inversing the OHE if math.isclose(self.median_std_, 0): self._X_categorical_minority_encoded = _safe_indexing( X_ohe.toarray(), np.flatnonzero(y == class_minority)) X_ohe.data = np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2 X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr") X_resampled, y_resampled = super()._fit_resample(X_encoded, y) # reverse the encoding of the categorical features X_res_cat = X_resampled[:, self.continuous_features_.size:] X_res_cat.data = np.ones_like(X_res_cat.data) X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat) if sparse.issparse(X): X_resampled = sparse.hstack( ( X_resampled[:, :self.continuous_features_.size], X_res_cat_dec, ), format="csr", ) else: X_resampled = np.hstack(( X_resampled[:, :self.continuous_features_.size].toarray(), X_res_cat_dec, )) indices_reordered = np.argsort( np.hstack((self.continuous_features_, self.categorical_features_))) if sparse.issparse(X_resampled): # the matrix is supposed to be in the CSR format after the stacking col_indices = X_resampled.indices.copy() for idx, col_idx in enumerate(indices_reordered): mask = X_resampled.indices == col_idx col_indices[mask] = idx X_resampled.indices = col_indices else: X_resampled = X_resampled[:, indices_reordered] return X_resampled, y_resampled