Example #1
0
    def _encode_categorical(self, X, y):
        """TODO"""
        # compute the median of the standard deviation of the minority class
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        # Separate categorical features from continuous features
        X_continuous = X[:, self.continuous_features_]
        X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
        X_categorical = X[:, self.categorical_features_].copy()
        X_minority = X_continuous[np.flatnonzero(y == class_minority)]

        if sparse.issparse(X):
            if X.format == "csr":
                _, var = csr_mean_variance_axis0(X_minority)
            else:
                _, var = csc_mean_variance_axis0(X_minority)
        else:
            var = X_minority.var(axis=0)
        self.median_std_ = np.median(np.sqrt(var))

        if X_continuous.dtype.name != "object":
            dtype_ohe = X_continuous.dtype
        else:
            dtype_ohe = np.float64
        self.ohe_ = OneHotEncoder(sparse=True, handle_unknown="ignore", dtype=dtype_ohe)

        # the input of the OneHotEncoder needs to be dense
        X_ohe = self.ohe_.fit_transform(
            X_categorical.toarray() if sparse.issparse(X_categorical) else X_categorical
        )

        # we can replace the 1 entries of the categorical features with the
        # median of the standard deviation. It will ensure that whenever
        # distance is computed between 2 samples, the difference will be equal
        # to the median of the standard deviation as in the original paper.

        # In the edge case where the median of the std is equal to 0, the 1s
        # entries will be also nullified. In this case, we store the original
        # categorical encoding which will be later used for inversing the OHE
        if math.isclose(self.median_std_, 0):
            self._X_categorical_encoded = X_ohe.toarray()

        X_ohe.data = np.ones_like(X_ohe.data, dtype=X_ohe.dtype) * self.median_std_ / 2

        if self._issparse:
            X_encoded = np.hstack([X_continuous.toarray(), X_ohe.toarray()])
        else:
            X_encoded = np.hstack([X_continuous, X_ohe.toarray()])

        return X_encoded
Example #2
0
    def _fit_resample(self, X, y):
        self.n_features_ = X.shape[1]
        self._validate_estimator()

        # compute the median of the standard deviation of the minority class
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        X_continuous = X[:, self.continuous_features_]
        X_continuous = check_array(X_continuous, accept_sparse=['csr', 'csc'])
        X_minority = safe_indexing(X_continuous,
                                   np.flatnonzero(y == class_minority))

        if sparse.issparse(X):
            if X.format == 'csr':
                _, var = csr_mean_variance_axis0(X_minority)
            else:
                _, var = csc_mean_variance_axis0(X_minority)
        else:
            var = X_minority.var(axis=0)
        self.median_std_ = np.median(np.sqrt(var))

        X_categorical = X[:, self.categorical_features_]
        if X_continuous.dtype.name != 'object':
            dtype_ohe = X_continuous.dtype
        else:
            dtype_ohe = np.float64
        self.ohe_ = OneHotEncoder(sparse=True, handle_unknown='ignore',
                                  dtype=dtype_ohe)
        # the input of the OneHotEncoder needs to be dense
        X_ohe = self.ohe_.fit_transform(
            X_categorical.toarray() if sparse.issparse(X_categorical)
            else X_categorical)

        # we can replace the 1 entries of the categorical features with the
        # median of the standard deviation. It will ensure that whenever
        # distance is computed between 2 samples, the difference will be equal
        # to the median of the standard deviation as in the original paper.
        X_ohe.data = (np.ones_like(X_ohe.data, dtype=X_ohe.dtype) *
                      self.median_std_ / 2)
        X_encoded = sparse.hstack((X_continuous, X_ohe), format='csr')

        X_resampled, y_resampled = super(SMOTENC, self)._fit_resample(
            X_encoded, y)

        # reverse the encoding of the categorical features
        X_res_cat = X_resampled[:, self.continuous_features_.size:]
        X_res_cat.data = np.ones_like(X_res_cat.data)
        X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat)

        if sparse.issparse(X):
            X_resampled = sparse.hstack(
                (X_resampled[:, :self.continuous_features_.size],
                 X_res_cat_dec), format='csr'
            )
        else:
            X_resampled = np.hstack(
                (X_resampled[:, :self.continuous_features_.size].toarray(),
                 X_res_cat_dec)
            )

        indices_reordered = np.argsort(
            np.hstack((self.continuous_features_, self.categorical_features_))
        )
        if sparse.issparse(X_resampled):
            # the matrix is supposed to be in the CSR format after the stacking
            col_indices = X_resampled.indices.copy()
            for idx, col_idx in enumerate(indices_reordered):
                mask = X_resampled.indices == col_idx
                col_indices[mask] = idx
            X_resampled.indices = col_indices
        else:
            X_resampled = X_resampled[:, indices_reordered]

        return X_resampled, y_resampled
def _mutual_proximity_gammai_sparse(S, sample_size=0, train_set_ind=None, 
                                    verbose=0, log=None, mv=None, n_jobs=-1):
    """MP gaussi for sparse similarity matrices. 
    
    Please do not directly use this function, but invoke via 
    mutual_proximity_gaussi()
    """
    self_value = 1. # similarity matrix
    # mean, variance WITHOUT zero values (missing values), ddof=1
    if S.diagonal().max() != self_value or S.diagonal().min() != self_value:
        raise ValueError("Self similarities must be 1.")
    S_param = S[train_set_ind]

    if mv is None:
        # mean, variance WITH zero values:
        from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0  # @UnresolvedImport
        mu, va = csr_mean_variance_axis0(S_param)
    elif mv == 0: # mean, variance WITHOUT zero values (missing values)
        # the -1 accounts for self sim that must be excluded from the calc
        mu = np.array((S_param.sum(0) - 1) / (S_param.getnnz(0) - 1)).ravel()
        E2 = mu**2
        X = S_param.copy()
        X.data **= 2
        n_x = (X.getnnz(0) - 1)
        E1 = np.array((X.sum(0) - 1) / n_x).ravel()
        del X
        # for an unbiased sample variance
        va = n_x / (n_x - 1) * (E1 - E2)
        del E1
    else:
        log.error("MP only supports missing values as zeros.", flush=True)
        raise ValueError("mv must be None or 0.")
    
    A = (mu**2) / va
    B = va / mu
    del mu, va
    A[A < 0] = np.nan
    B[B <= 0] = np.nan

    S_mp = lil_matrix(S.shape, dtype=np.float32)
    n = S.shape[0]
    
    # Parallelization
    if n_jobs == -1: # take all cpus
        NUMBER_OF_PROCESSES = mp.cpu_count()
    else:
        NUMBER_OF_PROCESSES = n_jobs
    tasks = []
    
    batches = _get_weighted_batches(n, NUMBER_OF_PROCESSES)
    #   create jobs
    for idx, batch in enumerate(batches):
        matrix = S
        tasks.append((_partial_mp_gammai_sparse, 
                      (batch, matrix, idx, n, A, B, verbose)))   
    
    task_queue = mp.Queue()
    done_queue = mp.Queue()
    
    for task in tasks:
        task_queue.put(task)
    #   start jobs
    processes = []
    for i in range(NUMBER_OF_PROCESSES):
        processes.append(mp.Process(target=_worker, 
                                    args=(task_queue, done_queue)))
        processes[i].start()  
    #   collect results
    for i in range(len(tasks)):
        rows, Dmp_part = done_queue.get()
        task_queue.put('STOP')
        if verbose:
            log.message("Merging submatrix {} (rows {}..{})".
                        format(i, rows[0], rows[-1]), flush=True)
        if rows.size > 0:
            row_slice = slice(rows[0], rows[-1]+1)
        else: # for very small matrices, some batches might be empty
            row_slice = slice(0, 0)
        S_mp[row_slice] = Dmp_part
     
    for p in processes:
        p.join()
    
    S_mp = S_mp.tolil()
    if verbose:
        log.message("Mirroring distance matrix", flush=True)
    S_mp += S_mp.T
    
    if verbose:
        log.message("Setting self distances", flush=True)
    for i in range(S_mp.shape[0]):
        S_mp[i, i] = self_value

    if verbose:
        log.message("Converting to CSR matrix", flush=True)
    return S_mp.tocsr()
Example #4
0
    def _fit_resample(self, X, y):
        # FIXME: to be removed in 0.12
        if self.n_jobs is not None:
            warnings.warn(
                "The parameter `n_jobs` has been deprecated in 0.10 and will be "
                "removed in 0.12. You can pass an nearest neighbors estimator where "
                "`n_jobs` is already set instead.",
                FutureWarning,
            )

        self.n_features_ = X.shape[1]
        self._validate_estimator()

        # compute the median of the standard deviation of the minority class
        target_stats = Counter(y)
        class_minority = min(target_stats, key=target_stats.get)

        X_continuous = X[:, self.continuous_features_]
        X_continuous = check_array(X_continuous, accept_sparse=["csr", "csc"])
        X_minority = _safe_indexing(X_continuous,
                                    np.flatnonzero(y == class_minority))

        if sparse.issparse(X):
            if X.format == "csr":
                _, var = csr_mean_variance_axis0(X_minority)
            else:
                _, var = csc_mean_variance_axis0(X_minority)
        else:
            var = X_minority.var(axis=0)
        self.median_std_ = np.median(np.sqrt(var))

        X_categorical = X[:, self.categorical_features_]
        if X_continuous.dtype.name != "object":
            dtype_ohe = X_continuous.dtype
        else:
            dtype_ohe = np.float64
        self.ohe_ = OneHotEncoder(sparse=True,
                                  handle_unknown="ignore",
                                  dtype=dtype_ohe)

        # the input of the OneHotEncoder needs to be dense
        X_ohe = self.ohe_.fit_transform(X_categorical.toarray(
        ) if sparse.issparse(X_categorical) else X_categorical)

        # we can replace the 1 entries of the categorical features with the
        # median of the standard deviation. It will ensure that whenever
        # distance is computed between 2 samples, the difference will be equal
        # to the median of the standard deviation as in the original paper.

        # In the edge case where the median of the std is equal to 0, the 1s
        # entries will be also nullified. In this case, we store the original
        # categorical encoding which will be later used for inversing the OHE
        if math.isclose(self.median_std_, 0):
            self._X_categorical_minority_encoded = _safe_indexing(
                X_ohe.toarray(), np.flatnonzero(y == class_minority))

        X_ohe.data = np.ones_like(X_ohe.data,
                                  dtype=X_ohe.dtype) * self.median_std_ / 2
        X_encoded = sparse.hstack((X_continuous, X_ohe), format="csr")

        X_resampled, y_resampled = super()._fit_resample(X_encoded, y)

        # reverse the encoding of the categorical features
        X_res_cat = X_resampled[:, self.continuous_features_.size:]
        X_res_cat.data = np.ones_like(X_res_cat.data)
        X_res_cat_dec = self.ohe_.inverse_transform(X_res_cat)

        if sparse.issparse(X):
            X_resampled = sparse.hstack(
                (
                    X_resampled[:, :self.continuous_features_.size],
                    X_res_cat_dec,
                ),
                format="csr",
            )
        else:
            X_resampled = np.hstack((
                X_resampled[:, :self.continuous_features_.size].toarray(),
                X_res_cat_dec,
            ))

        indices_reordered = np.argsort(
            np.hstack((self.continuous_features_, self.categorical_features_)))
        if sparse.issparse(X_resampled):
            # the matrix is supposed to be in the CSR format after the stacking
            col_indices = X_resampled.indices.copy()
            for idx, col_idx in enumerate(indices_reordered):
                mask = X_resampled.indices == col_idx
                col_indices[mask] = idx
            X_resampled.indices = col_indices
        else:
            X_resampled = X_resampled[:, indices_reordered]

        return X_resampled, y_resampled
def _mutual_proximity_gaussi_sparse(S, sample_size, train_set_ind, 
                                    verbose, log, mv, n_jobs):
    """MP gaussi for sparse similarity matrices. 
    
    Please do not directly use this function, but invoke via 
    mutual_proximity_gaussi()
    """
    n = S.shape[0]
    self_value = 1. # similarity matrix
    
    if mv is None:
        # mean, variance WITH zero values:
        from sklearn.utils.sparsefuncs_fast \
            import csr_mean_variance_axis0  # @UnresolvedImport
        mu, va = csr_mean_variance_axis0(S[train_set_ind])
    elif mv == 0:
        # mean, variance WITHOUT zero values (missing values)
        mu = np.array((S.sum(0) - 1) / (S.getnnz(0) - 1)).ravel()
        X = S.copy()
        X.data **= 2
        E1 = np.array((X.sum(0) - 1) / (X.getnnz(0) - 1)).ravel()
        del X
        va = E1 - mu**2
        del E1
    else:
        log.error("MP only supports missing values as zeros.", flush=True)
        raise ValueError("mv must be None or 0.")
    sd = np.sqrt(va)
    del va
            
    Dmp = lil_matrix(S.shape)
    
    # Parallelization
    if n_jobs == -1: # take all cpus
        NUMBER_OF_PROCESSES = mp.cpu_count()
    else:
        NUMBER_OF_PROCESSES = n_jobs
    tasks = []
    
    batches = _get_weighted_batches(n, NUMBER_OF_PROCESSES)
    
    for idx, batch in enumerate(batches):
        matrix = S
        tasks.append((_partial_mp_gaussi_sparse, 
                      (batch, matrix, idx, n, mu, sd, verbose)))   
    
    task_queue = mp.Queue()
    done_queue = mp.Queue()
    
    for task in tasks:
        task_queue.put(task)
        
    processes = []
    for i in range(NUMBER_OF_PROCESSES):
        processes.append(mp.Process(target=_worker, 
                                    args=(task_queue, done_queue))) 
        processes[i].start()  
    
    for i in range(len(tasks)):  # @UnusedVariable
        rows, Dmp_part = done_queue.get()
        task_queue.put('STOP')
        if verbose:
            log.message("Merging submatrix {} (rows {}..{})".
                        format(i, rows[0], rows[-1]), flush=True)
        if rows.size > 0:
            rows_slice = slice(rows[0], rows[-1]+1)
        else:
            rows_slice = slice(0, 0)
        Dmp[rows_slice, :] = Dmp_part
     
    for p in processes:
        p.join()
    
    Dmp = Dmp.tolil()
    if verbose:
        log.message("Mirroring distance matrix", flush=True)
    Dmp += Dmp.T
    
    if verbose:
        log.message("Setting self distances", flush=True)
    for i in range(Dmp.shape[0]):
        Dmp[i, i] = self_value

    if verbose:
        log.message("Converting to CSR matrix", flush=True)
    return Dmp.tocsr()