class SMOTE(UnbalancedDataset): """ This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and SVM-SMOTE. See the original papers: [1], [2], [3] for more details. * It does not support multiple classes automatically, but can be called multiple times """ def __init__(self, k=5, m=10, out_step=0.5, ratio='auto', random_state=None, kind='regular', nn_method='exact', verbose=False, **kwargs): """ SMOTE over sampling algorithm and variations. Choose one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' :param k: Number of nearest neighbours to used to construct synthetic samples. :param m: The number of nearest neighbours to use to determine if a minority sample is in danger. :param out_step: Step size when extrapolating :param ratio: If 'auto', the ratio will be defined automatically to balanced the dataset. If an integer is given, the number of samples generated is equal to the number of samples in the minority class mulitply by this ratio. :param random_state: Seed for random number generation :param kind: The type of smote algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' :param nn_method: The nearest neighbors method to use which can be either: 'approximate' or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an exact search. :param verbose: Whether or not to print status information :param kwargs: Additional arguments passed to sklearn SVC object """ # Parent class methods UnbalancedDataset.__init__(self, ratio=ratio, random_state=random_state) # Do not expect any support regarding the selection with this method if (kwargs.pop('indices_support', False)): raise ValueError('No indices support with this method.') # Get the number of processor that the user wants to use self.n_jobs = kwargs.pop('n_jobs', multiprocessing.cpu_count()) # --- The type of smote # This object can perform regular smote over-sampling, borderline 1, # borderline 2 and svm smote. Since the algorithms are fairly simple # they share most methods.# self.kind = kind # --- Verbose # Control whether or not status and progress information should be# self.verbose = verbose # --- Nearest Neighbours for synthetic samples # The smote algorithm uses the k-th nearest neighbours of a minority # sample to generate new synthetic samples.# self.k = k # --- NN object # Import the NN object from scikit-learn library. Since in the smote # variations we must first find samples that are in danger, we # initialize the NN object differently depending on the method chosen# if nn_method == 'exact': from sklearn.neighbors import NearestNeighbors elif nn_method == 'approximate': from sklearn.neighbors import LSHForest if kind == 'regular': # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering# if nn_method == 'exact': self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1, n_jobs=self.n_jobs) elif nn_method == 'approximate': self.nearest_neighbour_ = LSHForest(n_estimators=50, n_candidates=500, n_neighbors=k + 1) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live # near the boundary between the classes. Therefore, before # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary.# if nn_method == 'exact': self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1, n_jobs=self.n_jobs) elif nn_method == 'approximate': self.nearest_neighbour_ = LSHForest(n_estimators=50, n_candidates=500, n_neighbors=m + 1) # --- Nearest Neighbours for noise and boundary (in danger) # Before creating synthetic samples we must first decide if # a given entry is noise or in danger. We use m nns in this step# self.m = m # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support # vectors to decide which samples are in danger (near the boundary). # Additionally it also introduces extrapolation for samples that are # considered safe (far from boundary) and interpolation for samples # in danger (near the boundary). The level of extrapolation is # controled by the out_step.# if kind == 'svm': # As usual, use scikit-learn object# from sklearn.svm import SVC # Store extrapolation size# self.out_step = out_step # Store SVM object with any parameters# self.svm_ = SVC(**kwargs) def in_danger_noise(self, samples, kind='danger'): """Estimate if a set of sample are in danger or not. Parameters ---------- samples : ndarray, shape (n_samples, n_features) The samples to check if either they are in danger or not. kind : str, optional (default='danger') The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray, shape (n_samples, ) A boolean array where True refer to samples in danger or noise. """ # Find the NN for each samples # Exclude the sample itself x = self.nearest_neighbour_.kneighbors(samples, return_distance=False)[:, 1:] # Count how many NN belong to the minority class # Find the class corresponding to the label in x nn_label = (self.y[x] != self.minc).astype(int) # Compute the number of majority samples in the NN n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and(n_maj >= float(self.m) / 2., n_maj < self.m) elif kind == 'noise': # Samples are noise for m = m' return n_maj == self.m def resample(self): """ Main method of all children classes. :return: Over-sampled data set. """ # Compute the ratio if it is auto if self.ratio == 'auto': self.ratio = (float(self.ucd[self.maxc] - self.ucd[self.minc]) / float(self.ucd[self.minc])) # Start by separating minority class features and target values. minx = self.x[self.y == self.minc] miny = self.y[self.y == self.minc] # If regular SMOTE is to be performed# if self.kind == 'regular': # Print if verbose is true# if self.verbose: print("Finding the %i nearest neighbours..." % self.k, end="") # Look for k-th nearest neighbours, excluding, of course, the # point itself.# self.nearest_neighbour_.fit(minx) # Matrix with k-th nearest neighbours indexes for each minority # element.# nns = self.nearest_neighbour_.kneighbors(minx, return_distance=False)[:, 1:] # Print status if verbose is true# if self.verbose: ## print("done!") # Creating synthetic samples # print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples # FIX THIS SHIT!!!# sx, sy = self.make_samples(x=minx, nn_data=minx, y_type=self.minc, nn_num=nns, n_samples=int(self.ratio * len(miny)), step_size=1.0, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y if (self.kind == 'borderline1') or (self.kind == 'borderline2'): if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # Find the NNs for all samples in the data set. self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = self.in_danger_noise(minx, kind='danger') # If all minority samples are safe, return the original data set. if not any(danger_index): ## if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here.# return self.x, self.y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) # nns...# nns = self.nearest_neighbour_.kneighbors(minx[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. sx, sy = self.make_samples(minx[danger_index], minx, miny[0], nns, int(self.ratio * len(miny)), random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = concatenate((self.x, sx), axis=0) ret_y = concatenate((self.y, sy), axis=0) return ret_x, ret_y else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.rs) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01# fractions = betavariate(alpha=10, beta=10) # Only minority sx1, sy1 = self.make_samples(minx[danger_index], minx, self.minc, nns, fractions * (int(self.ratio * len(miny)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Only majority with smaller step size sx2, sy2 = self.make_samples(minx[danger_index], self.x[self.y != self.minc], self.minc, nns, (1 - fractions) * int(self.ratio * len(miny)), step_size=0.5, random_state=self.rs, verbose=self.verbose) # Concatenate the newly generated samples to the original data set ret_x = np.concatenate((self.x, sx1, sx2), axis=0) ret_y = np.concatenate((self.y, sy1, sy2), axis=0) return ret_x, ret_y if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class.# # Fit SVM to the full data# self.svm_.fit(self.x, self.y) # Find the support vectors and their corresponding indexes support_index = self.svm_.support_[self.y[self.svm_.support_] == self.minc] support_vector = self.x[support_index] # First, find the nn of all the samples to identify samples in danger # and noisy ones if self.verbose: print("Finding the %i nearest neighbours..." % self.m, end="") # As usual, fit a nearest neighbour model to the data self.nearest_neighbour_.fit(self.x) if self.verbose: print("done!") # Now, get rid of noisy support vectors noise_bool = self.in_danger_noise(support_vector, kind='noise') # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] danger_bool = self.in_danger_noise(support_vector, kind='danger') # Something ...# safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format( support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int))) # Proceed to find support vectors NNs among the minority class print("Finding the %i nearest neighbours..." % self.k, end="") self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(minx) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.rs) fractions = betavariate(alpha=10, beta=10) # Interpolate samples in danger if (np.count_nonzero(danger_bool) > 0): nns = self.nearest_neighbour_.kneighbors( support_vector[danger_bool], return_distance=False)[:, 1:] sx1, sy1 = self.make_samples(support_vector[danger_bool], minx, self.minc, nns, fractions * (int(self.ratio * len(minx)) + 1), step_size=1, random_state=self.rs, verbose=self.verbose) # Extrapolate safe samples if (np.count_nonzero(safety_bool) > 0): nns = self.nearest_neighbour_.kneighbors( support_vector[safety_bool], return_distance=False)[:, 1:] sx2, sy2 = self.make_samples(support_vector[safety_bool], minx, self.minc, nns, (1 - fractions) * int(self.ratio * len(minx)), step_size=-self.out_step, random_state=self.rs, verbose=self.verbose) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set if ((np.count_nonzero(danger_bool) > 0) and (np.count_nonzero(safety_bool) > 0)): ret_x = concatenate((self.x, sx1, sx2), axis=0) ret_y = concatenate((self.y, sy1, sy2), axis=0) # not any support vectors in danger elif np.count_nonzero(danger_bool) == 0: ret_x = concatenate((self.x, sx2), axis=0) ret_y = concatenate((self.y, sy2), axis=0) # All the support vector in danger elif np.count_nonzero(safety_bool) == 0: ret_x = concatenate((self.x, sx1), axis=0) ret_y = concatenate((self.y, sy1), axis=0) return ret_x, ret_y
class SMOTE(OverSampler): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and SVM-SMOTE. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Boolean to either or not print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' nn_method : str, optional (default='exact') The nearest neighbors method to use which can be either: 'approximate' or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an exact search. Attributes ---------- ratio_ : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. rs_ : int or None, optional (default=None) Seed for random number generation. min_c_ : str or int The identifier of the minority class. max_c_ : str or int The identifier of the majority class. stats_c_ : dict of str/int : int A dictionary in which the number of occurences of each class is reported. Notes ----- See the original papers: [1]_, [2]_, [3]_ for more details. It does not support multiple classes automatically, but can be called multiple times. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. """ def __init__(self, ratio='auto', random_state=None, verbose=True, k=5, m=10, out_step=0.5, kind='regular', nn_method='exact', n_jobs=-1, **kwargs): """Initialisation of SMOTE object. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Boolean to either or not print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' nn_method : str, optional (default='exact') The nearest neighbors method to use which can be either: 'approximate' or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an exact search. n_jobs : int, optional (default=-1) Number of threads to run the algorithm when it is possible. """ super(SMOTE, self).__init__(ratio=ratio, random_state=random_state, verbose=verbose) # Check the number of thread to use self.n_jobs = n_jobs # --- The type of smote # This object can perform regular smote over-sampling, borderline 1, # borderline 2 and svm smote. Since the algorithms are fairly simple # they share most methods. possible_kind = ('regular', 'borderline1', 'borderline2', 'svm') if kind in possible_kind: self.kind = kind else: raise ValueError('Unknown kind for SMOTE algorithm.') # --- Verbose # Control whether or not status and progress information should be self.verbose = verbose # --- Nearest Neighbours for synthetic samples # The smote algorithm uses the k-th nearest neighbours of a minority # sample to generate new synthetic samples. self.k = k # --- NN object # Import the NN object from scikit-learn library. Since in the smote # variations we must first find samples that are in danger, we # initialize the NN object differently depending on the method chosen if kind == 'regular': # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering if nn_method == 'exact': self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1, n_jobs=self.n_jobs) elif nn_method == 'approximate': self.nearest_neighbour_ = LSHForest(n_estimators=50, n_candidates=500, n_neighbors=k+1) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live # near the boundary between the classes. Therefore, before # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary. if nn_method == 'exact': self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1, n_jobs=self.n_jobs) elif nn_method == 'approximate': self.nearest_neighbour_ = LSHForest(n_estimators=50, n_candidates=500, n_neighbors=m+1) # --- Nearest Neighbours for noise and boundary (in danger) # Before creating synthetic samples we must first decide if # a given entry is noise or in danger. We use m nns in this step self.m = m # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support # vectors to decide which samples are in danger (near the boundary). # Additionally it also introduces extrapolation for samples that are # considered safe (far from boundary) and interpolation for samples # in danger (near the boundary). The level of extrapolation is # controled by the out_step. if kind == 'svm': # Store extrapolation size self.out_step = out_step # Store SVM object with any parameters self.svm_ = SVC(**kwargs) def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).fit(X, y) return self def _in_danger_noise(self, samples, y, kind='danger'): """Estimate if a set of sample are in danger or not. Parameters ---------- samples : ndarray, shape (n_samples, n_features) The samples to check if either they are in danger or not. y : ndarray, shape (n_samples, ) The true label in order to check the neighbour labels. kind : str, optional (default='danger') The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray, shape (n_samples, ) A boolean array where True refer to samples in danger or noise. """ # Find the NN for each samples # Exclude the sample itself x = self.nearest_neighbour_.kneighbors(samples, return_distance=False)[:, 1:] # Count how many NN belong to the minority class # Find the class corresponding to the label in x nn_label = (y[x] != self.min_c_).astype(int) # Compute the number of majority samples in the NN n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and(n_maj >= float(self.m) / 2., n_maj < self.m) elif kind == 'noise': # Samples are noise for m = m' return n_maj == self.m else: raise ValueError('Unknown string for parameter kind.') def _make_samples(self, X, y_type, nn_data, nn_num, n_samples, step_size=1.): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : ndarray, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in nn_data. n_samples : int The number of samples to generate. step_size : float, optional (default=1.) The step size to create samples. Returns ------- X_new : ndarray, shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray, shape (n_samples_new, ) Target values for synthetic samples. """ # Check the consistency of X X = check_array(X) # A matrix to store the synthetic samples X_new = np.zeros((n_samples, X.shape[1])) # Set seeds np.random.seed(self.rs_) seeds = np.random.randint(low=0, high=100*len(nn_num.flatten()), size=n_samples) # Randomly pick samples to construct neighbours from np.random.seed(self.rs_) samples = np.random.randint(low=0, high=len(nn_num.flatten()), size=n_samples) # Loop over the NN matrix and create new samples for i, n in enumerate(samples): # NN lines relate to original sample, columns to its # nearest neighbours row, col = divmod(n, nn_num.shape[1]) # Take a step of random size (0,1) in the direction of the # n nearest neighbours np.random.seed(seeds[i]) step = step_size * np.random.uniform() # Construct synthetic sample X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) # The returned target vector is simply a repetition of the # minority label y_new = np.array([y_type] * len(X_new)) if self.verbose: print("Generated {} new samples ...".format(len(X_new))) return X_new, y_new def transform(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).transform(X, y) # Define the number of sample to create # We handle only two classes problem for the moment. if self.ratio_ == 'auto': num_samples = (self.stats_c_[self.maj_c_] - self.stats_c_[self.min_c_]) else: num_samples = ((self.ratio_ * self.stats_c_[self.maj_c_]) - self.stats_c_[self.min_c_]) # Start by separating minority class features and target values. X_min = X[y == self.min_c_] # If regular SMOTE is to be performed if self.kind == 'regular': # Print if verbose is true# if self.verbose: print('Finding the {} nearest neighbours...'.format(self.k)) # Look for k-th nearest neighbours, excluding, of course, the # point itself. self.nearest_neighbour_.fit(X_min) # Matrix with k-th nearest neighbours indexes for each minority # element. nns = self.nearest_neighbour_.kneighbors( X_min, return_distance=False)[:, 1:] # Print status if verbose is true if self.verbose: print("done!") print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples X_new, y_new = self._make_samples(X_min, self.min_c_, X_min, nns, num_samples, 1.0) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) return X_resampled, y_resampled if self.kind == 'borderline1' or self.kind == 'borderline2': if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # Find the NNs for all samples in the data set. self.nearest_neighbour_.fit(X) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = self._in_danger_noise(X_min, y, kind='danger') # If all minority samples are safe, return the original data set. if not any(danger_index): if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here. return X, y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(X_min) # nns...# nns = self.nearest_neighbour_.kneighbors( X_min[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. X_new, y_new = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, num_samples) # Concatenate the newly generated samples to the original # dataset X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1}) return X_resampled, y_resampled else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.rs_) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01 fractions = betavariate(alpha=10, beta=10) # Only minority X_new_1, y_new_1 = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Only majority with smaller step size X_new_2, y_new_2 = self._make_samples(X_min[danger_index], self.min_c_, X[y != self.min_c_], nns, int((1 - fractions) * num_samples), step_size=0.5) # Concatenate the newly generated samples to the original # data set X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1}) return X_resampled, y_resampled if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class. # Fit SVM to the full data# self.svm_.fit(X, y) # Find the support vectors and their corresponding indexes support_index = self.svm_.support_[y[self.svm_.support_] == self.min_c_] support_vector = X[support_index] # First, find the nn of all the samples to identify samples # in danger and noisy ones if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # As usual, fit a nearest neighbour model to the data self.nearest_neighbour_.fit(X) if self.verbose: print("done!") # Now, get rid of noisy support vectors noise_bool = self._in_danger_noise(support_vector, y, kind='noise') # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] danger_bool = self._in_danger_noise(support_vector, y, kind='danger') safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format(support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int) )) # Proceed to find support vectors NNs among the minority class print("Finding the {} nearest neighbours...".format(self.k)) self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(X_min) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.rs_) fractions = betavariate(alpha=10, beta=10) # Interpolate samples in danger if np.count_nonzero(danger_bool) > 0: nns = self.nearest_neighbour_.kneighbors( support_vector[danger_bool], return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( support_vector[danger_bool], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Extrapolate safe samples if np.count_nonzero(safety_bool) > 0: nns = self.nearest_neighbour_.kneighbors( support_vector[safety_bool], return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( support_vector[safety_bool], self.min_c_, X_min, nns, int((1 - fractions) * num_samples), step_size=-self.out_step) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # not any support vectors in danger elif np.count_nonzero(danger_bool) == 0: X_resampled = np.concatenate((X, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_2), axis=0) # All the support vector in danger elif np.count_nonzero(safety_bool) == 0: X_resampled = np.concatenate((X, X_new_1), axis=0) y_resampled = np.concatenate((y, y_new_1), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour_.set_params(**{'n_neighbors': self.m+1}) return X_resampled, y_resampled
class SMOTE(OverSampler): """Class to perform over-sampling using SMOTE. This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique, and the variations Borderline SMOTE 1, 2 and SVM-SMOTE. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Boolean to either or not print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' nn_method : str, optional (default='exact') The nearest neighbors method to use which can be either: 'approximate' or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an exact search. Attributes ---------- ratio_ : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. rs_ : int or None, optional (default=None) Seed for random number generation. min_c_ : str or int The identifier of the minority class. max_c_ : str or int The identifier of the majority class. stats_c_ : dict of str/int : int A dictionary in which the number of occurences of each class is reported. Notes ----- See the original papers: [1]_, [2]_, [3]_ for more details. It does not support multiple classes automatically, but can be called multiple times. References ---------- .. [1] N. V. Chawla, K. W. Bowyer, L. O.Hall, W. P. Kegelmeyer, "SMOTE: synthetic minority over-sampling technique," Journal of artificial intelligence research, 321-357, 2002. .. [2] H. Han, W. Wen-Yuan, M. Bing-Huan, "Borderline-SMOTE: a new over-sampling method in imbalanced data sets learning," Advances in intelligent computing, 878-887, 2005. .. [3] H. M. Nguyen, E. W. Cooper, K. Kamei, "Borderline over-sampling for imbalanced data classification," International Journal of Knowledge Engineering and Soft Data Paradigms, 3(1), pp.4-21, 2001. """ def __init__(self, ratio='auto', random_state=None, verbose=True, k=5, m=10, out_step=0.5, kind='regular', nn_method='exact', n_jobs=-1, **kwargs): """Initialisation of SMOTE object. Parameters ---------- ratio : str or float, optional (default='auto') If 'auto', the ratio will be defined automatically to balanced the dataset. Otherwise, the ratio will corresponds to the number of samples in the minority class over the the number of samples in the majority class. random_state : int or None, optional (default=None) Seed for random number generation. verbose : bool, optional (default=True) Boolean to either or not print information about the processing. k : int, optional (default=5) Number of nearest neighbours to used to construct synthetic samples. m : int, optional (default=10) Number of nearest neighbours to use to determine if a minority sample is in danger. out_step : float, optional (default=0.5) Step size when extrapolating. kind : str, optional (default='regular') The type of SMOTE algorithm to use one of the following options: 'regular', 'borderline1', 'borderline2', 'svm' nn_method : str, optional (default='exact') The nearest neighbors method to use which can be either: 'approximate' or 'exact'. 'approximate' will use LSH Forest while 'exact' will be an exact search. n_jobs : int, optional (default=-1) Number of threads to run the algorithm when it is possible. """ super(SMOTE, self).__init__(ratio=ratio, random_state=random_state, verbose=verbose) # Check the number of thread to use self.n_jobs = n_jobs # --- The type of smote # This object can perform regular smote over-sampling, borderline 1, # borderline 2 and svm smote. Since the algorithms are fairly simple # they share most methods. possible_kind = ('regular', 'borderline1', 'borderline2', 'svm') if kind in possible_kind: self.kind = kind else: raise ValueError('Unknown kind for SMOTE algorithm.') # --- Verbose # Control whether or not status and progress information should be self.verbose = verbose # --- Nearest Neighbours for synthetic samples # The smote algorithm uses the k-th nearest neighbours of a minority # sample to generate new synthetic samples. self.k = k # --- NN object # Import the NN object from scikit-learn library. Since in the smote # variations we must first find samples that are in danger, we # initialize the NN object differently depending on the method chosen if kind == 'regular': # Regular smote does not look for samples in danger, instead it # creates synthetic samples directly from the k-th nearest # neighbours with not filtering if nn_method == 'exact': self.nearest_neighbour_ = NearestNeighbors(n_neighbors=k + 1, n_jobs=self.n_jobs) elif nn_method == 'approximate': self.nearest_neighbour_ = LSHForest(n_estimators=50, n_candidates=500, n_neighbors=k + 1) else: # Borderline1, 2 and SVM variations of smote must first look for # samples that could be considered noise and samples that live # near the boundary between the classes. Therefore, before # creating synthetic samples from the k-th nns, it first look # for m nearest neighbors to decide whether or not a sample is # noise or near the boundary. if nn_method == 'exact': self.nearest_neighbour_ = NearestNeighbors(n_neighbors=m + 1, n_jobs=self.n_jobs) elif nn_method == 'approximate': self.nearest_neighbour_ = LSHForest(n_estimators=50, n_candidates=500, n_neighbors=m + 1) # --- Nearest Neighbours for noise and boundary (in danger) # Before creating synthetic samples we must first decide if # a given entry is noise or in danger. We use m nns in this step self.m = m # --- SVM smote # Unlike the borderline variations, the SVM variation uses the support # vectors to decide which samples are in danger (near the boundary). # Additionally it also introduces extrapolation for samples that are # considered safe (far from boundary) and interpolation for samples # in danger (near the boundary). The level of extrapolation is # controled by the out_step. if kind == 'svm': # Store extrapolation size self.out_step = out_step # Store SVM object with any parameters self.svm_ = SVC(**kwargs) def fit(self, X, y): """Find the classes statistics before to perform sampling. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- self : object, Return self. """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).fit(X, y) return self def _in_danger_noise(self, samples, y, kind='danger'): """Estimate if a set of sample are in danger or not. Parameters ---------- samples : ndarray, shape (n_samples, n_features) The samples to check if either they are in danger or not. y : ndarray, shape (n_samples, ) The true label in order to check the neighbour labels. kind : str, optional (default='danger') The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray, shape (n_samples, ) A boolean array where True refer to samples in danger or noise. """ # Find the NN for each samples # Exclude the sample itself x = self.nearest_neighbour_.kneighbors(samples, return_distance=False)[:, 1:] # Count how many NN belong to the minority class # Find the class corresponding to the label in x nn_label = (y[x] != self.min_c_).astype(int) # Compute the number of majority samples in the NN n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and(n_maj >= float(self.m) / 2., n_maj < self.m) elif kind == 'noise': # Samples are noise for m = m' return n_maj == self.m else: raise ValueError('Unknown string for parameter kind.') def _make_samples(self, X, y_type, nn_data, nn_num, n_samples, step_size=1.): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : ndarray, shape (n_samples, n_features) Points from which the points will be created. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in nn_data. n_samples : int The number of samples to generate. step_size : float, optional (default=1.) The step size to create samples. Returns ------- X_new : ndarray, shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray, shape (n_samples_new, ) Target values for synthetic samples. """ # Check the consistency of X X = check_array(X) # A matrix to store the synthetic samples X_new = np.zeros((n_samples, X.shape[1])) # Set seeds np.random.seed(self.rs_) seeds = np.random.randint(low=0, high=100 * len(nn_num.flatten()), size=n_samples) # Randomly pick samples to construct neighbours from np.random.seed(self.rs_) samples = np.random.randint(low=0, high=len(nn_num.flatten()), size=n_samples) # Loop over the NN matrix and create new samples for i, n in enumerate(samples): # NN lines relate to original sample, columns to its # nearest neighbours row, col = divmod(n, nn_num.shape[1]) # Take a step of random size (0,1) in the direction of the # n nearest neighbours np.random.seed(seeds[i]) step = step_size * np.random.uniform() # Construct synthetic sample X_new[i] = X[row] - step * (X[row] - nn_data[nn_num[row, col]]) # The returned target vector is simply a repetition of the # minority label y_new = np.array([y_type] * len(X_new)) if self.verbose: print("Generated {} new samples ...".format(len(X_new))) return X_new, y_new def transform(self, X, y): """Resample the dataset. Parameters ---------- X : ndarray, shape (n_samples, n_features) Matrix containing the data which have to be sampled. y : ndarray, shape (n_samples, ) Corresponding label for each sample in X. Returns ------- X_resampled : ndarray, shape (n_samples_new, n_features) The array containing the resampled data. y_resampled : ndarray, shape (n_samples_new) The corresponding label of `X_resampled` """ # Check the consistency of X and y X, y = check_X_y(X, y) # Call the parent function super(SMOTE, self).transform(X, y) # Define the number of sample to create # We handle only two classes problem for the moment. if self.ratio_ == 'auto': num_samples = (self.stats_c_[self.maj_c_] - self.stats_c_[self.min_c_]) else: num_samples = ((self.ratio_ * self.stats_c_[self.maj_c_]) - self.stats_c_[self.min_c_]) # Start by separating minority class features and target values. X_min = X[y == self.min_c_] # If regular SMOTE is to be performed if self.kind == 'regular': # Print if verbose is true# if self.verbose: print('Finding the {} nearest neighbours...'.format(self.k)) # Look for k-th nearest neighbours, excluding, of course, the # point itself. self.nearest_neighbour_.fit(X_min) # Matrix with k-th nearest neighbours indexes for each minority # element. nns = self.nearest_neighbour_.kneighbors(X_min, return_distance=False)[:, 1:] # Print status if verbose is true if self.verbose: print("done!") print("Creating synthetic samples...", end="") # --- Generating synthetic samples # Use static method make_samples to generate minority samples X_new, y_new = self._make_samples(X_min, self.min_c_, X_min, nns, num_samples, 1.0) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) return X_resampled, y_resampled if self.kind == 'borderline1' or self.kind == 'borderline2': if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # Find the NNs for all samples in the data set. self.nearest_neighbour_.fit(X) if self.verbose: print("done!") # Boolean array with True for minority samples in danger danger_index = self._in_danger_noise(X_min, y, kind='danger') # If all minority samples are safe, return the original data set. if not any(danger_index): if self.verbose: print('There are no samples in danger. No borderline ' 'synthetic samples created.') # All are safe, nothing to be done here. return X, y # If we got here is because some samples are in danger, we need to # find the NNs among the minority class to create the new synthetic # samples. # # We start by changing the number of NNs to consider from m + 1 # to k + 1 self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(X_min) # nns...# nns = self.nearest_neighbour_.kneighbors(X_min[danger_index], return_distance=False)[:, 1:] # B1 and B2 types diverge here!!! if self.kind == 'borderline1': # Create synthetic samples for borderline points. X_new, y_new = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, num_samples) # Concatenate the newly generated samples to the original # dataset X_resampled = np.concatenate((X, X_new), axis=0) y_resampled = np.concatenate((y, y_new), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour_.set_params( **{'n_neighbors': self.m + 1}) return X_resampled, y_resampled else: # Split the number of synthetic samples between only minority # (type 1), or minority and majority (with reduced step size) # (type 2). np.random.seed(self.rs_) # The fraction is sampled from a beta distribution centered # around 0.5 with variance ~0.01 fractions = betavariate(alpha=10, beta=10) # Only minority X_new_1, y_new_1 = self._make_samples(X_min[danger_index], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Only majority with smaller step size X_new_2, y_new_2 = self._make_samples(X_min[danger_index], self.min_c_, X[y != self.min_c_], nns, int((1 - fractions) * num_samples), step_size=0.5) # Concatenate the newly generated samples to the original # data set X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour_.set_params( **{'n_neighbors': self.m + 1}) return X_resampled, y_resampled if self.kind == 'svm': # The SVM smote model fits a support vector machine # classifier to the data and uses the support vector to # provide a notion of boundary. Unlike regular smote, where # such notion relies on proportion of nearest neighbours # belonging to each class. # Fit SVM to the full data# self.svm_.fit(X, y) # Find the support vectors and their corresponding indexes support_index = self.svm_.support_[y[self.svm_.support_] == self.min_c_] support_vector = X[support_index] # First, find the nn of all the samples to identify samples # in danger and noisy ones if self.verbose: print("Finding the {} nearest neighbours...".format(self.m)) # As usual, fit a nearest neighbour model to the data self.nearest_neighbour_.fit(X) if self.verbose: print("done!") # Now, get rid of noisy support vectors noise_bool = self._in_danger_noise(support_vector, y, kind='noise') # Remove noisy support vectors support_vector = support_vector[np.logical_not(noise_bool)] danger_bool = self._in_danger_noise(support_vector, y, kind='danger') safety_bool = np.logical_not(danger_bool) if self.verbose: print("Out of {0} support vectors, {1} are noisy, " "{2} are in danger " "and {3} are safe.".format( support_vector.shape[0], noise_bool.sum().astype(int), danger_bool.sum().astype(int), safety_bool.sum().astype(int))) # Proceed to find support vectors NNs among the minority class print("Finding the {} nearest neighbours...".format(self.k)) self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1}) self.nearest_neighbour_.fit(X_min) if self.verbose: print("done!") print("Creating synthetic samples...", end="") # Split the number of synthetic samples between interpolation and # extrapolation # The fraction are sampled from a beta distribution with mean # 0.5 and variance 0.01# np.random.seed(self.rs_) fractions = betavariate(alpha=10, beta=10) # Interpolate samples in danger if np.count_nonzero(danger_bool) > 0: nns = self.nearest_neighbour_.kneighbors( support_vector[danger_bool], return_distance=False)[:, 1:] X_new_1, y_new_1 = self._make_samples( support_vector[danger_bool], self.min_c_, X_min, nns, int(fractions * (num_samples + 1)), step_size=1.) # Extrapolate safe samples if np.count_nonzero(safety_bool) > 0: nns = self.nearest_neighbour_.kneighbors( support_vector[safety_bool], return_distance=False)[:, 1:] X_new_2, y_new_2 = self._make_samples( support_vector[safety_bool], self.min_c_, X_min, nns, int((1 - fractions) * num_samples), step_size=-self.out_step) if self.verbose: print("done!") # Concatenate the newly generated samples to the original data set if (np.count_nonzero(danger_bool) > 0 and np.count_nonzero(safety_bool) > 0): X_resampled = np.concatenate((X, X_new_1, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_1, y_new_2), axis=0) # not any support vectors in danger elif np.count_nonzero(danger_bool) == 0: X_resampled = np.concatenate((X, X_new_2), axis=0) y_resampled = np.concatenate((y, y_new_2), axis=0) # All the support vector in danger elif np.count_nonzero(safety_bool) == 0: X_resampled = np.concatenate((X, X_new_1), axis=0) y_resampled = np.concatenate((y, y_new_1), axis=0) # Reset the k-neighbours to m+1 neighbours self.nearest_neighbour_.set_params(**{'n_neighbors': self.m + 1}) return X_resampled, y_resampled