def resample(x_matrix, y_vector, sampler_type): """ Resamples a dataset with imbalanced data so that the labels contained in the y_vector are distributed equally. This is done to prevent a classifier from being biased by the number of sample of a certain class. :param x_matrix: a numpy matrix with the independent variables :param y_vector: a numpy vector with the dependent variables :param sampler_type: the type of sampler that is going to be used to resample the data :return: a numpy matrix and a numpy vector with the data resampled using the selected sampler """ if sampler_type is None: return x_matrix, y_vector verbose = False ratio = 'auto' random_state = 0 samplers = { 'random_over_sampler': RandomOverSampler(ratio=ratio, verbose=verbose), 'smote_regular': SMOTE(ratio=ratio, random_state=random_state, verbose=verbose, kind='regular'), 'smote_bl1': SMOTE(ratio=ratio, random_state=random_state, verbose=verbose, kind='borderline1'), 'smote_bl2': SMOTE(ratio=ratio, random_state=random_state, verbose=verbose, kind='borderline2'), 'smote_tomek': SMOTETomek(ratio=ratio, random_state=random_state, verbose=verbose), 'smoteenn': SMOTEENN(ratio=ratio, random_state=random_state, verbose=verbose) } sampler = samplers[sampler_type] resampled_x, resampled_y = sampler.fit_transform(x_matrix, y_vector) return resampled_x, resampled_y
def test_smote_transform_wt_fit(): """Test either if an error is raised when transform is called before fitting""" # Create the object smote = SMOTE(random_state=RND_SEED) assert_raises(RuntimeError, smote.transform, X, Y)
def test_smote(x, y): print('SMOTE') sm = SMOTE(kind='regular', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE bordeline 1') sm = SMOTE(kind='borderline1', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE bordeline 2') sm = SMOTE(kind='borderline2', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE SVM') svm_args = {'class_weight': 'auto'} sm = SMOTE(kind='svm', verbose=verbose, **svm_args) svmx, svmy = sm.fit_transform(x, y)
def balance_data_oversampling(self, ratio = 2, balance_type = "OverSampler"): ''' Balance data. ''' verbose = True if balance_type == "OverSampler": sm = OverSampler(verbose = verbose, ratio = ratio) elif balance_type == 'SMOTE_borderline1': sm = SMOTE(kind = 'borderline1', verbose = verbose, ratio = ratio) elif balance_type == 'SMOTE_regular': sm = SMOTE(kind = 'regular', verbose = verbose, ratio = ratio) elif balance_type == 'SMOTE_borderline2': sm = SMOTE(kind = 'borderline2', verbose = verbose, ratio = ratio) else: sm = TomekLinks(verbose = verbose) self.train_x, self.train_y = sm.fit_transform(self.train_x, self.train_y)
def test_smote_fit_single_class(): """Test either if an error when there is a single class""" # Create the object smote = SMOTE(random_state=RND_SEED) # Resample the data # Create a wrong y y_single_class = np.zeros((X.shape[0], )) assert_raises(RuntimeError, smote.fit, X, y_single_class)
def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio): sampler = None verbose = True if sample_type == SMOTE_REG: sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15) elif sample_type == SMOTE_SVM: # TODO: Make this configurable? svm_args = {'class_weight': 'balanced'} sampler = SMOTE(kind='svm', ratio=ratio, verbose=verbose, k=15, **svm_args) elif sample_type == SMOTE_BORDERLINE_1: sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_BORDERLINE_2: sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_ENN: sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15) elif sample_type == SMOTE_TOMEK: sampler = SMOTETomek(ratio=ratio, verbose=verbose, k=15) elif sample_type == UNDERSAMPLER: sampler = UnderSampler(ratio=ratio, verbose=verbose, replacement=False, random_state=17) elif sample_type == ADASYN_SAMPLER: sampler = ADASYN(k=15, imb_threshold=0.6, ratio=ratio) elif sample_type == TOMEK_LINKS: sampler = TomekLinks() elif sample_type == CLUSTER_CENTROIDS: sampler = ClusterCentroids(ratio=ratio) elif sample_type == NEARMISS: sampler = NearMiss(ratio=ratio) else: print "Unrecoqnized sample technique: " + sample_type print "Returning original data" return train_x, train_y return sampler.fit_transform(train_x, train_y)
def __init__(self, use_cache=False): self.use_cache = use_cache self.records = None self.dictionary = None ratio = 'auto' verbose = False resampler = Constants.RESAMPLER classifier = Constants.DOCUMENT_CLASSIFIER random_state = Constants.DOCUMENT_CLASSIFIER_SEED classifiers = { 'logistic_regression': LogisticRegression(C=100), 'svc': SVC(), 'kneighbors': KNeighborsClassifier(n_neighbors=10), 'decision_tree': DecisionTreeClassifier(), 'nu_svc': NuSVC(), 'random_forest': RandomForestClassifier(n_estimators=100) } samplers = { 'random_over_sampler': RandomOverSampler( ratio, random_state=random_state, verbose=verbose), 'smote_regular': SMOTE( ratio, random_state=random_state, verbose=verbose, kind='regular'), 'smote_bl1': SMOTE( ratio, random_state=random_state, verbose=verbose, kind='borderline1'), 'smote_bl2': SMOTE( ratio, random_state=random_state, verbose=verbose, kind='borderline2'), 'smote_tomek': SMOTETomek( ratio, random_state=random_state, verbose=verbose), 'smote-enn': SMOTEENN( ratio, random_state=random_state, verbose=verbose) } self.classifier = classifiers[classifier] self.resampler = samplers[resampler] classifiers = None samplers = None
def test_smote_fit(): """Test the fitting method""" # Create the object smote = SMOTE(random_state=RND_SEED) # Fit the data smote.fit(X, Y) # Check if the data information have been computed assert_equal(smote.min_c_, 0) assert_equal(smote.maj_c_, 1) assert_equal(smote.stats_c_[0], 500) assert_equal(smote.stats_c_[1], 4500)
def smote_oversampling(X, y): """ Perform the SMOTE oversampling Keyword arguments: X -- The feature vectors y -- The target classes """ if verbose: print '\nOversampling with SMOTE ...' over_sampler = SMOTE(verbose=verbose) X_over_sampled, y_over_sampled = over_sampler.fit_transform(X, y) return X_over_sampled, y_over_sampled
def balance_data_oversampling_smote_regular(self): ''' Balance data using SMOTE regular. ''' x = self.X y = self.y y.shape = (len(self.y)) verbose = True sm = SMOTE(kind='regular', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) self.X = svmx self.y = svmy self.y.shape = (len(self.y), 1)
def balance_data_oversampling_smote_borderline2(self): ''' Balance data using SMOTE bordeline 2. ''' x = self.X y = self.y y.shape = (len(self.y)) verbose = True sm = SMOTE(kind='borderline2', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) self.X = svmx self.y = svmy self.y.shape = (len(self.y), 1)
def balance_data_oversampling_smote_svm(self): ''' Balance data using SMOTE SVM. ''' x = self.X y = self.y y.shape = (len(self.y)) verbose = True svm_args = {'class_weight': 'auto'} sm = SMOTE(kind='svm', verbose=verbose, **svm_args) svmx, svmy = sm.fit_transform(x, y) self.X = svmx self.y = svmy self.y.shape = (len(self.y), 1)
def test_transform_regular(): """Test transform function with regular SMOTE.""" # Create the object kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
from unbalanced_dataset.over_sampling import SMOTE # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling sm = SMOTE(kind='borderline1') X_resampled, y_resampled = sm.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling sm = SMOTE(kind='borderline2') X_resampled, y_resampled = sm.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
print 'number of features before: ', X.shape[1] print 'feature selection via Linear SVM...' lsvc = LinearSVC(C=100, penalty='l1', dual=False).fit(X, y) # according the validation curve (not output here), C=10 gives the best result model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X) print 'number of features after: ', X_new.shape[1] # Use SMOTE to 'fix' the imbalanced problem: # the python implementation of SMOTE comes from # https://github.com/fmfn/UnbalancedDataset/tree/master/unbalanced_dataset ratio = float(len([t for t in y if t==-1]))/float(len([t for t in y if t==1])) # oversampler = OverSampler(ratio = ratio-1) smote = SMOTE(k=3, ratio = ratio-1) smote.x = X_new smote.y = y smote.minc = 1 smote.maxc = -1 smote.ucd ={1: len([tg for tg in y if tg==1]), -1: len([tg for tg in y if tg==-1])} ret_X, ret_y = smote.resample() # overX, overy = oversampler.resample() combined = zip(ret_X, ret_y) random.shuffle(combined) ret_X[:], ret_y[:] = zip(*combined) print 'shuffled??\n', ret_y print 'training and predicting...' # clf = SVC(kernel='linear', C=1, probability=True)
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling sm = SMOTE(kind='regular') X_resampled, y_resampled = sm.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
from unbalanced_dataset.over_sampling import SMOTE # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling sm = SMOTE(kind='svm') X_resampled, y_resampled = sm.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)