Example #1
0
def resample(x_matrix, y_vector, sampler_type):
    """
    Resamples a dataset with imbalanced data so that the labels contained in
    the y_vector are distributed equally. This is done to prevent a classifier
    from being biased by the number of sample of a certain class.

    :param x_matrix: a numpy matrix with the independent variables
    :param y_vector: a numpy vector with the dependent variables
    :param sampler_type: the type of sampler that is going to be used to
    resample the data
    :return: a numpy matrix and a numpy vector with the data resampled using the
    selected sampler
    """

    if sampler_type is None:
        return x_matrix, y_vector

    verbose = False
    ratio = 'auto'
    random_state = 0
    samplers = {
        'random_over_sampler':
        RandomOverSampler(ratio=ratio, verbose=verbose),
        'smote_regular':
        SMOTE(ratio=ratio,
              random_state=random_state,
              verbose=verbose,
              kind='regular'),
        'smote_bl1':
        SMOTE(ratio=ratio,
              random_state=random_state,
              verbose=verbose,
              kind='borderline1'),
        'smote_bl2':
        SMOTE(ratio=ratio,
              random_state=random_state,
              verbose=verbose,
              kind='borderline2'),
        'smote_tomek':
        SMOTETomek(ratio=ratio, random_state=random_state, verbose=verbose),
        'smoteenn':
        SMOTEENN(ratio=ratio, random_state=random_state, verbose=verbose)
    }

    sampler = samplers[sampler_type]
    resampled_x, resampled_y = sampler.fit_transform(x_matrix, y_vector)

    return resampled_x, resampled_y
def test_smote_transform_wt_fit():
    """Test either if an error is raised when transform is called before
    fitting"""

    # Create the object
    smote = SMOTE(random_state=RND_SEED)
    assert_raises(RuntimeError, smote.transform, X, Y)
Example #3
0
def test_smote(x, y):
    print('SMOTE')
    sm = SMOTE(kind='regular', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 1')
    sm = SMOTE(kind='borderline1', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 2')
    sm = SMOTE(kind='borderline2', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE SVM')
    svm_args = {'class_weight': 'auto'}
    sm = SMOTE(kind='svm', verbose=verbose, **svm_args)
    svmx, svmy = sm.fit_transform(x, y)
Example #4
0
    def balance_data_oversampling(self, ratio = 2, balance_type = "OverSampler"):
        '''
        Balance data.
        '''
        verbose = True

        if balance_type == "OverSampler":
            sm = OverSampler(verbose = verbose, ratio = ratio)
        elif balance_type == 'SMOTE_borderline1':
            sm = SMOTE(kind = 'borderline1', verbose = verbose, ratio = ratio)
        elif balance_type == 'SMOTE_regular':
            sm = SMOTE(kind = 'regular', verbose = verbose, ratio = ratio)
        elif balance_type == 'SMOTE_borderline2':
            sm = SMOTE(kind = 'borderline2', verbose = verbose, ratio = ratio)
        else:
            sm = TomekLinks(verbose = verbose)

        self.train_x, self.train_y = sm.fit_transform(self.train_x, self.train_y)
def test_smote_fit_single_class():
    """Test either if an error when there is a single class"""

    # Create the object
    smote = SMOTE(random_state=RND_SEED)
    # Resample the data
    # Create a wrong y
    y_single_class = np.zeros((X.shape[0], ))
    assert_raises(RuntimeError, smote.fit, X, y_single_class)
Example #6
0
def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio):
    sampler = None
    verbose = True
    if sample_type == SMOTE_REG:
        sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15)
    elif sample_type == SMOTE_SVM:
        # TODO: Make this configurable?
        svm_args = {'class_weight': 'balanced'}
        sampler = SMOTE(kind='svm',
                        ratio=ratio,
                        verbose=verbose,
                        k=15,
                        **svm_args)
    elif sample_type == SMOTE_BORDERLINE_1:
        sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_BORDERLINE_2:
        sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_ENN:
        sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == SMOTE_TOMEK:
        sampler = SMOTETomek(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == UNDERSAMPLER:
        sampler = UnderSampler(ratio=ratio,
                               verbose=verbose,
                               replacement=False,
                               random_state=17)
    elif sample_type == ADASYN_SAMPLER:
        sampler = ADASYN(k=15, imb_threshold=0.6, ratio=ratio)
    elif sample_type == TOMEK_LINKS:
        sampler = TomekLinks()
    elif sample_type == CLUSTER_CENTROIDS:
        sampler = ClusterCentroids(ratio=ratio)
    elif sample_type == NEARMISS:
        sampler = NearMiss(ratio=ratio)
    else:
        print "Unrecoqnized sample technique: " + sample_type
        print "Returning original data"
        return train_x, train_y
    return sampler.fit_transform(train_x, train_y)
Example #7
0
    def __init__(self, use_cache=False):
        self.use_cache = use_cache
        self.records = None
        self.dictionary = None

        ratio = 'auto'
        verbose = False
        resampler = Constants.RESAMPLER
        classifier = Constants.DOCUMENT_CLASSIFIER
        random_state = Constants.DOCUMENT_CLASSIFIER_SEED
        classifiers = {
            'logistic_regression': LogisticRegression(C=100),
            'svc': SVC(),
            'kneighbors': KNeighborsClassifier(n_neighbors=10),
            'decision_tree': DecisionTreeClassifier(),
            'nu_svc': NuSVC(),
            'random_forest': RandomForestClassifier(n_estimators=100)
        }
        samplers = {
            'random_over_sampler': RandomOverSampler(
                ratio, random_state=random_state, verbose=verbose),
            'smote_regular': SMOTE(
                ratio, random_state=random_state, verbose=verbose,
                kind='regular'),
            'smote_bl1': SMOTE(
                ratio, random_state=random_state, verbose=verbose,
                kind='borderline1'),
            'smote_bl2': SMOTE(
                ratio, random_state=random_state, verbose=verbose,
                kind='borderline2'),
            'smote_tomek': SMOTETomek(
                ratio, random_state=random_state, verbose=verbose),
            'smote-enn': SMOTEENN(
                ratio, random_state=random_state, verbose=verbose)
        }
        self.classifier = classifiers[classifier]
        self.resampler = samplers[resampler]
        classifiers = None
        samplers = None
def test_smote_fit():
    """Test the fitting method"""

    # Create the object
    smote = SMOTE(random_state=RND_SEED)
    # Fit the data
    smote.fit(X, Y)

    # Check if the data information have been computed
    assert_equal(smote.min_c_, 0)
    assert_equal(smote.maj_c_, 1)
    assert_equal(smote.stats_c_[0], 500)
    assert_equal(smote.stats_c_[1], 4500)
def smote_oversampling(X, y):
    """
	Perform the SMOTE oversampling

	Keyword arguments:
	X -- The feature vectors
	y -- The target classes
	"""

    if verbose:
        print '\nOversampling with SMOTE ...'
    over_sampler = SMOTE(verbose=verbose)
    X_over_sampled, y_over_sampled = over_sampler.fit_transform(X, y)
    return X_over_sampled, y_over_sampled
    def balance_data_oversampling_smote_regular(self):
        '''
        Balance data using SMOTE regular.
        '''
        x = self.X
        y = self.y
        y.shape = (len(self.y))
        verbose = True

        sm = SMOTE(kind='regular', verbose=verbose)
        svmx, svmy = sm.fit_transform(x, y)

        self.X = svmx
        self.y = svmy
        self.y.shape = (len(self.y), 1)
    def balance_data_oversampling_smote_borderline2(self):
        '''
        Balance data using SMOTE bordeline 2.
        '''
        x = self.X
        y = self.y
        y.shape = (len(self.y))
        verbose = True

        sm = SMOTE(kind='borderline2', verbose=verbose)
        svmx, svmy = sm.fit_transform(x, y)

        self.X = svmx
        self.y = svmy
        self.y.shape = (len(self.y), 1)
    def balance_data_oversampling_smote_svm(self):
        '''
        Balance data using SMOTE SVM.
        '''
        x = self.X
        y = self.y
        y.shape = (len(self.y))
        verbose = True

        svm_args = {'class_weight': 'auto'}
        sm = SMOTE(kind='svm', verbose=verbose, **svm_args)
        svmx, svmy = sm.fit_transform(x, y)

        self.X = svmx
        self.y = svmy
        self.y.shape = (len(self.y), 1)
def test_transform_regular():
    """Test transform function with regular SMOTE."""

    # Create the object
    kind = 'regular'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
from unbalanced_dataset.over_sampling import SMOTE

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
sm = SMOTE(kind='borderline1')
X_resampled, y_resampled = sm.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
Example #15
0
                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
sm = SMOTE(kind='borderline2')
X_resampled, y_resampled = sm.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],
Example #16
0
print 'number of features before: ', X.shape[1]
print 'feature selection via Linear SVM...'
lsvc = LinearSVC(C=100, penalty='l1', dual=False).fit(X, y) 
# according the validation curve (not output here), C=10 gives the best result
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
print 'number of features after: ', X_new.shape[1]


# Use SMOTE to 'fix' the imbalanced problem:
# the python implementation of SMOTE comes from
# https://github.com/fmfn/UnbalancedDataset/tree/master/unbalanced_dataset 
ratio = float(len([t for t in y if t==-1]))/float(len([t for t in y if t==1]))
# oversampler = OverSampler(ratio = ratio-1)
smote = SMOTE(k=3, ratio = ratio-1)  
smote.x = X_new
smote.y = y
smote.minc = 1
smote.maxc = -1
smote.ucd ={1: len([tg for tg in y if tg==1]), -1: len([tg for tg in y if tg==-1])}
ret_X, ret_y = smote.resample()
# overX, overy = oversampler.resample()

combined = zip(ret_X, ret_y)
random.shuffle(combined)
ret_X[:], ret_y[:] = zip(*combined)

print 'shuffled??\n', ret_y
print 'training and predicting...'
# clf = SVC(kernel='linear', C=1, probability=True)
                           weights=[0.1, 0.9],
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
sm = SMOTE(kind='regular')
X_resampled, y_resampled = sm.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],
Example #18
0
from unbalanced_dataset.over_sampling import SMOTE

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
sm = SMOTE(kind='svm')
X_resampled, y_resampled = sm.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)