def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio):
    sampler = None
    verbose = True
    if sample_type == SMOTE_REG:
        sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15)
    elif sample_type == SMOTE_SVM:
        # TODO: Make this configurable?
        svm_args = {'class_weight' : 'balanced'}
        sampler = SMOTE(kind='svm', ratio=ratio, verbose=verbose, k=15, **svm_args)
    elif sample_type == SMOTE_BORDERLINE_1:
        sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_BORDERLINE_2:
        sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_ENN:
        sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == SMOTE_TOMEK:
        sampler = SMOTETomek(ratio=ratio,verbose=verbose, k=15)
    elif sample_type == UNDERSAMPLER:
        sampler = UnderSampler(ratio=ratio, verbose=verbose, replacement=False,
                               random_state=17)
    else:
        print "Unrecoqnized sample technique: " + sample_type
        print "Returning original data"
        return train_x, train_y
    return sampler.fit_transform(train_x, train_y)
def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio):
    sampler = None
    verbose = True
    if sample_type == SMOTE_REG:
        sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15)
    elif sample_type == SMOTE_SVM:
        # TODO: Make this configurable?
        svm_args = {'class_weight' : 'balanced'}
        sampler = SMOTE(kind='svm', ratio=ratio, verbose=verbose, k=15, **svm_args)
    elif sample_type == SMOTE_BORDERLINE_1:
        sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_BORDERLINE_2:
        sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_ENN:
        sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == SMOTE_TOMEK:
        sampler = SMOTETomek(ratio=ratio,verbose=verbose, k=15)
    elif sample_type == UNDERSAMPLER:
        sampler = UnderSampler(ratio=ratio, verbose=verbose, replacement=False,
                               random_state=17)
    elif sample_type == ADASYN_SAMPLER:
        sampler = ADASYN(k=15,imb_threshold=0.6, ratio=ratio)
    elif sample_type == TOMEK_LINKS:
        sampler = TomekLinks()
    elif sample_type == CLUSTER_CENTROIDS:
        sampler = ClusterCentroids(ratio=ratio)
    elif sample_type == NEARMISS:
        sampler = NearMiss(ratio=ratio)
    else:
        print "Unrecoqnized sample technique: " + sample_type
        print "Returning original data"
        return train_x, train_y
    return sampler.fit_transform(train_x, train_y)
Beispiel #3
0
def grid_search_rf():
    rf_grid = {
    'max_depth': [4, 8, None],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True], # Mandatory with oob_score=True
    'n_estimators': [50, 100, 200, 400],
    'random_state': [67],
    'oob_score': [True],
    'n_jobs': [-1]
    }

    rf_grid_cv = GridSearchCV(RandomForestClassifier(),
                                 rf_grid,
                                 n_jobs=-1,
                                 verbose=True,
                                 scoring='roc_auc')

    sm = SMOTE(kind='regular', ratio=0.4)
    X_resampled, y_resampled = sm.fit_transform(X, y)

    # Splitting train and test data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3)

    rf_grid_cv.fit(X_train, y_train)

    print "Best Parameters found:\n", rf_grid_cv.best_params_

    best_model = rf_grid_cv.best_estimator_

    print "OOB:", best_model.oob_score_
Beispiel #4
0
def test_smote(x, y):
    print('SMOTE')
    sm = SMOTE(kind='regular', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 1')
    sm = SMOTE(kind='borderline1', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 2')
    sm = SMOTE(kind='borderline2', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE SVM')
    svm_args = {'class_weight': 'auto'}
    sm = SMOTE(kind='svm', verbose=verbose, **svm_args)
    svmx, svmy = sm.fit_transform(x, y)
Beispiel #5
0
def test_smote(x, y):
    print('SMOTE')
    sm = SMOTE(kind='regular', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 1')
    sm = SMOTE(kind='borderline1', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE bordeline 2')
    sm = SMOTE(kind='borderline2', verbose=verbose)
    svmx, svmy = sm.fit_transform(x, y)

    print('SMOTE SVM')
    svm_args={'class_weight': 'auto'}
    sm = SMOTE(kind='svm', verbose=verbose, **svm_args)
    svmx, svmy = sm.fit_transform(x, y)
def smote_oversampling(X, y):
    """
	Perform the SMOTE oversampling

	Keyword arguments:
	X -- The feature vectors
	y -- The target classes
	"""

    if verbose:
        print '\nOversampling with SMOTE ...'
    over_sampler = SMOTE(verbose=verbose)
    X_over_sampled, y_over_sampled = over_sampler.fit_transform(X, y)
    return X_over_sampled, y_over_sampled
Beispiel #7
0
def train(train, test) :
    smote = SMOTE(kind='regular', verbose=False)
    train_matrix, train_labels = smote.fit_transform(train.drop('label', 1), train.label)

    if (algorithm == 'random-forest') :
        clf = RandomForestClassifier(n_estimators=5, n_jobs=3, criterion='entropy')
    elif (algorithm == 'adaboost') :
        clf = AdaBoostClassifier()

    # clf = SVC(class_weight="balanced", probability=True, verbose=False)

    clf.fit(train_matrix, train_labels)

    return [testMeanDiff(clf, test), clf.score(test.drop('label', 1), test.label),  clf]
def smote_oversampling(X,y):
	"""
	Perform the SMOTE oversampling

	Keyword arguments:
	X -- The feature vectors
	y -- The target classes
	"""

	if verbose:
		print '\nOversampling with SMOTE ...'
	over_sampler=SMOTE(verbose=verbose)
	X_over_sampled,y_over_sampled = over_sampler.fit_transform(X,y)
	return X_over_sampled,y_over_sampled
    def balance_data_oversampling_smote_borderline2(self):
        '''
        Balance data using SMOTE bordeline 2.
        '''
        x = self.X
        y = self.y
        y.shape = (len(self.y))
        verbose = True

        sm = SMOTE(kind='borderline2', verbose=verbose)
        svmx, svmy = sm.fit_transform(x, y)

        self.X = svmx
        self.y = svmy
        self.y.shape = (len(self.y), 1)
    def balance_data_oversampling_smote_regular(self):
        '''
        Balance data using SMOTE regular.
        '''
        x = self.X
        y = self.y
        y.shape = (len(self.y))
        verbose = True

        sm = SMOTE(kind='regular', verbose=verbose)
        svmx, svmy = sm.fit_transform(x, y)

        self.X = svmx
        self.y = svmy
        self.y.shape = (len(self.y), 1)
def test_transform_regular():
    """Test transform function with regular SMOTE."""

    # Create the object
    kind = 'regular'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
def test_transform_regular():
    """Test transform function with regular SMOTE."""

    # Create the object
    kind = 'regular'
    smote = SMOTE(random_state=RND_SEED, kind=kind)
    # Fit the data
    smote.fit(X, Y)

    X_resampled, y_resampled = smote.fit_transform(X, Y)

    currdir = os.path.dirname(os.path.abspath(__file__))
    X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy'))
    y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy'))
    assert_array_equal(X_resampled, X_gt)
    assert_array_equal(y_resampled, y_gt)
    def balance_data_oversampling_smote_svm(self):
        '''
        Balance data using SMOTE SVM.
        '''
        x = self.X
        y = self.y
        y.shape = (len(self.y))
        verbose = True

        svm_args = {'class_weight': 'auto'}
        sm = SMOTE(kind='svm', verbose=verbose, **svm_args)
        svmx, svmy = sm.fit_transform(x, y)

        self.X = svmx
        self.y = svmy
        self.y.shape = (len(self.y), 1)
Beispiel #14
0
def fit_random_forest(X, y):
    sm = SMOTE(kind='regular', ratio=0.5)
    X_resampled, y_resampled = sm.fit_transform(X, y)

    # Splitting train and test data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3)

    rf = RandomForestClassifier(oob_score=True, n_jobs=-1, bootstrap=True, min_samples_leaf=2,
                                n_estimators=400, min_samples_split=1, random_state=67,
                                max_features=None, max_depth=None)
    rf.fit(X_train, y_train)

    # Draw a confusion matrix for the results
    y_predict = rf.predict(X_test)
    y_proba = rf.predict_proba(X_test)
    cm = standard_confusion_matrix(y_test, y_predict)

    print "\nRandom Forest Scores:\n"
    print "accuracy:", rf.score(X_test, y_test)
    print "precision:", precision_score(y_test, y_predict)
    print "recall:", recall_score(y_test, y_predict)

    tpr, fpr, thres = roc_curve(y_proba[:,0:1].flatten(), y_test)
    plt.plot(tpr, fpr)
    plt.show()

    fix, ax = plt.subplots(figsize=(10, 7))
    sns.heatmap(cm, annot=True,  fmt='', square=True, \
                            xticklabels=['1', '0'], \
                            yticklabels=['1', '0']);
    plt.show()

    cols = list(df.columns)

    print "\nFeature Importance: \n"
    for name, importance in izip(cols, rf.feature_importances_):
        print round(importance,4), '\t\t', name

    plot_importance(rf, merged_df, max_features=16)

    return rf
Beispiel #15
0
def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio):
    sampler = None
    verbose = True
    if sample_type == SMOTE_REG:
        sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15)
    elif sample_type == SMOTE_SVM:
        # TODO: Make this configurable?
        svm_args = {'class_weight': 'balanced'}
        sampler = SMOTE(kind='svm',
                        ratio=ratio,
                        verbose=verbose,
                        k=15,
                        **svm_args)
    elif sample_type == SMOTE_BORDERLINE_1:
        sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_BORDERLINE_2:
        sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose)
    elif sample_type == SMOTE_ENN:
        sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == SMOTE_TOMEK:
        sampler = SMOTETomek(ratio=ratio, verbose=verbose, k=15)
    elif sample_type == UNDERSAMPLER:
        sampler = UnderSampler(ratio=ratio,
                               verbose=verbose,
                               replacement=False,
                               random_state=17)
    elif sample_type == ADASYN_SAMPLER:
        sampler = ADASYN(k=15, imb_threshold=0.6, ratio=ratio)
    elif sample_type == TOMEK_LINKS:
        sampler = TomekLinks()
    elif sample_type == CLUSTER_CENTROIDS:
        sampler = ClusterCentroids(ratio=ratio)
    elif sample_type == NEARMISS:
        sampler = NearMiss(ratio=ratio)
    else:
        print "Unrecoqnized sample technique: " + sample_type
        print "Returning original data"
        return train_x, train_y
    return sampler.fit_transform(train_x, train_y)
Beispiel #16
0
import pandas as pd
import sklearn
import scipy
import numpy
from unbalanced_dataset.over_sampling import SMOTE

train = pd.DataFrame.from_csv('train.csv')
features = [ 'var38',
'var15',
'saldo_var30',
'saldo_medio_var5_hace2',
'saldo_medio_var5_hace3',
'num_var22_ult1',
'num_var22_ult3',
'num_var45_hace3',
'saldo_medio_var5_ult3',
'num_var22_hace3']
X = train[features]
Y = train['TARGET']

sm = SMOTE(kind='regular', verbose='verbose')
svmx, svmy = sm.fit_transform(X, Y)
print len(svmx)
                           n_informative=3,
                           n_redundant=1,
                           flip_y=0,
                           n_features=20,
                           n_clusters_per_class=1,
                           n_samples=5000,
                           random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
sm = SMOTE(kind='regular')
X_resampled, y_resampled = sm.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0],
            X_vis[y == 0, 1],
            label="Class #0",
            alpha=0.5,
            edgecolor=almost_black,
            facecolor=palette[0],
            linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0],
            X_vis[y == 1, 1],
            label="Class #1",
Beispiel #18
0
def fit_logistic_regression(X, y):

    sm = SMOTE(kind='regular')
    X_resampled, y_resampled = sm.fit_transform(X, y)

    # Splitting train and test data
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3)

    # Fitting regression and getting its scores
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    predict_log = log_reg.predict(X_test)
    print "\nLogistic Regression Scores:\n"
    print "Accuracy on test set:", log_reg.score(X_test, y_test)
    print "Precision:", precision_score(y_test, predict_log)
    print "Recall:", recall_score(y_test, predict_log)

    # Fitting multiple k-fold cross validations and getting mean scores
    kfold = KFold(len(y))

    accuracies = []
    precisions = []
    recalls = []

    for train_index, test_index in kfold:
        model = LogisticRegression()
        model.fit(X[train_index], y[train_index])
        y_predict = model.predict(X[test_index])
        y_true = y[test_index]
        accuracies.append(accuracy_score(y_true, y_predict))
        precisions.append(precision_score(y_true, y_predict))
        recalls.append(recall_score(y_true, y_predict))

    print "\nK-Fold Cross Validation on Logistic Regression Scores:\n"
    print "accuracy:", np.average(accuracies)
    print "precision:", np.average(precisions)
    print "recall:", np.average(recalls)

    cols = list(df.columns)

    print
    print "Beta scores:"
    for name, coef in izip(df.columns, model.coef_[0]):
        print "%s: %.4f" % (name, coef)

    y_predict = log_reg.predict(X_test)
    y_proba = log_reg.predict_proba(X_test)
    cm = standard_confusion_matrix(y_test, y_predict)

    tpr, fpr, thres = roc_curve(y_proba[:,0:1].flatten(), y_test)
    plt.plot(tpr, fpr)
    plt.show()

    fix, ax = plt.subplots(figsize=(10, 7))
    sns.heatmap(cm, annot=True,  fmt='', square=True, \
                            xticklabels=['1', '0'], \
                            yticklabels=['1', '0']);
    plt.show()

    print
    print "Likelihoods:"
    for i, coef in enumerate(log_reg.coef_[0]):
#         print "beta %s: %.5f" % (cols[i], exp(coef))
        if coef <0:
            print "*Increasing the %s by 1 point decreases the chance of label=1 by a factor of %.4f.*" % (cols[i], exp(coef))
        else:
            print "*Increasing the %s by 1 point increases the chance of label=1 by a factor of %.4f.*" % (cols[i], exp(coef))
        print

    print "To double:"
    for i, coef in enumerate(model.coef_[0]):
#     print "beta %s: %.5f" % (cols[i], log(2) / coef)
        if coef < 0:
            print "*Decreasing the %s score by %d points doubles the chance of label=1.*" % (cols[i], log(2) / coef)
        else:
            print "*Increasing the %s score by %d points doubles the chance of label=1.*" % (cols[i], log(2) / coef)
        print
from unbalanced_dataset.over_sampling import SMOTE

# Generate the dataset
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9],
                           n_informative=3, n_redundant=1, flip_y=0,
                           n_features=20, n_clusters_per_class=1,
                           n_samples=5000, random_state=10)

# Instanciate a PCA object for the sake of easy visualisation
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random under-sampling
sm = SMOTE(kind='borderline2')
X_resampled, y_resampled = sm.fit_transform(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],