def grid_search_rf(): rf_grid = { 'max_depth': [4, 8, None], 'max_features': ['sqrt', 'log2', None], 'min_samples_split': [1, 2, 4], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True], # Mandatory with oob_score=True 'n_estimators': [50, 100, 200, 400], 'random_state': [67], 'oob_score': [True], 'n_jobs': [-1] } rf_grid_cv = GridSearchCV(RandomForestClassifier(), rf_grid, n_jobs=-1, verbose=True, scoring='roc_auc') sm = SMOTE(kind='regular', ratio=0.4) X_resampled, y_resampled = sm.fit_transform(X, y) # Splitting train and test data X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3) rf_grid_cv.fit(X_train, y_train) print "Best Parameters found:\n", rf_grid_cv.best_params_ best_model = rf_grid_cv.best_estimator_ print "OOB:", best_model.oob_score_
def test_smote_fit(): """Test the fitting method""" # Create the object smote = SMOTE(random_state=RND_SEED) # Fit the data smote.fit(X, Y) # Check if the data information have been computed assert_equal(smote.min_c_, 0) assert_equal(smote.maj_c_, 1) assert_equal(smote.stats_c_[0], 500) assert_equal(smote.stats_c_[1], 4500)
def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio): sampler = None verbose = True if sample_type == SMOTE_REG: sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15) elif sample_type == SMOTE_SVM: # TODO: Make this configurable? svm_args = {'class_weight' : 'balanced'} sampler = SMOTE(kind='svm', ratio=ratio, verbose=verbose, k=15, **svm_args) elif sample_type == SMOTE_BORDERLINE_1: sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_BORDERLINE_2: sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_ENN: sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15) elif sample_type == SMOTE_TOMEK: sampler = SMOTETomek(ratio=ratio,verbose=verbose, k=15) elif sample_type == UNDERSAMPLER: sampler = UnderSampler(ratio=ratio, verbose=verbose, replacement=False, random_state=17) else: print "Unrecoqnized sample technique: " + sample_type print "Returning original data" return train_x, train_y return sampler.fit_transform(train_x, train_y)
def train(train, test) : smote = SMOTE(kind='regular', verbose=False) train_matrix, train_labels = smote.fit_transform(train.drop('label', 1), train.label) if (algorithm == 'random-forest') : clf = RandomForestClassifier(n_estimators=5, n_jobs=3, criterion='entropy') elif (algorithm == 'adaboost') : clf = AdaBoostClassifier() # clf = SVC(class_weight="balanced", probability=True, verbose=False) clf.fit(train_matrix, train_labels) return [testMeanDiff(clf, test), clf.score(test.drop('label', 1), test.label), clf]
def smote_oversampling(X,y): """ Perform the SMOTE oversampling Keyword arguments: X -- The feature vectors y -- The target classes """ if verbose: print '\nOversampling with SMOTE ...' over_sampler=SMOTE(verbose=verbose) X_over_sampled,y_over_sampled = over_sampler.fit_transform(X,y) return X_over_sampled,y_over_sampled
def test_transform_regular(): """Test transform function with regular SMOTE.""" # Create the object kind = 'regular' smote = SMOTE(random_state=RND_SEED, kind=kind) # Fit the data smote.fit(X, Y) X_resampled, y_resampled = smote.fit_transform(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'smote_reg_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def fit_random_forest(X, y): sm = SMOTE(kind='regular', ratio=0.5) X_resampled, y_resampled = sm.fit_transform(X, y) # Splitting train and test data X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3) rf = RandomForestClassifier(oob_score=True, n_jobs=-1, bootstrap=True, min_samples_leaf=2, n_estimators=400, min_samples_split=1, random_state=67, max_features=None, max_depth=None) rf.fit(X_train, y_train) # Draw a confusion matrix for the results y_predict = rf.predict(X_test) y_proba = rf.predict_proba(X_test) cm = standard_confusion_matrix(y_test, y_predict) print "\nRandom Forest Scores:\n" print "accuracy:", rf.score(X_test, y_test) print "precision:", precision_score(y_test, y_predict) print "recall:", recall_score(y_test, y_predict) tpr, fpr, thres = roc_curve(y_proba[:,0:1].flatten(), y_test) plt.plot(tpr, fpr) plt.show() fix, ax = plt.subplots(figsize=(10, 7)) sns.heatmap(cm, annot=True, fmt='', square=True, \ xticklabels=['1', '0'], \ yticklabels=['1', '0']); plt.show() cols = list(df.columns) print "\nFeature Importance: \n" for name, importance in izip(cols, rf.feature_importances_): print round(importance,4), '\t\t', name plot_importance(rf, merged_df, max_features=16) return rf
def test_smote(x, y): print('SMOTE') sm = SMOTE(kind='regular', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE bordeline 1') sm = SMOTE(kind='borderline1', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE bordeline 2') sm = SMOTE(kind='borderline2', verbose=verbose) svmx, svmy = sm.fit_transform(x, y) print('SMOTE SVM') svm_args={'class_weight': 'auto'} sm = SMOTE(kind='svm', verbose=verbose, **svm_args) svmx, svmy = sm.fit_transform(x, y)
import pandas as pd import sklearn import scipy import numpy from unbalanced_dataset.over_sampling import SMOTE train = pd.DataFrame.from_csv('train.csv') features = [ 'var38', 'var15', 'saldo_var30', 'saldo_medio_var5_hace2', 'saldo_medio_var5_hace3', 'num_var22_ult1', 'num_var22_ult3', 'num_var45_hace3', 'saldo_medio_var5_ult3', 'num_var22_hace3'] X = train[features] Y = train['TARGET'] sm = SMOTE(kind='regular', verbose='verbose') svmx, svmy = sm.fit_transform(X, Y) print len(svmx)
from unbalanced_dataset.over_sampling import SMOTE # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling sm = SMOTE(kind='borderline2') X_resampled, y_resampled = sm.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
def __get_sample_transformed_examples(sample_type, train_x, train_y, ratio): sampler = None verbose = True if sample_type == SMOTE_REG: sampler = SMOTE(kind='regular', verbose=verbose, ratio=ratio, k=15) elif sample_type == SMOTE_SVM: # TODO: Make this configurable? svm_args = {'class_weight' : 'balanced'} sampler = SMOTE(kind='svm', ratio=ratio, verbose=verbose, k=15, **svm_args) elif sample_type == SMOTE_BORDERLINE_1: sampler = SMOTE(kind='borderline1', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_BORDERLINE_2: sampler = SMOTE(kind='borderline2', ratio=ratio, verbose=verbose) elif sample_type == SMOTE_ENN: sampler = SMOTEENN(ratio=ratio, verbose=verbose, k=15) elif sample_type == SMOTE_TOMEK: sampler = SMOTETomek(ratio=ratio,verbose=verbose, k=15) elif sample_type == UNDERSAMPLER: sampler = UnderSampler(ratio=ratio, verbose=verbose, replacement=False, random_state=17) elif sample_type == ADASYN_SAMPLER: sampler = ADASYN(k=15,imb_threshold=0.6, ratio=ratio) elif sample_type == TOMEK_LINKS: sampler = TomekLinks() elif sample_type == CLUSTER_CENTROIDS: sampler = ClusterCentroids(ratio=ratio) elif sample_type == NEARMISS: sampler = NearMiss(ratio=ratio) else: print "Unrecoqnized sample technique: " + sample_type print "Returning original data" return train_x, train_y return sampler.fit_transform(train_x, train_y)
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling sm = SMOTE(kind='borderline2') X_resampled, y_resampled = sm.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1],
def fit_logistic_regression(X, y): sm = SMOTE(kind='regular') X_resampled, y_resampled = sm.fit_transform(X, y) # Splitting train and test data X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3) # Fitting regression and getting its scores log_reg = LogisticRegression() log_reg.fit(X_train, y_train) predict_log = log_reg.predict(X_test) print "\nLogistic Regression Scores:\n" print "Accuracy on test set:", log_reg.score(X_test, y_test) print "Precision:", precision_score(y_test, predict_log) print "Recall:", recall_score(y_test, predict_log) # Fitting multiple k-fold cross validations and getting mean scores kfold = KFold(len(y)) accuracies = [] precisions = [] recalls = [] for train_index, test_index in kfold: model = LogisticRegression() model.fit(X[train_index], y[train_index]) y_predict = model.predict(X[test_index]) y_true = y[test_index] accuracies.append(accuracy_score(y_true, y_predict)) precisions.append(precision_score(y_true, y_predict)) recalls.append(recall_score(y_true, y_predict)) print "\nK-Fold Cross Validation on Logistic Regression Scores:\n" print "accuracy:", np.average(accuracies) print "precision:", np.average(precisions) print "recall:", np.average(recalls) cols = list(df.columns) print print "Beta scores:" for name, coef in izip(df.columns, model.coef_[0]): print "%s: %.4f" % (name, coef) y_predict = log_reg.predict(X_test) y_proba = log_reg.predict_proba(X_test) cm = standard_confusion_matrix(y_test, y_predict) tpr, fpr, thres = roc_curve(y_proba[:,0:1].flatten(), y_test) plt.plot(tpr, fpr) plt.show() fix, ax = plt.subplots(figsize=(10, 7)) sns.heatmap(cm, annot=True, fmt='', square=True, \ xticklabels=['1', '0'], \ yticklabels=['1', '0']); plt.show() print print "Likelihoods:" for i, coef in enumerate(log_reg.coef_[0]): # print "beta %s: %.5f" % (cols[i], exp(coef)) if coef <0: print "*Increasing the %s by 1 point decreases the chance of label=1 by a factor of %.4f.*" % (cols[i], exp(coef)) else: print "*Increasing the %s by 1 point increases the chance of label=1 by a factor of %.4f.*" % (cols[i], exp(coef)) print print "To double:" for i, coef in enumerate(model.coef_[0]): # print "beta %s: %.5f" % (cols[i], log(2) / coef) if coef < 0: print "*Decreasing the %s score by %d points doubles the chance of label=1.*" % (cols[i], log(2) / coef) else: print "*Increasing the %s score by %d points doubles the chance of label=1.*" % (cols[i], log(2) / coef) print
print 'number of features before: ', X.shape[1] print 'feature selection via Linear SVM...' lsvc = LinearSVC(C=100, penalty='l1', dual=False).fit(X, y) # according the validation curve (not output here), C=10 gives the best result model = SelectFromModel(lsvc, prefit=True) X_new = model.transform(X) print 'number of features after: ', X_new.shape[1] # Use SMOTE to 'fix' the imbalanced problem: # the python implementation of SMOTE comes from # https://github.com/fmfn/UnbalancedDataset/tree/master/unbalanced_dataset ratio = float(len([t for t in y if t==-1]))/float(len([t for t in y if t==1])) # oversampler = OverSampler(ratio = ratio-1) smote = SMOTE(k=3, ratio = ratio-1) smote.x = X_new smote.y = y smote.minc = 1 smote.maxc = -1 smote.ucd ={1: len([tg for tg in y if tg==1]), -1: len([tg for tg in y if tg==-1])} ret_X, ret_y = smote.resample() # overX, overy = oversampler.resample() combined = zip(ret_X, ret_y) random.shuffle(combined) ret_X[:], ret_y[:] = zip(*combined) print 'shuffled??\n', ret_y print 'training and predicting...' # clf = SVC(kernel='linear', C=1, probability=True)
from unbalanced_dataset.over_sampling import SMOTE # Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply the random under-sampling sm = SMOTE(kind='svm') X_resampled, y_resampled = sm.fit_transform(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)