def test_cnn_fit_sample_with_object(): """Test the fit sample routine with a knn object""" # Resample the data knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1) X_resampled, y_resampled = cnn.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def fit(self, X , y = None): # 'Random under-sampling' smote = CondensedNearestNeighbour(size_ngh=51, n_seeds_S=51) #Accuracy: 0.939693267481 #Precision: 0.238095238095 #Recall: 0.897435897436 #Accuracy: 0.962568234988 #Precision: 0.324468085106 #Recall: 0.782051282051 #SMOTE(ratio=ratio, kind='borderline1') #Accuracy: 0.971146347803 #Precision: 0.372093023256 #Recall: 0.615384615385 #SMOTE(ratio=ratio, kind='borderline2') #Accuracy: 0.965427605927 #Precision: 0.333333333333 #Recall: 0.705128205128 #svm_args = {'class_weight': 'auto'} #svmsmote = SMOTE(ratio=ratio, kind='svm', **svm_args) #Accuracy: 0.972186119054 #Precision: 0.395683453237 #Recall: 0.705128205128 # smote = SMOTE(ratio='auto', kind='regular') X, y = smote.fit_sample(X.toarray(), y) weights = np.array([1/y.mean() if i == 1 else 1 for i in y]) return super(RandomForestClassifier, self).fit(X,y,sample_weight=weights)
def random_instance_selection(dfZ, x, blackbox, dataset): dfZ1, Z = random_neighborhood(dfZ, x, blackbox, dataset) y = blackbox.predict(Z) cnn = CondensedNearestNeighbour(return_indices=True) Z, _, _ = cnn.fit_sample(Z, y) dfZ = build_df2explain(blackbox, Z, dataset) return dfZ, Z
def test_cnn_fit_sample_with_object(): knn = KNeighborsClassifier(n_neighbors=1) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) X_resampled, y_resampled = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=1) X_resampled, y_resampled = cnn.fit_sample(X, Y) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample(): """Test the fit sample routine""" # Resample the data cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample(): """Test the fit sample routine""" # Resample the data cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample(): cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def test_cnn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) currdir = os.path.dirname(os.path.abspath(__file__)) X_gt = np.load(os.path.join(currdir, 'data', 'cnn_x.npy')) y_gt = np.load(os.path.join(currdir, 'data', 'cnn_y.npy')) idx_gt = np.load(os.path.join(currdir, 'data', 'cnn_idx.npy')) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample_with_indices(): cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
def test_cnn_fit_sample(): """Test the fit sample routine""" # Resample the data cnn = CondensedNearestNeighbour(random_state=RND_SEED) X_resampled, y_resampled = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt)
def train_stage(df_path, cb_path): print('Load Train Data.') df = pd.read_csv(df_path) print('\nShape of Train Data: {}'.format(df.shape)) y_df = np.array(df['target']) df_ids = np.array(df.index) df.drop(['ID_code', 'target'], axis=1, inplace=True) cb_cv_result = np.zeros(df.shape[0]) skf = StratifiedKFold(n_splits=15, shuffle=False, random_state=42) skf.get_n_splits(df_ids, y_df) #sm = TomekLinks(random_state=42) sm = CondensedNearestNeighbour(random_state=42, n_jobs=3) print('\nModel Fitting...') for counter, ids in enumerate(skf.split(df_ids, y_df)): print('\nFold {}'.format(counter + 1)) X_fit, y_fit = df.values[ids[0]], y_df[ids[0]] X_val, y_val = df.values[ids[1]], y_df[ids[1]] X_fit, y_fit = sm.fit_sample(X_fit, y_fit) print('CatBoost') cb_cv_result[ids[1]] += fit_cb(X_fit, y_fit, X_val, y_val, counter, cb_path, name='cb') del X_fit, X_val, y_fit, y_val gc.collect() auc_cb = round(roc_auc_score(y_df, cb_cv_result), 4) print('Catboost VAL AUC: {}'.format(auc_cb)) return 0
def test_cnn_fit_sample_with_indices(): """Test the fit sample routine with indices support""" # Resample the data cnn = CondensedNearestNeighbour(return_indices=True, random_state=RND_SEED) X_resampled, y_resampled, idx_under = cnn.fit_sample(X, Y) X_gt = np.array([[-0.10903849, -0.12085181], [0.01936241, 0.17799828], [0.05230552, 0.09043907], [-1.25020462, -0.40402054], [0.70524765, 0.39816382], [0.35831463, 1.33483198], [-0.284881, -0.62730973], [0.03394306, 0.03986753], [-0.01252787, 0.34102657], [0.15198585, 0.12512646]]) y_gt = np.array([0, 0, 1, 1, 1, 2, 2, 2, 2, 2]) idx_gt = np.array([4, 11, 17, 12, 19, 9, 5, 7, 14, 18]) assert_array_equal(X_resampled, X_gt) assert_array_equal(y_resampled, y_gt) assert_array_equal(idx_under, idx_gt)
scores = cross_validate(enn_pipe_rf, X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision')) scores['test_roc_auc'].mean(), scores['test_average_precision'].mean() # (0.9248526844001812, 0.6883592815252976) ######### Condensed Nearest Neighbor ######### from imblearn.under_sampling import CondensedNearestNeighbour # opposite of ENN; iteratively adds points to the data that are misclassified by KNN cnn = CondensedNearestNeighbour() X_train_cnn, y_train_cnn = cnn.fit_sample(X_train, y_train) print(X_train_cnn.shape) print(np.bincount(y_train_cnn)) ### Pipeline method cnn_pipe = make_imb_pipeline(CondensedNearestNeighbour(), LogisticRegression()) scores = cross_validate(cnn_pipe, X_train, y_train, cv=10, scoring=('roc_auc', 'average_precision')) pd.DataFrame(scores)[['test_roc_auc', 'test_average_precision']].mean() cnn_pipe_rf = make_imb_pipeline(CondensedNearestNeighbour(),
def balance_cnn(input): input_x, input_y = input cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_sample(input_x, input_y) return X_res, y_res
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 #initial statistics plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k') plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show() #using standard scaling on sepa-length,sepal-width ,petal -length,petal-width and encoding on #different species of iris x = iris.data[:, 0:4] y = iris.target X_normalized = normalize(x, axis=0) x_train, x_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.20) cnn = CondensedNearestNeighbour(return_indices=True) X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_normalized, y) clf = KNeighborsClassifier(n_neighbors=1) clf.fit(X_resampled, y_resampled) y_pred = clf.predict(x_test) print(confusion_matrix(y_test, y_pred)) target_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'] print(classification_report(y_test, y_pred, target_names=target_names))
import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn import neighbors, datasets from itertools import product from sklearn.neighbors import DistanceMetric from imblearn.under_sampling import CondensedNearestNeighbour iris = datasets.load_iris() X = iris.data[:, :2] y = iris.target cnn = CondensedNearestNeighbour() X_cnn, y_cnn = cnn.fit_sample(X, y) # Create color maps cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) metrics = ['euclidean', 'mahalanobis'] n_neighbors = [1, 3] datasets = [{ "X": X, "y": y, "cnn": False }, { "X": X_cnn, "y": y_cnn, "cnn": True
# Method3:(optional) --> income2 = pd.get_dummies(income_raw)['>50K'] # Print the number of features after one-hot encoding encoded = list(features_final.columns) print("{} total features after one-hot encoding.".format(len(encoded))) # Uncomment the following line to see the encoded feature names print(encoded) #----------------- # @Raafat: Some techniques to deal imbalanced data: # --> under sampling from imblearn.under_sampling import CondensedNearestNeighbour cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_sample(features_final[0:300], income[0:300]) print('not Resampled dataset shape {}'.format(income[0:300].value_counts())) print('cnn Resampled dataset shape {}'.format(pd.Series(y_res).value_counts())) from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(random_state=42) X_res, y_res = rus.fit_sample(features_final[0:300], income[0:300]) print('rus Resampled dataset shape {}'.format(pd.Series(y_res).value_counts())) from imblearn.under_sampling import TomekLinks tl = TomekLinks(random_state=42) X_res, y_res = tl.fit_sample(features_final[0:300], income[0:300]) print('tl Resampled dataset shape {}'.format(pd.Series(y_res).value_counts())) # --> over sampling from imblearn.over_sampling import SMOTE
# Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Condensed Nearest Neighbours cnn = CondensedNearestNeighbour() X_resampled, y_resampled = cnn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
from matplotlib.colors import ListedColormap from sklearn import neighbors from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF','#FFFFE0']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#9B870C']) for data in X: yind = X.index(data) yt = y[yind] X_train,X_test,y_train,y_test = train_test_split(data,yt,test_size=0.2,random_state=1,stratify=yt) cnn = CondensedNearestNeighbour() Xc,yc = cnn.fit_sample(data,yt) X_train_cnn,X_test_cnn,y_train_cnn,y_test_cnn = train_test_split(data,yt,test_size=0.2,random_state=1,stratify=yt) clf1 = neighbors.KNeighborsClassifier(n_neighbors=1) clf1.fit(X_train,y_train) pred1 = clf1.predict(X_test) pred_cnn1 = clf1.predict(X_test_cnn) x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1 y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),np.arange(y_min, y_max, 0.02)) x_minc, x_maxc = Xc[:, 0].min() - 1, Xc[:, 0].max() + 1 y_minc, y_maxc = Xc[:, 1].min() - 1, Xc[:, 1].max() + 1 xxc, yyc = np.meshgrid(np.arange(x_minc, x_maxc, 0.02),np.arange(y_minc, y_maxc, 0.02))
from imblearn.under_sampling import CondensedNearestNeighbour ## Generate the dataset #X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], # n_informative=3, n_redundant=1, flip_y=0, # n_features=20, n_clusters_per_class=1, # n_samples=200, random_state=10) # Apply Condensed Nearest Neighbours cnn = CondensedNearestNeighbour(return_indices=True) #Xtrain_new, Ytrain_new, idx_resampled = cnn.fit_sample(Xtrain[selectList].iloc[:100,:], Ytrain.iloc[:100]) from imblearn.combine import SMOTETomek smote_tomek = SMOTETomek(random_state=0) Xtrain_new, Ytrain_new, _ = cnn.fit_sample(Xtrain[selectList].iloc[:400, :], Ytrain.iloc[:400]) Xtrain_new = pd.DataFrame(Xtrain_new) Ytrain_new = pd.DataFrame(Ytrain_new) for i in range(800, Xtrain.shape[0], 400): #Xtrain.shape[0] 36279 X_resampled, y_resampled, _ = cnn.fit_sample( Xtrain[selectList].iloc[(i - 400):i, :], Ytrain.iloc[(i - 400):i]) Xtrain_new = pd.concat([ Xtrain_new.reset_index(drop=True), pd.DataFrame(X_resampled).reset_index(drop=True) ], axis=0) Ytrain_new = pd.concat([ Ytrain_new.reset_index(drop=True), pd.DataFrame(y_resampled).reset_index(drop=True) ],
import matplotlib.pyplot as plt from pylab import subplot, title from matplotlib.colors import ListedColormap from imblearn.under_sampling import CondensedNearestNeighbour X1, y1 = make_blobs(n_samples=150, centers=4, n_features=2,random_state=21) X2, y2 = make_gaussian_quantiles(mean=(2,2),cov=3., n_samples=150, n_features=2, n_classes=3, random_state=9) X3, y3 = make_gaussian_quantiles(mean=(5,5),cov=5., n_samples=150, n_features=2, n_classes=2, random_state=15) X = concatenate([X1,X2,X3]) y = concatenate([y1,y2,y3]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=35) cnn = CondensedNearestNeighbour(random_state=0) #random_state is used to get the same result for every run X_res1, y_res1 = cnn.fit_sample(X, y) X_train1, X_test1, y_train1, y_test1 = train_test_split(X_res1, y_res1, test_size=0.25, random_state=35) #CNN İLE ALAKALI ACCURACY İÇİN! h = .02 cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF', '#8B008B']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF', '#8B008B']) clf1 = KNeighborsClassifier(n_neighbors=1, weights='uniform') clf2 = KNeighborsClassifier(n_neighbors=1, weights='uniform') clf1.fit(X_train, y_train) clf2.fit(X_train1,y_train1) pred1 = clf1.predict(X_test) pred2 = clf2.predict(X_test1)
def balance_cnn(self): cnn = CondensedNearestNeighbour(random_state=42) X_res, y_res = cnn.fit_sample(self.vec, self.target) Classification_JCL.split_data(self, X_res, y_res)
from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report import collections data = pd.read_csv("creditcard.csv") d1 = np.array(data['Amount']) data['normAmount'] = StandardScaler().fit_transform(d1.reshape(-1, 1)) data = data.drop(['Time', 'Amount'], axis=1) X = data.loc[:, data.columns != 'Class'] y = data.loc[:, data.columns == 'Class'] #samppling cnn = CondensedNearestNeighbour(random_state=1) X_sampled, y_sampled = cnn.fit_sample(X, y.values.ravel()) print("sampled data size", collections.Counter(y_sampled)) X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.3, random_state=0) X_train_sampled_df = pd.DataFrame(X_train) y_train_sampled_df = pd.DataFrame(y_train) X_test_sampled_df = pd.DataFrame(X_test) y_test_sampled_df = pd.DataFrame(y_test) #random forest clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) clf.fit(X_train_sampled_df, y_train_sampled_df.values.ravel()) y_pred = clf.predict(X_test_sampled_df) print("predicted")
# -*- coding: utf-8 -*- """ Created on Tue Sep 18 16:03:54 2018 @author: Student """ import matplotlib.pyplot as plt from sklearn.cross_validation import train_test_split from sklearn.neighbors import KNeighborsClassifier from imblearn.under_sampling import CondensedNearestNeighbour from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.model_selection import cross_val_score import pandas as pd dataset=pd.read_csv('car_datacat.csv') x = dataset.iloc[:,0:6].values y =dataset.iloc[:,6].values x_train, x_test, y_train, y_test =train_test_split(x,y,test_size=0.2) cnn = CondensedNearestNeighbour(return_indices=True) X_resampled, y_resampled, idx_resampled = cnn.fit_sample(x, y) clf=KNeighborsClassifier(n_neighbors=1) clf.fit(X_resampled, y_resampled) y_pred = clf.predict(x_test) print(confusion_matrix(y_test, y_pred))
except Exception as e: print('an issue with {}, rate: {}, variant: {}'.format( dataset, rate, variant)) print(e) if CNN_FLAG: variant = 'CNN' print('>> {}, rate: {}, variant: {}'.format( dataset, rate, variant)) try: dataset_size = X_train.shape[0] coreset_size = max(int(dataset_size * rate / 100), 1) startTime = datetime.now() cnn = CondensedNearestNeighbour(random_state=SEED) observers, total_labels = cnn.fit_sample(X_train, y_train) observers, total_labels = fix_rate(X_train, y_train, observers, total_labels, coreset_size) interm = (datetime.now() - startTime).total_seconds() startTime = datetime.now() try: neigh = KNeighborsClassifier(n_neighbors=5) except: neigh = KNeighborsClassifier(n_neighbors=1) neigh.fit(observers, total_labels) y_pred = neigh.predict(X_test) final = (datetime.now() - startTime).total_seconds() inds = get_indices_(y_test, y_pred) CNN_RES = fill_table(CNN_RES, inds, j, interm, final) except Exception as e:
############################################################################## ### tomaek Links tl = TomekLinks(return_indices=True) X_resampled, y_resampled, idx_resampled = tl.fit_sample(X_train, y_train) plot_(X_resampled, y_resampled, remove=False) plot_(X_resampled, y_resampled, remove=True) tl_tree = tree.fit(X_resampled, y_resampled) tl_ = confusion_matrix(y_test, tl_tree.predict(X_test)) ############################################################################### ### Condensed Nearest Neighbor cnn = CondensedNearestNeighbour(return_indices=True) X_resampled, y_resampled, idx_resampled = cnn.fit_sample(X_train, y_train) plot_(X_resampled, y_resampled, remove=False) plot_(X_resampled, y_resampled, remove=True) cnn_tree = tree.fit(X_resampled, y_resampled) cnn_ = confusion_matrix(y_test, cnn_tree.predict(X_test)) ############################################################################### ### One-side selection oss = OneSidedSelection(return_indices=True) X_resampled, y_resampled, idx_resampled = oss.fit_sample(X_train, y_train) plot_(X_resampled, y_resampled, remove=False) plot_(X_resampled, y_resampled, remove=True)
def draw_cnn(k, metric): names = ['x', 'y', 'color'] df = pd.DataFrame(mapped_colors, columns=names) # print(df.head()) originalX = np.array(df.ix[:, 0:2]) originaly = np.array(df['color']) cnn = CondensedNearestNeighbour(n_neighbors=k, return_indices=True) X_resampled, y_resampled, idx_resampled = cnn.fit_sample( originalX, originaly) X = X_resampled y = y_resampled X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) if metric == 'mahalanobis': knn = KNeighborsClassifier( n_neighbors=k, metric=metric, metric_params={'V': np.cov(np.transpose(X))}) else: knn = KNeighborsClassifier(n_neighbors=k, metric=metric) knn.fit(X_train, y_train) pred = knn.predict(X_test) err = 1 - accuracy_score(y_test, pred) print('\nThe error is ' + str(err * 100)) print('\nPercentage points left after CNN: ' + str(len(idx_resampled) / total_points * 100)) h = .02 cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) cmap_black = ListedColormap(['#FFFFFF', '#FFFFFF', '#FFFFFF']) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = knn.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure() plt.pcolormesh(xx, yy, Z, cmap=cmap_light) plt.scatter(originalX[:, 0], originalX[:, 1], c=originaly, cmap=cmap_black, edgecolor='k', s=20) plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20) plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.title("3-Class classification (k = %i)" % k)
def test_cnn_fit_sample_with_wrong_object(): knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) with raises(ValueError, match="has to be a int or an "): cnn.fit_sample(X, Y)
# Generate the dataset X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=10) # Instanciate a PCA object for the sake of easy visualisation pca = PCA(n_components=2) # Fit and transform x to visualise inside a 2D feature space X_vis = pca.fit_transform(X) # Apply Condensed Nearest Neighbours cnn = CondensedNearestNeighbour() X_resampled, y_resampled = cnn.fit_sample(X, y) X_res_vis = pca.transform(X_resampled) # Two subplots, unpack the axes array immediately f, (ax1, ax2) = plt.subplots(1, 2) ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5, edgecolor=almost_black, facecolor=palette[2], linewidth=0.15) ax1.set_title('Original set') ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1], label="Class #0", alpha=.5, edgecolor=almost_black, facecolor=palette[0], linewidth=0.15) ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
score = clf.predict_proba(X_test) evaluate(y_test, score) # Tomek's links # Edited data set using nearest neighbours print("######################## ENN ########################") enn = EditedNearestNeighbours(random_state=0) X_res, y_res = enn.fit_sample(X_train, y_train) print(X_train.shape) print(X_res.shape) print(np.sum(y_res)) clf = SVC(probability=True) clf.fit(X_res, y_res) score = clf.predict_proba(X_test) evaluate(y_test, score) # Condensed nearest neighbors and derived algorithms print("######################## CNN ########################") cnn = CondensedNearestNeighbour(random_state=0) X_res, y_res = cnn.fit_sample(X_train, y_train) print(X_train.shape) print(X_res.shape) print(np.sum(y_res)) clf = SVC(probability=True) clf.fit(X_res, y_res) score = clf.predict_proba(X_test) evaluate(y_test, score) pass