def test_desknn(knn_methods, voting): pool_classifiers, X_dsel, y_dsel, X_test, y_test = setup_classifiers() desknn = DESKNN(pool_classifiers, knn_classifier=knn_methods, voting=voting) desknn.fit(X_dsel, y_dsel) assert np.isclose(desknn.score(X_test, y_test), 0.9787234042553191)
def test_desknn_proba(knn_methods): pool_classifiers, X_dsel, y_dsel, X_test, y_test = setup_classifiers() desknn = DESKNN(pool_classifiers, knn_classifier=knn_methods, voting='soft') desknn.fit(X_dsel, y_dsel) probas = desknn.predict_proba(X_test) expected = np.load( 'deslib/tests/expected_values/desknn_proba_integration.npy') assert np.allclose(probas, expected)
def __init__(self, pool_classifiers, X_train, y_train, X_test, y_test, mode=0, neighbourhood=13): self.mode = mode self.neighbourhood = neighbourhood self.pool_classifiers = pool_classifiers self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test self.X_dsel = [] self.y_dsel = [] self.desknn = DESKNN(pool_classifiers) # dodaj parametry desknn na sztywno albo przez konstrutkor
def initialize_ds(pool_classifiers, X, y, k=5): knorau = KNORAU(pool_classifiers, k=k) kne = KNORAE(pool_classifiers, k=k) desknn = DESKNN(pool_classifiers, k=k) ola = OLA(pool_classifiers, k=k) lca = LCA(pool_classifiers, k=k) mla = MLA(pool_classifiers, k=k) mcb = MCB(pool_classifiers, k=k) rank = Rank(pool_classifiers, k=k) knop = KNOP(pool_classifiers, k=k) meta = METADES(pool_classifiers, k=k) list_ds = [knorau, kne, ola, lca, mla, desknn, mcb, rank, knop, meta] names = [ 'KNORA-U', 'KNORA-E', 'OLA', 'LCA', 'MLA', 'DESKNN', 'MCB', 'RANK', 'KNOP', 'META-DES' ] # fit the ds techniques for ds in list_ds: ds.fit(X, y) return list_ds, names
class DESNEW: def __init__(self, pool_classifiers, X_train, y_train, X_test, y_test, mode=0, neighbourhood=13): self.mode = mode self.neighbourhood = neighbourhood self.pool_classifiers = pool_classifiers self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test self.X_dsel = [] self.y_dsel = [] self.desknn = DESKNN(pool_classifiers) # dodaj parametry desknn na sztywno albo przez konstrutkor def return_metrics(self): if self.mode == 0: self._fit_base_classifiers(self.X_train, self.y_train) metrics = self.fit_predict_return_simple(self.X_test, self.y_test) elif self.mode == 1: X_train1, y_train1 = SMOTE(random_state=42).fit_resample(self.X_train, self.y_train) self._fit_base_classifiers(X_train1, y_train1) metrics = self.fit_predict_return_simple(self.X_test, self.y_test) elif self.mode == 2: self._fit_base_classifiers(self.X_train, self.y_train) self._test_sample(self.X_test, self.y_test) metrics = self.fit_predict_return_score(self.X_dsel, self.y_dsel) else: X_train1, y_train1 = SMOTE(random_state=42).fit_resample(self.X_train, self.y_train) self._fit_base_classifiers(X_train1, y_train1) self._test_sample(self.X_test, self.y_test) metrics = self.fit_predict_return_score(self.X_dsel, self.y_dsel) return metrics def fit_predict_return_simple(self, X_test, y_test): metrics_names = ['acc', 'f1', 'gmean', 'prec', 'recall'] self.desknn.fit(X_test, y_test) y_pred = self.desknn.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) gmean = geometric_mean_score(y_test, y_pred) prec = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) metrics_results = dict(zip(metrics_names, [acc, f1, gmean, prec, recall])) return metrics_results def fit_predict_return_score(self, X_dsel, y_dsel): predictions = [] metrics = [] metrics_names = ['acc', 'f1', 'gmean', 'prec', 'recall'] for X_d, y_d in zip(X_dsel, y_dsel): self.desknn.fit(X_d, y_d) y_pred = self.desknn.predict(X_d) predictions.append(y_pred) acc = accuracy_score(y_d, y_pred) f1 = f1_score(y_d, y_pred) gmean = geometric_mean_score(y_d, y_pred) prec = precision_score(y_d, y_pred) recall = recall_score(y_d, y_pred) metrics_results = dict(zip(metrics_names, [acc, f1, gmean, prec, recall])) metrics.append(metrics_results) #for y_pred in predictions: #acc = accuracy_score(y_dsel, y_pred) #f1 = f1_score(y_dsel, y_pred) #gmean = geometric_mean_score(y_test, y_pred) #prec = precision_score(y_test, y_pred) #recall = recall_score(y_test, y_pred) #metrics_results = dict(zip(metrics_names, [acc, f1, gmean, prec, recall])) #metrics.append(metrics_results) gmeans = [] for metrics_results in metrics: gmeans.append(metrics_results['gmean']) if not gmeans: metrics = 0 return metrics else: idx = np.argmax(gmeans) return metrics[idx] def _test_sample(self, X_test, y_test): # sampling # binary classification unique_vals, counts = np.unique(y_test, return_counts=True) minority_class = unique_vals[np.argmin(counts)] majority_class = unique_vals[np.argmax(counts)] minority_class_indices = np.where(y_test == minority_class) minority_samples = X_test[minority_class_indices] #instancje zbioru testowego o klasie mniejszosciowej neigh = NearestNeighbors(n_neighbors=self.neighbourhood, metric="euclidean") neigh.fit(X_test) #n najblizszych sasiadow na calym zbiorze testowym #for sample in minority_samples: distances, indices = neigh.kneighbors(minority_samples) for i in range(0, len(indices)): # min distance for ratio # zbior do SMOTE y_i = indices[i] uni_val, cnts = np.unique(y_test[y_i], return_counts=True) if cnts[0]<13: if cnts[minority_class]>2 and cnts[minority_class] < cnts[majority_class]: for j in range(1,len(y_i)): if y_test[j] == minority_class: distance = distances[i][j] dist_func = lambda x: 1.5*x / (x + 1) ratio = dist_func(distance) if ratio<0.75: sm = SMOTE(sampling_strategy=0.75, random_state=42,k_neighbors=2) elif ratio>1: sm = SMOTE(sampling_strategy=1, random_state=42, k_neighbors=2) else: sm = SMOTE(sampling_strategy=ratio, random_state=42, k_neighbors=2) X_resampled, y_resampled = sm.fit_resample(X_test[y_i], y_test[y_i]) self.X_dsel.append(X_resampled) self.y_dsel.append(y_resampled) break else: self.X_dsel.append(X_test[y_i]) self.y_dsel.append(y_test[y_i]) def estimate_distance(self): #check only samples from minor class / classes #for given k neighbours (k>10) compute the distance (euclidean, Mahalonobis) from nearest same class sample #sample the region using SMOTE, computing density of sampling by homographic function f(x)=x/x+1 according to distance from given sample #if no other minor class instance inside the region -> break, cause of noise possibility #in other cases -> compute precision, recall, g-mean and f1 score pass def _fit_base_classifiers(self, X_train, y_train): # sampling for clf in self.pool_classifiers: clf.fit(X_train, y_train)
from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.metrics import accuracy_score from scipy.stats import rankdata from scipy.stats import ttest_ind from tabulate import tabulate from copy import deepcopy import matplotlib.pyplot as plt import matplotlib.ticker as mticker from sklearn.preprocessing import normalize import os state = 2404 clf_pool = { "My DES_KNN": DES_KNN(random_state=state), "DES_KNN": DESKNN(random_state=state), "KNORA-U": KNORAU(), "KNORA-E": KNORAE(), "ADABoost": AdaBoostClassifier(), } def test(clf_pool, data, method=None): dataset = "./datasets/" + data dataset = np.genfromtxt("%s.csv" % (dataset), delimiter=",") X = dataset[:, :-1] y = dataset[:, -1].astype(int) n_splits = 5 n_repeats = 3 rskf = RepeatedStratifiedKFold(n_splits=n_splits,
def predict(self, X): # Check is fit had been called check_is_fitted(self, "classes_") # Input validation X = check_array(X) if X.shape[1] != self.X_.shape[1]: raise ValueError("number of features does not match") X_dsel = self.previous_X y_dsel = self.previous_y unique, counts = np.unique(y_dsel, return_counts=True) k_neighbors = 5 if counts[0] - 1 < 5: k_neighbors = counts[0] - 1 if self.oversampler == "SMOTE" and k_neighbors > 0: smote = SMOTE(random_state=42, k_neighbors=k_neighbors) X_dsel, y_dsel = smote.fit_resample(X_dsel, y_dsel) elif self.oversampler == "svmSMOTE" and k_neighbors > 0: try: svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors) X_dsel, y_dsel = svmSmote.fit_resample(X_dsel, y_dsel) except ValueError: pass elif self.oversampler == "borderline1" and k_neighbors > 0: borderlineSmote1 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-1') X_dsel, y_dsel = borderlineSmote1.fit_resample(X_dsel, y_dsel) elif self.oversampler == "borderline2" and k_neighbors > 0: borderlineSmote2 = BorderlineSMOTE(random_state=42, k_neighbors=k_neighbors, kind='borderline-2') X_dsel, y_dsel = borderlineSmote2.fit_resample(X_dsel, y_dsel) elif self.oversampler == "ADASYN" and k_neighbors > 0: try: adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors) X_dsel, y_dsel = adasyn.fit_resample(X_dsel, y_dsel) except RuntimeError: pass except ValueError: pass elif self.oversampler == "SLS" and k_neighbors > 0: sls = Safe_Level_SMOTE(n_neighbors=k_neighbors) X_dsel, y_dsel = sls.sample(X_dsel, y_dsel) if self.desMethod == "KNORAE": des = KNORAE(self.ensemble_, random_state=42) elif self.desMethod == "KNORAU": des = KNORAU(self.ensemble_, random_state=42) elif self.desMethod == "KNN": des = DESKNN(self.ensemble_, random_state=42) elif self.desMethod == "Clustering": des = DESClustering(self.ensemble_, random_state=42) else: des = KNORAE(self.ensemble_, random_state=42) if len(self.ensemble_) < 2: prediction = self.ensemble_[0].predict(X) else: des.fit(X_dsel, y_dsel) prediction = des.predict(X) return prediction