Exemple #1
0
def test_desknn(knn_methods, voting):
    pool_classifiers, X_dsel, y_dsel, X_test, y_test = setup_classifiers()

    desknn = DESKNN(pool_classifiers,
                    knn_classifier=knn_methods,
                    voting=voting)
    desknn.fit(X_dsel, y_dsel)
    assert np.isclose(desknn.score(X_test, y_test), 0.9787234042553191)
Exemple #2
0
def test_desknn_proba(knn_methods):
    pool_classifiers, X_dsel, y_dsel, X_test, y_test = setup_classifiers()

    desknn = DESKNN(pool_classifiers,
                    knn_classifier=knn_methods,
                    voting='soft')
    desknn.fit(X_dsel, y_dsel)
    probas = desknn.predict_proba(X_test)
    expected = np.load(
        'deslib/tests/expected_values/desknn_proba_integration.npy')
    assert np.allclose(probas, expected)
Exemple #3
0
 def __init__(self, pool_classifiers, X_train, y_train, X_test, y_test, mode=0, neighbourhood=13):
     self.mode = mode
     self.neighbourhood = neighbourhood
     self.pool_classifiers = pool_classifiers
     self.X_train = X_train
     self.y_train = y_train
     self.X_test = X_test
     self.y_test = y_test
     self.X_dsel = []
     self.y_dsel = []
     self.desknn = DESKNN(pool_classifiers) # dodaj parametry desknn na sztywno albo przez konstrutkor
Exemple #4
0
def initialize_ds(pool_classifiers, X, y, k=5):
    knorau = KNORAU(pool_classifiers, k=k)
    kne = KNORAE(pool_classifiers, k=k)
    desknn = DESKNN(pool_classifiers, k=k)
    ola = OLA(pool_classifiers, k=k)
    lca = LCA(pool_classifiers, k=k)
    mla = MLA(pool_classifiers, k=k)
    mcb = MCB(pool_classifiers, k=k)
    rank = Rank(pool_classifiers, k=k)
    knop = KNOP(pool_classifiers, k=k)
    meta = METADES(pool_classifiers, k=k)

    list_ds = [knorau, kne, ola, lca, mla, desknn, mcb, rank, knop, meta]
    names = [
        'KNORA-U', 'KNORA-E', 'OLA', 'LCA', 'MLA', 'DESKNN', 'MCB', 'RANK',
        'KNOP', 'META-DES'
    ]
    # fit the ds techniques
    for ds in list_ds:
        ds.fit(X, y)

    return list_ds, names
Exemple #5
0
class DESNEW:
    def __init__(self, pool_classifiers, X_train, y_train, X_test, y_test, mode=0, neighbourhood=13):
        self.mode = mode
        self.neighbourhood = neighbourhood
        self.pool_classifiers = pool_classifiers
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.X_dsel = []
        self.y_dsel = []
        self.desknn = DESKNN(pool_classifiers) # dodaj parametry desknn na sztywno albo przez konstrutkor


    def return_metrics(self):
        
        if self.mode == 0:
            self._fit_base_classifiers(self.X_train, self.y_train)
            metrics = self.fit_predict_return_simple(self.X_test, self.y_test)

        elif self.mode == 1:
            X_train1, y_train1 = SMOTE(random_state=42).fit_resample(self.X_train, self.y_train)
            self._fit_base_classifiers(X_train1, y_train1)
            metrics = self.fit_predict_return_simple(self.X_test, self.y_test)
            
        elif self.mode == 2:
            self._fit_base_classifiers(self.X_train, self.y_train)
            self._test_sample(self.X_test, self.y_test)
            metrics = self.fit_predict_return_score(self.X_dsel, self.y_dsel)
        else:
            X_train1, y_train1 = SMOTE(random_state=42).fit_resample(self.X_train, self.y_train)
            self._fit_base_classifiers(X_train1, y_train1)
            self._test_sample(self.X_test, self.y_test)
            metrics = self.fit_predict_return_score(self.X_dsel, self.y_dsel)
        return metrics
        

    def fit_predict_return_simple(self, X_test, y_test):
        metrics_names = ['acc', 'f1', 'gmean', 'prec', 'recall']
        self.desknn.fit(X_test, y_test)
        y_pred = self.desknn.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        gmean = geometric_mean_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        metrics_results = dict(zip(metrics_names, [acc, f1, gmean, prec, recall]))
        return metrics_results

    def fit_predict_return_score(self, X_dsel, y_dsel):
        predictions = []
        metrics = []
        metrics_names = ['acc', 'f1', 'gmean', 'prec', 'recall']
        
        for X_d, y_d in zip(X_dsel, y_dsel):
            self.desknn.fit(X_d, y_d)
            y_pred = self.desknn.predict(X_d)
            predictions.append(y_pred)
            acc = accuracy_score(y_d, y_pred)
            f1 = f1_score(y_d, y_pred)
            gmean = geometric_mean_score(y_d, y_pred)
            prec = precision_score(y_d, y_pred)
            recall = recall_score(y_d, y_pred)
            metrics_results = dict(zip(metrics_names, [acc, f1, gmean, prec, recall]))
            metrics.append(metrics_results)
        
        #for y_pred in predictions:
            #acc = accuracy_score(y_dsel, y_pred)
            #f1 = f1_score(y_dsel, y_pred)
            #gmean = geometric_mean_score(y_test, y_pred)
            #prec = precision_score(y_test, y_pred)
            #recall = recall_score(y_test, y_pred)
            
            #metrics_results = dict(zip(metrics_names, [acc, f1, gmean, prec, recall]))
            #metrics.append(metrics_results)
            
        gmeans = []

        for metrics_results in metrics:
            gmeans.append(metrics_results['gmean'])

        if not gmeans:
            metrics = 0
            return metrics
        else:
            idx = np.argmax(gmeans)
            return metrics[idx]
    
    
    def _test_sample(self, X_test, y_test):
        
        # sampling
        # binary classification
        unique_vals, counts = np.unique(y_test, return_counts=True)
        minority_class = unique_vals[np.argmin(counts)]
        majority_class = unique_vals[np.argmax(counts)]
        minority_class_indices = np.where(y_test == minority_class)
        minority_samples = X_test[minority_class_indices] #instancje zbioru testowego o klasie mniejszosciowej
        
        neigh = NearestNeighbors(n_neighbors=self.neighbourhood, metric="euclidean")
        neigh.fit(X_test) #n najblizszych sasiadow na calym zbiorze testowym
        
        #for sample in minority_samples:
        distances, indices = neigh.kneighbors(minority_samples)
            
        for i in range(0, len(indices)):
                # min distance for ratio
                # zbior do SMOTE
            y_i = indices[i]
            uni_val, cnts = np.unique(y_test[y_i], return_counts=True)
            if cnts[0]<13:
                if cnts[minority_class]>2 and cnts[minority_class] < cnts[majority_class]:
                    for j in range(1,len(y_i)):
                
                        if y_test[j] == minority_class:
                            distance = distances[i][j]
                            dist_func = lambda x: 1.5*x / (x + 1)
                            ratio = dist_func(distance)
                            if ratio<0.75:
                                sm = SMOTE(sampling_strategy=0.75, random_state=42,k_neighbors=2)
                            elif ratio>1:
                                sm = SMOTE(sampling_strategy=1, random_state=42, k_neighbors=2)
                            else:
                                sm = SMOTE(sampling_strategy=ratio, random_state=42, k_neighbors=2)

                            X_resampled, y_resampled = sm.fit_resample(X_test[y_i], y_test[y_i])

                            self.X_dsel.append(X_resampled)
                            self.y_dsel.append(y_resampled)
                            break
                else:
                    self.X_dsel.append(X_test[y_i])
                    self.y_dsel.append(y_test[y_i])
                

                
        
    def estimate_distance(self):
        #check only samples from minor class / classes
        #for given k neighbours (k>10) compute the distance (euclidean, Mahalonobis) from nearest same class sample
        #sample the region using SMOTE, computing density of sampling by homographic function f(x)=x/x+1 according to distance from given sample
        #if no other minor class instance inside the region -> break, cause of noise possibility
        #in other cases -> compute precision, recall, g-mean and f1 score
        pass
    
    def _fit_base_classifiers(self, X_train, y_train):
        # sampling
        for clf in self.pool_classifiers:
            clf.fit(X_train, y_train)
            
Exemple #6
0
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score
from scipy.stats import rankdata
from scipy.stats import ttest_ind
from tabulate import tabulate
from copy import deepcopy
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from sklearn.preprocessing import normalize
import os

state = 2404

clf_pool = {
    "My DES_KNN": DES_KNN(random_state=state),
    "DES_KNN": DESKNN(random_state=state),
    "KNORA-U": KNORAU(),
    "KNORA-E": KNORAE(),
    "ADABoost": AdaBoostClassifier(),
}


def test(clf_pool, data, method=None):
    dataset = "./datasets/" + data
    dataset = np.genfromtxt("%s.csv" % (dataset), delimiter=",")
    X = dataset[:, :-1]
    y = dataset[:, -1].astype(int)

    n_splits = 5
    n_repeats = 3
    rskf = RepeatedStratifiedKFold(n_splits=n_splits,
Exemple #7
0
    def predict(self, X):
        # Check is fit had been called
        check_is_fitted(self, "classes_")

        # Input validation
        X = check_array(X)
        if X.shape[1] != self.X_.shape[1]:
            raise ValueError("number of features does not match")

        X_dsel = self.previous_X
        y_dsel = self.previous_y

        unique, counts = np.unique(y_dsel, return_counts=True)

        k_neighbors = 5
        if counts[0] - 1 < 5:
            k_neighbors = counts[0] - 1

        if self.oversampler == "SMOTE" and k_neighbors > 0:
            smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
            X_dsel, y_dsel = smote.fit_resample(X_dsel, y_dsel)
        elif self.oversampler == "svmSMOTE" and k_neighbors > 0:
            try:
                svmSmote = SVMSMOTE(random_state=42, k_neighbors=k_neighbors)
                X_dsel, y_dsel = svmSmote.fit_resample(X_dsel, y_dsel)
            except ValueError:
                pass
        elif self.oversampler == "borderline1" and k_neighbors > 0:
            borderlineSmote1 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-1')
            X_dsel, y_dsel = borderlineSmote1.fit_resample(X_dsel, y_dsel)
        elif self.oversampler == "borderline2" and k_neighbors > 0:
            borderlineSmote2 = BorderlineSMOTE(random_state=42,
                                               k_neighbors=k_neighbors,
                                               kind='borderline-2')
            X_dsel, y_dsel = borderlineSmote2.fit_resample(X_dsel, y_dsel)
        elif self.oversampler == "ADASYN" and k_neighbors > 0:
            try:
                adasyn = ADASYN(random_state=42, n_neighbors=k_neighbors)
                X_dsel, y_dsel = adasyn.fit_resample(X_dsel, y_dsel)
            except RuntimeError:
                pass
            except ValueError:
                pass
        elif self.oversampler == "SLS" and k_neighbors > 0:
            sls = Safe_Level_SMOTE(n_neighbors=k_neighbors)
            X_dsel, y_dsel = sls.sample(X_dsel, y_dsel)

        if self.desMethod == "KNORAE":
            des = KNORAE(self.ensemble_, random_state=42)
        elif self.desMethod == "KNORAU":
            des = KNORAU(self.ensemble_, random_state=42)
        elif self.desMethod == "KNN":
            des = DESKNN(self.ensemble_, random_state=42)
        elif self.desMethod == "Clustering":
            des = DESClustering(self.ensemble_, random_state=42)
        else:
            des = KNORAE(self.ensemble_, random_state=42)

        if len(self.ensemble_) < 2:
            prediction = self.ensemble_[0].predict(X)
        else:
            des.fit(X_dsel, y_dsel)
            prediction = des.predict(X)

        return prediction