Ejemplo n.º 1
0
class TomekLinks:
    def __init__(self):
        self.filter = Filter(parameters={})

    def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter:
        levels = list(set(classes))
        classes = np.array(classes)
        class1Indxes = np.argwhere(classes == levels[0])
        class2Indxes = np.argwhere(classes == levels[1])
        tomekMatrix = np.ones((len(class1Indxes), len(class2Indxes)),
                              dtype=bool)
        for row, c1indx in enumerate(class1Indxes):
            for column, c2indx in enumerate(class2Indxes):
                meanPoint = (data[c1indx] + data[c2indx]) / 2
                except1indx = np.delete(class1Indxes, row)
                dist1 = np.sum(np.abs(data[except1indx] - meanPoint), axis=1)
                if np.any(dist1 <= np.sum(np.abs(data[c1indx] - meanPoint))):
                    tomekMatrix[row, column] = False
                except2indx = np.delete(class2Indxes, column)
                dist2 = np.sum(np.abs(data[except2indx] - meanPoint), axis=1)
                if np.any(dist2 <= np.sum(np.abs(data[c2indx] - meanPoint))):
                    tomekMatrix[row, column] = False
        c1remove = class1Indxes[np.sum(tomekMatrix, axis=1) > 0]
        c2remove = class2Indxes[np.sum(tomekMatrix, axis=0) > 0]
        toRemove = np.concatenate((c1remove, c2remove))

        self.filter.rem_indx = np.sort(toRemove)
        self.filter.set_cleanData(
            np.delete(data, self.filter.rem_indx, axis=0),
            np.delete(classes, self.filter.rem_indx, axis=0))
        return self.filter
Ejemplo n.º 2
0
 def __init__(self,
              nfolds: int = 10,
              agreementLevel: float = 0.7,
              ntrees: int = 500,
              seed: int = 0,
              n_jobs: int = -1):
     # Some data verification
     # Data can be a DataFrame or a Numpy Array
     if (agreementLevel < 0.5 or agreementLevel > 1):
         raise ValueError("Agreement Level must be between 0.5 and 1.")
     # if (classColumn < 0 or classColumn > len(data)):
     #     raise ValueError("Column of class out of data bounds")
     self.nfolds = nfolds
     self.agreementLevel = agreementLevel
     self.ntrees = ntrees
     self.seed = seed
     self.n_jobs = n_jobs
     self.k_fold = KFold(nfolds, shuffle=True, random_state=self.seed)
     self.clf = RandomForestClassifier(n_estimators=ntrees,
                                       random_state=seed,
                                       n_jobs=self.n_jobs)
     self.filter = Filter(
         parameters={
             "nfolds": self.nfolds,
             "ntrees": self.ntrees,
             "agreementLevel": self.agreementLevel
         })
Ejemplo n.º 3
0
class ENN:
    def __init__(self, neighbours: int = 3, n_jobs: int = -1):
        self.neighbours = neighbours
        self.filter = Filter(parameters = {"neighbours": self.neighbours})
        self.n_jobs = n_jobs

    def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter:
        self.isNoise = np.array([False] * len(classes))
        self.clf = KNeighborsClassifier(n_neighbors = self.neighbours, algorithm = 'kd_tree', n_jobs = self.n_jobs)
        for indx in range(len(data)):
            self.clf.fit(np.delete(data, indx, axis = 0), np.delete(classes, indx, axis = 0))
            pred = self.clf.predict(data[indx].reshape(1, -1))
            self.isNoise[indx] = pred != classes[indx]
        self.filter.rem_indx = np.argwhere(self.isNoise)
        notNoise = np.invert(self.isNoise)
        self.filter.set_cleanData(data[notNoise], classes[notNoise])
        return self.filter
Ejemplo n.º 4
0
class DROPv1:
    def __init__(self, num_neighbours: int = 1):
        self.n_neigh = num_neighbours
        self.filter = Filter(parameters={num_neighbours: self.n_neigh})

    def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter:
        self.clf = KNeighborsClassifier()
        preds = []
        for indx in range(len(classes)):
            self.clf.fit(np.delete(data, indx, axis=0),
                         np.delete(classes, indx, axis=0))
            preds.append(self.clf.predict(data[indx].reshape(1, -1)))
        preds = np.array(preds)
        currentAcc = np.sum(preds.reshape(1, -1) == classes)

        indxes = np.arange(len(classes))
        toRemove = np.array([], dtype='int64')
        for indx in indxes:
            predsIn = []
            indxremoved = np.setdiff1d(indxes, toRemove)
            for test_indx in indxremoved:
                self.clf.fit(
                    np.delete(data,
                              np.concatenate(([indx], [test_indx], toRemove)),
                              axis=0),
                    np.delete(classes,
                              np.concatenate(([indx], [test_indx], toRemove)),
                              axis=0))
                predsIn.append(self.clf.predict(data[test_indx].reshape(1,
                                                                        -1)))
            predsIn = np.array(predsIn)
            newAcc = np.sum(predsIn.reshape(1, -1) == classes[indxremoved])
            if (newAcc >= currentAcc):
                currentAcc = newAcc
                if predsIn[indx - len(toRemove)] == classes[indx]:
                    --currentAcc
                toRemove = np.concatenate((toRemove, [indx]))

        self.filter.rem_indx = toRemove
        self.filter.rem_indx = np.sort(toRemove)
        self.filter.set_cleanData(
            np.delete(data, self.filter.rem_indx, axis=0),
            np.delete(classes, self.filter.rem_indx, axis=0))
        return self.filter
Ejemplo n.º 5
0
class HARF:
    def __init__(self,
                 nfolds: int = 10,
                 agreementLevel: float = 0.7,
                 ntrees: int = 500,
                 seed: int = 0,
                 n_jobs: int = -1):
        # Some data verification
        # Data can be a DataFrame or a Numpy Array
        if (agreementLevel < 0.5 or agreementLevel > 1):
            raise ValueError("Agreement Level must be between 0.5 and 1.")
        # if (classColumn < 0 or classColumn > len(data)):
        #     raise ValueError("Column of class out of data bounds")
        self.nfolds = nfolds
        self.agreementLevel = agreementLevel
        self.ntrees = ntrees
        self.seed = seed
        self.n_jobs = n_jobs
        self.k_fold = KFold(nfolds, shuffle=True, random_state=self.seed)
        self.clf = RandomForestClassifier(n_estimators=ntrees,
                                          random_state=seed,
                                          n_jobs=self.n_jobs)
        self.filter = Filter(
            parameters={
                "nfolds": self.nfolds,
                "ntrees": self.ntrees,
                "agreementLevel": self.agreementLevel
            })

    def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter:
        self.splits = self.k_fold.split(data)
        self.isNoise = np.array([False] * len(classes))
        for train_indx, test_indx in self.splits:
            self.clf.fit(data[train_indx], classes[train_indx])
            probs = self.clf.predict_proba(data[test_indx])
            self.isNoise[test_indx] = [
                prob[class_indx] <= 1 - self.agreementLevel
                for prob, class_indx in zip(probs, classes[test_indx])
            ]
        self.filter.rem_indx = np.argwhere(self.isNoise)
        notNoise = np.invert(self.isNoise)
        self.filter.set_cleanData(data[notNoise], classes[notNoise])
        return self.filter
Ejemplo n.º 6
0
class CNN:
    def __init__(self, max_neighbours: int = 5, n_jobs: int = -1):
        self.max_neighbours = max_neighbours
        self.filter = Filter(parameters={})
        self.n_jobs = n_jobs
        self.clf = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs)

    def __call__(self, data: t.Sequence, classes: t.Sequence):
        self.isNoise = np.array([False] * len(classes))

        firstDifIndx = next(indx for indx, num in enumerate(classes)
                            if num != classes[0])
        inStore = [0, firstDifIndx]
        grabBag = [indx for indx in range(1, firstDifIndx)]
        for indx in range(firstDifIndx + 1, len(classes)):
            self.clf.fit(data[inStore], classes[inStore])
            pred = self.clf.predict(data[indx].reshape(1, -1))
            if pred == classes[indx]:
                grabBag.append(indx)
            else:
                inStore.append(indx)
        keepOn = True
        while (keepOn):
            keepOn = False
            for indx in grabBag:
                self.clf.fit(data[inStore], classes[inStore])
                pred = self.clf.predict(data[indx].reshape(1, -1))
                if (pred != classes[indx]):
                    inStore.append(indx)
                    grabBag.remove(indx)
                    keepOn = True
        self.filter.rem_indx = grabBag
        self.filter.rem_indx.sort()
        notNoise = inStore
        notNoise.sort()
        self.filter.set_cleanData(data[notNoise], classes[notNoise])
        return self.filter
Ejemplo n.º 7
0
 def __init__(self):
     self.filter = Filter(parameters={})
Ejemplo n.º 8
0
 def __init__(self, max_neighbours: int = 5, n_jobs: int = -1):
     self.max_neighbours = max_neighbours
     self.filter = Filter(
         parameters={"max_neighbours": self.max_neighbours})
     self.n_jobs = n_jobs
Ejemplo n.º 9
0
 def __init__(self, neighbours: int = 3, n_jobs: int = -1):
     self.neighbours = neighbours
     self.filter = Filter(parameters = {"neighbours": self.neighbours})
     self.n_jobs = n_jobs
Ejemplo n.º 10
0
 def __init__(self, num_neighbours: int = 1):
     self.n_neigh = num_neighbours
     self.filter = Filter(parameters={num_neighbours: self.n_neigh})
Ejemplo n.º 11
0
 def __init__(self, max_neighbours: int = 5, n_jobs: int = -1):
     self.max_neighbours = max_neighbours
     self.filter = Filter(parameters={})
     self.n_jobs = n_jobs
     self.clf = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs)