class TomekLinks: def __init__(self): self.filter = Filter(parameters={}) def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter: levels = list(set(classes)) classes = np.array(classes) class1Indxes = np.argwhere(classes == levels[0]) class2Indxes = np.argwhere(classes == levels[1]) tomekMatrix = np.ones((len(class1Indxes), len(class2Indxes)), dtype=bool) for row, c1indx in enumerate(class1Indxes): for column, c2indx in enumerate(class2Indxes): meanPoint = (data[c1indx] + data[c2indx]) / 2 except1indx = np.delete(class1Indxes, row) dist1 = np.sum(np.abs(data[except1indx] - meanPoint), axis=1) if np.any(dist1 <= np.sum(np.abs(data[c1indx] - meanPoint))): tomekMatrix[row, column] = False except2indx = np.delete(class2Indxes, column) dist2 = np.sum(np.abs(data[except2indx] - meanPoint), axis=1) if np.any(dist2 <= np.sum(np.abs(data[c2indx] - meanPoint))): tomekMatrix[row, column] = False c1remove = class1Indxes[np.sum(tomekMatrix, axis=1) > 0] c2remove = class2Indxes[np.sum(tomekMatrix, axis=0) > 0] toRemove = np.concatenate((c1remove, c2remove)) self.filter.rem_indx = np.sort(toRemove) self.filter.set_cleanData( np.delete(data, self.filter.rem_indx, axis=0), np.delete(classes, self.filter.rem_indx, axis=0)) return self.filter
def __init__(self, nfolds: int = 10, agreementLevel: float = 0.7, ntrees: int = 500, seed: int = 0, n_jobs: int = -1): # Some data verification # Data can be a DataFrame or a Numpy Array if (agreementLevel < 0.5 or agreementLevel > 1): raise ValueError("Agreement Level must be between 0.5 and 1.") # if (classColumn < 0 or classColumn > len(data)): # raise ValueError("Column of class out of data bounds") self.nfolds = nfolds self.agreementLevel = agreementLevel self.ntrees = ntrees self.seed = seed self.n_jobs = n_jobs self.k_fold = KFold(nfolds, shuffle=True, random_state=self.seed) self.clf = RandomForestClassifier(n_estimators=ntrees, random_state=seed, n_jobs=self.n_jobs) self.filter = Filter( parameters={ "nfolds": self.nfolds, "ntrees": self.ntrees, "agreementLevel": self.agreementLevel })
class ENN: def __init__(self, neighbours: int = 3, n_jobs: int = -1): self.neighbours = neighbours self.filter = Filter(parameters = {"neighbours": self.neighbours}) self.n_jobs = n_jobs def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter: self.isNoise = np.array([False] * len(classes)) self.clf = KNeighborsClassifier(n_neighbors = self.neighbours, algorithm = 'kd_tree', n_jobs = self.n_jobs) for indx in range(len(data)): self.clf.fit(np.delete(data, indx, axis = 0), np.delete(classes, indx, axis = 0)) pred = self.clf.predict(data[indx].reshape(1, -1)) self.isNoise[indx] = pred != classes[indx] self.filter.rem_indx = np.argwhere(self.isNoise) notNoise = np.invert(self.isNoise) self.filter.set_cleanData(data[notNoise], classes[notNoise]) return self.filter
class DROPv1: def __init__(self, num_neighbours: int = 1): self.n_neigh = num_neighbours self.filter = Filter(parameters={num_neighbours: self.n_neigh}) def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter: self.clf = KNeighborsClassifier() preds = [] for indx in range(len(classes)): self.clf.fit(np.delete(data, indx, axis=0), np.delete(classes, indx, axis=0)) preds.append(self.clf.predict(data[indx].reshape(1, -1))) preds = np.array(preds) currentAcc = np.sum(preds.reshape(1, -1) == classes) indxes = np.arange(len(classes)) toRemove = np.array([], dtype='int64') for indx in indxes: predsIn = [] indxremoved = np.setdiff1d(indxes, toRemove) for test_indx in indxremoved: self.clf.fit( np.delete(data, np.concatenate(([indx], [test_indx], toRemove)), axis=0), np.delete(classes, np.concatenate(([indx], [test_indx], toRemove)), axis=0)) predsIn.append(self.clf.predict(data[test_indx].reshape(1, -1))) predsIn = np.array(predsIn) newAcc = np.sum(predsIn.reshape(1, -1) == classes[indxremoved]) if (newAcc >= currentAcc): currentAcc = newAcc if predsIn[indx - len(toRemove)] == classes[indx]: --currentAcc toRemove = np.concatenate((toRemove, [indx])) self.filter.rem_indx = toRemove self.filter.rem_indx = np.sort(toRemove) self.filter.set_cleanData( np.delete(data, self.filter.rem_indx, axis=0), np.delete(classes, self.filter.rem_indx, axis=0)) return self.filter
class HARF: def __init__(self, nfolds: int = 10, agreementLevel: float = 0.7, ntrees: int = 500, seed: int = 0, n_jobs: int = -1): # Some data verification # Data can be a DataFrame or a Numpy Array if (agreementLevel < 0.5 or agreementLevel > 1): raise ValueError("Agreement Level must be between 0.5 and 1.") # if (classColumn < 0 or classColumn > len(data)): # raise ValueError("Column of class out of data bounds") self.nfolds = nfolds self.agreementLevel = agreementLevel self.ntrees = ntrees self.seed = seed self.n_jobs = n_jobs self.k_fold = KFold(nfolds, shuffle=True, random_state=self.seed) self.clf = RandomForestClassifier(n_estimators=ntrees, random_state=seed, n_jobs=self.n_jobs) self.filter = Filter( parameters={ "nfolds": self.nfolds, "ntrees": self.ntrees, "agreementLevel": self.agreementLevel }) def __call__(self, data: t.Sequence, classes: t.Sequence) -> Filter: self.splits = self.k_fold.split(data) self.isNoise = np.array([False] * len(classes)) for train_indx, test_indx in self.splits: self.clf.fit(data[train_indx], classes[train_indx]) probs = self.clf.predict_proba(data[test_indx]) self.isNoise[test_indx] = [ prob[class_indx] <= 1 - self.agreementLevel for prob, class_indx in zip(probs, classes[test_indx]) ] self.filter.rem_indx = np.argwhere(self.isNoise) notNoise = np.invert(self.isNoise) self.filter.set_cleanData(data[notNoise], classes[notNoise]) return self.filter
class CNN: def __init__(self, max_neighbours: int = 5, n_jobs: int = -1): self.max_neighbours = max_neighbours self.filter = Filter(parameters={}) self.n_jobs = n_jobs self.clf = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs) def __call__(self, data: t.Sequence, classes: t.Sequence): self.isNoise = np.array([False] * len(classes)) firstDifIndx = next(indx for indx, num in enumerate(classes) if num != classes[0]) inStore = [0, firstDifIndx] grabBag = [indx for indx in range(1, firstDifIndx)] for indx in range(firstDifIndx + 1, len(classes)): self.clf.fit(data[inStore], classes[inStore]) pred = self.clf.predict(data[indx].reshape(1, -1)) if pred == classes[indx]: grabBag.append(indx) else: inStore.append(indx) keepOn = True while (keepOn): keepOn = False for indx in grabBag: self.clf.fit(data[inStore], classes[inStore]) pred = self.clf.predict(data[indx].reshape(1, -1)) if (pred != classes[indx]): inStore.append(indx) grabBag.remove(indx) keepOn = True self.filter.rem_indx = grabBag self.filter.rem_indx.sort() notNoise = inStore notNoise.sort() self.filter.set_cleanData(data[notNoise], classes[notNoise]) return self.filter
def __init__(self): self.filter = Filter(parameters={})
def __init__(self, max_neighbours: int = 5, n_jobs: int = -1): self.max_neighbours = max_neighbours self.filter = Filter( parameters={"max_neighbours": self.max_neighbours}) self.n_jobs = n_jobs
def __init__(self, neighbours: int = 3, n_jobs: int = -1): self.neighbours = neighbours self.filter = Filter(parameters = {"neighbours": self.neighbours}) self.n_jobs = n_jobs
def __init__(self, num_neighbours: int = 1): self.n_neigh = num_neighbours self.filter = Filter(parameters={num_neighbours: self.n_neigh})
def __init__(self, max_neighbours: int = 5, n_jobs: int = -1): self.max_neighbours = max_neighbours self.filter = Filter(parameters={}) self.n_jobs = n_jobs self.clf = KNeighborsClassifier(n_neighbors=1, n_jobs=self.n_jobs)