def start(**kwargs): dataValues = kwargs["dataValues"] dataLabels = kwargs["dataLabels"] initialLabeledData = kwargs["initialLabeledData"] sizeOfBatch = kwargs["sizeOfBatch"] classes = kwargs["classes"] K = kwargs["K_variation"] batches = kwargs["batches"] sizeOfBatch = kwargs["sizeOfBatch"] clfName = kwargs["clfName"] densityFunction = kwargs["densityFunction"] poolSize = kwargs["poolSize"] isBatchMode = kwargs["isBatchMode"] print( "METHOD: {} as classifier and {} and Hellinger distance as dynamic CSE" .format(clfName, densityFunction)) usePCA = False arrAcc = [] arrX = [] arrY = [] arrUt = [] arrYt = [] arrClf = [] arrPredicted = [] initialDataLength = 0 finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch) # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) reset = True if isBatchMode: for t in range(batches): #print("passo: ",t) initialDataLength = finalDataLength finalDataLength = finalDataLength + sizeOfBatch #print(initialDataLength) #print(finalDataLength) # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(np.array(Ut)) arrYt.append(yt) predicted = clf.predict(Ut) arrPredicted.append(predicted) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** excludingPercentage = cuttingPercentage(X, Ut, t) #excludingPercentageByClass, reset = cuttingPercentageByClass(X, Ut, y, predicted, classes, t) allInstances = [] allLabels = [] # ***** Box 5 ***** if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(Ut, predicted, classes, densityFunction) #O(n^{2}d) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, Ut]) allLabels = np.hstack([y, yt]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) #O(n log(n) c) #selectedIndexes = util.compactingDataDensityBased(pdfsByClass, excludingPercentageByClass) #print(t, excludingPercentage) # ***** Box 6 ***** if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(Ut, yt, selectedIndexes) #O(n) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) else: t = 0 inst = [] labels = [] clf = classifiers.classifier(X, y, K, clfName) remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA) reset = False for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1))[0] arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(Ut) arrYt.append(yt) arrPredicted.append(predicted) #new approach if len(inst) == poolSize: inst = np.array(inst) excludingPercentage = cuttingPercentage(X, inst, t) t += 1 '''if excludingPercentage < 0: #print("negative, reseting points") excludingPercentage = 0.5 #default reset = True else: reset = False ''' if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, inst]) allLabels = np.hstack([y, labels]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) inst = [] labels = [] arrAcc = split_list(arrAcc, batches) arrAcc = makeAccuracy(arrAcc, remainingY) arrYt = split_list(arrYt, batches) arrPredicted = split_list(arrPredicted, batches) # returns accuracy array and last selected points return "AMANDA (Dynamic)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
def fit(self, dataValues, dataLabels=None): arrAcc = [] classes = list(set(dataLabels)) initialDataLength = 0 self.excludingPercentage = 1 - self.excludingPercentage finalDataLength = self.initialLabeledData reset = True # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) if self.isBatchMode: for t in range(self.batches): #print("passo: ",t) initialDataLength = finalDataLength finalDataLength = finalDataLength + self.sizeOfBatch # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) # ***** Box 3 ***** clf = classifiers.classifier(X, y, self.K, self.clfName) predicted = clf.predict(Ut) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** #pdfs from each new points from each class applied on new arrived points '''pdfsByClass = util.pdfByClass(Ut, predicted, classes, self.densityFunction) # ***** Box 5 ***** selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage) # ***** Box 6 ***** X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)''' allInstances = [] allLabels = [] if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(Ut, predicted, classes, self.densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, Ut]) allLabels = np.hstack([y, predicted]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, self.densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, self.excludingPercentage) # ***** Box 6 ***** if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) else: inst = [] labels = [] clf = classifiers.classifier(X, y, self.K, self.clfName) remainingX, remainingY = util.loadLabeledData( dataValues, dataLabels, finalDataLength, len(dataValues), self.usePCA) for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1)) arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) if len(inst) == self.poolSize: inst = np.asarray(inst) #pdfsByClass = util.pdfByClass(inst, labels, classes, self.densityFunction) #selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage) #X, y = util.selectedSlicedData(inst, labels, selectedIndexes) if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(inst, labels, classes, self.densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, inst]) allLabels = np.hstack([y, labels]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, self.densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, self.K, self.clfName) inst = [] labels = [] arrAcc = split_list(arrAcc, self.batches) arrAcc = makeAccuracy(arrAcc, remainingY) # returns accuracy array and last selected points self.threshold_ = arrAcc return self
def fit(self, dataValues, dataLabels=None): arrAcc = [] classes = list(set(dataLabels)) initialDataLength = 0 finalDataLength = self.initialLabeledData # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) if self.isBatchMode: for t in range(self.batches): #print("passo: ",t) initialDataLength = finalDataLength finalDataLength = finalDataLength + self.sizeOfBatch #print(initialDataLength) #print(finalDataLength) # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) # ***** Box 3 ***** #predicted = classifiers.classify(X, y, Ut, self.K, classes, self.clfName) predicted = classifiers.classify(X, y, Ut, self.K, classes, self.clfName) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** #pdfs from each new points from each class applied on new arrived points #pdfsByClass = util.pdfByClass2(Ut, predicted, classes) pdfsByClass = util.pdfByClass(Ut, predicted, classes, self.densityFunction) instancesXByClass, instancesUtByClass = util.unifyInstancesByClass( X, y, Ut, predicted, classes) # ***** Box 5 ***** keepPercentageByClass = util.getBhattacharyyaScoresByClass( instancesXByClass, instancesUtByClass, classes) #keepPercentage = util.getBhattacharyyaScores(instancesUtByClass) #selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, keepPercentage) selectedIndexes = util.compactingDataDensityBased3( pdfsByClass, keepPercentageByClass) # ***** Box 6 ***** X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes) else: inst = [] labels = [] clf = classifiers.knn(X, y, self.K) remainingX, remainingY = util.loadLabeledData( dataValues, dataLabels, finalDataLength, len(dataValues), self.usePCA) for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1)) arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) if len(inst) == self.poolSize: inst = np.asarray(inst) pdfsByClass = util.pdfByClass(inst, labels, classes, self.densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, self.excludingPercentage) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) clf = classifiers.knn(X, y, self.K) inst = [] labels = [] arrAcc = split_list(arrAcc, 100) arrAcc = makeAccuracy(arrAcc, remainingY) # returns accuracy array and last selected points self.threshold_ = arrAcc return self
def start(**kwargs): dataValues = kwargs["dataValues"] dataLabels = kwargs["dataLabels"] initialLabeledData = kwargs["initialLabeledData"] sizeOfBatch = kwargs["sizeOfBatch"] classes = kwargs["classes"] K = kwargs["K_variation"] batches = kwargs["batches"] sizeOfBatch = kwargs["sizeOfBatch"] excludingPercentage = kwargs["excludingPercentage"] clfName = kwargs["clfName"] densityFunction = kwargs["densityFunction"] poolSize = kwargs["poolSize"] isBatchMode = kwargs["isBatchMode"] print("METHOD: {} as classifier and {} as core support extraction with cutting data method".format(clfName, densityFunction)) usePCA=False arrAcc = [] arrX = [] arrY = [] arrUt = [] arrYt = [] arrClf = [] arrPredicted = [] initialDataLength = 0 excludingPercentage = 1-excludingPercentage finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch) reset = True # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) if isBatchMode: for t in range(batches): #print("passo: ",t) initialDataLength=finalDataLength finalDataLength=finalDataLength+sizeOfBatch #print(initialDataLength) #print(finalDataLength) Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(np.array(Ut)) arrYt.append(yt) #classifies predicted = clf.predict(Ut) arrPredicted.append(predicted) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** #pdfs from each new points from each class applied on new arrived points allInstances = [] allLabels = [] if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(Ut, yt, classes, densityFunction)#O(nmd) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, Ut]) allLabels = np.hstack([y, yt]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)#O(n log(n) c) # ***** Box 6 ***** if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(Ut, yt, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes)#O(n) #training clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn) else: inst = [] labels = [] clf = classifiers.classifier(X, y, K, clfName) remainingX , remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA) reset = False for Ut, yt in zip(remainingX, remainingY): predicted = clf.predict(Ut.reshape(1, -1))[0] arrAcc.append(predicted) inst.append(Ut) labels.append(predicted) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(Ut) arrYt.append(yt) arrPredicted.append(predicted) if len(inst) == poolSize: inst = np.asarray(inst) '''pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) inst = [] labels = []''' if reset == True: #Considers only the last distribution (time-series like) pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction) else: #Considers the past and actual data (concept-drift like) allInstances = np.vstack([X, inst]) allLabels = np.hstack([y, labels]) pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage) if reset == True: #Considers only the last distribution (time-series like) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) else: #Considers the past and actual data (concept-drift like) X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes) clf = classifiers.classifier(X, y, K, clfName) inst = [] labels = [] arrAcc = split_list(arrAcc, batches) arrAcc = makeAccuracy(arrAcc, remainingY) arrYt = split_list(arrYt, batches) arrPredicted = split_list(arrPredicted, batches) # returns accuracy array and last selected points return "AMANDA (Fixed)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
def start(**kwargs): dataValues = kwargs["dataValues"] dataLabels = kwargs["dataLabels"] #initialLabeledDataPerc = kwargs["initialLabeledDataPerc"] initialLabeledData = kwargs["initialLabeledData"] sizeOfBatch = kwargs["sizeOfBatch"] usePCA = kwargs["usePCA"] classes = kwargs["classes"] K = kwargs["K_variation"] batches = kwargs["batches"] sizeOfBatch = kwargs["sizeOfBatch"] excludingPercentage = kwargs["excludingPercentage"] clfName = kwargs["clfName"] densityFunction = kwargs["densityFunction"] poolSize = kwargs["poolSize"] isBatchMode = kwargs["isBatchMode"] print( "METHOD: SVM as classifier and boundary remover and {} as CSE with cutting data method" .format(densityFunction)) arrAcc = [] initialDataLength = 0 excludingPercentage = 1 - excludingPercentage finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch) # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) if isBatchMode: for t in range(batches): #print("passo: ",t) initialDataLength = finalDataLength finalDataLength = finalDataLength + sizeOfBatch #print(initialDataLength) #print(finalDataLength) # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) # ***** Box 3 ***** #predicted = classifiers.classify(X, y, Ut, K, classes, clfName) clf = classifiers.svmClassifier(X, y) #predicted = clf.predict(Ut) predicted = classifiers.classify(X, y, Ut, K, classes, clfName) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** #removing boundaryPoints for the next batch Ut, predicted = util.removeBoundaryPoints(clf.support_, Ut, predicted) #pdfs from each new points from each class applied on new arrived points pdfsByClass = util.pdfByClass(Ut, predicted, classes, densityFunction) # ***** Box 5 ***** selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) # ***** Box 6 ***** X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes) else: indexUt = finalDataLength clf = classifiers.knn(X, y, K) inst = np.copy(X) labels = np.copy(y) remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA) for Ut, yt in zip(remainingX, remainingY): Ut = Ut.reshape(1, -1) predicted = clf.predict(Ut) arrAcc.append(predicted) clfSVM = classifiers.svmClassifier(np.vstack([inst, Ut]), np.hstack([labels, predicted])) #print(clfSVM.support_, indexUt) if indexUt in clfSVM.support_: ''' print("Alerta de intruso na fronteira!") print(len(X)) print(len(Ut)) print("===============================")''' pass else: #print("Instances= ",inst) #print("New Instance",Ut) inst = np.vstack([inst, Ut]) labels = np.hstack([labels, predicted]) #print(len(inst)) if len(inst) == poolSize: inst = np.asarray(inst) #Ut, predicted = util.removeBoundaryPoints(clfSVM.support_, Ut, predicted) pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction) selectedIndexes = util.compactingDataDensityBased2( pdfsByClass, excludingPercentage) X, y = util.selectedSlicedData(inst, labels, selectedIndexes) clf = classifiers.knn(X, y, K) inst = np.copy(X) labels = np.copy(y) arrAcc = split_list(arrAcc, 100) arrAcc = makeAccuracy(arrAcc, remainingY) # returns accuracy array and last selected points return "SVM removing boundary + GMM", arrAcc, X, y