def fit(self, dataValues, dataLabels=None): arrAcc = [] classes = list(set(dataLabels)) initialDataLength = 0 finalDataLength = self.initialLabeledData # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) for t in range(self.batches): #print("passo: ",t) initialDataLength = finalDataLength finalDataLength = finalDataLength + self.sizeOfBatch # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, self.usePCA) # ***** Box 3 ***** clf = classifiers.labelPropagation(X, y, self.K) #classifiers.classifier(X, y, self.K, self.clfName) predicted = clf.predict(Ut) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** #pdfs from each new points from each class applied on new arrived points indexesByClass = util.slicingClusteredData(y, classes) bestModelSelectedByClass = util.loadBestModelByClass( X, indexesByClass) # ***** Box 5 ***** predictedByClass = util.slicingClusteredData(predicted, classes) #p% smallest distances per class, based on paper selectedIndexes = util.mahalanobisCoreSupportExtraction( Ut, predictedByClass, bestModelSelectedByClass, self.p) #selectedIndexes = np.hstack([selectedIndexes[0],selectedIndexes[1]]) stackedIndexes = selectedIndexes[0] for i in range(1, len(selectedIndexes)): stackedIndexes = np.hstack( [stackedIndexes, selectedIndexes[i]]) selectedIndexes = stackedIndexes # ***** Box 6 ***** X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes) # returns accuracy array and last selected points self.threshold_ = arrAcc return self
def cuttingPercentageByClass(Xt_1, Xt, yt_1, yt, classes, t=None): x = np.sqrt(2) reset = False hellinger_distance_by_class = {} similarityByClass = {} indexes_Xt_1_ByClass = util.slicingClusteredData(yt_1, classes) indexes_Xt_ByClass = util.slicingClusteredData(yt, classes) for c in classes: res = [] for i in range(Xt_1.shape[1]): P = Xt_1[indexes_Xt_1_ByClass[c], i] Q = Xt[indexes_Xt_ByClass[c], i] bins = int(np.sqrt(len(indexes_Xt_1_ByClass[c]))) hP = np.histogram(P + (-np.min(P)), bins=bins) hQ = np.histogram(Q + (-np.min(Q)), bins=bins) res.append(hellinger(hP[1], hQ[1])) res = np.mean(res) similarity = 1 - ( ((100 * res) / x) / 100) #(100 - ((100 * res)/x))/100 #print(t,res, similarity) if similarity < 0: reset = True elif similarity > 0: reset = False similarity = 0.5 + ((res / x) / 10) if similarity > 0.9: similarity = 0.9 similarityByClass.update({c: similarity}) #print(t,c,similarity) return similarityByClass, reset #percentage of similarity
def countInstances(datasetID, dataLabels): classes = list(set(dataLabels)) inst = util.slicingClusteredData(dataLabels, classes) for i in range(len(inst)): print("{}: class {} -> {} instances.".format(datasetID, i, len(inst[i])))
def start(**kwargs): dataValues = kwargs["dataValues"] dataLabels = kwargs["dataLabels"] initialLabeledData = kwargs["initialLabeledData"] sizeOfBatch = kwargs["sizeOfBatch"] classes = kwargs["classes"] batches = kwargs["batches"] sizeOfBatch = kwargs["sizeOfBatch"] p = kwargs["excludingPercentage"] K = kwargs["K_variation"] clfName = kwargs["clfName"] densityFunction='gmmBIC' distanceMetric = 'mahalanobis' print("METHOD: {} as classifier and GMM with BIC and Mahalanobis as core support extraction".format(clfName)) usePCA=False arrAcc = [] arrX = [] arrY = [] arrUt = [] arrYt = [] arrClf = [] arrPredicted = [] initialDataLength = 0 finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch) # ***** Box 1 ***** #Initial labeled data X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) #predicted = classifiers.classify(X, y, Ut, K, classes, clfName) clf = classifiers.labelPropagation(X, y, K) #Starting the process for t in range(batches): #print("Step: ", t) initialDataLength=finalDataLength finalDataLength=finalDataLength+sizeOfBatch # ***** Box 2 ***** Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA) # for decision boundaries plot arrClf.append(clf) arrX.append(X) arrY.append(y) arrUt.append(np.array(Ut)) arrYt.append(yt) #predict test data predicted = clf.predict(Ut) arrPredicted.append(predicted) # Evaluating classification arrAcc.append(metrics.evaluate(yt, predicted)) # ***** Box 4 ***** indexesByClass = util.slicingClusteredData(y, classes) bestModelSelectedByClass = util.loadBestModelByClass(X, indexesByClass) # ***** Box 5 ***** predictedByClass = util.slicingClusteredData(predicted, classes) selectedIndexes = util.mahalanobisCoreSupportExtraction(Ut, predictedByClass, bestModelSelectedByClass, p) #selectedIndexes = np.hstack([selectedIndexes[0],selectedIndexes[1]]) stackedIndexes=selectedIndexes[0] for i in range(1, len(selectedIndexes)): stackedIndexes = np.hstack([stackedIndexes,selectedIndexes[i]]) selectedIndexes = stackedIndexes X, y = util.selectedSlicedData(Ut, yt, selectedIndexes) #training data clf = classifiers.labelPropagation(X, y, K) return "COMPOSE GMM", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted