Example #1
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledDataPerc = kwargs["initialLabeledDataPerc"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    usePCA = kwargs["usePCA"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    excludingPercentage = kwargs["excludingPercentage"]

    print(
        "STARTING TEST with Random Forest as classifier and GMM as cutting data"
    )

    arrAcc = []
    initialDataLength = 0
    finalDataLength = round((initialLabeledDataPerc) * sizeOfBatch)
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                finalDataLength, usePCA)
    initialDataLength = finalDataLength
    finalDataLength = sizeOfBatch

    for t in range(batches):
        # ***** Box 2 *****
        Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                      initialDataLength, finalDataLength,
                                      usePCA)

        # ***** Box 3 *****
        predicted = classifiers.randomForest(X, y, Ut)

        # ***** Box 4 *****
        #pdfs from each new points from each class applied on new arrived points
        pdfsByClass = util.pdfByClass2(Ut, predicted, classes)

        # ***** Box 5 *****
        selectedIndexes = util.compactingDataDensityBased2(
            pdfsByClass, excludingPercentage)

        # ***** Box 6 *****
        X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)

        initialDataLength = finalDataLength
        finalDataLength += sizeOfBatch
        # Evaluating classification
        arrAcc.append(metrics.evaluate(yt, predicted))

    # returns accuracy array and last selected points
    return arrAcc, X, y
Example #2
0
    def fit(self, dataValues, dataLabels=None):
        arrAcc = []
        classes = list(set(dataLabels))
        initialDataLength = 0
        finalDataLength = self.initialLabeledData

        # ***** Box 1 *****
        #Initial labeled data
        X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                    finalDataLength, self.usePCA)

        for t in range(self.batches):
            #print("passo: ",t)
            initialDataLength = finalDataLength
            finalDataLength = finalDataLength + self.sizeOfBatch

            # ***** Box 2 *****
            Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                          initialDataLength, finalDataLength,
                                          self.usePCA)

            # ***** Box 3 *****
            clf = classifiers.labelPropagation(X, y, self.K)
            #classifiers.classifier(X, y, self.K, self.clfName)

            predicted = clf.predict(Ut)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))

            # ***** Box 4 *****
            #pdfs from each new points from each class applied on new arrived points
            indexesByClass = util.slicingClusteredData(y, classes)
            bestModelSelectedByClass = util.loadBestModelByClass(
                X, indexesByClass)

            # ***** Box 5 *****
            predictedByClass = util.slicingClusteredData(predicted, classes)
            #p% smallest distances per class, based on paper
            selectedIndexes = util.mahalanobisCoreSupportExtraction(
                Ut, predictedByClass, bestModelSelectedByClass, self.p)
            #selectedIndexes = np.hstack([selectedIndexes[0],selectedIndexes[1]])
            stackedIndexes = selectedIndexes[0]
            for i in range(1, len(selectedIndexes)):
                stackedIndexes = np.hstack(
                    [stackedIndexes, selectedIndexes[i]])
            selectedIndexes = stackedIndexes

            # ***** Box 6 *****
            X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)

        # returns accuracy array and last selected points
        self.threshold_ = arrAcc
        return self
Example #3
0
def makeAccuracy(arrAllAcc, arrTrueY):
    arrAcc = []
    ini = 0
    end = ini
    for predicted in arrAllAcc:
        predicted = np.asarray(predicted)
        predicted = predicted.flatten()
        batchSize = len(predicted)
        ini = end
        end = end + batchSize

        yt = arrTrueY[ini:end]
        arrAcc.append(metrics.evaluate(yt, predicted))

    return arrAcc
Example #4
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    clfName = kwargs["clfName"]
    densityFunction = kwargs["densityFunction"]
    poolSize = kwargs["poolSize"]
    isBatchMode = kwargs["isBatchMode"]

    print(
        "METHOD: {} as classifier and {} and Hellinger distance as dynamic CSE"
        .format(clfName, densityFunction))
    usePCA = False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    finalDataLength = initialLabeledData  #round((initialLabeledDataPerc)*sizeOfBatch)
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                finalDataLength, usePCA)
    clf = classifiers.classifier(X, y, K, clfName)  #O(nd+kn)
    reset = True
    if isBatchMode:
        for t in range(batches):
            #print("passo: ",t)
            initialDataLength = finalDataLength
            finalDataLength = finalDataLength + sizeOfBatch
            #print(initialDataLength)
            #print(finalDataLength)
            # ***** Box 2 *****
            Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                          initialDataLength, finalDataLength,
                                          usePCA)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(np.array(Ut))
            arrYt.append(yt)
            predicted = clf.predict(Ut)
            arrPredicted.append(predicted)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))

            # ***** Box 4 *****
            excludingPercentage = cuttingPercentage(X, Ut, t)
            #excludingPercentageByClass, reset = cuttingPercentageByClass(X, Ut, y, predicted, classes, t)
            allInstances = []
            allLabels = []

            # ***** Box 5 *****
            if reset == True:
                #Considers only the last distribution (time-series like)
                pdfsByClass = util.pdfByClass(Ut, predicted, classes,
                                              densityFunction)  #O(n^{2}d)
            else:
                #Considers the past and actual data (concept-drift like)
                allInstances = np.vstack([X, Ut])
                allLabels = np.hstack([y, yt])
                pdfsByClass = util.pdfByClass(allInstances, allLabels, classes,
                                              densityFunction)

            selectedIndexes = util.compactingDataDensityBased2(
                pdfsByClass, excludingPercentage)  #O(n log(n) c)
            #selectedIndexes = util.compactingDataDensityBased(pdfsByClass, excludingPercentageByClass)
            #print(t, excludingPercentage)
            # ***** Box 6 *****
            if reset == True:
                #Considers only the last distribution (time-series like)
                X, y = util.selectedSlicedData(Ut, yt, selectedIndexes)  #O(n)
            else:
                #Considers the past and actual data (concept-drift like)
                X, y = util.selectedSlicedData(allInstances, allLabels,
                                               selectedIndexes)

            clf = classifiers.classifier(X, y, K, clfName)  #O(nd+kn)
    else:
        t = 0
        inst = []
        labels = []
        clf = classifiers.classifier(X, y, K, clfName)
        remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels,
                                                      finalDataLength,
                                                      len(dataValues), usePCA)
        reset = False
        for Ut, yt in zip(remainingX, remainingY):
            predicted = clf.predict(Ut.reshape(1, -1))[0]
            arrAcc.append(predicted)
            inst.append(Ut)
            labels.append(predicted)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(Ut)
            arrYt.append(yt)

            arrPredicted.append(predicted)

            #new approach
            if len(inst) == poolSize:
                inst = np.array(inst)
                excludingPercentage = cuttingPercentage(X, inst, t)
                t += 1
                '''if excludingPercentage < 0:
                    #print("negative, reseting points")
                    excludingPercentage = 0.5 #default
                    reset = True
                else:
                    reset = False
                '''
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    pdfsByClass = util.pdfByClass(inst, labels, classes,
                                                  densityFunction)
                else:
                    #Considers the past and actual data (concept-drift like)
                    allInstances = np.vstack([X, inst])
                    allLabels = np.hstack([y, labels])
                    pdfsByClass = util.pdfByClass(allInstances, allLabels,
                                                  classes, densityFunction)

                selectedIndexes = util.compactingDataDensityBased2(
                    pdfsByClass, excludingPercentage)

                if reset == True:
                    #Considers only the last distribution (time-series like)
                    X, y = util.selectedSlicedData(inst, labels,
                                                   selectedIndexes)
                else:
                    #Considers the past and actual data (concept-drift like)
                    X, y = util.selectedSlicedData(allInstances, allLabels,
                                                   selectedIndexes)

                clf = classifiers.classifier(X, y, K, clfName)
                inst = []
                labels = []

        arrAcc = split_list(arrAcc, batches)
        arrAcc = makeAccuracy(arrAcc, remainingY)
        arrYt = split_list(arrYt, batches)
        arrPredicted = split_list(arrPredicted, batches)

    # returns accuracy array and last selected points
    return "AMANDA (Dynamic)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
Example #5
0
    def fit(self, dataValues, dataLabels=None):
        arrAcc = []
        classes = list(set(dataLabels))
        initialDataLength = 0
        self.excludingPercentage = 1 - self.excludingPercentage
        finalDataLength = self.initialLabeledData
        reset = True

        # ***** Box 1 *****
        #Initial labeled data
        X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                    finalDataLength, self.usePCA)
        if self.isBatchMode:
            for t in range(self.batches):
                #print("passo: ",t)
                initialDataLength = finalDataLength
                finalDataLength = finalDataLength + self.sizeOfBatch

                # ***** Box 2 *****
                Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                              initialDataLength,
                                              finalDataLength, self.usePCA)

                # ***** Box 3 *****
                clf = classifiers.classifier(X, y, self.K, self.clfName)

                predicted = clf.predict(Ut)
                # Evaluating classification
                arrAcc.append(metrics.evaluate(yt, predicted))

                # ***** Box 4 *****
                #pdfs from each new points from each class applied on new arrived points
                '''pdfsByClass = util.pdfByClass(Ut, predicted, classes, self.densityFunction)
                
                # ***** Box 5 *****
                selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage)
                
                # ***** Box 6 *****
                X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)'''

                allInstances = []
                allLabels = []
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    pdfsByClass = util.pdfByClass(Ut, predicted, classes,
                                                  self.densityFunction)
                else:
                    #Considers the past and actual data (concept-drift like)
                    allInstances = np.vstack([X, Ut])
                    allLabels = np.hstack([y, predicted])
                    pdfsByClass = util.pdfByClass(allInstances, allLabels,
                                                  classes,
                                                  self.densityFunction)

                selectedIndexes = util.compactingDataDensityBased2(
                    pdfsByClass, self.excludingPercentage)

                # ***** Box 6 *****
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    X, y = util.selectedSlicedData(Ut, predicted,
                                                   selectedIndexes)
                else:
                    #Considers the past and actual data (concept-drift like)
                    X, y = util.selectedSlicedData(allInstances, allLabels,
                                                   selectedIndexes)
        else:
            inst = []
            labels = []
            clf = classifiers.classifier(X, y, self.K, self.clfName)
            remainingX, remainingY = util.loadLabeledData(
                dataValues, dataLabels, finalDataLength, len(dataValues),
                self.usePCA)

            for Ut, yt in zip(remainingX, remainingY):
                predicted = clf.predict(Ut.reshape(1, -1))
                arrAcc.append(predicted)
                inst.append(Ut)
                labels.append(predicted)

                if len(inst) == self.poolSize:
                    inst = np.asarray(inst)
                    #pdfsByClass = util.pdfByClass(inst, labels, classes, self.densityFunction)
                    #selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage)
                    #X, y = util.selectedSlicedData(inst, labels, selectedIndexes)
                    if reset == True:
                        #Considers only the last distribution (time-series like)
                        pdfsByClass = util.pdfByClass(inst, labels, classes,
                                                      self.densityFunction)
                    else:
                        #Considers the past and actual data (concept-drift like)
                        allInstances = np.vstack([X, inst])
                        allLabels = np.hstack([y, labels])
                        pdfsByClass = util.pdfByClass(allInstances, allLabels,
                                                      classes,
                                                      self.densityFunction)

                    selectedIndexes = util.compactingDataDensityBased2(
                        pdfsByClass, excludingPercentage)

                    if reset == True:
                        #Considers only the last distribution (time-series like)
                        X, y = util.selectedSlicedData(inst, labels,
                                                       selectedIndexes)
                    else:
                        #Considers the past and actual data (concept-drift like)
                        X, y = util.selectedSlicedData(allInstances, allLabels,
                                                       selectedIndexes)

                    clf = classifiers.classifier(X, y, self.K, self.clfName)
                    inst = []
                    labels = []

            arrAcc = split_list(arrAcc, self.batches)
            arrAcc = makeAccuracy(arrAcc, remainingY)

        # returns accuracy array and last selected points
        self.threshold_ = arrAcc
        return self
Example #6
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    excludingPercentage = kwargs["excludingPercentage"]
    clfName = kwargs["clfName"]
    densityFunction = kwargs["densityFunction"]
    poolSize = kwargs["poolSize"]
    isBatchMode = kwargs["isBatchMode"]

    print("METHOD: Sliding {0} as classifier".format(clfName))
    usePCA = False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    finalDataLength = initialLabeledData
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                finalDataLength, usePCA)
    clf = classifiers.classifier(X, y, K, clfName)

    if isBatchMode:
        for t in range(batches):
            # sliding
            clf.fit(X, y)

            initialDataLength = finalDataLength
            finalDataLength = finalDataLength + sizeOfBatch
            #print(initialDataLength)
            #print(finalDataLength)

            Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                          initialDataLength, finalDataLength,
                                          usePCA)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(np.array(Ut))
            arrYt.append(yt)
            predicted = clf.predict(Ut)
            arrPredicted.append(predicted)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))

            X, y = Ut, predicted
    else:
        inst = []
        labels = []
        remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels,
                                                      finalDataLength,
                                                      len(dataValues), usePCA)

        for Ut, yt in zip(remainingX, remainingY):
            predicted = clf.predict(Ut.reshape(1, -1))
            arrAcc.append(predicted)
            inst.append(Ut)
            labels.append(predicted)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(Ut)
            arrYt.append(yt)
            arrPredicted.append(predicted)

            if len(inst) == poolSize:
                inst = np.asarray(inst)
                clf = classifiers.classifier(inst, labels, K, clfName)
                inst = []
                labels = []

        arrAcc = split_list(arrAcc, batches)
        arrAcc = makeAccuracy(arrAcc, remainingY)
        arrYt = split_list(arrYt, batches)
        arrPredicted = split_list(arrPredicted, batches)

    return "Sliding SSL", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
Example #7
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    p = kwargs["excludingPercentage"]
    K = kwargs["K_variation"]
    clfName = kwargs["clfName"]
    densityFunction='gmmBIC'
    distanceMetric = 'mahalanobis'

    print("METHOD: {} as classifier and GMM with BIC and Mahalanobis as core support extraction".format(clfName))
    usePCA=False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch)
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)
    #predicted = classifiers.classify(X, y, Ut, K, classes, clfName)
    clf = classifiers.labelPropagation(X, y, K)
    #Starting the process
    for t in range(batches):
        #print("Step: ", t)
        initialDataLength=finalDataLength
        finalDataLength=finalDataLength+sizeOfBatch
        # ***** Box 2 *****
        Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)

        # for decision boundaries plot
        arrClf.append(clf)
        arrX.append(X)
        arrY.append(y)
        arrUt.append(np.array(Ut))
        arrYt.append(yt)
        #predict test data
        predicted = clf.predict(Ut)
        arrPredicted.append(predicted)

        # Evaluating classification
        arrAcc.append(metrics.evaluate(yt, predicted))
        
        # ***** Box 4 *****
        indexesByClass = util.slicingClusteredData(y, classes)
        bestModelSelectedByClass = util.loadBestModelByClass(X, indexesByClass)
        
        # ***** Box 5 *****
        predictedByClass = util.slicingClusteredData(predicted, classes)
        selectedIndexes = util.mahalanobisCoreSupportExtraction(Ut, predictedByClass, bestModelSelectedByClass, p)
        #selectedIndexes = np.hstack([selectedIndexes[0],selectedIndexes[1]])
        stackedIndexes=selectedIndexes[0]
        for i in range(1, len(selectedIndexes)):
            stackedIndexes = np.hstack([stackedIndexes,selectedIndexes[i]])
        selectedIndexes =  stackedIndexes   
        
        X, y = util.selectedSlicedData(Ut, yt, selectedIndexes)
        #training data
        clf = classifiers.labelPropagation(X, y, K)
        
    return "COMPOSE GMM", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
Example #8
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    excludingPercentage = kwargs["excludingPercentage"]
    clfName = kwargs["clfName"]
    densityFunction = kwargs["densityFunction"]
    poolSize = kwargs["poolSize"]
    isBatchMode = kwargs["isBatchMode"]
    
    print("METHOD: {} as classifier and {} as core support extraction with cutting data method".format(clfName, densityFunction))
    usePCA=False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    excludingPercentage = 1-excludingPercentage
    finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch)
    reset = True

    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)
    clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn)
    if isBatchMode:
        for t in range(batches):
            #print("passo: ",t)
            initialDataLength=finalDataLength
            finalDataLength=finalDataLength+sizeOfBatch
            #print(initialDataLength)
            #print(finalDataLength)
            
            Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(np.array(Ut))
            arrYt.append(yt)

            #classifies
            predicted = clf.predict(Ut)
            
            arrPredicted.append(predicted)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))
            
            # ***** Box 4 *****
            #pdfs from each new points from each class applied on new arrived points
            allInstances = []
            allLabels = []
            if reset == True:
                #Considers only the last distribution (time-series like)
                pdfsByClass = util.pdfByClass(Ut, yt, classes, densityFunction)#O(nmd)
            else:
                #Considers the past and actual data (concept-drift like)
                allInstances = np.vstack([X, Ut])
                allLabels = np.hstack([y, yt])
                pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction)
                
            selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)#O(n log(n) c)
        
            # ***** Box 6 *****
            if reset == True:
                #Considers only the last distribution (time-series like)
                X, y = util.selectedSlicedData(Ut, yt, selectedIndexes)
            else:
                #Considers the past and actual data (concept-drift like)
                X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes)#O(n)

            #training
            clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn)
    else:
        inst = []
        labels = []
        clf = classifiers.classifier(X, y, K, clfName)
        remainingX , remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA)
        reset = False
        
        for Ut, yt in zip(remainingX, remainingY):
            predicted = clf.predict(Ut.reshape(1, -1))[0]
            arrAcc.append(predicted)
            inst.append(Ut)
            labels.append(predicted)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(Ut)
            arrYt.append(yt)
            arrPredicted.append(predicted)
            
            if len(inst) == poolSize:
                inst = np.asarray(inst)
                '''pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction)
                selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)
                X, y = util.selectedSlicedData(inst, labels, selectedIndexes)
                clf = classifiers.classifier(X, y, K, clfName)
                inst = []
                labels = []'''
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction)
                else:
                    #Considers the past and actual data (concept-drift like)
                    allInstances = np.vstack([X, inst])
                    allLabels = np.hstack([y, labels])
                    pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction)

                selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)

                if reset == True:
                    #Considers only the last distribution (time-series like)
                    X, y = util.selectedSlicedData(inst, labels, selectedIndexes)
                else:
                    #Considers the past and actual data (concept-drift like)
                    X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes)

                clf = classifiers.classifier(X, y, K, clfName)
                inst = []
                labels = []
            
        arrAcc = split_list(arrAcc, batches)
        arrAcc = makeAccuracy(arrAcc, remainingY)
        arrYt = split_list(arrYt, batches)
        arrPredicted = split_list(arrPredicted, batches)

    # returns accuracy array and last selected points
    return "AMANDA (Fixed)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
    def fit(self, dataValues, dataLabels=None):
        arrAcc = []
        classes = list(set(dataLabels))
        initialDataLength = 0
        finalDataLength = self.initialLabeledData

        # ***** Box 1 *****
        #Initial labeled data
        X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                    finalDataLength, self.usePCA)
        if self.isBatchMode:
            for t in range(self.batches):
                #print("passo: ",t)
                initialDataLength = finalDataLength
                finalDataLength = finalDataLength + self.sizeOfBatch
                #print(initialDataLength)
                #print(finalDataLength)
                # ***** Box 2 *****
                Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                              initialDataLength,
                                              finalDataLength, self.usePCA)

                # ***** Box 3 *****
                #predicted = classifiers.classify(X, y, Ut, self.K, classes, self.clfName)
                predicted = classifiers.classify(X, y, Ut, self.K, classes,
                                                 self.clfName)
                # Evaluating classification
                arrAcc.append(metrics.evaluate(yt, predicted))

                # ***** Box 4 *****
                #pdfs from each new points from each class applied on new arrived points
                #pdfsByClass = util.pdfByClass2(Ut, predicted, classes)
                pdfsByClass = util.pdfByClass(Ut, predicted, classes,
                                              self.densityFunction)
                instancesXByClass, instancesUtByClass = util.unifyInstancesByClass(
                    X, y, Ut, predicted, classes)

                # ***** Box 5 *****
                keepPercentageByClass = util.getBhattacharyyaScoresByClass(
                    instancesXByClass, instancesUtByClass, classes)
                #keepPercentage = util.getBhattacharyyaScores(instancesUtByClass)

                #selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, keepPercentage)
                selectedIndexes = util.compactingDataDensityBased3(
                    pdfsByClass, keepPercentageByClass)
                # ***** Box 6 *****
                X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)
        else:
            inst = []
            labels = []
            clf = classifiers.knn(X, y, self.K)
            remainingX, remainingY = util.loadLabeledData(
                dataValues, dataLabels, finalDataLength, len(dataValues),
                self.usePCA)

            for Ut, yt in zip(remainingX, remainingY):
                predicted = clf.predict(Ut.reshape(1, -1))
                arrAcc.append(predicted)
                inst.append(Ut)
                labels.append(predicted)

                if len(inst) == self.poolSize:
                    inst = np.asarray(inst)
                    pdfsByClass = util.pdfByClass(inst, labels, classes,
                                                  self.densityFunction)
                    selectedIndexes = util.compactingDataDensityBased2(
                        pdfsByClass, self.excludingPercentage)
                    X, y = util.selectedSlicedData(inst, labels,
                                                   selectedIndexes)
                    clf = classifiers.knn(X, y, self.K)
                    inst = []
                    labels = []

            arrAcc = split_list(arrAcc, 100)
            arrAcc = makeAccuracy(arrAcc, remainingY)

        # returns accuracy array and last selected points
        self.threshold_ = arrAcc
        return self
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    #initialLabeledDataPerc = kwargs["initialLabeledDataPerc"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    usePCA = kwargs["usePCA"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    excludingPercentage = kwargs["excludingPercentage"]
    clfName = kwargs["clfName"]
    densityFunction = kwargs["densityFunction"]
    poolSize = kwargs["poolSize"]
    isBatchMode = kwargs["isBatchMode"]

    print(
        "METHOD: SVM as classifier and boundary remover and {} as CSE with cutting data method"
        .format(densityFunction))

    arrAcc = []
    initialDataLength = 0
    excludingPercentage = 1 - excludingPercentage
    finalDataLength = initialLabeledData  #round((initialLabeledDataPerc)*sizeOfBatch)
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                finalDataLength, usePCA)
    if isBatchMode:
        for t in range(batches):
            #print("passo: ",t)
            initialDataLength = finalDataLength
            finalDataLength = finalDataLength + sizeOfBatch
            #print(initialDataLength)
            #print(finalDataLength)
            # ***** Box 2 *****
            Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                          initialDataLength, finalDataLength,
                                          usePCA)

            # ***** Box 3 *****
            #predicted = classifiers.classify(X, y, Ut, K, classes, clfName)
            clf = classifiers.svmClassifier(X, y)
            #predicted = clf.predict(Ut)
            predicted = classifiers.classify(X, y, Ut, K, classes, clfName)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))

            # ***** Box 4 *****
            #removing boundaryPoints for the next batch
            Ut, predicted = util.removeBoundaryPoints(clf.support_, Ut,
                                                      predicted)

            #pdfs from each new points from each class applied on new arrived points
            pdfsByClass = util.pdfByClass(Ut, predicted, classes,
                                          densityFunction)

            # ***** Box 5 *****
            selectedIndexes = util.compactingDataDensityBased2(
                pdfsByClass, excludingPercentage)

            # ***** Box 6 *****
            X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)
    else:
        indexUt = finalDataLength
        clf = classifiers.knn(X, y, K)
        inst = np.copy(X)
        labels = np.copy(y)
        remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels,
                                                      finalDataLength,
                                                      len(dataValues), usePCA)

        for Ut, yt in zip(remainingX, remainingY):
            Ut = Ut.reshape(1, -1)
            predicted = clf.predict(Ut)
            arrAcc.append(predicted)

            clfSVM = classifiers.svmClassifier(np.vstack([inst, Ut]),
                                               np.hstack([labels, predicted]))
            #print(clfSVM.support_, indexUt)
            if indexUt in clfSVM.support_:
                '''
                print("Alerta de intruso na fronteira!")
                print(len(X))
                print(len(Ut))
                print("===============================")'''
                pass
            else:
                #print("Instances= ",inst)
                #print("New Instance",Ut)
                inst = np.vstack([inst, Ut])
                labels = np.hstack([labels, predicted])
            #print(len(inst))
            if len(inst) == poolSize:
                inst = np.asarray(inst)

                #Ut, predicted = util.removeBoundaryPoints(clfSVM.support_, Ut, predicted)

                pdfsByClass = util.pdfByClass(inst, labels, classes,
                                              densityFunction)
                selectedIndexes = util.compactingDataDensityBased2(
                    pdfsByClass, excludingPercentage)
                X, y = util.selectedSlicedData(inst, labels, selectedIndexes)
                clf = classifiers.knn(X, y, K)
                inst = np.copy(X)
                labels = np.copy(y)

        arrAcc = split_list(arrAcc, 100)
        arrAcc = makeAccuracy(arrAcc, remainingY)

    # returns accuracy array and last selected points
    return "SVM removing boundary + GMM", arrAcc, X, y