Beispiel #1
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    clfName = kwargs["clfName"]
    densityFunction = kwargs["densityFunction"]
    poolSize = kwargs["poolSize"]
    isBatchMode = kwargs["isBatchMode"]

    print(
        "METHOD: {} as classifier and {} and Hellinger distance as dynamic CSE"
        .format(clfName, densityFunction))
    usePCA = False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    finalDataLength = initialLabeledData  #round((initialLabeledDataPerc)*sizeOfBatch)
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                finalDataLength, usePCA)
    clf = classifiers.classifier(X, y, K, clfName)  #O(nd+kn)
    reset = True
    if isBatchMode:
        for t in range(batches):
            #print("passo: ",t)
            initialDataLength = finalDataLength
            finalDataLength = finalDataLength + sizeOfBatch
            #print(initialDataLength)
            #print(finalDataLength)
            # ***** Box 2 *****
            Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                          initialDataLength, finalDataLength,
                                          usePCA)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(np.array(Ut))
            arrYt.append(yt)
            predicted = clf.predict(Ut)
            arrPredicted.append(predicted)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))

            # ***** Box 4 *****
            excludingPercentage = cuttingPercentage(X, Ut, t)
            #excludingPercentageByClass, reset = cuttingPercentageByClass(X, Ut, y, predicted, classes, t)
            allInstances = []
            allLabels = []

            # ***** Box 5 *****
            if reset == True:
                #Considers only the last distribution (time-series like)
                pdfsByClass = util.pdfByClass(Ut, predicted, classes,
                                              densityFunction)  #O(n^{2}d)
            else:
                #Considers the past and actual data (concept-drift like)
                allInstances = np.vstack([X, Ut])
                allLabels = np.hstack([y, yt])
                pdfsByClass = util.pdfByClass(allInstances, allLabels, classes,
                                              densityFunction)

            selectedIndexes = util.compactingDataDensityBased2(
                pdfsByClass, excludingPercentage)  #O(n log(n) c)
            #selectedIndexes = util.compactingDataDensityBased(pdfsByClass, excludingPercentageByClass)
            #print(t, excludingPercentage)
            # ***** Box 6 *****
            if reset == True:
                #Considers only the last distribution (time-series like)
                X, y = util.selectedSlicedData(Ut, yt, selectedIndexes)  #O(n)
            else:
                #Considers the past and actual data (concept-drift like)
                X, y = util.selectedSlicedData(allInstances, allLabels,
                                               selectedIndexes)

            clf = classifiers.classifier(X, y, K, clfName)  #O(nd+kn)
    else:
        t = 0
        inst = []
        labels = []
        clf = classifiers.classifier(X, y, K, clfName)
        remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels,
                                                      finalDataLength,
                                                      len(dataValues), usePCA)
        reset = False
        for Ut, yt in zip(remainingX, remainingY):
            predicted = clf.predict(Ut.reshape(1, -1))[0]
            arrAcc.append(predicted)
            inst.append(Ut)
            labels.append(predicted)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(Ut)
            arrYt.append(yt)

            arrPredicted.append(predicted)

            #new approach
            if len(inst) == poolSize:
                inst = np.array(inst)
                excludingPercentage = cuttingPercentage(X, inst, t)
                t += 1
                '''if excludingPercentage < 0:
                    #print("negative, reseting points")
                    excludingPercentage = 0.5 #default
                    reset = True
                else:
                    reset = False
                '''
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    pdfsByClass = util.pdfByClass(inst, labels, classes,
                                                  densityFunction)
                else:
                    #Considers the past and actual data (concept-drift like)
                    allInstances = np.vstack([X, inst])
                    allLabels = np.hstack([y, labels])
                    pdfsByClass = util.pdfByClass(allInstances, allLabels,
                                                  classes, densityFunction)

                selectedIndexes = util.compactingDataDensityBased2(
                    pdfsByClass, excludingPercentage)

                if reset == True:
                    #Considers only the last distribution (time-series like)
                    X, y = util.selectedSlicedData(inst, labels,
                                                   selectedIndexes)
                else:
                    #Considers the past and actual data (concept-drift like)
                    X, y = util.selectedSlicedData(allInstances, allLabels,
                                                   selectedIndexes)

                clf = classifiers.classifier(X, y, K, clfName)
                inst = []
                labels = []

        arrAcc = split_list(arrAcc, batches)
        arrAcc = makeAccuracy(arrAcc, remainingY)
        arrYt = split_list(arrYt, batches)
        arrPredicted = split_list(arrPredicted, batches)

    # returns accuracy array and last selected points
    return "AMANDA (Dynamic)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
Beispiel #2
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    excludingPercentage = kwargs["excludingPercentage"]
    clfName = kwargs["clfName"]
    densityFunction = kwargs["densityFunction"]
    poolSize = kwargs["poolSize"]
    isBatchMode = kwargs["isBatchMode"]

    print("METHOD: Sliding {0} as classifier".format(clfName))
    usePCA = False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    finalDataLength = initialLabeledData
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                finalDataLength, usePCA)
    clf = classifiers.classifier(X, y, K, clfName)

    if isBatchMode:
        for t in range(batches):
            # sliding
            clf.fit(X, y)

            initialDataLength = finalDataLength
            finalDataLength = finalDataLength + sizeOfBatch
            #print(initialDataLength)
            #print(finalDataLength)

            Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                          initialDataLength, finalDataLength,
                                          usePCA)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(np.array(Ut))
            arrYt.append(yt)
            predicted = clf.predict(Ut)
            arrPredicted.append(predicted)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))

            X, y = Ut, predicted
    else:
        inst = []
        labels = []
        remainingX, remainingY = util.loadLabeledData(dataValues, dataLabels,
                                                      finalDataLength,
                                                      len(dataValues), usePCA)

        for Ut, yt in zip(remainingX, remainingY):
            predicted = clf.predict(Ut.reshape(1, -1))
            arrAcc.append(predicted)
            inst.append(Ut)
            labels.append(predicted)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(Ut)
            arrYt.append(yt)
            arrPredicted.append(predicted)

            if len(inst) == poolSize:
                inst = np.asarray(inst)
                clf = classifiers.classifier(inst, labels, K, clfName)
                inst = []
                labels = []

        arrAcc = split_list(arrAcc, batches)
        arrAcc = makeAccuracy(arrAcc, remainingY)
        arrYt = split_list(arrYt, batches)
        arrPredicted = split_list(arrPredicted, batches)

    return "Sliding SSL", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted
Beispiel #3
0
    def fit(self, dataValues, dataLabels=None):
        arrAcc = []
        classes = list(set(dataLabels))
        initialDataLength = 0
        self.excludingPercentage = 1 - self.excludingPercentage
        finalDataLength = self.initialLabeledData
        reset = True

        # ***** Box 1 *****
        #Initial labeled data
        X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                    finalDataLength, self.usePCA)
        if self.isBatchMode:
            for t in range(self.batches):
                #print("passo: ",t)
                initialDataLength = finalDataLength
                finalDataLength = finalDataLength + self.sizeOfBatch

                # ***** Box 2 *****
                Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                              initialDataLength,
                                              finalDataLength, self.usePCA)

                # ***** Box 3 *****
                clf = classifiers.classifier(X, y, self.K, self.clfName)

                predicted = clf.predict(Ut)
                # Evaluating classification
                arrAcc.append(metrics.evaluate(yt, predicted))

                # ***** Box 4 *****
                #pdfs from each new points from each class applied on new arrived points
                '''pdfsByClass = util.pdfByClass(Ut, predicted, classes, self.densityFunction)
                
                # ***** Box 5 *****
                selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage)
                
                # ***** Box 6 *****
                X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)'''

                allInstances = []
                allLabels = []
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    pdfsByClass = util.pdfByClass(Ut, predicted, classes,
                                                  self.densityFunction)
                else:
                    #Considers the past and actual data (concept-drift like)
                    allInstances = np.vstack([X, Ut])
                    allLabels = np.hstack([y, predicted])
                    pdfsByClass = util.pdfByClass(allInstances, allLabels,
                                                  classes,
                                                  self.densityFunction)

                selectedIndexes = util.compactingDataDensityBased2(
                    pdfsByClass, self.excludingPercentage)

                # ***** Box 6 *****
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    X, y = util.selectedSlicedData(Ut, predicted,
                                                   selectedIndexes)
                else:
                    #Considers the past and actual data (concept-drift like)
                    X, y = util.selectedSlicedData(allInstances, allLabels,
                                                   selectedIndexes)
        else:
            inst = []
            labels = []
            clf = classifiers.classifier(X, y, self.K, self.clfName)
            remainingX, remainingY = util.loadLabeledData(
                dataValues, dataLabels, finalDataLength, len(dataValues),
                self.usePCA)

            for Ut, yt in zip(remainingX, remainingY):
                predicted = clf.predict(Ut.reshape(1, -1))
                arrAcc.append(predicted)
                inst.append(Ut)
                labels.append(predicted)

                if len(inst) == self.poolSize:
                    inst = np.asarray(inst)
                    #pdfsByClass = util.pdfByClass(inst, labels, classes, self.densityFunction)
                    #selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, self.excludingPercentage)
                    #X, y = util.selectedSlicedData(inst, labels, selectedIndexes)
                    if reset == True:
                        #Considers only the last distribution (time-series like)
                        pdfsByClass = util.pdfByClass(inst, labels, classes,
                                                      self.densityFunction)
                    else:
                        #Considers the past and actual data (concept-drift like)
                        allInstances = np.vstack([X, inst])
                        allLabels = np.hstack([y, labels])
                        pdfsByClass = util.pdfByClass(allInstances, allLabels,
                                                      classes,
                                                      self.densityFunction)

                    selectedIndexes = util.compactingDataDensityBased2(
                        pdfsByClass, excludingPercentage)

                    if reset == True:
                        #Considers only the last distribution (time-series like)
                        X, y = util.selectedSlicedData(inst, labels,
                                                       selectedIndexes)
                    else:
                        #Considers the past and actual data (concept-drift like)
                        X, y = util.selectedSlicedData(allInstances, allLabels,
                                                       selectedIndexes)

                    clf = classifiers.classifier(X, y, self.K, self.clfName)
                    inst = []
                    labels = []

            arrAcc = split_list(arrAcc, self.batches)
            arrAcc = makeAccuracy(arrAcc, remainingY)

        # returns accuracy array and last selected points
        self.threshold_ = arrAcc
        return self
Beispiel #4
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    K = kwargs["K_variation"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    excludingPercentage = kwargs["excludingPercentage"]
    clfName = kwargs["clfName"]
    densityFunction = kwargs["densityFunction"]
    poolSize = kwargs["poolSize"]
    isBatchMode = kwargs["isBatchMode"]
    
    print("METHOD: {} as classifier and {} as core support extraction with cutting data method".format(clfName, densityFunction))
    usePCA=False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    excludingPercentage = 1-excludingPercentage
    finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch)
    reset = True

    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)
    clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn)
    if isBatchMode:
        for t in range(batches):
            #print("passo: ",t)
            initialDataLength=finalDataLength
            finalDataLength=finalDataLength+sizeOfBatch
            #print(initialDataLength)
            #print(finalDataLength)
            
            Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(np.array(Ut))
            arrYt.append(yt)

            #classifies
            predicted = clf.predict(Ut)
            
            arrPredicted.append(predicted)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))
            
            # ***** Box 4 *****
            #pdfs from each new points from each class applied on new arrived points
            allInstances = []
            allLabels = []
            if reset == True:
                #Considers only the last distribution (time-series like)
                pdfsByClass = util.pdfByClass(Ut, yt, classes, densityFunction)#O(nmd)
            else:
                #Considers the past and actual data (concept-drift like)
                allInstances = np.vstack([X, Ut])
                allLabels = np.hstack([y, yt])
                pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction)
                
            selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)#O(n log(n) c)
        
            # ***** Box 6 *****
            if reset == True:
                #Considers only the last distribution (time-series like)
                X, y = util.selectedSlicedData(Ut, yt, selectedIndexes)
            else:
                #Considers the past and actual data (concept-drift like)
                X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes)#O(n)

            #training
            clf = classifiers.classifier(X, y, K, clfName) #O(nd+kn)
    else:
        inst = []
        labels = []
        clf = classifiers.classifier(X, y, K, clfName)
        remainingX , remainingY = util.loadLabeledData(dataValues, dataLabels, finalDataLength, len(dataValues), usePCA)
        reset = False
        
        for Ut, yt in zip(remainingX, remainingY):
            predicted = clf.predict(Ut.reshape(1, -1))[0]
            arrAcc.append(predicted)
            inst.append(Ut)
            labels.append(predicted)

            # for decision boundaries plot
            arrClf.append(clf)
            arrX.append(X)
            arrY.append(y)
            arrUt.append(Ut)
            arrYt.append(yt)
            arrPredicted.append(predicted)
            
            if len(inst) == poolSize:
                inst = np.asarray(inst)
                '''pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction)
                selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)
                X, y = util.selectedSlicedData(inst, labels, selectedIndexes)
                clf = classifiers.classifier(X, y, K, clfName)
                inst = []
                labels = []'''
                if reset == True:
                    #Considers only the last distribution (time-series like)
                    pdfsByClass = util.pdfByClass(inst, labels, classes, densityFunction)
                else:
                    #Considers the past and actual data (concept-drift like)
                    allInstances = np.vstack([X, inst])
                    allLabels = np.hstack([y, labels])
                    pdfsByClass = util.pdfByClass(allInstances, allLabels, classes, densityFunction)

                selectedIndexes = util.compactingDataDensityBased2(pdfsByClass, excludingPercentage)

                if reset == True:
                    #Considers only the last distribution (time-series like)
                    X, y = util.selectedSlicedData(inst, labels, selectedIndexes)
                else:
                    #Considers the past and actual data (concept-drift like)
                    X, y = util.selectedSlicedData(allInstances, allLabels, selectedIndexes)

                clf = classifiers.classifier(X, y, K, clfName)
                inst = []
                labels = []
            
        arrAcc = split_list(arrAcc, batches)
        arrAcc = makeAccuracy(arrAcc, remainingY)
        arrYt = split_list(arrYt, batches)
        arrPredicted = split_list(arrPredicted, batches)

    # returns accuracy array and last selected points
    return "AMANDA (Fixed)", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted