Ejemplo n.º 1
0
    def fit(self, dataValues, dataLabels=None):
        arrAcc = []
        classes = list(set(dataLabels))
        initialDataLength = 0
        finalDataLength = self.initialLabeledData

        # ***** Box 1 *****
        #Initial labeled data
        X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength,
                                    finalDataLength, self.usePCA)

        for t in range(self.batches):
            #print("passo: ",t)
            initialDataLength = finalDataLength
            finalDataLength = finalDataLength + self.sizeOfBatch

            # ***** Box 2 *****
            Ut, yt = util.loadLabeledData(dataValues, dataLabels,
                                          initialDataLength, finalDataLength,
                                          self.usePCA)

            # ***** Box 3 *****
            clf = classifiers.labelPropagation(X, y, self.K)
            #classifiers.classifier(X, y, self.K, self.clfName)

            predicted = clf.predict(Ut)
            # Evaluating classification
            arrAcc.append(metrics.evaluate(yt, predicted))

            # ***** Box 4 *****
            #pdfs from each new points from each class applied on new arrived points
            indexesByClass = util.slicingClusteredData(y, classes)
            bestModelSelectedByClass = util.loadBestModelByClass(
                X, indexesByClass)

            # ***** Box 5 *****
            predictedByClass = util.slicingClusteredData(predicted, classes)
            #p% smallest distances per class, based on paper
            selectedIndexes = util.mahalanobisCoreSupportExtraction(
                Ut, predictedByClass, bestModelSelectedByClass, self.p)
            #selectedIndexes = np.hstack([selectedIndexes[0],selectedIndexes[1]])
            stackedIndexes = selectedIndexes[0]
            for i in range(1, len(selectedIndexes)):
                stackedIndexes = np.hstack(
                    [stackedIndexes, selectedIndexes[i]])
            selectedIndexes = stackedIndexes

            # ***** Box 6 *****
            X, y = util.selectedSlicedData(Ut, predicted, selectedIndexes)

        # returns accuracy array and last selected points
        self.threshold_ = arrAcc
        return self
Ejemplo n.º 2
0
def cuttingPercentageByClass(Xt_1, Xt, yt_1, yt, classes, t=None):
    x = np.sqrt(2)
    reset = False

    hellinger_distance_by_class = {}
    similarityByClass = {}

    indexes_Xt_1_ByClass = util.slicingClusteredData(yt_1, classes)
    indexes_Xt_ByClass = util.slicingClusteredData(yt, classes)

    for c in classes:
        res = []
        for i in range(Xt_1.shape[1]):
            P = Xt_1[indexes_Xt_1_ByClass[c], i]
            Q = Xt[indexes_Xt_ByClass[c], i]

            bins = int(np.sqrt(len(indexes_Xt_1_ByClass[c])))

            hP = np.histogram(P + (-np.min(P)), bins=bins)
            hQ = np.histogram(Q + (-np.min(Q)), bins=bins)
            res.append(hellinger(hP[1], hQ[1]))

        res = np.mean(res)
        similarity = 1 - (
            ((100 * res) / x) / 100)  #(100 - ((100 * res)/x))/100
        #print(t,res, similarity)
        if similarity < 0:
            reset = True
        elif similarity > 0:
            reset = False

        similarity = 0.5 + ((res / x) / 10)
        if similarity > 0.9:
            similarity = 0.9

        similarityByClass.update({c: similarity})
        #print(t,c,similarity)

    return similarityByClass, reset  #percentage of similarity
Ejemplo n.º 3
0
def countInstances(datasetID, dataLabels):
    classes = list(set(dataLabels))
    inst = util.slicingClusteredData(dataLabels, classes)
    for i in range(len(inst)):
        print("{}: class {} -> {} instances.".format(datasetID, i,
                                                     len(inst[i])))
Ejemplo n.º 4
0
def start(**kwargs):
    dataValues = kwargs["dataValues"]
    dataLabels = kwargs["dataLabels"]
    initialLabeledData = kwargs["initialLabeledData"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    classes = kwargs["classes"]
    batches = kwargs["batches"]
    sizeOfBatch = kwargs["sizeOfBatch"]
    p = kwargs["excludingPercentage"]
    K = kwargs["K_variation"]
    clfName = kwargs["clfName"]
    densityFunction='gmmBIC'
    distanceMetric = 'mahalanobis'

    print("METHOD: {} as classifier and GMM with BIC and Mahalanobis as core support extraction".format(clfName))
    usePCA=False
    arrAcc = []
    arrX = []
    arrY = []
    arrUt = []
    arrYt = []
    arrClf = []
    arrPredicted = []
    initialDataLength = 0
    finalDataLength = initialLabeledData #round((initialLabeledDataPerc)*sizeOfBatch)
    # ***** Box 1 *****
    #Initial labeled data
    X, y = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)
    #predicted = classifiers.classify(X, y, Ut, K, classes, clfName)
    clf = classifiers.labelPropagation(X, y, K)
    #Starting the process
    for t in range(batches):
        #print("Step: ", t)
        initialDataLength=finalDataLength
        finalDataLength=finalDataLength+sizeOfBatch
        # ***** Box 2 *****
        Ut, yt = util.loadLabeledData(dataValues, dataLabels, initialDataLength, finalDataLength, usePCA)

        # for decision boundaries plot
        arrClf.append(clf)
        arrX.append(X)
        arrY.append(y)
        arrUt.append(np.array(Ut))
        arrYt.append(yt)
        #predict test data
        predicted = clf.predict(Ut)
        arrPredicted.append(predicted)

        # Evaluating classification
        arrAcc.append(metrics.evaluate(yt, predicted))
        
        # ***** Box 4 *****
        indexesByClass = util.slicingClusteredData(y, classes)
        bestModelSelectedByClass = util.loadBestModelByClass(X, indexesByClass)
        
        # ***** Box 5 *****
        predictedByClass = util.slicingClusteredData(predicted, classes)
        selectedIndexes = util.mahalanobisCoreSupportExtraction(Ut, predictedByClass, bestModelSelectedByClass, p)
        #selectedIndexes = np.hstack([selectedIndexes[0],selectedIndexes[1]])
        stackedIndexes=selectedIndexes[0]
        for i in range(1, len(selectedIndexes)):
            stackedIndexes = np.hstack([stackedIndexes,selectedIndexes[i]])
        selectedIndexes =  stackedIndexes   
        
        X, y = util.selectedSlicedData(Ut, yt, selectedIndexes)
        #training data
        clf = classifiers.labelPropagation(X, y, K)
        
    return "COMPOSE GMM", arrAcc, X, y, arrX, arrY, arrUt, arrYt, arrClf, arrPredicted