コード例 #1
0
    def train(self):
        output = open(self.outputFilename,'w')
        output_confusion = open("confusionMatrix.txt",'w')
        accuracyList = []
        precisionList = []
        recallList = []
        fMeasureList = []
        if self.flag == "-r":
            random = True
            distance = False
            diversity = False
            probability = False
        elif self.flag == "-d":
            random = False
            distance = True
            diversity = False
            probability = False
        elif self.flag == "-v":
            random = False
            distance = False
            diversity = True
            probability = False
        elif self.flag == "-p":
            random = False
            distance = False
            diversity = False
            probability = True
        for i in range(0,5):#run 5 times to obtain an average
            mySplitter = FileSplitter(self.inputFilename)
            if self.inputFilename == "shuffled_magic04.data": #declare a classifier with best parameters for dataset
                clf = svm.SVC(gamma=.01, C=100.0)
            elif self.inputFilename == "ionosphereShuffle.txt":
                clf = svm.SVC(gamma=0.10000000000000001, C=1.0)
            elif self.inputFilename == "sensorShuffled.data":
                clf = svm.SVC(gamma=.01, C=100.0)
            elif self.inputFilename == "mushroom.data":
                clf = svm.SVC(gamma=.10000000000000001, C=1.0)
            trainContent = []
            trainTracker = 0
            accuracyIteration = []
            precisionIteration = []
            recallIteration = []
            fMeasureIteration = []
            for j in range(0,10):
                trainTracker += 1
                if j == 0: 
                    trainContent.extend(mySplitter.removeInitialFive())
                elif (j == 1) or (j == 2):
                    remainingContent = mySplitter.getContent()
                    if self.inputFilename == "shuffled_magic04.data":
                        x2, y2 = Formatter.separate_magic(remainingContent)
                        dec = clf.decision_function(x2)
                    elif self.inputFilename == "ionosphereShuffle.txt":
                        x2, y2 = Formatter.separate_ionosphere(remainingContent)
                        dec = clf.decision_function(x2)
                    elif self.inputFilename == "sensorShuffled.data":
                        x2, y2 = Formatter.separate_sensor(remainingContent)
                        dec = clf.decision_function(x2)
                    elif self.inputFilename == "mushroom.data":
                        x2, y2 = Formatter.separate_mushroom(remainingContent)
                        dec = clf.decision_function(x2)
                    if random == True:
                        trainContent.extend(mySplitter.removeRandom(.025))
                    elif distance == True:
                        trainContent.extend(mySplitter.removeClosest(dec,.025))
                    elif diversity ==  True:
                        trainContent.extend(mySplitter.removeBrinkers(dec,0.85,.025))
                    elif probability == True:
                        trainContent.extend(mySplitter.removeProbable(dec,.025))
                else:
                    remainingContent = mySplitter.getContent()
                    if self.inputFilename == "shuffled_magic04.data":
                        x2, y2 = Formatter.separate_magic(remainingContent)
                        dec = clf.decision_function(x2)
                    elif self.inputFilename == "ionosphereShuffle.txt":
                        x2, y2 = Formatter.separate_ionosphere(remainingContent)
                        dec = clf.decision_function(x2)
                    elif self.inputFilename == "sensorShuffled.data":
                        x2, y2 = Formatter.separate_sensor(remainingContent)
                        dec = clf.decision_function(x2)
                    elif self.inputFilename == "mushroom.data":
                        x2, y2 = Formatter.separate_mushroom(remainingContent)
                        dec = clf.decision_function(x2)
                    if random == True:
                        trainContent.extend(mySplitter.removeRandom(.1))
                    elif distance == True:
                        trainContent.extend(mySplitter.removeClosest(dec,.1))
                    elif diversity ==  True:
                        trainContent.extend(mySplitter.removeBrinkers(dec,0.85,.1))
                    elif probability == True:
                        trainContent.extend(mySplitter.removeProbable(dec,.1))
                
                if self.inputFilename == "shuffled_magic04.data":
                    x1,y1 = Formatter.separate_magic(trainContent) #attributes are stored in x, labels in y
                    predictContent = mySplitter.getTestContent()
                    x2,y2 = Formatter.separate_magic(predictContent)
                elif self.inputFilename == "ionosphereShuffle.txt":
                    x1,y1 = Formatter.separate_ionosphere(trainContent)
                    predictContent = mySplitter.getTestContent()
                    x2,y2 = Formatter.separate_ionosphere(predictContent)
                elif self.inputFilename == "sensorShuffled.data":
                    x1,y1 = Formatter.separate_sensor(trainContent)
                    predictContent = mySplitter.getTestContent()
                    x2,y2 = Formatter.separate_sensor(predictContent)
                elif self.inputFilename == "mushroom.data":
                    x1,y1 = Formatter.separate_mushroom(trainContent)
                    predictContent = mySplitter.getTestContent()
                    x2,y2 = Formatter.separate_mushroom(predictContent)

                clf.fit(x1,y1) #train the classifier with a labeled set
                prediction = clf.predict(x2) #predict the labels for the x2 set, and store them in a variable
                accuracy, precision, recall = Formatter.confusion_matrix(prediction, y2, output_confusion, j)
                fMeasure = Formatter.f_measure(precision,recall)
                accuracy *= 100
                precision *= 100
                recall *= 100
        
                print ("Accuracy: " + str(accuracy) + "%")
                print ("Precision: " + str(precision) + "%")
                print ("Recall: " + str(recall) + "%")
                print ("F-measure: " + str(fMeasure))
            
                fMeasureIteration.append(fMeasure)
                accuracyIteration.append(accuracy)
                precisionIteration.append(precision)
                recallIteration.append(recall)
            fMeasureList.append(fMeasureIteration)
            accuracyList.append(accuracyIteration)
            precisionList.append(precisionIteration)
            recallList.append(recallIteration)
        outputList = Formatter.takeAverage(fMeasureList)
        Formatter.writeHistogramOutput(outputList,output)
        output.close()
        output_confusion.close()