def emlimitateUnusedFeature(self, trainData, testData = None):
        trainData.set_class_index(trainData.num_attributes() - 1)   # set class attribute
        featureIndex = -1       
        filteredTrainData = trainData
        filteredTestData = testData
        

        attribute_index = 0

        while attribute_index < filteredTrainData.num_attributes() - 1:
            sampleCoverage = 0
            #print attribute_index
            # check value for current feature in each instance
            for instance_index in range(0, filteredTrainData.num_instances()):
                instance = filteredTrainData.get_instance(instance_index)
                value = instance.get_value(attribute_index)
                
                if value > 0:
                    sampleCoverage += 1
            if sampleCoverage == 0:
                #print "found"
                remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1
                remove.set_inputformat(filteredTrainData)
                filteredTrainData = remove.filter(filteredTrainData)  
                if filteredTestData:
                    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", str(attribute_index+1)]) #The index in this function start from 1
                    remove.set_inputformat(filteredTestData)
                    filteredTestData = remove.filter(filteredTestData)  
            else:
                attribute_index += 1

        return [filteredTrainData, filteredTestData]
Exemple #2
0
    def attributeSelector(self, data, selectNum):
        attributeSelector = Filter(classname="weka.filters.supervised.attribute.AttributeSelection",\
                         options=["-S", "weka.attributeSelection.Ranker -T -1.7976931348623157E308 -N " + str(selectNum),\
                                   "-E", "weka.attributeSelection.InfoGainAttributeEval"])

        attributeSelector.set_inputformat(data)
        data = attributeSelector.filter(data)

        return data
    def filterUnusedFeatureFromList(self, data, unusedFuncitonList):
        filteredData = data

        for attribute in unusedFuncitonList:                
            remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"])
            remove.set_inputformat(filteredData)
            filteredData = remove.filter(filteredData)

        return filteredData      
    def attributeSelector(self, data, selectNum):
        attributeSelector = Filter(classname="weka.filters.supervised.attribute.AttributeSelection",\
                         options=["-S", "weka.attributeSelection.Ranker -T -1.7976931348623157E308 -N " + str(selectNum),\
                                   "-E", "weka.attributeSelection.InfoGainAttributeEval"])

        attributeSelector.set_inputformat(data)
        data = attributeSelector.filter(data)

            
        return data
Exemple #5
0
    def filterUnusedFeatureFromList(self, data, unusedFuncitonList):
        filteredData = data

        for attribute in unusedFuncitonList:
            remove = Filter(
                classname="weka.filters.unsupervised.attribute.RemoveByName",
                options=["-E", "^" + attribute + ".*$"])
            remove.set_inputformat(filteredData)
            filteredData = remove.filter(filteredData)

        return filteredData
 def getSetDataBySetIndex(self, data, index):
     # cut feature set out
     featureTable = FeatureTable()
     startIndexList = featureTable.getEachSetStartIndex()
     
     start = startIndexList[index]
     end = startIndexList[index+1] - 1
     remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-V", "-R", str(start) + "-" + str(end) + ",last"])
     remove.set_inputformat(data)
     filteredData = remove.filter(data)
     return filteredData
Exemple #7
0
    def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile,
                                                      apiFile, indexInTable,
                                                      methodName,
                                                      databaseTable,
                                                      csvFilePath):
        outputStr = methodName + ","
        resultList = []
        # Get whole feature set of our approach
        filteredData = self.load_Arff(ourApproahFile)
        # Use this function to get selected API feature and save the unselected api in a list
        filterOutList = self.attribueSelectionBasedOnRankingInDatabase(
            apiFile, indexInTable, databaseTable, "")[1]

        # Remove unselected API
        for functionName in filterOutList:
            functionName = functionName.split("(")[0] + "\(\)"
            functionName = functionName.replace('$', '\$')
            remove = Filter(
                classname="weka.filters.unsupervised.attribute.RemoveByName",
                options=["-E", "^" + functionName + ".*$"])
            remove.set_inputformat(filteredData)
            filteredData = remove.filter(filteredData)
        featureNum = filteredData.num_attributes() - 1
        print "featureNum: " + str(featureNum)
        if csvFilePath != "":
            self.writeTenScaledTitleManual(featureNum, csvFilePath)
            #print "i:" + str(i)
            #print "functionName:" + functionName
            #print "featureNum: " + str(filteredData.num_attributes() - 1)
        for attributeStr in filteredData.attributes():
            print(attributeStr)
        # Run ten scaled generation and evaluation
        step = 10
        while step < featureNum:
            roundData = self.attributeSelector(filteredData, step)
            classifier = self.algorithmPicker(roundData, indexInTable)
            evaluation = self.evaluation(classifier, roundData)
            #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))
            #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",")
            step += 10

        classifier = self.algorithmPicker(filteredData, indexInTable)
        evaluation = self.evaluation(classifier, filteredData)
        #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))

        # Write out to CSV file
        for item in resultList:
            outputStr += item + ","
        outputStr = outputStr[0:-1] + "\n"
        self.writeToPath(csvFilePath, outputStr)
Exemple #8
0
    def getSetDataBySetIndex(self, data, index):
        # cut feature set out
        featureTable = FeatureTable()
        startIndexList = featureTable.getEachSetStartIndex()

        start = startIndexList[index]
        end = startIndexList[index + 1] - 1
        remove = Filter(
            classname="weka.filters.unsupervised.attribute.Remove",
            options=["-V", "-R",
                     str(start) + "-" + str(end) + ",last"])
        remove.set_inputformat(data)
        filteredData = remove.filter(data)
        return filteredData
Exemple #9
0
 def _pre_process_to_classification(self, dataset):   
     filter_data = Filter(classname = 'weka.filters.unsupervised.attribute.MathExpression', 
                          options = ['-unset-class-temporarily', '-E', "ifelse ( A>0, 1, 0 )", 
                                     '-V', '-R', 'last'])
     
     filter_data.set_inputformat(dataset)
     filtered = filter_data.filter(dataset)
     
     discretize_data = Filter(classname = 'weka.filters.unsupervised.attribute.NumericToNominal', 
                          options = ['-R', 'last'])
     
     discretize_data.set_inputformat(filtered)
     discretized = discretize_data.filter(filtered)
     
     return discretized
 def filterOutUnnecessaryAPIAndEvaluateOurApproach(self, ourApproahFile, apiFile, indexInTable, methodName, databaseTable, csvFilePath):
     outputStr = methodName+","
     resultList = []
     # Get whole feature set of our approach
     filteredData = self.load_Arff(ourApproahFile)
     # Use this function to get selected API feature and save the unselected api in a list
     filterOutList = self.attribueSelectionBasedOnRankingInDatabase(apiFile, indexInTable, databaseTable, "")[1]
     
     # Remove unselected API
     for functionName in filterOutList:
         functionName = functionName.split("(")[0] + "\(\)"
         functionName = functionName.replace('$','\$')
         remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"])
         remove.set_inputformat(filteredData)
         filteredData = remove.filter(filteredData)
     featureNum = filteredData.num_attributes() - 1
     print "featureNum: " + str(featureNum)
     if csvFilePath != "":
         self.writeTenScaledTitleManual(featureNum, csvFilePath)
         #print "i:" + str(i)
         #print "functionName:" + functionName
         #print "featureNum: " + str(filteredData.num_attributes() - 1)
     for attributeStr in filteredData.attributes():
         print(attributeStr)
     # Run ten scaled generation and evaluation 
     step = 10 
     while step < featureNum:
         roundData = self.attributeSelector(filteredData, step)
         classifier = self.algorithmPicker(roundData, indexInTable)
         evaluation = self.evaluation(classifier, roundData)
         #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(roundData.num_attributes() - 1) + "/" + str(featureNum))
         resultList.append("{:.2f}".format(evaluation.percent_correct()))
         #csvFile.write("{:.2f}".format(evaluation.percent_correct()) +",")
         step += 10
     
     classifier = self.algorithmPicker(filteredData, indexInTable)
     evaluation = self.evaluation(classifier, filteredData)
     #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredData.num_attributes() - 1) + "/" + str(featureNum))
     resultList.append("{:.2f}".format(evaluation.percent_correct()))
     
     # Write out to CSV file
     for item in resultList:
         outputStr += item +","
     outputStr = outputStr[0:-1] + "\n"
     self.writeToPath(csvFilePath, outputStr)
Exemple #11
0
    def createTwoDatasets(self,
                          wholeDataPath,
                          trainingDataPercentage,
                          trainingPath,
                          testingPath,
                          shuffleSeed=43):
        wholeData = self.load_Arff(wholeDataPath)
        randomize = Filter(
            classname="weka.filters.unsupervised.instance.Randomize",
            options=["-S", str(shuffleSeed)])
        randomize.set_inputformat(wholeData)
        wholeData = randomize.filter(wholeData)

        removePercentage = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(trainingDataPercentage), "-V"])
        removePercentage.set_inputformat(wholeData)
        trainingData = removePercentage.filter(wholeData)
        print "instances:" + str(trainingData.num_instances())

        removePercentage = Filter(
            classname="weka.filters.unsupervised.instance.RemovePercentage",
            options=["-P", str(trainingDataPercentage)])
        removePercentage.set_inputformat(wholeData)
        testingData = removePercentage.filter(wholeData)

        print "instances:" + str(testingData.num_instances())

        self.save_Arff(trainingData, trainingPath)
        self.save_Arff(testingData, testingPath)
Exemple #12
0
    def emlimitateUnusedFeature(self, trainData, testData=None):
        trainData.set_class_index(trainData.num_attributes() -
                                  1)  # set class attribute
        featureIndex = -1
        filteredTrainData = trainData
        filteredTestData = testData

        attribute_index = 0

        while attribute_index < filteredTrainData.num_attributes() - 1:
            sampleCoverage = 0
            #print attribute_index
            # check value for current feature in each instance
            for instance_index in range(0, filteredTrainData.num_instances()):
                instance = filteredTrainData.get_instance(instance_index)
                value = instance.get_value(attribute_index)

                if value > 0:
                    sampleCoverage += 1
            if sampleCoverage == 0:
                #print "found"
                remove = Filter(
                    classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", str(attribute_index + 1)
                             ])  #The index in this function start from 1
                remove.set_inputformat(filteredTrainData)
                filteredTrainData = remove.filter(filteredTrainData)
                if filteredTestData:
                    remove = Filter(
                        classname="weka.filters.unsupervised.attribute.Remove",
                        options=["-R", str(attribute_index + 1)
                                 ])  #The index in this function start from 1
                    remove.set_inputformat(filteredTestData)
                    filteredTestData = remove.filter(filteredTestData)
            else:
                attribute_index += 1

        return [filteredTrainData, filteredTestData]
 def createTwoDatasets(self, wholeDataPath, trainingDataPercentage, trainingPath, testingPath, shuffleSeed = 43):
     wholeData = self.load_Arff(wholeDataPath)
     randomize = Filter(classname="weka.filters.unsupervised.instance.Randomize", options=["-S", str(shuffleSeed)])
     randomize.set_inputformat(wholeData)
     wholeData = randomize.filter(wholeData)
     
     removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage), "-V"])
     removePercentage.set_inputformat(wholeData)
     trainingData = removePercentage.filter(wholeData)
     print "instances:" + str(trainingData.num_instances())
     
     removePercentage = Filter(classname="weka.filters.unsupervised.instance.RemovePercentage", options=["-P", str(trainingDataPercentage)])
     removePercentage.set_inputformat(wholeData)
     testingData = removePercentage.filter(wholeData)
     
     print "instances:" + str(testingData.num_instances())
     
     self.save_Arff(trainingData, trainingPath)
     self.save_Arff(testingData, testingPath)
Exemple #14
0
from weka.core.converters import Loader
from weka.clusterers import Clusterer, ClusterEvaluation
from weka.filters import Filter
import weka.plot.clusterers as plc

jvm.start()

# load iris
fname = data_dir + os.sep + "iris.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# remove class attribute
flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
flt.set_inputformat(data)
filtered = flt.filter(data)

# build KMeans
print("\n--> SimpleKMeans\n")
cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
print(evl.get_cluster_results())
plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True)

# use AddCluster filter
print("\n--> AddCluster filter\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
Exemple #15
0
            writer.writerow(row)

# close csvfile
csvfile.close()

# start JVM
jvm.start()

# load CSV file
loader = Loader(classname="weka.core.converters.CSVLoader", options=["-E", '"', "-F", ","])
data = loader.load_file(csvfilename)
#print(data)

# convert class to nominal
wfilter = Filter(classname="weka.filters.unsupervised.attribute.StringToNominal", options=["-R", "last"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# convert content to string
wfilter = Filter(classname="weka.filters.unsupervised.attribute.NominalToString", options=["-C", "first"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# set class attribute
data.set_class_index(data.num_attributes() - 1)

# generate baseline
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
evaluation = Evaluation(data)
evaluation.crossvalidate_model(zeror, data, 10, Random(1))
print("\nBaseline:\n" + evaluation.to_summary())
Exemple #16
0
csvfile.close()

# start JVM
jvm.start()

# load CSV file
loader = Loader(classname="weka.core.converters.CSVLoader",
                options=["-E", '"', "-F", ","])
data = loader.load_file(csvfilename)
#print(data)

# convert class to nominal
wfilter = Filter(
    classname="weka.filters.unsupervised.attribute.StringToNominal",
    options=["-R", "last"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# convert content to string
wfilter = Filter(
    classname="weka.filters.unsupervised.attribute.NominalToString",
    options=["-C", "first"])
wfilter.set_inputformat(data)
data = wfilter.filter(data)

# set class attribute
data.set_class_index(data.num_attributes() - 1)

# generate baseline
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
evaluation = Evaluation(data)
Exemple #17
0
# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# simulate the 10 train/test pairs of cross-validation
evl = Evaluation(data)
for i in xrange(1, 11):
    # create train set
    remove = Filter(
        classname="weka.filters.supervised.instance.StratifiedRemoveFolds",
        options=["-N", "10", "-F", str(i), "-S", "1", "-V"])
    remove.set_inputformat(data)
    train = remove.filter(data)

    # create test set
    remove = Filter(
        classname="weka.filters.supervised.instance.StratifiedRemoveFolds",
        options=["-N", "10", "-F", str(i), "-S", "1"])
    remove.set_inputformat(data)
    test = remove.filter(data)

    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    evl.test_model(cls, test)

print("Simulated CV accuracy: %0.1f%%" % (evl.percent_correct()))
    def attribueSelectionBasedOnRankingInDatabase(self, trainingData, indexInTable, databaseTable, csvFilePath, testingData = None):     
        featureNum = trainingData.num_attributes() - 1
        outputStr = ""
        outputStr += databaseTable+","

        # select from database vector difference
        featureList3 = []
        wholefeatureList = []
        dbmgr = permissionMappingManager(databasePath)

        for row in dbmgr.query("select * from " + databaseTable):
            featureList3.append(row[0])
            wholefeatureList.append(row[0])
        #featureList3.reverse()
        
        bestRemainFilterList = []
        resultList = []
        digit = len(featureList3) % 10

        bestAccuracy = 0
        bestTrainingData = None
        bestTestingData = None
        bestEvaluation = None
        
        classifier = self.algorithmPicker(trainingData, indexInTable)
        evaluation = self.evaluation(classifier, trainingData, testingData)
        if evaluation.percent_correct() >= bestAccuracy:
            bestAccuracy = evaluation.percent_correct()
            bestTrainingData = trainingData
            bestTestingData = testingData
            bestRemainFilterList = list(featureList3)
            bestEvaluation = evaluation
            
        print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))
        
        if digit > 0:
            for i in range(0, digit):
                functionName = featureList3.pop().split("(")[0] + "\(\)"
                functionName = functionName.replace('$','\$')
                #print "functionName:" + functionName
                remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"])
                remove.set_inputformat(trainingData)
                trainingData = remove.filter(trainingData)
                if testingData:
                    remove.set_inputformat(testingData)
                    testingData = remove.filter(testingData)
                
                #print "i:" + str(i)
                #print "functionName:" + functionName
                #print "featureNum: " + str(filteredData.num_attributes() - 1)
            #for attributeStr in trainingData.attributes():
            #    print(attributeStr)
            #self.printFunctionInfo(trainingData, trainingData.num_instances())
            
            classifier = self.algorithmPicker(trainingData, indexInTable)
            evaluation = self.evaluation(classifier, trainingData, testingData)
            if evaluation.percent_correct() >= bestAccuracy:
                bestAccuracy = evaluation.percent_correct()
                bestTrainingData = trainingData
                bestTestingData = testingData
                bestRemainFilterList = list(featureList3)
                bestEvaluation = evaluation
                
            print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))
            
        while trainingData.num_attributes() - 1 > 10:
            for i in range(0,10):
                functionName = featureList3.pop().split("(")[0] + "\(\)"
                functionName = functionName.replace('$','\$')
                #print "functionName:" + functionName
                remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + functionName + ".*$"])
                remove.set_inputformat(trainingData)
                trainingData = remove.filter(trainingData)
                if testingData:
                    remove.set_inputformat(testingData)
                    testingData = remove.filter(testingData)
                #print functionName
                #print "featureNum: " + str(filteredData.num_attributes() - 1)
                
            #for attributeStr in trainingData.attributes():
            #    print(attributeStr)
            
            classifier = self.algorithmPicker(trainingData, indexInTable)
            evaluation = self.evaluation(classifier, trainingData, testingData)
            if evaluation.percent_correct() >= bestAccuracy:
                
                bestAccuracy = evaluation.percent_correct()
                bestTrainingData = trainingData
                bestTestingData = testingData
                bestRemainFilterList = list(featureList3)
                bestEvaluation = evaluation
                #print "update feature number:" + str(len(bestRemainFilterList))
                
            print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))

        resultList.reverse()
        
        fileteredfeatureList = []
        #print "bestRemainFilterList number:" + str(len(bestRemainFilterList))
        #print "wholefeatureList number:" + str(len(wholefeatureList))
        for item in wholefeatureList:
            if item not in bestRemainFilterList:
                fileteredfeatureList.append(item)
                
        #print "update fileteredfeatureList number:" + str(len(fileteredfeatureList))
        for item in resultList:
            outputStr += item +","
        outputStr = outputStr[0:-1] + "\n"
        
        print outputStr
        self.writeToPath(csvFilePath, outputStr)
        accuracyStr = "{:.2f}".format(bestAccuracy)
        #print fileteredfeatureList
        return [bestEvaluation, bestTrainingData, bestTestingData, resultList]
Exemple #19
0
    def getTenScaledResultsRankedByInfo(self,
                                        trainingData,
                                        indexInTable,
                                        csvFilePath,
                                        testingData=None):
        dbmgr = permissionMappingManager(databasePath)
        featureNum = trainingData.num_attributes() - 1

        attributeIn = trainingData.attributes()
        attributeList = []
        for item in attributeIn:
            functionName = str(item).split(" ")[1]
            functionName = functionName.split("(")[0] + "\(\)"
            functionName = functionName.replace('$', '\$')
            #print functionName
            attributeList.append(functionName)

        outputStr = ""
        outputStr += "InfomationGain" + ","
        resultList = []
        bestAccuracy = 0
        bestTrainData = 0
        bestTestData = 0

        #for index in range(0, len(attributeList)-1):
        #    attributeList[index] = attributeList[index].split(" ")[1]
        #    print attributeList[index]

        csvFile = open(csvFilePath, "a")
        csvFile.write(self.algorithmTable[indexInTable] + ",")

        step = 10
        while step < featureNum:
            # pick top features
            filteredTrainData = self.attributeSelector(trainingData, step)

            # check top feature informations
            APIList = []
            for item in filteredTrainData.attributes():
                #print str(item)
                functionName = str(item).split(" ")[1]
                #functionName = functionName.split("_")[0][1:]
                APIList.append(functionName)

            numberOfInstance = self.getNumOfInstance(trainingData)

            # Get those features that it doesn't pick
            filteredList = []
            attributeIn = filteredTrainData.attributes()
            for item in attributeIn:
                functionName = str(item).split(" ")[1]
                functionName = functionName.split("(")[0] + "\(\)"
                functionName = functionName.replace('$', '\$')
                filteredList.append(functionName)

            items = self.getItemsNotInTheList(attributeList, filteredList)
            #print len(items)
            #for item in items:
            #    print item
            # Re-process training data and make testing Data synchronized

            filteredTrainData = trainingData
            filterTestingData = testingData
            for attribute in items:
                remove = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.RemoveByName",
                    options=["-E", "^" + attribute + ".*$"])

                remove.set_inputformat(filteredTrainData)
                filteredTrainData = remove.filter(filteredTrainData)
                if filterTestingData:
                    remove.set_inputformat(filterTestingData)
                    filterTestingData = remove.filter(filterTestingData)
                #print attribute
                #print str(filteredTrainData.num_attributes() - 1)

            # Build classifier and evaluate it
            classifier = self.algorithmPicker(filteredTrainData, indexInTable)
            evaluation = self.evaluation(classifier, filteredTrainData,
                                         filterTestingData)
            #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredTrainData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))

            #Save best data and accuracy
            if evaluation.percent_correct() > bestAccuracy:
                bestAccuracy = evaluation.percent_correct()
                bestTrainData = filteredTrainData
                if testingData:
                    bestTestData = filterTestingData
                #bestEvaluation = evaluation
            step += 10

        classifier = self.algorithmPicker(trainingData, indexInTable)
        evaluation = self.evaluation(classifier, trainingData, testingData)
        #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))

        #Save best data and accuracy
        if evaluation.percent_correct() > bestAccuracy:
            bestAccuracy = evaluation.percent_correct()
            bestTrainData = filteredTrainData
            if testingData:
                bestTestData = filterTestingData
            #bestEvaluation = evaluation

        for item in resultList:
            outputStr += item + ","
        outputStr = outputStr[0:-1] + "\n"
        self.writeToPath(csvFilePath, outputStr)
        return [bestAccuracy, bestTrainData, bestTestData, resultList]
    def getTenScaledResultsRankedByInfo(self, trainingData, indexInTable, csvFilePath, testingData = None):
        dbmgr = permissionMappingManager(databasePath)
        featureNum = trainingData.num_attributes() - 1
        
        attributeIn = trainingData.attributes()
        attributeList = []
        for item in attributeIn:
            functionName = str(item).split(" ")[1]
            functionName = functionName.split("(")[0] + "\(\)"
            functionName = functionName.replace('$','\$')
            #print functionName
            attributeList.append(functionName)
        
        
        outputStr = ""
        outputStr += "InfomationGain" + ","
        resultList = []
        bestAccuracy = 0
        bestTrainData = 0
        bestTestData = 0
        
        #for index in range(0, len(attributeList)-1):
        #    attributeList[index] = attributeList[index].split(" ")[1]
        #    print attributeList[index]
        

        csvFile = open(csvFilePath, "a")
        csvFile.write(self.algorithmTable[indexInTable]+",") 
        
        step = 10 
        while step < featureNum:
            # pick top features
            filteredTrainData = self.attributeSelector(trainingData, step)
            
            
            # check top feature informations
            APIList = []  
            for item in filteredTrainData.attributes():
                #print str(item)
                functionName = str(item).split(" ")[1]
                #functionName = functionName.split("_")[0][1:] 
                APIList.append(functionName)
                
            numberOfInstance = self.getNumOfInstance(trainingData)
            
            
                
            # Get those features that it doesn't pick
            filteredList = []
            attributeIn = filteredTrainData.attributes()
            for item in attributeIn:
                functionName = str(item).split(" ")[1]
                functionName = functionName.split("(")[0] + "\(\)"
                functionName = functionName.replace('$','\$')
                filteredList.append(functionName)

            items = self.getItemsNotInTheList(attributeList, filteredList)
            #print len(items)
            #for item in items:
            #    print item
            # Re-process training data and make testing Data synchronized

            filteredTrainData = trainingData
            filterTestingData = testingData
            for attribute in items:                
                remove = Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=["-E", "^" + attribute + ".*$"])

                remove.set_inputformat(filteredTrainData)
                filteredTrainData = remove.filter(filteredTrainData)
                if filterTestingData:
                    remove.set_inputformat(filterTestingData)
                    filterTestingData = remove.filter(filterTestingData)
                #print attribute
                #print str(filteredTrainData.num_attributes() - 1)

            # Build classifier and evaluate it   
            classifier = self.algorithmPicker(filteredTrainData, indexInTable)    
            evaluation = self.evaluation(classifier, filteredTrainData, filterTestingData)
            #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(filteredTrainData.num_attributes() - 1) + "/" + str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))
            
            #Save best data and accuracy
            if evaluation.percent_correct() > bestAccuracy:
                bestAccuracy = evaluation.percent_correct()
                bestTrainData = filteredTrainData
                if testingData:
                    bestTestData = filterTestingData
                #bestEvaluation = evaluation
            step += 10
            
        
        
        classifier = self.algorithmPicker(trainingData, indexInTable)
        evaluation = self.evaluation(classifier, trainingData, testingData)
        #print(self.algorithmTable[indexInTable] + ": " + "{:.2f}".format(evaluation.percent_correct()) + ", Feature select number:" + str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))
        
        #Save best data and accuracy
        if evaluation.percent_correct() > bestAccuracy:
            bestAccuracy = evaluation.percent_correct()
            bestTrainData = filteredTrainData
            if testingData:
                bestTestData = filterTestingData
            #bestEvaluation = evaluation
        
        for item in resultList:
            outputStr += item +","
        outputStr = outputStr[0:-1] + "\n"
        self.writeToPath(csvFilePath, outputStr)
        return [bestAccuracy, bestTrainData, bestTestData, resultList]
Exemple #21
0
    def attribueSelectionBasedOnRankingInDatabase(self,
                                                  trainingData,
                                                  indexInTable,
                                                  databaseTable,
                                                  csvFilePath,
                                                  testingData=None):
        featureNum = trainingData.num_attributes() - 1
        outputStr = ""
        outputStr += databaseTable + ","

        # select from database vector difference
        featureList3 = []
        wholefeatureList = []
        dbmgr = permissionMappingManager(databasePath)

        for row in dbmgr.query("select * from " + databaseTable):
            featureList3.append(row[0])
            wholefeatureList.append(row[0])
        #featureList3.reverse()

        bestRemainFilterList = []
        resultList = []
        digit = len(featureList3) % 10

        bestAccuracy = 0
        bestTrainingData = None
        bestTestingData = None
        bestEvaluation = None

        classifier = self.algorithmPicker(trainingData, indexInTable)
        evaluation = self.evaluation(classifier, trainingData, testingData)
        if evaluation.percent_correct() >= bestAccuracy:
            bestAccuracy = evaluation.percent_correct()
            bestTrainingData = trainingData
            bestTestingData = testingData
            bestRemainFilterList = list(featureList3)
            bestEvaluation = evaluation

        print(self.algorithmTable[indexInTable] + ": " +
              "{:.2f}".format(evaluation.percent_correct()) +
              ", Feature select number:" +
              str(trainingData.num_attributes() - 1) + "/" + str(featureNum))
        resultList.append("{:.2f}".format(evaluation.percent_correct()))

        if digit > 0:
            for i in range(0, digit):
                functionName = featureList3.pop().split("(")[0] + "\(\)"
                functionName = functionName.replace('$', '\$')
                #print "functionName:" + functionName
                remove = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.RemoveByName",
                    options=["-E", "^" + functionName + ".*$"])
                remove.set_inputformat(trainingData)
                trainingData = remove.filter(trainingData)
                if testingData:
                    remove.set_inputformat(testingData)
                    testingData = remove.filter(testingData)

                #print "i:" + str(i)
                #print "functionName:" + functionName
                #print "featureNum: " + str(filteredData.num_attributes() - 1)
            #for attributeStr in trainingData.attributes():
            #    print(attributeStr)
            #self.printFunctionInfo(trainingData, trainingData.num_instances())

            classifier = self.algorithmPicker(trainingData, indexInTable)
            evaluation = self.evaluation(classifier, trainingData, testingData)
            if evaluation.percent_correct() >= bestAccuracy:
                bestAccuracy = evaluation.percent_correct()
                bestTrainingData = trainingData
                bestTestingData = testingData
                bestRemainFilterList = list(featureList3)
                bestEvaluation = evaluation

            print(self.algorithmTable[indexInTable] + ": " +
                  "{:.2f}".format(evaluation.percent_correct()) +
                  ", Feature select number:" +
                  str(trainingData.num_attributes() - 1) + "/" +
                  str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))

        while trainingData.num_attributes() - 1 > 10:
            for i in range(0, 10):
                functionName = featureList3.pop().split("(")[0] + "\(\)"
                functionName = functionName.replace('$', '\$')
                #print "functionName:" + functionName
                remove = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.RemoveByName",
                    options=["-E", "^" + functionName + ".*$"])
                remove.set_inputformat(trainingData)
                trainingData = remove.filter(trainingData)
                if testingData:
                    remove.set_inputformat(testingData)
                    testingData = remove.filter(testingData)
                #print functionName
                #print "featureNum: " + str(filteredData.num_attributes() - 1)

            #for attributeStr in trainingData.attributes():
            #    print(attributeStr)

            classifier = self.algorithmPicker(trainingData, indexInTable)
            evaluation = self.evaluation(classifier, trainingData, testingData)
            if evaluation.percent_correct() >= bestAccuracy:

                bestAccuracy = evaluation.percent_correct()
                bestTrainingData = trainingData
                bestTestingData = testingData
                bestRemainFilterList = list(featureList3)
                bestEvaluation = evaluation
                #print "update feature number:" + str(len(bestRemainFilterList))

            print(self.algorithmTable[indexInTable] + ": " +
                  "{:.2f}".format(evaluation.percent_correct()) +
                  ", Feature select number:" +
                  str(trainingData.num_attributes() - 1) + "/" +
                  str(featureNum))
            resultList.append("{:.2f}".format(evaluation.percent_correct()))

        resultList.reverse()

        fileteredfeatureList = []
        #print "bestRemainFilterList number:" + str(len(bestRemainFilterList))
        #print "wholefeatureList number:" + str(len(wholefeatureList))
        for item in wholefeatureList:
            if item not in bestRemainFilterList:
                fileteredfeatureList.append(item)

        #print "update fileteredfeatureList number:" + str(len(fileteredfeatureList))
        for item in resultList:
            outputStr += item + ","
        outputStr = outputStr[0:-1] + "\n"

        print outputStr
        self.writeToPath(csvFilePath, outputStr)
        accuracyStr = "{:.2f}".format(bestAccuracy)
        #print fileteredfeatureList
        return [bestEvaluation, bestTrainingData, bestTestingData, resultList]
Exemple #22
0
from weka.core.converters import Loader, Saver
from weka.core.dataset import Instances
from weka.filters import Filter

jvm.start()

# load weather.nominal
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# output header
print(Instances.template_instances(data))

# remove attribute no 3
print("\nRemove attribute no 3")
fltr = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "3"])
fltr.set_inputformat(data)
filtered = fltr.filter(data)

# output header
print(Instances.template_instances(filtered))

# save modified dataset
saver = Saver(classname="weka.core.converters.ArffSaver")
saver.save_file(filtered, data_dir + os.sep + "weather.nominal-filtered.arff")

jvm.stop()

Exemple #23
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# plot
pld.scatter_plot(
    data, data.get_attribute_by_name("petalwidth").get_index(),
    data.get_attribute_by_name("petallength").get_index(),
    wait=False)

# add classifier errors to dataset
addcls = Filter(
    classname="weka.filters.supervised.attribute.AddClassification",
    options=["-W", "weka.classifiers.trees.J48", "-classification", "-error"])
addcls.set_inputformat(data)
filtered = addcls.filter(data)
print(filtered)

# build J48
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(data)
evl = Evaluation(data)
evl.test_model(cls, data)

# plot classifier errors
plc.plot_classifier_errors(evl.predictions(), wait=True)

jvm.stop()

Exemple #24
0
 def _normalize_dataset(self, dataset):
     normalize_data = Filter(classname = 'weka.filters.unsupervised.attribute.Normalize', 
                          options = [])
     normalize_data.set_inputformat(dataset)
     normalized = normalize_data.filter(dataset)
     return normalized