コード例 #1
0
def PreprocessData(Data,option):
    IDs = []
    if (option['idFlag']):    # means that the last attribute is id
        attributeremove = AttributeRemove()
        attributeremove.setInvertSelection(Boolean(True))  # remove every attribute but the last one which is ID
        attributeremove.setAttributeIndices(String(str(Data.numAttributes())))
        attributeremove.setInputFormat(Data)
        IDs = Filter.useFilter(Data, attributeremove)
        attributeremove = AttributeRemove()
        attributeremove.setInvertSelection(Boolean(False))  # remove IDs from dataset
        attributeremove.setAttributeIndices(String(str(Data.numAttributes())))
        attributeremove.setInputFormat(Data)
        Data = Filter.useFilter(Data, attributeremove)
    # set the class Index - the index of the dependent variable
    Data.setClassIndex(Data.numAttributes() - 1)
    # remove of the classes
    if (option['rmClassFlag']):    # means that instances with specified class label must be removed
        ClassLabel = option['rmClass']
        removewithvalues = RemoveWithValues()
        removewithvalues.setAttributeIndex(String('last'))
        removewithvalues.setNominalIndices(String(str(ClassLabel)))
        removewithvalues.setInputFormat(Data)
        newData = Filter.useFilter(Data, removewithvalues)
    else:
        newData = Data
    if (option['weightFlag']):    # it means that instances should be weighted according to number of samples
        # if there is only two classes, do it as before
        if (Data.numClasses()==2):
            # weight instances with reciprocal weight with number of samples
            numInstancesC1 = 0
            numInstancesC2 = 0
            # get numerical value of the class attribute for the first class because we don't know it
            classLabel = newData.instance(1).classAttribute()
            c1 = newData.instance(1).value(classLabel)
            # find number of instances per class
            for   cnt  in   range(0,newData.numInstances()):
                if (newData.instance(cnt).value(classLabel) == c1):
                    numInstancesC1 = numInstancesC1 + 1
                else:
                    numInstancesC2 = numInstancesC2 + 1
            # calculate weights
            weightC1 = numInstancesC2 /(numInstancesC2 + numInstancesC1 + 0.0)
            weightC2 = numInstancesC1 /(numInstancesC2 + numInstancesC1 + 0.0)
            # assign weight to instances of classes
            for cnt in range(0,newData.numInstances()):
                if (newData.instance(cnt).value(classLabel) == c1):
                    newData.instance(cnt).setWeight(weightC1)
                else:
                    newData.instance(cnt).setWeight(weightC2)
        # if number of class are more than two then .... 
        elif (Data.numClasses()>2):
            numClasses = Data.numClasses()
            stats = Data.attributeStats(Data.classIndex())
            AttributeStats = stats.nominalCounts
            classLabels = Data.instance(1).classAttribute()
            # assign weight to instances of classes
            cnt = 0
            sumWeigths = 0.0
            numInstancesPerClass = {}
            weightPerClass = {}
            mapClassLabels = {}
            for e in classLabels.enumerateValues():
                numInst = AttributeStats[cnt] + 0.0
                w = 1.0 / numInst
                mapClassLabels.update({e:cnt})
                weightPerClass.update({cnt:w})
                numInstancesPerClass.update({cnt:numInst})
                sumWeigths = sumWeigths + w
                cnt = cnt + 1 

            # normalize weights
            for k in weightPerClass.keys():
                weightPerClass[k] = weightPerClass[k]/sumWeigths

            for cnt in range(0,newData.numInstances()):
                w = weightPerClass[ newData.instance(cnt).value(classLabels) ]
                newData.instance(cnt).setWeight(w)
    return newData, IDs
コード例 #2
0
    		usage()
    		return 1


        options = {'idFlag':True, 'weightFlag': False, 'rmClassFlag': False, 'rmClass': 0}
        # read the first dataset
        fn = inputList[0]
        fid = FileReader(fn)
	Data = Instances(fid)
        Data, IDs = PreprocessData(Data,options)
        # remove class label
        attributeremove = AttributeRemove()
        attributeremove.setInvertSelection(Boolean(False))  # remove class labels from dataset
        attributeremove.setAttributeIndices(String(str(Data.numAttributes())))
        attributeremove.setInputFormat(Data)
        newData = Filter.useFilter(Data, attributeremove)
        # loop over input arff file
        cnt = Data.numAttributes() 
        for fnCnt in range(1,len(inputList)):
             fn = inputList[fnCnt]
             fid = FileReader(fn)
	     Data = Instances(fid)
             Data, IDs = PreprocessData(Data,options)
             # remove class label
             attributeremove = AttributeRemove()
	     attributeremove.setInvertSelection(Boolean(True))  # remove every attribute but the last one which is class label
	     attributeremove.setAttributeIndices(String(str(Data.numAttributes())))
	     attributeremove.setInputFormat(Data)
	     labels = Filter.useFilter(Data, attributeremove)
             attributeremove = AttributeRemove()
             attributeremove.setInvertSelection(Boolean(False))  # remove class labels from dataset