def PreprocessData(Data,option): IDs = [] if (option['idFlag']): # means that the last attribute is id attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(True)) # remove every attribute but the last one which is ID attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) IDs = Filter.useFilter(Data, attributeremove) attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove IDs from dataset attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) Data = Filter.useFilter(Data, attributeremove) # set the class Index - the index of the dependent variable Data.setClassIndex(Data.numAttributes() - 1) # remove of the classes if (option['rmClassFlag']): # means that instances with specified class label must be removed ClassLabel = option['rmClass'] removewithvalues = RemoveWithValues() removewithvalues.setAttributeIndex(String('last')) removewithvalues.setNominalIndices(String(str(ClassLabel))) removewithvalues.setInputFormat(Data) newData = Filter.useFilter(Data, removewithvalues) else: newData = Data if (option['weightFlag']): # it means that instances should be weighted according to number of samples # if there is only two classes, do it as before if (Data.numClasses()==2): # weight instances with reciprocal weight with number of samples numInstancesC1 = 0 numInstancesC2 = 0 # get numerical value of the class attribute for the first class because we don't know it classLabel = newData.instance(1).classAttribute() c1 = newData.instance(1).value(classLabel) # find number of instances per class for cnt in range(0,newData.numInstances()): if (newData.instance(cnt).value(classLabel) == c1): numInstancesC1 = numInstancesC1 + 1 else: numInstancesC2 = numInstancesC2 + 1 # calculate weights weightC1 = numInstancesC2 /(numInstancesC2 + numInstancesC1 + 0.0) weightC2 = numInstancesC1 /(numInstancesC2 + numInstancesC1 + 0.0) # assign weight to instances of classes for cnt in range(0,newData.numInstances()): if (newData.instance(cnt).value(classLabel) == c1): newData.instance(cnt).setWeight(weightC1) else: newData.instance(cnt).setWeight(weightC2) # if number of class are more than two then .... elif (Data.numClasses()>2): numClasses = Data.numClasses() stats = Data.attributeStats(Data.classIndex()) AttributeStats = stats.nominalCounts classLabels = Data.instance(1).classAttribute() # assign weight to instances of classes cnt = 0 sumWeigths = 0.0 numInstancesPerClass = {} weightPerClass = {} mapClassLabels = {} for e in classLabels.enumerateValues(): numInst = AttributeStats[cnt] + 0.0 w = 1.0 / numInst mapClassLabels.update({e:cnt}) weightPerClass.update({cnt:w}) numInstancesPerClass.update({cnt:numInst}) sumWeigths = sumWeigths + w cnt = cnt + 1 # normalize weights for k in weightPerClass.keys(): weightPerClass[k] = weightPerClass[k]/sumWeigths for cnt in range(0,newData.numInstances()): w = weightPerClass[ newData.instance(cnt).value(classLabels) ] newData.instance(cnt).setWeight(w) return newData, IDs
usage() return 1 options = {'idFlag':True, 'weightFlag': False, 'rmClassFlag': False, 'rmClass': 0} # read the first dataset fn = inputList[0] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove class labels from dataset attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) newData = Filter.useFilter(Data, attributeremove) # loop over input arff file cnt = Data.numAttributes() for fnCnt in range(1,len(inputList)): fn = inputList[fnCnt] fid = FileReader(fn) Data = Instances(fid) Data, IDs = PreprocessData(Data,options) # remove class label attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(True)) # remove every attribute but the last one which is class label attributeremove.setAttributeIndices(String(str(Data.numAttributes()))) attributeremove.setInputFormat(Data) labels = Filter.useFilter(Data, attributeremove) attributeremove = AttributeRemove() attributeremove.setInvertSelection(Boolean(False)) # remove class labels from dataset