Example #1
0
 def prune(self):
     if not self.m_isLeaf:
         for i in range(len(self.m_sons)):
             self.son(i).prune()
         indexOfLargestBranch = self.localModel().distribution().maxBag()
         if self.m_subtreeRaising:
             errorsLargestBranch = self.son(
                 indexOfLargestBranch).getEstimatedErrorsForBranch(
                     self.m_train)
         else:
             errorsLargestBranch = float("inf")
         errorsLeaf = self.getEstimatedErrorsForDistribution(
             self.localModel().distribution())
         errorsTree = self.getEstimatedErrors()
         if (Utils.gr(errorsTree+0.1, errorsLeaf) or Utils.equal(errorsTree+0.1, errorsLeaf)) and\
             (Utils.gr(errorsLargestBranch+0.1, errorsLeaf) or Utils.equal(errorsLargestBranch+0.1, errorsLeaf)):
             self.m_sons = None
             self.m_isLeaf = True
             self.m_localModel = NoSplit(self.localModel().distribution())
             return
         if Utils.gr(errorsTree + 0.1, errorsLargestBranch) or Utils.equal(
                 errorsTree + 0.1, errorsLargestBranch):
             largestBranch = self.son(indexOfLargestBranch)
             self.m_sons = largestBranch.m_sons
             self.m_localModel = largestBranch.localModel()
             self.m_isLeaf = largestBranch.m_isLeaf
             self.newDistribution(self.m_train)
             self.prune()
Example #2
0
 def handleNumericAttribute(self, trainInstances: Instances):
     next = 1
     last = 0
     splitIndex = -1
     self.m_distribution = Distribution(2, trainInstances.numClasses())
     i = 0
     for inst in trainInstances:
         if inst.isMissing(self.m_attIndex):
             break
         self.m_distribution.add(1, inst)
         i += 1
     firstMiss = i
     minSplit = 0.1 * self.m_distribution.total(
     ) / trainInstances.numClasses()
     if Utils.gr(self.m_minNoObj, minSplit) or Utils.equal(
             minSplit, self.m_minNoObj):
         minSplit = self.m_minNoObj
     elif Utils.gr(minSplit, 25):
         minSplit = 25
     if Utils.gr(2 * minSplit, firstMiss):
         return
     defaultEnt = self.infoGainCrit.oldEnt(self.m_distribution)
     print("dfalut", defaultEnt)
     while next < firstMiss:
         if trainInstances.instance(next - 1).value(
                 self.m_attIndex) + 1e-5 < trainInstances.instance(
                     next).value(self.m_attIndex):
             self.m_distribution.shiftRange(1, 0, trainInstances, last,
                                            next)
             if (Utils.gr(self.m_distribution.perBag(0), minSplit) or Utils.equal(self.m_distribution.perBag(0), minSplit))\
                     and (Utils.gr(self.m_distribution.perBag(1), minSplit) or Utils.equal(self.m_distribution.perBag(1), minSplit)):
                 currentInfoGain = self.infoGainCrit.splitCritValue(
                     self.m_distribution, self.m_sumOfWeights, defaultEnt)
                 if Utils.gr(currentInfoGain, self.m_infoGain):
                     self.m_infoGain = currentInfoGain
                     splitIndex = next - 1
                 self.m_index += 1
             last = next
         next += 1
     if self.m_index == 0:
         return
     if self.m_useMDLcorrection:
         self.m_infoGain = self.m_infoGain - (Utils.log2(self.m_index) /
                                              self.m_sumOfWeights)
     if Utils.gr(0, self.m_infoGain) or Utils.equal(0, self.m_infoGain):
         return
     self.m_numSubsets = 2
     self.m_splitPoint = (
         trainInstances.instance(splitIndex + 1).value(self.m_attIndex) +
         trainInstances.instance(splitIndex).value(self.m_attIndex)) / 2
     if self.m_splitPoint == trainInstances.instance(splitIndex + 1).value(
             self.m_attIndex):
         self.m_splitPoint = trainInstances.instance(splitIndex).value(
             self.m_attIndex)
     self.m_distribution = Distribution(2, trainInstances.numClasses())
     self.m_distribution.addRange(0, trainInstances, 0, splitIndex + 1)
     self.m_distribution.addRange(1, trainInstances, splitIndex + 1,
                                  firstMiss)
     self.m_gainRatio = self.gainRatioCrit.splitCritValue(
         self.m_distribution, self.m_sumOfWeights, self.m_infoGain)
Example #3
0
 def splitCritValue(self,
                    bags: Distribution,
                    totalNoInst: float = None,
                    numerator: float = None):
     if totalNoInst is None and numerator is None:
         numerator = self.oldEnt(bags) - self.newEnt(bags)
         if Utils.equal(numerator, 0):
             return float('inf')
         denumerator = self.splitEnt(bags)
         if Utils.equal(denumerator, 0):
             return float('inf')
         return denumerator / numerator
     elif numerator is None:
         res = 0
         noUnkown = totalNoInst - bags.total()
         if Utils.gr(bags.total(), 0):
             for i in range(bags.numBags()):
                 res = res - self.lnFunc(bags.perBag(i))
             res = res - self.lnFunc(noUnkown)
             res = res + self.lnFunc(totalNoInst)
         return res / math.log(2)
     else:
         denumerator = self.splitEnt(bags, totalNoInst)
         if Utils.equal(denumerator, 0):
             return 0
         denumerator /= totalNoInst
         return numerator / denumerator
Example #4
0
 def selectModel(self, data: Instances, test: Instances = None):
     if test is not None:
         return self.selectModel(data)
     multiVal = True
     averageInfoGain = validModels = 0
     checkDistribution = Distribution(data)
     noSplitModel = NoSplit(checkDistribution)
     if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \
         Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())):
         return noSplitModel
     if self.m_allData is not None:
         for attr in data.enumerateAttributes():
             if attr.isNumeric() or Utils.gr(
                     0.3 * self.m_allData.numInstances(), attr.numValues()):
                 multiVal = False
                 break
     currentModel = [None] * data.numAttributes()  #type:List[C45Split]
     sumOfWeights = data.sumOfWeight()
     for i in range(data.numAttributes()):
         if i != data.classIndex():
             currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights,
                                        self.m_useMDLcorrection)
             currentModel[i].buildClassifer(data)
             if currentModel[i].checkModel():
                 if self.m_allData is not None:
                     if data.attribute(i).isNumeric() or \
                         (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())):
                         averageInfoGain = averageInfoGain + currentModel[
                             i].infoGain()
                         validModels += 1
                 else:
                     averageInfoGain = averageInfoGain + currentModel[
                         i].infoGain()
                     validModels += 1
         else:
             currentModel[i] = None
     if validModels == 0:
         return noSplitModel
     averageInfoGain = averageInfoGain / validModels
     minResult = 0
     for i in range(data.numAttributes()):
         if i != data.classIndex() and currentModel[i].checkModel():
             if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\
                 Utils.gr(currentModel[i].gainRatio(), minResult):
                 bestModel = currentModel[i]
                 minResult = currentModel[i].gainRatio()
     if Utils.equal(minResult, 0):
         return noSplitModel
     bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex())
     if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue:
         bestModel.setSplitPoint(self.m_allData)
     return bestModel
Example #5
0
 def splitCritValue(self,bags:Distribution,totalNoInst:float=None,oldEnt:float=None):
     if totalNoInst is None:
         numerator=self.oldEnt(bags)-self.newEnt(bags)
         if Utils.equal(numerator, 0):
             return float("inf")
         return bags.total()/numerator
     else:
         noUnknown=totalNoInst-bags.total()
         unknowRate=noUnknown/totalNoInst
         if oldEnt is not None:
             numerator = oldEnt - self.newEnt(bags)
         else:
             numerator=self.oldEnt(bags)-self.newEnt(bags)
         numerator=(1-unknowRate)*numerator
         if Utils.equal(numerator, 0):
             return 0
         return numerator/bags.total()
Example #6
0
 def maxBag(self):
     max = 0
     maxIndex = -1
     for i in range(len(self.m_perBag)):
         if Utils.gr(self.m_perBag[i], max) or Utils.equal(
                 self.m_perBag[i], max):
             max = self.m_perBag[i]
             maxIndex = i
     return maxIndex
Example #7
0
 def check(self, minNoObj: float):
     counter = 0
     for i in range(len(self.m_perBag)):
         if Utils.gr(self.m_perBag[i], minNoObj) or Utils.equal(
                 self.m_perBag[i], minNoObj):
             counter += 1
             if counter > 1:
                 return True
     return False
 def prune(self):
     if not self.m_isLeaf:
         for i in range(len(self.m_sons)):
             self.son(i).prune()
         if Utils.gr(self.errorsForTree(),
                     self.errorsForLeaf()) or Utils.equal(
                         self.errorsForTree(), self.errorsForLeaf()):
             self.m_sons = None
             self.m_isLeaf = None
             self.m_localModel = NoSplit(self.localModel().distribution())
Example #9
0
 def prob(self, classIndex: int, intIndex: int = None):
     if intIndex is None:
         if not Utils.equal(self.totaL, 0):
             return self.m_perClass[classIndex] / self.totaL
         return 0
     else:
         if Utils.gr(self.m_perBag[intIndex], 0):
             return self.m_perClassPerBag[intIndex][
                 classIndex] / self.m_perBag[intIndex]
         return self.prob(classIndex)
Example #10
0
 def newDistribution(self, data: Instances):
     self.localModel().resetDistribution(data)
     self.m_train = data
     if not self.m_isLeaf:
         localInstances = self.localModel().split(data)
         for i in range(len(self.m_sons)):
             self.son(i).newDistribution(localInstances[i])
     else:
         if not Utils.equal(data.sumOfWeight(), 0):
             self.m_isEmpty = False
 def errorsForTree(self):
     errors = 0
     if self.m_isLeaf:
         return self.errorsForLeaf()
     for i in range(len(self.m_sons)):
         if Utils.equal(self.localModel().distribution().perBag(i), 0):
             errors += self.m_test.perBag(i) - self.m_test.perClassPerBag(
                 i,
                 self.localModel().distribution().maxClass())
         else:
             errors += self.son(i).errorsForTree()
     return errors
Example #12
0
    def buildTree(self,
                  data: Instances,
                  keepData: bool,
                  test: Instances = None):
        if keepData:
            self.m_train = data
        self.m_isLeaf = False
        self.m_isEmpty = False
        self.m_sons = None
        if test is None:
            self.m_test = None
            self.m_localModel = self.m_toSelectModel.selectModel(data)
            if self.m_localModel.numSubsets() > 1:
                localInstances = self.m_localModel.split(data)
                self.m_sons = []
                for i in range(self.m_localModel.numSubsets()):
                    self.m_sons.append(self.getNewTree(localInstances[i]))
                    localInstances[i] = None
            else:
                self.m_isLeaf = True
                if Utils.equal(data.sumOfWeight(), 0):
                    self.m_isEmpty = True

        else:
            self.m_localModel = self.m_toSelectModel.selectModel(data, test)
            self.m_test = Distribution(test, self.m_localModel)
            if self.m_localModel.numSubsets() > 1:
                localTrain = self.m_localModel.split(data)
                localTest = self.m_localModel.split(test)
                self.m_sons = []
                for i in range(self.m_localModel.numSubsets()):
                    self.m_sons.append(
                        self.getNewTree(localTrain[i], localTest[i]))
                    localTrain[i] = None
                    localTest[i] = None
            else:
                self.m_isLeaf = True
                if Utils.equal(data.sumOfWeight(), 0):
                    self.m_isEmpty = True
Example #13
0
 def addInstWithUnknown(self, source: Instances, attIndex: int):
     probs = []
     for j in range(len(self.m_perBag)):
         if Utils.equal(self.totaL, 0):
             probs.append(1 / len(self.m_perBag))
         else:
             probs.append(self.m_perBag[j] / self.totaL)
     for inst in source:
         if inst.isMissing(attIndex):
             classIndex = int(inst.classValue())
             weight = inst.weight()
             self.m_perClass[
                 classIndex] = self.m_perClass[classIndex] + weight
             self.totaL = self.totaL + weight
             for j in range(len(self.m_perBag)):
                 newWeight = probs[j] * weight
                 self.m_perClassPerBag[j][
                     classIndex] = self.m_perClassPerBag[j][
                         classIndex] + newWeight
                 self.m_perBag[j] = self.m_perBag[j] + newWeight
Example #14
0
 def handleEnumeratedAttribute(self, instances: Instances):
     numAttValues = instances.attribute(self.m_attIndex).numValues()
     newDistribution = Distribution(numAttValues, instances.numClasses())
     for inst in instances:
         if not inst.isMissing(self.m_attIndex):
             newDistribution.add(int(inst.value(self.m_attIndex)), inst)
     self.m_distribution = newDistribution
     for i in range(numAttValues):
         if Utils.gr(newDistribution.perBag(i), self.m_minNoObj) or\
                 Utils.equal(newDistribution.perBag(i), self.m_minNoObj):
             secondDistribution = Distribution(newDistribution, i)
             if secondDistribution.check(self.m_minNoObj):
                 self.m_numSubsets = 2
                 currIG = self.infoGainCrit.splitCritValue(
                     secondDistribution, self.m_sumOfWeights)
                 currGR = self.gainRatioCrit.splitCritValue(
                     secondDistribution, self.m_sumOfWeights, currIG)
                 if i == 0 or Utils.gr(currGR, self.m_gainRatio):
                     self.m_gainRatio = currGR
                     self.m_infoGain = currIG
                     self.m_splitPoint = i
                     self.m_distribution = secondDistribution
Example #15
0
 def getEstimatedErrorsForDistribution(self, theDistribution: Distribution):
     if Utils.equal(theDistribution.total(), 0):
         return 0
     return theDistribution.numIncorrect() + Utils.addErrs(
         theDistribution.total(), theDistribution.numIncorrect(), self.m_CF)