Esempio n. 1
0
 def splitCritValue(self,
                    bags: Distribution,
                    totalNoInst: float = None,
                    numerator: float = None):
     if totalNoInst is None and numerator is None:
         numerator = self.oldEnt(bags) - self.newEnt(bags)
         if Utils.equal(numerator, 0):
             return float('inf')
         denumerator = self.splitEnt(bags)
         if Utils.equal(denumerator, 0):
             return float('inf')
         return denumerator / numerator
     elif numerator is None:
         res = 0
         noUnkown = totalNoInst - bags.total()
         if Utils.gr(bags.total(), 0):
             for i in range(bags.numBags()):
                 res = res - self.lnFunc(bags.perBag(i))
             res = res - self.lnFunc(noUnkown)
             res = res + self.lnFunc(totalNoInst)
         return res / math.log(2)
     else:
         denumerator = self.splitEnt(bags, totalNoInst)
         if Utils.equal(denumerator, 0):
             return 0
         denumerator /= totalNoInst
         return numerator / denumerator
Esempio n. 2
0
 def splitEnt(self, bags: Distribution, totalnoInst: float = None):
     if totalnoInst is None:
         return super().splitEnt(bags)
     res = 0
     noUnknown = totalnoInst - bags.total()
     if Utils.gr(bags.total(), 0):
         for i in range(bags.numBags()):
             res = res - self.lnFunc(bags.perBag(i))
         res = res - self.lnFunc(noUnknown)
         res = res + self.lnFunc(totalnoInst)
     return res / math.log(2)
Esempio n. 3
0
 def selectModel(self, data: Instances, test: Instances = None):
     if test is not None:
         return self.selectModel(data)
     multiVal = True
     averageInfoGain = validModels = 0
     checkDistribution = Distribution(data)
     noSplitModel = NoSplit(checkDistribution)
     if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \
         Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())):
         return noSplitModel
     if self.m_allData is not None:
         for attr in data.enumerateAttributes():
             if attr.isNumeric() or Utils.gr(
                     0.3 * self.m_allData.numInstances(), attr.numValues()):
                 multiVal = False
                 break
     currentModel = [None] * data.numAttributes()  #type:List[C45Split]
     sumOfWeights = data.sumOfWeight()
     for i in range(data.numAttributes()):
         if i != data.classIndex():
             currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights,
                                        self.m_useMDLcorrection)
             currentModel[i].buildClassifer(data)
             if currentModel[i].checkModel():
                 if self.m_allData is not None:
                     if data.attribute(i).isNumeric() or \
                         (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())):
                         averageInfoGain = averageInfoGain + currentModel[
                             i].infoGain()
                         validModels += 1
                 else:
                     averageInfoGain = averageInfoGain + currentModel[
                         i].infoGain()
                     validModels += 1
         else:
             currentModel[i] = None
     if validModels == 0:
         return noSplitModel
     averageInfoGain = averageInfoGain / validModels
     minResult = 0
     for i in range(data.numAttributes()):
         if i != data.classIndex() and currentModel[i].checkModel():
             if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\
                 Utils.gr(currentModel[i].gainRatio(), minResult):
                 bestModel = currentModel[i]
                 minResult = currentModel[i].gainRatio()
     if Utils.equal(minResult, 0):
         return noSplitModel
     bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex())
     if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue:
         bestModel.setSplitPoint(self.m_allData)
     return bestModel
Esempio n. 4
0
 def splitCritValue(self,bags:Distribution,totalNoInst:float=None,oldEnt:float=None):
     if totalNoInst is None:
         numerator=self.oldEnt(bags)-self.newEnt(bags)
         if Utils.equal(numerator, 0):
             return float("inf")
         return bags.total()/numerator
     else:
         noUnknown=totalNoInst-bags.total()
         unknowRate=noUnknown/totalNoInst
         if oldEnt is not None:
             numerator = oldEnt - self.newEnt(bags)
         else:
             numerator=self.oldEnt(bags)-self.newEnt(bags)
         numerator=(1-unknowRate)*numerator
         if Utils.equal(numerator, 0):
             return 0
         return numerator/bags.total()
class PruneableClassifierTree(ClassifierTree):
    def __init__(self, toSelectLocModel: ModelSelection, pruneTree: bool,
                 num: int, cleanup: bool, seed: int):
        super().__init__(toSelectLocModel)
        self.pruneTheTree = pruneTree
        self.numSets = num
        self.m_cleanup = cleanup
        self.m_seed = seed

    def buildClassifier(self, data: Instances):
        data = Instances(data)
        data.deleteWithMissingClass()
        data.stratify(self.numSets)
        self.buildTree(
            data.trainCV(self.numSets, self.numSets - 1, self.m_seed),
            not self.m_cleanup, data.testCV(self.numSets, self.numSets - 1))
        if self.pruneTheTree:
            self.prune()
        if self.m_cleanup:
            self.cleanup(Instances(data, 0))

    def getNewTree(self, data: Instances, test: Instances = None):
        newTree = PruneableClassifierTree(self.m_toSelectModel,
                                          self.pruneTheTree, self.numSets,
                                          self.m_cleanup, self.m_seed)
        newTree.buildTree(data, not self.m_cleanup, test)
        return newTree

    def buildTree(self,
                  data: Instances,
                  keepData: bool,
                  test: Instances = None):
        if keepData:
            self.m_train = data
        self.m_isLeaf = False
        self.m_isEmpty = False
        self.m_sons = None
        self.m_localModel = self.m_toSelectModel.selectModel(data, test)
        self.m_test = Distribution(test, self.m_localModel)
        if self.m_localModel.numSubsets() > 1:
            localTrain = self.m_localModel.split(data)
            localTest = self.m_localModel.split(test)
            self.m_sons = []
            for i in range(len(self.m_sons)):
                self.m_sons.append(self.getNewTree(localTrain[i],
                                                   localTest[i]))
                localTrain[i] = None
                localTest[i] = None
        else:
            self.m_isLeaf = True
            if Utils.equal(data.sumOfWeight(), 0):
                self.m_isEmpty = True

    def prune(self):
        if not self.m_isLeaf:
            for i in range(len(self.m_sons)):
                self.son(i).prune()
            if Utils.gr(self.errorsForTree(),
                        self.errorsForLeaf()) or Utils.equal(
                            self.errorsForTree(), self.errorsForLeaf()):
                self.m_sons = None
                self.m_isLeaf = None
                self.m_localModel = NoSplit(self.localModel().distribution())

    def errorsForTree(self):
        errors = 0
        if self.m_isLeaf:
            return self.errorsForLeaf()
        for i in range(len(self.m_sons)):
            if Utils.equal(self.localModel().distribution().perBag(i), 0):
                errors += self.m_test.perBag(i) - self.m_test.perClassPerBag(
                    i,
                    self.localModel().distribution().maxClass())
            else:
                errors += self.son(i).errorsForTree()
        return errors

    def errorsForLeaf(self):
        return self.m_test.total() - self.m_test.perClass(
            self.localModel().distribution().maxClass())
Esempio n. 6
0
class C45Split(ClassifierSplitModel):
    def __init__(self, attIndex: int, minNoObj: int, sumOfWeights: float,
                 useMDLcorrection: bool):
        super().__init__()
        self.m_distribution = None  #type:Distribution
        self.m_numSubsets = 0
        self.m_attIndex = attIndex
        self.m_minNoObj = minNoObj
        self.m_sumOfWeights = sumOfWeights
        self.m_useMDLcorrection = useMDLcorrection
        self.m_splitPoint = float("inf")
        self.m_infoGain = 0
        self.m_gainRatio = 0

    def buildClassifer(self, instances: Instances):
        self.m_numSubsets = 0
        self.m_splitPoint = float("inf")
        self.m_infoGain = 0
        self.m_gainRatio = 0

        if instances.attribute(self.m_attIndex).isNominal():
            self.m_complexityIndex = instances.attribute(
                self.m_attIndex).numValues()
            self.m_index = self.m_complexityIndex
            self.handleEnumeratedAttribute(instances)
            print("att build after numSubsets:", self.numSubsets())
        else:
            self.m_complexityIndex = 2
            self.m_index = 0
            instances.sort(instances.attribute(self.m_attIndex))
            self.handleNumericAttribute(instances)
            print("num build after numSubsets:", self.numSubsets())

    def handleEnumeratedAttribute(self, instances: Instances):
        self.m_distribution = Distribution(self.m_complexityIndex,
                                           instances.numClasses())
        for inst in instances:
            if not inst.isMissing(self.m_attIndex):
                self.m_distribution.add(int(inst.value(self.m_attIndex)), inst)
        if self.m_distribution.check(self.m_minNoObj):
            self.m_numSubsets = self.m_complexityIndex
            self.m_infoGain = self.infoGainCrit.splitCritValue(
                self.m_distribution, self.m_sumOfWeights)
            self.m_gainRatio = self.gainRatioCrit.splitCritValue(
                self.m_distribution, self.m_sumOfWeights, self.m_infoGain)

    def handleNumericAttribute(self, trainInstances: Instances):
        next = 1
        last = 0
        splitIndex = -1
        self.m_distribution = Distribution(2, trainInstances.numClasses())
        i = 0
        for inst in trainInstances:
            if inst.isMissing(self.m_attIndex):
                break
            self.m_distribution.add(1, inst)
            i += 1
        firstMiss = i
        minSplit = 0.1 * self.m_distribution.total(
        ) / trainInstances.numClasses()
        if Utils.gr(self.m_minNoObj, minSplit) or Utils.equal(
                minSplit, self.m_minNoObj):
            minSplit = self.m_minNoObj
        elif Utils.gr(minSplit, 25):
            minSplit = 25
        if Utils.gr(2 * minSplit, firstMiss):
            return
        defaultEnt = self.infoGainCrit.oldEnt(self.m_distribution)
        print("dfalut", defaultEnt)
        while next < firstMiss:
            if trainInstances.instance(next - 1).value(
                    self.m_attIndex) + 1e-5 < trainInstances.instance(
                        next).value(self.m_attIndex):
                self.m_distribution.shiftRange(1, 0, trainInstances, last,
                                               next)
                if (Utils.gr(self.m_distribution.perBag(0), minSplit) or Utils.equal(self.m_distribution.perBag(0), minSplit))\
                        and (Utils.gr(self.m_distribution.perBag(1), minSplit) or Utils.equal(self.m_distribution.perBag(1), minSplit)):
                    currentInfoGain = self.infoGainCrit.splitCritValue(
                        self.m_distribution, self.m_sumOfWeights, defaultEnt)
                    if Utils.gr(currentInfoGain, self.m_infoGain):
                        self.m_infoGain = currentInfoGain
                        splitIndex = next - 1
                    self.m_index += 1
                last = next
            next += 1
        if self.m_index == 0:
            return
        if self.m_useMDLcorrection:
            self.m_infoGain = self.m_infoGain - (Utils.log2(self.m_index) /
                                                 self.m_sumOfWeights)
        if Utils.gr(0, self.m_infoGain) or Utils.equal(0, self.m_infoGain):
            return
        self.m_numSubsets = 2
        self.m_splitPoint = (
            trainInstances.instance(splitIndex + 1).value(self.m_attIndex) +
            trainInstances.instance(splitIndex).value(self.m_attIndex)) / 2
        if self.m_splitPoint == trainInstances.instance(splitIndex + 1).value(
                self.m_attIndex):
            self.m_splitPoint = trainInstances.instance(splitIndex).value(
                self.m_attIndex)
        self.m_distribution = Distribution(2, trainInstances.numClasses())
        self.m_distribution.addRange(0, trainInstances, 0, splitIndex + 1)
        self.m_distribution.addRange(1, trainInstances, splitIndex + 1,
                                     firstMiss)
        self.m_gainRatio = self.gainRatioCrit.splitCritValue(
            self.m_distribution, self.m_sumOfWeights, self.m_infoGain)

    def whichSubset(self, instance: Instance):
        if instance.isMissing(self.m_attIndex):
            return -1
        if instance.attribute(self.m_attIndex).isNominal():
            return int(instance.value(self.m_attIndex))
        elif instance.value(self.m_attIndex) <= self.m_splitPoint:
            return 0
        return 1

    def rightSide(self, index: int, data: Instances):
        text = ""
        if data.attribute(self.m_attIndex).isNominal():
            text += " = " + data.attribute(self.m_attIndex).value(index)
        elif index == 0:
            text += " <= " + Utils.doubleToString(self.m_splitPoint, 6)
        else:
            text += " > " + Utils.doubleToString(self.m_splitPoint, 6)
        return text

    def leftSide(self, data: Instances):
        return data.attribute(self.m_attIndex).name()

    def attIndex(self):
        return self.m_attIndex

    def splitPoint(self):
        return self.m_splitPoint

    def infoGain(self):
        return self.m_infoGain

    def gainRatio(self):
        return self.m_gainRatio

    def weights(self, instance: Instance):
        if instance.isMissing(self.m_attIndex):
            weights = []
            for i in range(self.m_numSubsets):
                weights.append(
                    self.m_distribution.perBag(i) /
                    self.m_distribution.total())
            return weights
        return None

    def setSplitPoint(self, allInstances: Instances):
        newSplitPoint = float("-inf")
        if allInstances.attribute(
                self.m_attIndex).isNumeric() and self.m_numSubsets > 1:
            for i in range(allInstances.numInstances()):
                instance = allInstances.instance(i)
                tempValue = instance.value(self.m_attIndex)
                if not Utils.isMissingValue(tempValue):
                    if tempValue > newSplitPoint and tempValue <= self.m_splitPoint:
                        newSplitPoint = tempValue
            self.m_splitPoint = newSplitPoint

    def resetDistribution(self, data: Instances):
        insts = Instances(data, data.numInstances())
        for i in range(data.numInstances()):
            if self.whichSubset(data.instance(i)) > -1:
                insts.add(data.instance(i))
        newD = Distribution(insts, self)
        newD.addInstWithUnknown(data, self.m_attIndex)
        self.m_distribution = newD
 def oldEnt(self,bags:Distribution):
     res=0
     for j in range(bags.numClasses()):
         res=res+self.lnFunc(bags.perClass(j))
     return (self.lnFunc(bags.total())-res)/math.log(2)
 def splitEnt(self,bags:Distribution):
     res=0
     for i in range(bags.numBags()):
         res=res+self.lnFunc(bags.perBag(i))
     return (self.lnFunc(bags.total())-res)/math.log(2)
Esempio n. 9
0
 def getEstimatedErrorsForDistribution(self, theDistribution: Distribution):
     if Utils.equal(theDistribution.total(), 0):
         return 0
     return theDistribution.numIncorrect() + Utils.addErrs(
         theDistribution.total(), theDistribution.numIncorrect(), self.m_CF)