def prune(self): if not self.m_isLeaf: for i in range(len(self.m_sons)): self.son(i).prune() indexOfLargestBranch = self.localModel().distribution().maxBag() if self.m_subtreeRaising: errorsLargestBranch = self.son( indexOfLargestBranch).getEstimatedErrorsForBranch( self.m_train) else: errorsLargestBranch = float("inf") errorsLeaf = self.getEstimatedErrorsForDistribution( self.localModel().distribution()) errorsTree = self.getEstimatedErrors() if (Utils.gr(errorsTree+0.1, errorsLeaf) or Utils.equal(errorsTree+0.1, errorsLeaf)) and\ (Utils.gr(errorsLargestBranch+0.1, errorsLeaf) or Utils.equal(errorsLargestBranch+0.1, errorsLeaf)): self.m_sons = None self.m_isLeaf = True self.m_localModel = NoSplit(self.localModel().distribution()) return if Utils.gr(errorsTree + 0.1, errorsLargestBranch) or Utils.equal( errorsTree + 0.1, errorsLargestBranch): largestBranch = self.son(indexOfLargestBranch) self.m_sons = largestBranch.m_sons self.m_localModel = largestBranch.localModel() self.m_isLeaf = largestBranch.m_isLeaf self.newDistribution(self.m_train) self.prune()
def handleNumericAttribute(self, trainInstances: Instances): next = 1 last = 0 splitIndex = -1 self.m_distribution = Distribution(2, trainInstances.numClasses()) i = 0 for inst in trainInstances: if inst.isMissing(self.m_attIndex): break self.m_distribution.add(1, inst) i += 1 firstMiss = i minSplit = 0.1 * self.m_distribution.total( ) / trainInstances.numClasses() if Utils.gr(self.m_minNoObj, minSplit) or Utils.equal( minSplit, self.m_minNoObj): minSplit = self.m_minNoObj elif Utils.gr(minSplit, 25): minSplit = 25 if Utils.gr(2 * minSplit, firstMiss): return defaultEnt = self.infoGainCrit.oldEnt(self.m_distribution) print("dfalut", defaultEnt) while next < firstMiss: if trainInstances.instance(next - 1).value( self.m_attIndex) + 1e-5 < trainInstances.instance( next).value(self.m_attIndex): self.m_distribution.shiftRange(1, 0, trainInstances, last, next) if (Utils.gr(self.m_distribution.perBag(0), minSplit) or Utils.equal(self.m_distribution.perBag(0), minSplit))\ and (Utils.gr(self.m_distribution.perBag(1), minSplit) or Utils.equal(self.m_distribution.perBag(1), minSplit)): currentInfoGain = self.infoGainCrit.splitCritValue( self.m_distribution, self.m_sumOfWeights, defaultEnt) if Utils.gr(currentInfoGain, self.m_infoGain): self.m_infoGain = currentInfoGain splitIndex = next - 1 self.m_index += 1 last = next next += 1 if self.m_index == 0: return if self.m_useMDLcorrection: self.m_infoGain = self.m_infoGain - (Utils.log2(self.m_index) / self.m_sumOfWeights) if Utils.gr(0, self.m_infoGain) or Utils.equal(0, self.m_infoGain): return self.m_numSubsets = 2 self.m_splitPoint = ( trainInstances.instance(splitIndex + 1).value(self.m_attIndex) + trainInstances.instance(splitIndex).value(self.m_attIndex)) / 2 if self.m_splitPoint == trainInstances.instance(splitIndex + 1).value( self.m_attIndex): self.m_splitPoint = trainInstances.instance(splitIndex).value( self.m_attIndex) self.m_distribution = Distribution(2, trainInstances.numClasses()) self.m_distribution.addRange(0, trainInstances, 0, splitIndex + 1) self.m_distribution.addRange(1, trainInstances, splitIndex + 1, firstMiss) self.m_gainRatio = self.gainRatioCrit.splitCritValue( self.m_distribution, self.m_sumOfWeights, self.m_infoGain)
def splitCritValue(self, bags: Distribution, totalNoInst: float = None, numerator: float = None): if totalNoInst is None and numerator is None: numerator = self.oldEnt(bags) - self.newEnt(bags) if Utils.equal(numerator, 0): return float('inf') denumerator = self.splitEnt(bags) if Utils.equal(denumerator, 0): return float('inf') return denumerator / numerator elif numerator is None: res = 0 noUnkown = totalNoInst - bags.total() if Utils.gr(bags.total(), 0): for i in range(bags.numBags()): res = res - self.lnFunc(bags.perBag(i)) res = res - self.lnFunc(noUnkown) res = res + self.lnFunc(totalNoInst) return res / math.log(2) else: denumerator = self.splitEnt(bags, totalNoInst) if Utils.equal(denumerator, 0): return 0 denumerator /= totalNoInst return numerator / denumerator
def selectModel(self, data: Instances, test: Instances = None): if test is not None: return self.selectModel(data) multiVal = True averageInfoGain = validModels = 0 checkDistribution = Distribution(data) noSplitModel = NoSplit(checkDistribution) if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \ Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())): return noSplitModel if self.m_allData is not None: for attr in data.enumerateAttributes(): if attr.isNumeric() or Utils.gr( 0.3 * self.m_allData.numInstances(), attr.numValues()): multiVal = False break currentModel = [None] * data.numAttributes() #type:List[C45Split] sumOfWeights = data.sumOfWeight() for i in range(data.numAttributes()): if i != data.classIndex(): currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights, self.m_useMDLcorrection) currentModel[i].buildClassifer(data) if currentModel[i].checkModel(): if self.m_allData is not None: if data.attribute(i).isNumeric() or \ (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())): averageInfoGain = averageInfoGain + currentModel[ i].infoGain() validModels += 1 else: averageInfoGain = averageInfoGain + currentModel[ i].infoGain() validModels += 1 else: currentModel[i] = None if validModels == 0: return noSplitModel averageInfoGain = averageInfoGain / validModels minResult = 0 for i in range(data.numAttributes()): if i != data.classIndex() and currentModel[i].checkModel(): if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\ Utils.gr(currentModel[i].gainRatio(), minResult): bestModel = currentModel[i] minResult = currentModel[i].gainRatio() if Utils.equal(minResult, 0): return noSplitModel bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()) if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue: bestModel.setSplitPoint(self.m_allData) return bestModel
def splitCritValue(self,bags:Distribution,totalNoInst:float=None,oldEnt:float=None): if totalNoInst is None: numerator=self.oldEnt(bags)-self.newEnt(bags) if Utils.equal(numerator, 0): return float("inf") return bags.total()/numerator else: noUnknown=totalNoInst-bags.total() unknowRate=noUnknown/totalNoInst if oldEnt is not None: numerator = oldEnt - self.newEnt(bags) else: numerator=self.oldEnt(bags)-self.newEnt(bags) numerator=(1-unknowRate)*numerator if Utils.equal(numerator, 0): return 0 return numerator/bags.total()
def maxBag(self): max = 0 maxIndex = -1 for i in range(len(self.m_perBag)): if Utils.gr(self.m_perBag[i], max) or Utils.equal( self.m_perBag[i], max): max = self.m_perBag[i] maxIndex = i return maxIndex
def check(self, minNoObj: float): counter = 0 for i in range(len(self.m_perBag)): if Utils.gr(self.m_perBag[i], minNoObj) or Utils.equal( self.m_perBag[i], minNoObj): counter += 1 if counter > 1: return True return False
def prune(self): if not self.m_isLeaf: for i in range(len(self.m_sons)): self.son(i).prune() if Utils.gr(self.errorsForTree(), self.errorsForLeaf()) or Utils.equal( self.errorsForTree(), self.errorsForLeaf()): self.m_sons = None self.m_isLeaf = None self.m_localModel = NoSplit(self.localModel().distribution())
def prob(self, classIndex: int, intIndex: int = None): if intIndex is None: if not Utils.equal(self.totaL, 0): return self.m_perClass[classIndex] / self.totaL return 0 else: if Utils.gr(self.m_perBag[intIndex], 0): return self.m_perClassPerBag[intIndex][ classIndex] / self.m_perBag[intIndex] return self.prob(classIndex)
def newDistribution(self, data: Instances): self.localModel().resetDistribution(data) self.m_train = data if not self.m_isLeaf: localInstances = self.localModel().split(data) for i in range(len(self.m_sons)): self.son(i).newDistribution(localInstances[i]) else: if not Utils.equal(data.sumOfWeight(), 0): self.m_isEmpty = False
def errorsForTree(self): errors = 0 if self.m_isLeaf: return self.errorsForLeaf() for i in range(len(self.m_sons)): if Utils.equal(self.localModel().distribution().perBag(i), 0): errors += self.m_test.perBag(i) - self.m_test.perClassPerBag( i, self.localModel().distribution().maxClass()) else: errors += self.son(i).errorsForTree() return errors
def buildTree(self, data: Instances, keepData: bool, test: Instances = None): if keepData: self.m_train = data self.m_isLeaf = False self.m_isEmpty = False self.m_sons = None if test is None: self.m_test = None self.m_localModel = self.m_toSelectModel.selectModel(data) if self.m_localModel.numSubsets() > 1: localInstances = self.m_localModel.split(data) self.m_sons = [] for i in range(self.m_localModel.numSubsets()): self.m_sons.append(self.getNewTree(localInstances[i])) localInstances[i] = None else: self.m_isLeaf = True if Utils.equal(data.sumOfWeight(), 0): self.m_isEmpty = True else: self.m_localModel = self.m_toSelectModel.selectModel(data, test) self.m_test = Distribution(test, self.m_localModel) if self.m_localModel.numSubsets() > 1: localTrain = self.m_localModel.split(data) localTest = self.m_localModel.split(test) self.m_sons = [] for i in range(self.m_localModel.numSubsets()): self.m_sons.append( self.getNewTree(localTrain[i], localTest[i])) localTrain[i] = None localTest[i] = None else: self.m_isLeaf = True if Utils.equal(data.sumOfWeight(), 0): self.m_isEmpty = True
def addInstWithUnknown(self, source: Instances, attIndex: int): probs = [] for j in range(len(self.m_perBag)): if Utils.equal(self.totaL, 0): probs.append(1 / len(self.m_perBag)) else: probs.append(self.m_perBag[j] / self.totaL) for inst in source: if inst.isMissing(attIndex): classIndex = int(inst.classValue()) weight = inst.weight() self.m_perClass[ classIndex] = self.m_perClass[classIndex] + weight self.totaL = self.totaL + weight for j in range(len(self.m_perBag)): newWeight = probs[j] * weight self.m_perClassPerBag[j][ classIndex] = self.m_perClassPerBag[j][ classIndex] + newWeight self.m_perBag[j] = self.m_perBag[j] + newWeight
def handleEnumeratedAttribute(self, instances: Instances): numAttValues = instances.attribute(self.m_attIndex).numValues() newDistribution = Distribution(numAttValues, instances.numClasses()) for inst in instances: if not inst.isMissing(self.m_attIndex): newDistribution.add(int(inst.value(self.m_attIndex)), inst) self.m_distribution = newDistribution for i in range(numAttValues): if Utils.gr(newDistribution.perBag(i), self.m_minNoObj) or\ Utils.equal(newDistribution.perBag(i), self.m_minNoObj): secondDistribution = Distribution(newDistribution, i) if secondDistribution.check(self.m_minNoObj): self.m_numSubsets = 2 currIG = self.infoGainCrit.splitCritValue( secondDistribution, self.m_sumOfWeights) currGR = self.gainRatioCrit.splitCritValue( secondDistribution, self.m_sumOfWeights, currIG) if i == 0 or Utils.gr(currGR, self.m_gainRatio): self.m_gainRatio = currGR self.m_infoGain = currIG self.m_splitPoint = i self.m_distribution = secondDistribution
def getEstimatedErrorsForDistribution(self, theDistribution: Distribution): if Utils.equal(theDistribution.total(), 0): return 0 return theDistribution.numIncorrect() + Utils.addErrs( theDistribution.total(), theDistribution.numIncorrect(), self.m_CF)