def updateRanges(self, instance: Instance, ranges: List[List[float]]): numVals = instance.numValues() prevIndex = 0 for j in range(numVals): currIndex = instance.index(j) while prevIndex < currIndex: if 0 < ranges[prevIndex][self.R_MIN]: ranges[prevIndex][self.R_MIN] = 0 ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][ self.R_MAX] - ranges[prevIndex][self.R_MIN] if 0 > ranges[prevIndex][self.R_MAX]: ranges[prevIndex][self.R_MAX] = 0 ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][ self.R_MAX] - ranges[prevIndex][self.R_MIN] prevIndex += 1 prevIndex += 1 if not instance.isMissingSparse(j): val = instance.valueSparse(j) if val < ranges[currIndex][self.R_MIN]: ranges[currIndex][self.R_MIN] = val ranges[currIndex][self.R_WIDTH] = ranges[currIndex][ self.R_MAX] - ranges[currIndex][self.R_MIN] if val > ranges[currIndex][self.R_MAX]: ranges[currIndex][self.R_MAX] = val ranges[currIndex][self.R_WIDTH] = ranges[currIndex][ self.R_MAX] - ranges[currIndex][self.R_MIN] return ranges
def add(self, bagIndex: int, instance: Instance): classIndex = int(instance.classValue()) weight = instance.weight() self.m_perClassPerBag[bagIndex][ classIndex] = self.m_perClassPerBag[bagIndex][classIndex] + weight self.m_perBag[bagIndex] = self.m_perBag[bagIndex] + weight self.m_perClass[classIndex] = self.m_perClass[classIndex] + weight self.totaL += weight
def whichSubset(self, instance: Instance): if instance.isMissing(self.m_attIndex): return -1 if instance.attribute(self.m_attIndex).isNominal(): return int(instance.value(self.m_attIndex)) elif instance.value(self.m_attIndex) <= self.m_splitPoint: return 0 return 1
def push(self, instance: Instance, copyInstance: bool = True): if instance is not None: if instance.dataset() is not None: if copyInstance: instance = copy.deepcopy(instance) self.copyValues(instance, False) instance.setDataset(self.m_OutputFormat) self.m_OutputQueue.put(instance)
def addWeights(self, instance: Instance, weights: List): classIndex = int(instance.classValue()) for i in range(len(self.m_perBag)): weight = instance.weight() * weights[i] self.m_perClassPerBag[i][ classIndex] = self.m_perClassPerBag[i][classIndex] + weight self.m_perBag[i] = self.m_perBag[i] + weight self.m_perClass[classIndex] = self.m_perClass[classIndex] + weight self.totaL += weight
def updateStatsForIntervalEstimator(self,classifier:IntervalEstimator,classMissing:Instance,classValue:float): preds=classifier.predictIntervals(classMissing,self.m_ConfLevel) if self.m_Predictions is not None: self.m_Predictions[-1].setPredictionIntervals(preds) for pred in preds: self.m_TotalSizeOfRegions+=classMissing.weight()*(pred[1]-pred[0])/(self.m_MaxTarget-self.m_MinTarget) for pred in preds: if pred[1]>=classValue and pred[0]<=classValue: self.m_TotalCoverage+=classMissing.weight() break
def regressionPrediction(self, transformedInstance: Instance, selectedAttributes: List[bool], coefficients: List[float]): result = 0 column = 0 for j in range(transformedInstance.numAttributes()): if self.m_ClassIndex != j and selectedAttributes[j]: result += coefficients[column] * transformedInstance.value(j) column += 1 result += coefficients[column] return result
def updateRangesFirst(self, instance: Instance, numAtt: int, ranges: List[List]): for i in range(len(ranges)): for j in range(len(ranges[i])): ranges[i][j] = 0 numVals = instance.numValues() for j in range(numVals): currIndex = instance.index(j) if not instance.isMissingSparse(j): return True return False
def distributionForInstance(self,instance:Instance)->List[float]: dist=[0]*instance.numClasses() if instance.classAttribute().type() == Attribute.NOMINAL: classification=self.classifyInstance(instance) if Utils.isMissingValue(classification): return dist else: dist[int(classification)]=1.0 return dist elif instance.classAttribute().type() == Attribute.NUMERIC or instance.classAttribute().type() == Attribute.DATE: dist[0]=self.classifyInstance(instance) return dist return dist
def copyStringValues(cls, inst:Instance, a0=None, a1=None, a2:AttributeLocator=None, a3:Instances=None, a4:AttributeLocator=None): if isinstance(a0,Instances) and isinstance(a1,AttributeLocator): if inst.dataset() is None: raise Exception("Instance has no dataset assigned!!") elif inst.dataset().numAttributes() != a0.numAttributes(): raise Exception("Src and Dest differ in # of attributes: " + str(inst.dataset().numAttributes()) + " != " + str(a0.numAttributes())) cls.copyStringValuesFromSrc(inst,True,inst.dataset(),a1,a0,a1) else: if a1 == a3: return if len(a2.getAttributeIndices()) != len(a4.getAttributeIndices()): raise Exception("Src and Dest string indices differ in length: " + str(len(a2.getAttributeIndices())) + " != " + str(len(a4.getAttributeIndices()))) if len(a2.getLocatorIndices()) != len(a4.getLocatorIndices()): raise Exception("Src and Dest locator indices differ in length: " + str(len(a2.getLocatorIndices())) + " != " + str(len(a4.getLocatorIndices()))) for i in range(len(a2.getAttributeIndices())): if a0: instIndex = a2.getActualIndex(a2.getAttributeIndices()[i]) else: instIndex = a4.getActualIndex(a4.getAttributeIndices()[i]) src = a1.attribute(a2.getActualIndex(a2.getAttributeIndices()[i])) dest = a3.attribute(a4.getActualIndex(a4.getAttributeIndices()[i])) if not inst.isMissing(instIndex): valIndex = dest.addStringValue(src, int(inst.value(instIndex))) inst.setValue(instIndex, valIndex)
def convertInstance(self,instance:Instance): inst=instance hasMissing=instance.hasMissingValue() if hasMissing: vals=[0]*self.getInputFormat().numAttributes() for j in range(instance.numAttributes()): if instance.isMissing(j) and self.getInputFormat().classIndex()!=j \ and (self.getInputFormat().attribute(j).isNominal() or self.getInputFormat().attribute(j).isNumeric()): vals[j]=self.m_ModesAndMeans[j] else: vals[j]=instance.value(j) inst=Instance(instance.weight(),vals) inst.setDataset(instance.dataset()) self.push(inst,not hasMissing)
def convertInstanceNominal(self, instance: Instance): if not self.m_needToTransform: self.push(instance, False) return vals = [0] * self.outputFormatPeek().numAttributes() attSoFar = 0 for j in range(self.getInputFormat().numAttributes()): att = self.getInputFormat().attribute(j) if not att.isNominal() or j == self.getInputFormat().classIndex(): vals[attSoFar] = instance.value(j) attSoFar += 1 else: if att.numValues() <= 2 and not self.m_TransformAll: vals[attSoFar] = instance.value(j) attSoFar += 1 else: if instance.isMissing(j): for k in range(att.numValues()): vals[attSoFar + k] = instance.value(j) else: for k in range(att.numValues()): if k == int(instance.value(j)): vals[attSoFar + k] = 1 else: vals[attSoFar + k] = 0 attSoFar += att.numValues() inst = Instance(instance.weight(), vals) self.copyValues(inst, False, instance.dataset(), self.outputFormatPeek()) self.push(inst)
def classifyInstance(self,instance:Instance): dist=self.distributionForInstance(instance) if dist is None: raise Exception("Null distribution predicted") if instance.classAttribute().type() == Attribute.NOMINAL: max=maxIndex=0 for i in range(len(dist)): if dist[i]>max: maxIndex=i max=dist[i] if max > 0: return maxIndex return Utils.missingValue() elif instance.classAttribute().type() == Attribute.NUMERIC or instance.classAttribute().type() == Attribute.DATE: return dist[0] return Utils.missingValue()
def distributionForInstance(self, instance: Instance, useLaplace: bool): numbers = [] for i in range(instance.numClasses()): if not useLaplace: numbers.append(self.getProbs(i, instance, 1)) else: numbers.append(self.getProbsLaplace(i, instance, 1)) return numbers
def convertInstanceNumeric(self, instance: Instance): if not self.m_needToTransform: self.push(instance, False) return vals = [0] * self.outputFormatPeek().numAttributes() attSoFar = 0 for j in range(self.getInputFormat().numAttributes()): att = self.getInputFormat().attribute(j) if not att.isNominal() or j == self.getInputFormat().classIndex(): vals[attSoFar] = instance.value(j) attSoFar += 1 else: if instance.isMissing(j): for k in range(att.numValues() - 1): vals[attSoFar + k] = instance.value(j) else: k = 0 while int(instance.value(j)) != self.m_Indices[j][k]: vals[attSoFar + k] = 1 k += 1 while k < att.numValues() - 1: vals[attSoFar + k] = 0 k += 1 attSoFar += att.numValues() - 1 inst = Instance(instance.weight(), vals) self.copyValues(inst, False, instance.dataset(), self.outputFormatPeek()) self.push(inst)
def weights(self, instance: Instance): if instance.isMissing(self.m_attIndex): weights = [] for i in range(self.m_numSubsets): weights.append( self.m_distribution.perBag(i) / self.m_distribution.total()) return weights return None
def input(self, instance: Instance): if self.getInputFormat() is None: raise Exception("No input instance format defined") if self.m_NewBatch: self.resetQueue() self.m_NewBatch = False if self.m_Indices is not None or self.getInputFormat().classAttribute( ).isNominal(): self.convertInstance(instance.copy()) return True self.bufferInput(instance) return False
def evaluationForSingleInstance(self, a0, instance:Instance, storePredictions:bool): if isinstance(a0,List): if self.m_ClassIsNominal: pred= Utils.maxIndex(a0) if a0[int(pred)] <= 0: pred= Utils.missingValue() self.updateStatsForClassifier(a0, instance) if storePredictions and not self.m_DiscardPredictions: if self.m_Predictions is None: self.m_Predictions=[] self.m_Predictions.append(NominalPrediction(instance.classValue(), a0, instance.weight())) else: pred=a0[0] self.updateStatsForPredictor(pred,instance) if storePredictions and not self.m_DiscardPredictions: if self.m_Predictions is None: self.m_Predictions=[] self.m_Predictions.append(NumericPrediction(instance.classValue(),pred,instance.weight())) return pred elif isinstance(a0,Classifier): classMissing=copy.deepcopy(instance) classMissing.setDataset(instance.dataset()) #TODO # if isinstance(a0,InputMappedClassifier) # else: classMissing.setClassMissing() # print("isMiss: ", instance.value(5)) pred=self.evaluationForSingleInstance(a0.distributionForInstance(classMissing),instance,storePredictions) if not self.m_ClassIsNominal: if not instance.classIsMissing() and not Utils.isMissingValue(pred): if isinstance(a0,IntervalEstimator): self.updateStatsForIntervalEstimator(a0,classMissing,instance.classValue()) else: self.m_CoverageStatisticsAvailable=False if isinstance(a0,ConditionalDensityEstimator): self.updateStatsForConditionalDensityEstimator(a0,classMissing,instance.classValue()) else: self.m_ComplexityStatisticsAvailable=False return pred
def distance(self, first: Instance, second: Instance, a0=None, a1=None): if a0 is None or isinstance(a0, PerformanceStats): return self.distance(first, second, float("inf"), a0) elif isinstance(a0, float): distance = 0 firstNumValues = first.numValues() secondNumValues = second.numValues() numAttributes = self.m_Data.numAttributes() classIndex = self.m_Data.classIndex() self.validate() p1 = p2 = 0 while p1 < firstNumValues or p2 < secondNumValues: if p1 >= firstNumValues: firstI = numAttributes else: firstI = first.index(p1) if p2 >= secondNumValues: secondI = numAttributes else: secondI = second.index(p2) if firstI == classIndex: p1 += 1 continue if firstI < numAttributes and not self.m_ActiveIndices[firstI]: p1 += 1 continue if secondI == classIndex: p2 += 1 continue if secondI < numAttributes and not self.m_ActiveIndices[ secondI]: p2 += 1 continue if firstI == secondI: diff = self.difference(firstI, first.valueSparse(p1), second.valueSparse(p2)) p1 += 1 p2 += 1 elif firstI > secondI: diff = self.difference(secondI, 0, second.valueSparse(p2)) p2 += 1 else: diff = self.difference(firstI, first.valueSparse(p1), 0) p1 += 1 if isinstance(a1, PerformanceStats): a1.incrCoordCount() distance = self.updateDistance(distance, diff) if distance > a0: return float('inf') return distance
def input(self, instance: Instance): if self.getInputFormat() is None: raise Exception("No input instance format defined") if self.m_NewBatch: self.resetQueue() self.m_NewBatch = False if self.getOutputFormat().numAttributes() == 0: return False if len(self.m_SelectedAttributes) == self.getInputFormat( ).numAttributes(): inst = copy.deepcopy(instance) inst.setDataset(None) else: vals = [0] * self.getOutputFormat().numAttributes() for i in range(len(self.m_SelectedAttributes)): current = self.m_SelectedAttributes[i] vals[i] = instance.value(current) inst = Instance(instance.weight(), vals) self.copyValues(inst, False, instance.dataset(), self.outputFormatPeek()) self.push(inst) return True
def copyStringValuesFromSrc(cls,instance:Instance,instSrcCompat:bool,srcDataset:Instances,srcLoc:AttributeLocator, destDataset:Instances,destLoc:AttributeLocator): if srcDataset == destDataset: return if len(srcLoc.getAttributeIndices()) != len(destLoc.getAttributeIndices()): raise Exception("Src and Dest string indices differ in length: " + str(len(srcLoc.getAttributeIndices()))+ " != " + str(len(destLoc.getAttributeIndices().length))) if len(srcLoc.getLocatorIndices()) != len(destLoc.getLocatorIndices()): raise Exception("Src and Dest locator indices differ in length: " + str(len(srcLoc.getLocatorIndices())) + " != " + str(len(destLoc.getLocatorIndices().length))) for i in range(len(srcLoc.getAttributeIndices())): if instSrcCompat: instIndex=srcLoc.getActualIndex(srcLoc.getAttributeIndices()[i]) else: instIndex=destLoc.getActualIndex(destLoc.getAttributeIndices()[i]) src=srcDataset.attribute(srcLoc.getActualIndex(srcLoc.getAttributeIndices()[i])) dest=destDataset.attribute(destLoc.getActualIndex(destLoc.getAttributeIndices()[i])) if not instance.isMissing(instIndex): valIndex=dest.addStringValue(src,int(instance.value(instIndex))) instance.setValue(instIndex,valIndex)
def clusterProcessedInstance(self,instance:Instance,updateErrors:bool,useFastDistCalc:bool): minDist=float('inf') bestCluster=0 for i in range(self.NumClusters): if useFastDistCalc: dist=self.m_DistanceFunction.distance(instance,self.m_ClusterCentroids.instance(i),minDist) else: dist=self.m_DistanceFunction.distance(instance,self.m_ClusterCentroids.instance(i)) if dist<minDist: minDist=dist bestCluster=i if updateErrors: minDist*=minDist*instance.weight() self.m_squaredErrors[bestCluster]+=minDist # print("bestCluster: ",bestCluster) return bestCluster
def moveCentroid(self,centroidIndex:int,members:Instances,updateClusterInfo:bool,addToCentroidInstances:bool): vals=[0]*members.numAttributes() nominalDists=[[] for i in range(members.numAttributes())] weightMissing=[0]*members.numAttributes() weightNonMissing=[0]*members.numAttributes() for j in range(members.numAttributes()): if members.attribute(j).isNominal(): nominalDists[j]=[0]*members.attribute(j).numValues() for inst in members: for j in range(members.numAttributes()): if inst.isMissing(j): weightMissing[j]+=inst.weight() else: weightNonMissing[j]+=inst.weight() if members.attribute(j).isNumeric(): vals[j]+=inst.weight()*inst.value(j) else: nominalDists[j][int(inst.value(j))]+=inst.weight() for j in range(members.numAttributes()): if members.attribute(j).isNumeric(): if weightNonMissing[j]>0: vals[j]/=weightNonMissing[j] else: vals[j]= Utils.missingValue() else: max=float('-inf') maxIndex=-1 for i in range(len(nominalDists[j])): if nominalDists[j][i]>max: max=nominalDists[j][i] maxIndex=i if max < weightMissing[j]: vals[j]= Utils.missingValue() else: vals[j]=maxIndex if updateClusterInfo: for j in range(members.numAttributes()): self.m_ClusterMissingCounts[centroidIndex][j]=weightMissing[j] self.m_ClusterNominalCounts[centroidIndex][j]=nominalDists[j] if addToCentroidInstances: self.m_ClusterCentroids.add(Instance(1.0,vals)) return vals
def makeInstance(self, tc: TwoClassStats, prob: float) -> Instance: count = 0 vals = [0] * 13 vals[count] = tc.getTruePositive() count += 1 vals[count] = tc.getFalseNegative() count += 1 vals[count] = tc.getFalsePositive() count += 1 vals[count] = tc.getTrueNegative() count += 1 vals[count] = tc.getFalsePositiveRate() count += 1 vals[count] = tc.getTruePositiveRate() count += 1 vals[count] = tc.getPrecision() count += 1 vals[count] = tc.getRecall() count += 1 vals[count] = tc.getFallout() count += 1 vals[count] = tc.getFMeasure() count += 1 ss = (tc.getTruePositive() + tc.getFalsePositive()) / ( tc.getTruePositive() + tc.getFalsePositive() + tc.getTrueNegative() + tc.getFalseNegative()) vals[count] = ss count += 1 expectedByChance = ss * (tc.getTruePositive() + tc.getFalseNegative()) if expectedByChance < 1: vals[count] = Utils.missingValue() else: vals[count] = tc.getTruePositive() / expectedByChance count += 1 vals[count] = prob return Instance(1.0, vals)
def updateStatsForClassifier(self,predictedDistribution:List,instance:Instance): actualClass=instance.classValue() if not instance.classIsMissing(): self.updateMargins(predictedDistribution,actualClass,instance.weight()) predictedClass=-1 bestProb=0 for i in range(self.m_NumClasses): if predictedDistribution[i] > bestProb: predictedClass=i bestProb=predictedDistribution[i] self.m_WithClass+=instance.weight() if predictedClass < 0: self.m_Unclassified+=instance.weight() return predictedProb=max(float('-inf'),predictedDistribution[actualClass]) priorProb=max(float('-inf'),self.m_ClassPriors[actualClass]/self.m_ClassPriorsSum) if predictedProb >= priorProb: self.m_SumKBInfo+= (Utils.log2(predictedProb) - Utils.log2(priorProb)) * instance.weight() else: self.m_SumKBInfo-= (Utils.log2(1 - predictedProb) - Utils.log2(1 - priorProb)) * instance.weight() self.m_SumSchemeEntropy-= Utils.log2(predictedProb) * instance.weight() self.m_SumPriorEntropy-= Utils.log2(priorProb) * instance.weight() self.updateNumericScores(predictedDistribution,self.makeDistribution(instance.classValue()),instance.weight()) indices= Utils.stableSort(predictedDistribution) sum=sizeOfregions=0 for i in range(len(predictedDistribution)-1,-1,-1): if sum >= self.m_ConfLevel: break sum+=predictedDistribution[indices[i]] sizeOfregions+=1 if actualClass == indices[i]: self.m_TotalCoverage+=instance.weight() self.m_TotalSizeOfRegions+=instance.weight()*sizeOfregions/(self.m_MaxTarget-self.m_MinTarget) self.m_ConfusionMatrix[actualClass][predictedClass]+=instance.weight() if predictedClass != actualClass: self.m_Incorrect+=instance.weight() else: self.m_Correct+=instance.weight() else: self.m_MissingClass+=instance.weight()
def updateStatsForPredictor(self,predictedValue:float,instance:Instance): if not instance.classIsMissing(): self.m_WithClass+=instance.weight() if Utils.isMissingValue(predictedValue): self.m_Unclassified+=instance.weight() return self.m_SumClass+=instance.weight()*instance.classValue() self.m_SumSqrClass+=instance.weight()*instance.classValue()*instance.classValue() self.m_SumClassPredicted+=instance.weight()*instance.classValue()*predictedValue self.m_SumPredicted+=instance.weight()*predictedValue self.m_SumSqrPredicted+=instance.weight()*predictedValue*predictedValue self.updateNumericScores(self.makeDistribution(predictedValue),self.makeDistribution(instance.classValue()),instance.weight()) else: self.m_MissingClass+=instance.weight()
def updateStatsForConditionalDensityEstimator(self,classifier:ConditionalDensityEstimator,classMissing:Instance,classValue:float): if self.m_PriorEstimator is None: self.setNumericPriorsFromBuffer() self.m_SumSchemeEntropy-=classifier.logDensity(classMissing,classValue)*classMissing.weight()/math.log(2) self.m_SumPriorEntropy-=self.m_PriorEstimator.logDensity(classValue)*classMissing.weight()/math.log(2)
def process(self, toPredict: Instance, classifier: Classifier, evaluation: Evaluation): probActual = probNext = pred = 0 classMissing = copy.deepcopy(toPredict) classMissing.setDataset(toPredict.dataset()) if toPredict.classAttribute().isNominal(): #返回分类预测的概率分布 preds = classifier.distributionForInstance(classMissing) #若概率全部为0,则表示不属于任何一类 val = 0 if sum(preds) == 0: pred = Utils.missingValue() probActual = Utils.missingValue() else: #分类结果为概率最大的一项下标 pred = Utils.maxIndex(preds) if not Utils.isMissingValue(toPredict.classIndex()): #如果值不缺失,表示非预测样本,不做修改 if not Utils.isMissingValue(toPredict.classValue()): val = int(toPredict.classValue()) probActual = preds[val] else: probActual = preds[Utils.maxIndex(preds)] for i in range(toPredict.classAttribute().numValues()): if i != val and preds[i] > probNext: probNext = preds[i] evaluation.evaluationForSingleInstance(preds, toPredict, True) else: #单项评估 pred = evaluation.evaluateModelOnceAndRecordPrediction( classifier, toPredict) if not self.m_SaveForVisualization: return #保存可视化数据 if self.m_PlotInstances is not None: isNominal = toPredict.classAttribute().isNominal() values = [0] * self.m_PlotInstances.numAttributes() i = 0 while i < self.m_PlotInstances.numAttributes(): #预测值前的所有值照原来的拷贝 if i < toPredict.classIndex(): values[i] = toPredict.value(i) elif i == toPredict.classIndex(): if isNominal: #首选结果与备选结果的差值 values[i] = probActual - probNext #预测结果 values[i + 1] = pred #原始值 values[i + 2] = toPredict.value(i) i += 2 else: values[i] = pred values[i + 1] = toPredict.value(i) i += 1 else: if isNominal: values[i] = toPredict.value(i - 2) else: values[i] = toPredict.value(i - 1) i += 1 # print("============") # for m in values: # print("val:",m) # print("============") self.m_PlotInstances.add(Instance(1.0, values)) if toPredict.classAttribute().isNominal(): if toPredict.isMissing( toPredict.classIndex()) or Utils.isMissingValue(pred): self.m_PlotShapes.append(Plot2D.MISSING_SHAPE) elif pred != toPredict.classValue(): self.m_PlotShapes.append(Plot2D.ERROR_SHAPE) else: self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE) if self.m_pointSizeProportionalToMargin: self.m_PlotSizes.append(probActual - probNext) else: sizeAdj = 0 if pred != toPredict.classValue(): sizeAdj = 1 self.m_PlotSizes.append(Plot2D.DEFAULT_SHAPE_SIZE.value + sizeAdj) else: errd = None if not toPredict.isMissing(toPredict.classIndex( )) and not Utils.isMissingValue(pred): errd = pred - toPredict.classValue() self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE) else: self.m_PlotShapes.append(Plot2D.MISSING_SHAPE) self.m_PlotSizes.append(errd)