def resetDistribution(self, data: Instances): insts = Instances(data, data.numInstances()) for i in range(data.numInstances()): if self.whichSubset(data.instance(i)) > -1: insts.add(data.instance(i)) newD = Distribution(insts, self) newD.addInstWithUnknown(data, self.m_attIndex) self.m_distribution = newD
def kNearestNeighbours(self, target: Instance, kNN: int) -> Instances: if self.m_Stats is not None: self.m_Stats.searchStart() heap = MyHeap(kNN) firstkNN = 0 for i in range(self.m_Instances.numInstances()): if target == self.m_Instances.instance(i): continue if self.m_Stats is not None: self.m_Stats.incrPointCount() if firstkNN < kNN: distance = self.m_DistanceFunction.distance( target, self.m_Instances.instance(i), float("inf"), self.m_Stats) if distance == 0 and self.m_SkipIdentical and i < self.m_Instances.numInstances( ) - 1: continue heap.put(i, distance) firstkNN += 1 else: temp = heap.peek() distance = self.m_DistanceFunction.distance( target, self.m_Instances.instance(i), temp.distance, self.m_Stats) if distance == 0 and self.m_SkipIdentical: continue if distance < temp.distance: heap.putBySubstitute(i, distance) elif distance == temp.distance: heap.putKthNearest(i, distance) neighbours = Instances(self.m_Instances, heap.size() + heap.noOfKthNearest()) self.m_Distances = [0] * (heap.size() + heap.noOfKthNearest()) indices = [0] * (heap.size() + heap.noOfKthNearest()) i = 1 while heap.noOfKthNearest() > 0: h = heap.getKthNearest() indices[len(indices) - i] = h.index self.m_Distances[len(indices) - i] = h.distance i += 1 while heap.size() > 0: h = heap.get() indices[len(indices) - i] = h.index self.m_Distances[len(indices) - i] = h.distance i += 1 self.m_DistanceFunction.postProcessDistances(self.m_Distances) for k in range(len(indices)): neighbours.add(self.m_Instances.instance(indices[k])) if self.m_Stats is not None: self.m_Stats.searchStart() return neighbours
def saveVisibleInstances(self): plots = self.m_plot.m_plot2D.getPlots() if plots is not None: master = plots[0] saveInsts = Instances(master.getPlotInstances()) for i in range(1, len(plots)): temp = plots[i] addInsts = temp.getPlotInstances() for j in range(addInsts.numInstances()): saveInsts.add(addInsts.instance(j)) # for ins in saveInsts: # for i in range(saveInsts.numAttributes()): # print(",",ins.value(i),end="") # print() # print(saveInsts.toArffString()) filename = QFileDialog.getSaveFileName(self, '保存文件', '/', 'Arff data files(*.arff)') with open(filename[0], 'w') as f: text = saveInsts.toArffString() f.write(text)
class SimpleKMeans(RandomizableClusterer): methodList = {"NumClusters":"setNumClusters","DontReplaceMissing":"setDontReplaceMissing", "Seed":"setSeedDefault"} propertyList = {"NumClusters":"2","DontReplaceMissing":"False","Seed":"10"} def __init__(self): super().__init__() self.NumClusters=2 self.DontReplaceMissing=False self.m_MaxIterations=500 self.m_Iterations=0 self.m_PreserveOrder=False self.m_FastDistanceCalc=False self.Seed=10 self.m_speedUpDistanceCompWithCanopies=False self.m_maxCanopyCandidates=100 self.m_minClusterDensity=2 self.m_periodicPruningRate=10000 self.setSeed(self.Seed) self.m_ClusterNominalCounts=None #type:List[List[List[float]]] self.m_ClusterMissingCounts=None #type:List[List[float]] self.m_FullMeansOrMediansOrModes=None #type:List[float] self.m_FullStdDevs=None #type:List[float] self.m_FullNominalCounts=None #type:List[List[float]] self.m_FullMissingCounts=None #type:List[float] self.m_ClusterCentroids=None #type:Instances self.m_initialStartPoints=None #type:Instances self.m_executionSlots=1 self.m_ClusterSizes=[] #type:List[float] self.m_squaredErrors=[] #type:List[float] self.m_DistanceFunction=EuclideanDistance() def __str__(self): if self.m_ClusterCentroids is None: return "No clusterer built yet!" maxAttWidth=0 maxWidth=0 for i in range(self.NumClusters): for j in range(self.m_ClusterCentroids.numAttributes()): if len(self.m_ClusterCentroids.attribute(j).name())>maxAttWidth: maxAttWidth=len(self.m_ClusterCentroids.attribute(j).name()) if self.m_ClusterCentroids.attribute(j).isNumeric(): try: width=math.log(math.fabs(self.m_ClusterCentroids.instance(i).value(j)))/math.log(10) except ValueError: width=float('-inf') if width<0: width=1 width+=6 if int(width) > maxWidth: maxWidth=int(width) for i in range(self.m_ClusterCentroids.numAttributes()): if self.m_ClusterCentroids.attribute(i).isNominal(): a=self.m_ClusterCentroids.attribute(i) for j in range(self.m_ClusterCentroids.numInstances()): val=a.value(int(self.m_ClusterCentroids.instance(j).value(i))) if len(val)>maxWidth: maxWidth=len(val) for j in range(a.numValues()): val=a.value(j)+" " if len(val)>maxAttWidth: maxAttWidth=len(val) for m_ClusterSize in self.m_ClusterSizes: size="("+str(m_ClusterSize)+")" if len(size)>maxWidth: maxWidth=len(size) plusMinus="+/-" maxAttWidth+=2 if maxAttWidth<len("Attribute")+2: maxAttWidth=len("Attribute")+2 if maxWidth<len("Full Data"): maxWidth=len("Full Data")+1 if maxWidth<len("missing"): maxWidth=len("missing")+1 temp="\nkMeans\n======\n" temp+="\nNumber of iterations: " + str(self.m_Iterations) if not self.m_FastDistanceCalc: temp+='\n' temp+="Within cluster sum of squared errors: "+ str(sum(self.m_squaredErrors)) temp+="\n\nInitial starting points (random):\n" temp+='\n' for i in range(self.m_initialStartPoints.numInstances()): temp+="Cluster " + str(i) + ": " + str(self.m_initialStartPoints.instance(i))+"\n" temp+="\nMissing values globally replaced with mean/mode" temp+="\n\nFinal cluster centroids:\n" temp+=self.pad("Cluster#", " ", (maxAttWidth + (maxWidth * 2 + 2))- len("Cluster#"), True) temp+='\n' temp+=self.pad("Attribute", " ", maxAttWidth - len("Attribute"), False) temp+=self.pad("Full Data", " ", maxWidth + 1 - len("Full Data"), True) for i in range(self.NumClusters): clustNum=str(i) temp+=self.pad(clustNum, " ", maxWidth + 1 - len(clustNum), True) temp+='\n' cSize="(" + str(sum(self.m_ClusterSizes)) + ")" temp+=self.pad(cSize, " ", maxAttWidth + maxWidth + 1 - len(cSize),True) for i in range(self.NumClusters): cSize="(" + str(self.m_ClusterSizes[i]) + ")" temp+=self.pad(cSize, " ", maxWidth + 1 - len(cSize), True) temp+='\n' temp+=self.pad("", "=",maxAttWidth+ (maxWidth * (self.m_ClusterCentroids.numInstances() + 1) + self.m_ClusterCentroids.numInstances() + 1), True) temp+='\n' for i in range(self.m_ClusterCentroids.numAttributes()): attName=self.m_ClusterCentroids.attribute(i).name() temp+=attName for j in range(maxAttWidth-len(attName)): temp+=" " if self.m_ClusterCentroids.attribute(i).isNominal(): if self.m_FullMeansOrMediansOrModes[i] == -1: valMeanMode=self.pad("missing", " ", maxWidth + 1 - len("missing"), True) else: strVal=self.m_ClusterCentroids.attribute(i).value(int(self.m_FullMeansOrMediansOrModes[i])) valMeanMode=self.pad(strVal," ",maxWidth+1-len(strVal),True) else: if math.isnan(self.m_FullMeansOrMediansOrModes[i]): valMeanMode=self.pad("missing", " ", maxWidth + 1 - len("missing"), True) else: strVal= Utils.doubleToString(self.m_FullMeansOrMediansOrModes[i], maxWidth, 4).strip() valMeanMode=self.pad(strVal," ",maxWidth+1-len(strVal),True) temp+=valMeanMode for j in range(self.NumClusters): if self.m_ClusterCentroids.attribute(i).isNominal(): if self.m_ClusterCentroids.instance(j).isMissing(i): valMeanMode=self.pad("missing", " ", maxWidth + 1 - len("missing"), True) else: strVal=self.m_ClusterCentroids.attribute(i).value(int(self.m_ClusterCentroids.instance(j).value(i))) valMeanMode=self.pad(strVal," ",maxWidth+1-len(strVal),True) else: if self.m_ClusterCentroids.instance(j).isMissing(i): valMeanMode=self.pad("missing", " ", maxWidth + 1 - len("missing"), True) else: strVal= Utils.doubleToString(self.m_ClusterCentroids.instance(j).value(i), maxWidth, 4).strip() valMeanMode=self.pad(strVal," ",maxWidth+1-len(strVal),True) temp+=valMeanMode temp+='\n' temp+='\n\n' return temp def clusterInstance(self,instance:Instance): self.m_ReplaceMissingFilter.input(instance) self.m_ReplaceMissingFilter.batchFinished() inst=self.m_ReplaceMissingFilter.output() return self.clusterProcessedInstance(inst,False,True) def setNumClusters(self,value:str): try: val=int(value) self.NumClusters=int(val) self.propertyList.update({"NumClusters":value}) except ValueError: pass def setDontReplaceMissing(self,value:int): if value == 0: self.DontReplaceMissing=False else: self.DontReplaceMissing=True def pad(self,source:str,padChar:str,length:int,leftPad:bool): temp="" if leftPad: for i in range(length): temp+=padChar temp+=source else: temp+=source for i in range(length): temp+=padChar return temp def getCapabilities(self)->Capabilities: result=super().getCapabilities() result.disableAll() result.enable(CapabilityEnum.NO_CLASS) result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES) result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES) result.enable(CapabilityEnum.MISSING_VALUES) return result def buildClusterer(self,data:Instances): self.getCapabilities().testWithFail(data) self.m_Iterations=0 #调用筛选器替换缺失值,Numeric使用平均值代替,Nominal使用出现次数最多的值代替 self.m_ReplaceMissingFilter=ReplaceMissingValues() instances=Instances(data) instances.setClassIndex(-1) self.m_ReplaceMissingFilter.setInputFormat(instances) instances=Filter.useFilter(instances,self.m_ReplaceMissingFilter) #保存每个簇的样本属性值频率,m_ClusterNominalCounts是个3维,1维n个簇,2维属性类,3维属性值频率 self.m_ClusterNominalCounts=[[[] for i in range(instances.numAttributes())] for j in range(self.NumClusters)] #每个簇不同属性缺失值频率 self.m_ClusterMissingCounts=[[0]*instances.numAttributes() for i in range(self.NumClusters)] #移动质心 self.m_FullMeansOrMediansOrModes=self.moveCentroid(0,instances,True,False) #整个样本集的属性缺失率 self.m_FullMissingCounts=self.m_ClusterMissingCounts[0] self.m_FullNominalCounts=self.m_ClusterNominalCounts[0] sumofWeights=instances.sumOfWeight() for i in range(instances.numAttributes()): if instances.attribute(i).isNumeric(): if self.m_FullMissingCounts[i] == sumofWeights: self.m_FullMeansOrMediansOrModes[i]=float('nan') else: if self.m_FullMissingCounts[i]>self.m_FullNominalCounts[i][Utils.maxIndex(self.m_FullNominalCounts[i])]: self.m_FullMeansOrMediansOrModes[i]=-1 self.m_ClusterCentroids=Instances(instances,self.NumClusters) clusterAssignments=[0]*instances.numInstances() self.m_DistanceFunction.setInstances(instances) random.seed(self.getSeed()) initC=dict() #type:Dict[DecisionTableHashKey,int] initInstances=instances for j in range(initInstances.numInstances()-1,-1,-1): instIndex=random.randint(0,j) hk=DecisionTableHashKey(initInstances.instance(instIndex),initInstances.numAttributes(),True) if hk not in initC: self.m_ClusterCentroids.add(initInstances.instance(instIndex)) initC.update({hk:None}) initInstances.swap(j,instIndex) if self.m_ClusterCentroids.numInstances() == self.NumClusters: break self.m_initialStartPoints=Instances(self.m_ClusterCentroids) self.NumClusters=self.m_ClusterCentroids.numInstances() converged=False tempI=[] #type:List[Instances] self.m_squaredErrors=[0]*self.NumClusters self.m_ClusterNominalCounts=[[[] for i in range(instances.numAttributes())] for j in range(self.NumClusters)] self.m_ClusterMissingCounts=[[0]*instances.numAttributes() for i in range(self.NumClusters)] #循环更新质心 while not converged: emptyClusterCount=0 self.m_Iterations+=1 converged=True if self.m_executionSlots<=1 or instances.numInstances() <2*self.m_executionSlots: for i in range(instances.numInstances()): toCluster=instances.instance(i) newC=self.clusterProcessedInstance(toCluster,False,True) if newC != clusterAssignments[i]: converged=False clusterAssignments[i]=newC self.m_ClusterCentroids=Instances(instances,self.NumClusters) for i in range(self.NumClusters): tempI.append(Instances(instances,0)) for i in range(instances.numInstances()): tempI[clusterAssignments[i]].add(instances.instance(i)) for i in range(self.NumClusters): if tempI[i].numInstances() == 0: emptyClusterCount+=1 else: self.moveCentroid(i,tempI[i],True,True) if self.m_Iterations == self.m_MaxIterations: converged=True if emptyClusterCount>0: self.NumClusters-=emptyClusterCount if converged: t=[None]*self.NumClusters #type:List[Instances] index=0 for k in range(len(tempI)): if tempI[k].numInstances()>0: t[index]=tempI[k] for i in range(tempI[k].numAttributes()): self.m_ClusterNominalCounts[index][i]=self.m_ClusterNominalCounts[k][i] index+=1 tempI=t else: tempI=[None]*self.NumClusters if not converged: self.m_ClusterNominalCounts=[[[] for i in range(instances.numAttributes())] for j in range(self.NumClusters)] if not self.m_FastDistanceCalc: for i in range(instances.numInstances()): self.clusterProcessedInstance(instances.instance(i),True,False) # for i in self.m_squaredErrors: # print("squ:",i) self.m_ClusterSizes=[] for i in range(self.NumClusters): self.m_ClusterSizes.append(tempI[i].sumOfWeight()) self.m_DistanceFunction.clean() def numberOfClusters(self): return self.NumClusters def setSeedDefault(self,value:str): try: val=int(value) self.Seed=val self.propertyList.update({"Seed":value}) except ValueError: pass def clusterProcessedInstance(self,instance:Instance,updateErrors:bool,useFastDistCalc:bool): minDist=float('inf') bestCluster=0 for i in range(self.NumClusters): if useFastDistCalc: dist=self.m_DistanceFunction.distance(instance,self.m_ClusterCentroids.instance(i),minDist) else: dist=self.m_DistanceFunction.distance(instance,self.m_ClusterCentroids.instance(i)) if dist<minDist: minDist=dist bestCluster=i if updateErrors: minDist*=minDist*instance.weight() self.m_squaredErrors[bestCluster]+=minDist # print("bestCluster: ",bestCluster) return bestCluster def moveCentroid(self,centroidIndex:int,members:Instances,updateClusterInfo:bool,addToCentroidInstances:bool): vals=[0]*members.numAttributes() nominalDists=[[] for i in range(members.numAttributes())] weightMissing=[0]*members.numAttributes() weightNonMissing=[0]*members.numAttributes() for j in range(members.numAttributes()): if members.attribute(j).isNominal(): nominalDists[j]=[0]*members.attribute(j).numValues() for inst in members: for j in range(members.numAttributes()): if inst.isMissing(j): weightMissing[j]+=inst.weight() else: weightNonMissing[j]+=inst.weight() if members.attribute(j).isNumeric(): vals[j]+=inst.weight()*inst.value(j) else: nominalDists[j][int(inst.value(j))]+=inst.weight() for j in range(members.numAttributes()): if members.attribute(j).isNumeric(): if weightNonMissing[j]>0: vals[j]/=weightNonMissing[j] else: vals[j]= Utils.missingValue() else: max=float('-inf') maxIndex=-1 for i in range(len(nominalDists[j])): if nominalDists[j][i]>max: max=nominalDists[j][i] maxIndex=i if max < weightMissing[j]: vals[j]= Utils.missingValue() else: vals[j]=maxIndex if updateClusterInfo: for j in range(members.numAttributes()): self.m_ClusterMissingCounts[centroidIndex][j]=weightMissing[j] self.m_ClusterNominalCounts[centroidIndex][j]=nominalDists[j] if addToCentroidInstances: self.m_ClusterCentroids.add(Instance(1.0,vals)) return vals
class ClassifierErrorsPlotInstances(AbstractPlotInstances): def __init__(self): super().__init__() def initialize(self): super().initialize() self.m_PlotShapes = [] #type:List[int] self.m_PlotSizes = [] #type:List[object] self.m_Classifier = None #type:Classifier self.m_ClassIndex = -1 self.m_Evaluation = None #type:Evaluation self.m_SaveForVisualization = True self.m_MinimumPlotSizeNumeric = 30 self.m_MaximumPlotSizeNumeric = 200 def setClassifier(self, value: Classifier): self.m_Classifier = value def setClassIndex(self, index: int): self.m_ClassIndex = index def setPointSizeProportionalToMargin(self, b: bool): self.m_pointSizeProportionalToMargin = b def setEvaluation(self, value: Evaluation): self.m_Evaluation = value def determineFormat(self): margin = None #type:Attribute if not self.m_SaveForVisualization: self.m_PlotInstances = None return hv = [] #type:List[Attribute] classAt = self.m_Instances.attribute(self.m_ClassIndex) if classAt.isNominal(): attVals = [] for i in range(classAt.numValues()): attVals.append(classAt.value(i)) predictedClass = Attribute("predicted " + classAt.name(), attVals) margin = Attribute("prediction margin") else: predictedClass = Attribute("predicted" + classAt.name()) for i in range(self.m_Instances.numAttributes()): if i == self.m_Instances.classIndex(): if classAt.isNominal(): hv.append(margin) hv.append(predictedClass) hv.append(self.m_Instances.attribute(i).copy()) #添加预测属性 self.m_PlotInstances = Instances( self.m_Instances.relationName() + "_predicted", hv, self.m_Instances.numInstances()) if classAt.isNominal(): self.m_PlotInstances.setClassIndex(self.m_ClassIndex + 2) else: self.m_PlotInstances.setClassIndex(self.m_ClassIndex + 1) def process(self, toPredict: Instance, classifier: Classifier, evaluation: Evaluation): probActual = probNext = pred = 0 classMissing = copy.deepcopy(toPredict) classMissing.setDataset(toPredict.dataset()) if toPredict.classAttribute().isNominal(): #返回分类预测的概率分布 preds = classifier.distributionForInstance(classMissing) #若概率全部为0,则表示不属于任何一类 val = 0 if sum(preds) == 0: pred = Utils.missingValue() probActual = Utils.missingValue() else: #分类结果为概率最大的一项下标 pred = Utils.maxIndex(preds) if not Utils.isMissingValue(toPredict.classIndex()): #如果值不缺失,表示非预测样本,不做修改 if not Utils.isMissingValue(toPredict.classValue()): val = int(toPredict.classValue()) probActual = preds[val] else: probActual = preds[Utils.maxIndex(preds)] for i in range(toPredict.classAttribute().numValues()): if i != val and preds[i] > probNext: probNext = preds[i] evaluation.evaluationForSingleInstance(preds, toPredict, True) else: #单项评估 pred = evaluation.evaluateModelOnceAndRecordPrediction( classifier, toPredict) if not self.m_SaveForVisualization: return #保存可视化数据 if self.m_PlotInstances is not None: isNominal = toPredict.classAttribute().isNominal() values = [0] * self.m_PlotInstances.numAttributes() i = 0 while i < self.m_PlotInstances.numAttributes(): #预测值前的所有值照原来的拷贝 if i < toPredict.classIndex(): values[i] = toPredict.value(i) elif i == toPredict.classIndex(): if isNominal: #首选结果与备选结果的差值 values[i] = probActual - probNext #预测结果 values[i + 1] = pred #原始值 values[i + 2] = toPredict.value(i) i += 2 else: values[i] = pred values[i + 1] = toPredict.value(i) i += 1 else: if isNominal: values[i] = toPredict.value(i - 2) else: values[i] = toPredict.value(i - 1) i += 1 # print("============") # for m in values: # print("val:",m) # print("============") self.m_PlotInstances.add(Instance(1.0, values)) if toPredict.classAttribute().isNominal(): if toPredict.isMissing( toPredict.classIndex()) or Utils.isMissingValue(pred): self.m_PlotShapes.append(Plot2D.MISSING_SHAPE) elif pred != toPredict.classValue(): self.m_PlotShapes.append(Plot2D.ERROR_SHAPE) else: self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE) if self.m_pointSizeProportionalToMargin: self.m_PlotSizes.append(probActual - probNext) else: sizeAdj = 0 if pred != toPredict.classValue(): sizeAdj = 1 self.m_PlotSizes.append(Plot2D.DEFAULT_SHAPE_SIZE.value + sizeAdj) else: errd = None if not toPredict.isMissing(toPredict.classIndex( )) and not Utils.isMissingValue(pred): errd = pred - toPredict.classValue() self.m_PlotShapes.append(Plot2D.CONST_AUTOMATIC_SHAPE) else: self.m_PlotShapes.append(Plot2D.MISSING_SHAPE) self.m_PlotSizes.append(errd) def createPlotData(self, name: str): if not self.m_SaveForVisualization: return None result = PlotData2D(self.m_PlotInstances) result.setShapeSize(self.m_PlotSizes) result.setShapeType(self.m_PlotShapes) result.setPlotName(name + " (" + self.m_Instances.relationName() + ")") return result def finishUp(self): super().finishUp() if not self.m_SaveForVisualization: return if self.m_Instances.classAttribute().isNumeric( ) or self.m_pointSizeProportionalToMargin: self.scaleNumericPredictions() def scaleNumericPredictions(self): maxErr = float("-inf") minErr = float("inf") if self.m_Instances.classAttribute().isNominal(): maxErr = 1 minErr = 0 else: for i in range(len(self.m_PlotSizes)): errd = self.m_PlotSizes[i] if errd is not None: err = abs(errd) if err < minErr: minErr = err if err > maxErr: maxErr = err for i in range(len(self.m_PlotSizes)): errd = self.m_PlotSizes[i] if errd is not None: err = abs(errd) if maxErr - minErr > 0: temp = ((err - minErr) / (maxErr - minErr)) * ( self.m_MaximumPlotSizeNumeric - self.m_MinimumPlotSizeNumeric + 1) self.m_PlotSizes[i] = int( temp) + self.m_MinimumPlotSizeNumeric else: self.m_PlotSizes[i] = self.m_MinimumPlotSizeNumeric else: self.m_PlotSizes[i] = self.m_MinimumPlotSizeNumeric def cleanUp(self): super().cleanUp() self.m_Classifier = None self.m_PlotShapes = None self.m_PlotSizes = None self.m_Evaluation = None