def __init__(self,data:Instances): self.m_Header=Instances(data,0) self.m_NumClasses=data.numClasses() self.m_NumFolds=1 self.m_metricsToDisplay=[] self.m_ClassIsNominal=data.classAttribute().isNominal() self.m_WithClass=0 self.m_Unclassified=0 self.m_SumKBInfo=0 self.m_SumSchemeEntropy=0 self.m_SumPriorEntropy=0 self.m_SumErr=self.m_SumAbsErr=self.m_SumSqrErr=self.m_SumPriorAbsErr=self.m_SumPriorSqrErr=0 self.m_ConfLevel=0.95 self.m_TotalCoverage=self.m_TotalSizeOfRegions=0 self.m_MissingClass=0 self.m_Incorrect=self.m_Correct=0 self.m_DiscardPredictions=False self.m_CoverageStatisticsAvailable=True self.m_ComplexityStatisticsAvailable=True self.m_SumClass=self.m_SumSqrClass=self.m_SumPredicted=self.m_SumSqrPredicted=self.m_SumClassPredicted=0 self.m_Predictions=None #type:List[Prediction] if self.m_ClassIsNominal: self.m_ConfusionMatrix=[[0]*self.m_NumClasses for i in range(self.m_NumClasses)] #type:List[List[float]] self.m_ClassNames=[] #type:List[str] for i in range(self.m_NumClasses): self.m_ClassNames.append(data.classAttribute().value(i)) self.m_ClassPriors=[0]*self.m_NumClasses #type:List[float] self.setPriors(data) self.m_MarginCounts=[0]*(self.k_MarginResolution+1) for s in self.BUILT_IN_EVAL_METRICS: if s.lower() != "coverage" and s.lower() != "region size": self.m_metricsToDisplay.append(s.lower())
def buildClassifier(self, instances: Instances): self.getCapabilities().testWithFail(instances) sumOfWeights = 0 self.m_Class = instances.classAttribute() self.m_ClassValue = 0 attrType = instances.classAttribute().type() if attrType == Attribute.NUMERIC: self.m_Counts = None elif attrType == Attribute.NOMINAL: self.m_Counts = [] for i in range(instances.numClasses()): self.m_Counts.append(1) sumOfWeights = instances.numClasses() for instance in instances: classValue = instance.classValue() if not Utils.isMissingValue(classValue): if instances.classAttribute().isNominal(): self.m_Counts[classValue] += instance.weight() else: self.m_ClassValue += instance.weight() * classValue sumOfWeights += instance.weight() if instances.classAttribute().isNumeric(): if Utils.gr(sumOfWeights, 0): self.m_ClassValue /= sumOfWeights else: self.m_ClassValue = Utils.maxIndex(self.m_Counts) Utils.normalize(self.m_Counts, sumOfWeights)
def dumpLabel(self,index:int,data:Instances): text="" text+=data.classAttribute().value(self.m_distribution.maxClass(index)) text+=" ("+str(Utils.roundDouble(self.m_distribution.perBag(index), 2)) if Utils.gr(self.m_distribution.numIncorrect(index), 0): text+="/"+str(Utils.roundDouble(self.m_distribution.numIncorrect(index), 2)) text+=")" return text
def evaluateClustersWithRespectToClass(self, inst: Instances): numClasses = inst.classAttribute().numValues() counts = [[0] * numClasses for i in range(self.m_numClusters)] clusterTotals = [0] * self.m_numClusters best = [0] * (self.m_numClusters + 1) current = [0] * (self.m_numClusters + 1) instances = copy.deepcopy(inst) instances.setClassIndex(inst.classIndex()) i = 0 for instance in instances: if self.m_clusterAssignments[i] >= 0: if not instance.classIsMissing(): counts[int(self.m_clusterAssignments[i])][int( instance.classValue())] += 1 clusterTotals[int(self.m_clusterAssignments[i])] += 1 i += 1 numInstances = i best[self.m_numClusters] = float('inf') self.mapClasses(self.m_numClusters, 0, counts, clusterTotals, current, best, 0) self.m_clusteringResult += "\n\nClass attribute: " + inst.classAttribute( ).name() + "\n" self.m_clusteringResult += "Classes to Clusters:\n" matrixString = self.toMatrixString(counts, clusterTotals, Instances(inst, 0)) self.m_clusteringResult += matrixString + '\n' Cwidth = 1 + int(math.log(self.m_numClusters) / math.log(10)) for i in range(self.m_numClusters): if clusterTotals[i] > 0: self.m_clusteringResult += "Cluster " + Utils.doubleToString( i, Cwidth, 0) self.m_clusteringResult += " <-- " if best[i] < 0: self.m_clusteringResult += "No class\n" else: self.m_clusteringResult += inst.classAttribute().value( int(best[i])) + '\n' self.m_clusteringResult+="\nIncorrectly clustered instances :\t"\ + str(best[self.m_numClusters])\ + "\t" \ + Utils.doubleToString((best[self.m_numClusters] / numInstances * 100.0), 8, 4) \ + " %\n" self.m_classToCluster = [] for i in range(self.m_numClusters): self.m_classToCluster[i] = int(best[i])
def setInputFormat(self, instanceInfo: Instances): super().setInputFormat(instanceInfo) if instanceInfo.classIndex() < 0: raise Exception("No class has been assigned to the instances") self.setOutputFormatBinary() self.m_Indices = None if instanceInfo.classAttribute().isNominal(): return True return False
def forInstances(cls, data: Instances, multi: bool = False) -> 'Capabilities': result = Capabilities(None) result.m_InterfaceDefinedCapabilities = set() if data.classIndex() == -1: result.enable(CapabilityEnum.NO_CLASS) else: if data.classAttribute().type() == Attribute.NOMINAL: if data.classAttribute().numValues() == 1: result.enable(CapabilityEnum.UNARY_CLASS) elif data.classAttribute().numValues() == 2: result.enable(CapabilityEnum.BINARY_CLASS) else: result.enable(CapabilityEnum.NOMINAL_CLASS) elif data.classAttribute().type() == Attribute.NUMERIC: result.enable(CapabilityEnum.NUMERIC_CLASS) elif data.classAttribute().type() == Attribute.STRING: result.enable(CapabilityEnum.STRING_CLASS) elif data.classAttribute().type() == Attribute.DATE: result.enable(CapabilityEnum.DATE_CLASS) else: raise Exception("Unknown class attribute type '" + data.classAttribute().name() + "'!") for i in range(data.numInstances()): if data.instance(i).classIsMissing(): result.enable(CapabilityEnum.MISSING_CLASS_VALUES) break for i in range(data.numAttributes()): if i == data.classIndex(): continue if data.attribute(i).type() == Attribute.NOMINAL: result.enable(CapabilityEnum.UNARY_ATTRIBUTES) if data.attribute(i).numValues() == 2: result.enable(CapabilityEnum.BINARY_ATTRIBUTES) elif data.attribute(i).numValues() > 2: result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES) elif data.attribute(i).type() == Attribute.NUMERIC: result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES) elif data.attribute(i).type() == Attribute.DATE: result.enable(CapabilityEnum.DATE_ATTRIBUTES) elif data.attribute(i).type() == Attribute.STRING: result.enable(CapabilityEnum.STRING_ATTRIBUTES) else: raise Exception("Unknown attribute type '" + data.attribute(i).name() + "'!") missing = False for i in range(data.numInstances()): inst = data.instance(i) for n in range(data.numAttributes()): if n == inst.classIndex(): continue if inst.isMissing(n): missing = True break if missing: result.enable(CapabilityEnum.MISSING_VALUES) break return result
def buildClassifier(self,data:Instances): self.getCapabilities().testWithFail(data) instances=Instances(data) instances.deleteWithMissingClass() self.m_NumClasses=instances.numClasses() self.m_ClassType=instances.classAttribute().type() self.m_Train=Instances(instances,0,instances.numInstances()) #只保存了样本集 if self.WindowSize > 0 and instances.numInstances() > self.WindowSize: self.m_Train=Instances(self.m_Train,self.m_Train.numInstances()-self.WindowSize,self.WindowSize) self.m_NumAttributesUsed=0 for i in range(self.m_Train.numAttributes()): if i != self.m_Train.classIndex() and (self.m_Train.attribute(i).isNominal() or self.m_Train.attribute(i).isNumeric()): self.m_NumAttributesUsed+=1 self.m_NNSearch.setInstances(self.m_Train) self.m_kNNValid=False self.m_defaultModel=ZeroR() self.m_defaultModel.buildClassifier(instances)
def toMatrixString(self, counts: List[List], clusterTotals: List, inst: Instances): ms = "" maxval = 0 for i in range(self.m_numClusters): for j in range(len(counts[0])): if counts[i][j] > maxval: maxval = counts[i][j] Cwidth = 1 + max(int(math.log(maxval) / math.log(10)), int(math.log(self.m_numClusters) / math.log(10))) ms += '\n' for i in range(self.m_numClusters): if clusterTotals[i] > 0: ms += " " + Utils.doubleToString(i, Cwidth, 0) ms += " <-- assigned to cluster\n" for i in range(len(counts[0])): for j in range(self.m_numClusters): if clusterTotals[j] > 0: ms += " " + Utils.doubleToString(counts[j][i], Cwidth, 0) ms += " | " + inst.classAttribute().value(i) + "\n" return ms
def threadClassifierRun(self): try: self.m_CEPanel.addToHistory() inst = Instances(self.m_Instances) trainTimeStart = trainTimeElapsed = testTimeStart = testTimeElapsed = 0 userTestStructure = None if self.m_SetTestFrame is not None: userTestStructure = deepcopy( self.m_SetTestFrame.getInstances()) #type:Instances userTestStructure.setClassIndex(self.m_TestClassIndex) #默认outputmodel,output per-class stats,output confusion matrix,store predictions for visualization #outputPredictionsText=None numFolds = 10 classIndex = self.m_ClassCombo.currentIndex() inst.setClassIndex(classIndex) classifier = self.m_ClassifierEditor.getValue() #type:Classifier name = time.strftime("%H:%M:%S - ") outPutResult = "" evaluation = None #type:Evaluation grph = None if self.m_CVBut.isChecked(): testMode = 1 numFolds = int(self.m_CVText.text()) if numFolds <= 1: raise Exception("Number of folds must be greater than 1") elif self.m_TrainBut.isChecked(): testMode = 2 elif self.m_TestSplitBut.isChecked(): testMode = 3 # if source is None: # raise Exception("No user test set has been specified") if not inst.equalHeaders(userTestStructure): QMessageBox.critical(self.m_Explorer, "错误", "测试数据集属性不同") else: raise Exception("Unknown test mode") cname = classifier.__module__ if cname.startswith("classifiers."): name += cname[len("classifiers."):] else: name += cname cmd = classifier.__module__ # if isinstance(classifier,OptionHandler): # cmd+=" "+Utils.joinOptions(classifier.getOptions()) plotInstances = ClassifierErrorsPlotInstances() plotInstances.setInstances(userTestStructure if testMode == 4 else inst) plotInstances.setClassifier(classifier) plotInstances.setClassIndex(inst.classIndex()) plotInstances.setPointSizeProportionalToMargin(False) outPutResult += "=== Run information ===\n\n" outPutResult += "Scheme: " + cname # if isinstance(classifier,OptionHandler): # o=classifier.getOptions() # outPutResult+=" "+Utils.joinOptions(o) outPutResult += "\n" outPutResult += "Relation: " + inst.relationName() + '\n' outPutResult += "Instances: " + str(inst.numInstances()) + '\n' outPutResult += "Attributes: " + str(inst.numAttributes()) + '\n' if inst.numAttributes() < 100: for i in range(inst.numAttributes()): outPutResult += " " + inst.attribute( i).name() + '\n' else: outPutResult += " [list of attributes omitted]\n" outPutResult += "Test mode: " if testMode == 1: outPutResult += str(numFolds) + "-fold cross-validation\n" elif testMode == 2: outPutResult += "evaluate on training data\n" elif testMode == 3: outPutResult += "user supplied test set: " + str( userTestStructure.numInstances()) + " instances\n" outPutResult += "\n" self.m_History.addResult(name, outPutResult) self.m_History.setSingle(name) if testMode == 2 or testMode == 3: trainTimeStart = time.time() classifier.buildClassifier(inst) trainTimeElapsed = time.time() - trainTimeStart outPutResult += "=== Classifier model (full training set) ===\n\n" outPutResult += str(classifier) + "\n" outPutResult += "\nTime taken to build model: " + Utils.doubleToString( trainTimeElapsed, 2) + " seconds\n\n" self.m_History.updateResult(name, outPutResult) if isinstance(classifier, Drawable): grph = classifier.graph() print("==========update Compelte=================") if testMode == 2: evaluation = Evaluation(inst) evaluation = self.setupEval(evaluation, classifier, inst, plotInstances, False) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) plotInstances.setUp() testTimeStart = time.time() #TODO # if isinstance(classifier,BatchPredictor) # else: for jj in range(inst.numInstances()): plotInstances.process(inst.instance(jj), classifier, evaluation) testTimeElapsed = time.time() - testTimeStart outPutResult += "=== Evaluation on training set ===\n" elif testMode == 1: rnd = 1 inst.randomize(rnd) if inst.attribute(classIndex).isNominal(): inst.stratify(numFolds) evaluation = Evaluation(inst) evaluation = self.setupEval(evaluation, classifier, inst, plotInstances, False) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) plotInstances.setUp() for fold in range(numFolds): train = inst.trainCV(numFolds, fold, rnd) evaluation = self.setupEval(evaluation, classifier, train, plotInstances, True) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) current = deepcopy(classifier) current.buildClassifier(train) test = inst.testCV(numFolds, fold) # TODO # if isinstance(classifier,BatchPredictor) # else: for jj in range(test.numInstances()): plotInstances.process(test.instance(jj), current, evaluation) if inst.attribute(classIndex).isNominal(): outPutResult += "=== Stratified cross-validation ===\n" else: outPutResult += "=== Cross-validation ===\n" elif testMode == 3: evaluation = Evaluation(inst) evaluation = self.setupEval(evaluation, classifier, inst, plotInstances, False) plotInstances.setInstances(userTestStructure) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) plotInstances.setUp() # TODO # if isinstance(classifier,BatchPredictor) testTimeStart = time.time() for i in range(userTestStructure.numInstances()): instance = userTestStructure.instance(i) # if isinstance(classifier,BatchPredictor) #else plotInstances.process(instance, classifier, evaluation) # if isinstance(classifier,BatchPredictor) testTimeElapsed = time.time() - testTimeStart outPutResult += "=== Evaluation on test set ===\n" if testMode != 1: mode = "" if testMode == 2: mode = "training data" elif testMode == 3: mode = "supplied test set" outPutResult += "\nTime taken to test model on " + mode + ": " + Utils.doubleToString( testTimeElapsed, 2) + " seconds\n\n" outPutResult += evaluation.toSummaryString(False) + '\n' self.m_History.updateResult(name, outPutResult) if inst.attribute(classIndex).isNominal(): outPutResult += evaluation.toClassDetailsString() + '\n' outPutResult += evaluation.toMatrixString() + '\n' self.m_History.updateResult(name, outPutResult) Utils.debugOut(outPutResult) if (plotInstances is not None and plotInstances.canPlot(False)): visName = name + " (" + inst.relationName() + ")" pl2d = plotInstances.getPlotData(cname) plotInstances.cleanUp() vv = [] trainHeader = Instances(self.m_Instances, 0) trainHeader.setClassIndex(classIndex) vv.append(trainHeader) if grph is not None: vv.append(grph) if evaluation is not None and evaluation.predictions( ) is not None: vv.append(evaluation.predictions()) vv.append(inst.classAttribute()) self.history_add_visualize_signal.emit(name, vv, visName, pl2d) except Exception as e: self.error_diglog_signal.emit(str(e)) self.mutex.lock() self.m_StartBut.setEnabled(True) self.m_StopBut.setEnabled(False) self.m_RunThread = None self.mutex.unlock() print("RunFinished")
def testInstances(self, data: Instances, *args): if len(args) == 0: return self.testInstances(data, 0, data.numAttributes() - 1) fromIndex = args[0] toIndex = args[1] if self.doNotCheckCapabilities(): return True if len(self.m_Capabilities) == 0 or (len(self.m_Capabilities) == 1 and self.handles( CapabilityEnum.NO_CLASS)): sys.stderr.write("No capabilities set!") if toIndex - fromIndex < 0: self.m_FailReason = CapabilityError("No attributes!") return False testClass = data.classIndex() > -1 and data.classIndex( ) >= fromIndex and data.classIndex() <= toIndex for i in range(fromIndex, toIndex + 1): att = data.attribute(i) if i == data.classIndex(): continue if not self.testAttribute(att): return False if not self.handles( CapabilityEnum.NO_CLASS) and data.classIndex() == -1: self.m_FailReason = CapabilityError("Class attribute not set!") return False if self.handles(CapabilityEnum.NO_CLASS) and data.classIndex() > -1: cap = self.getClassCapabilities() cap.disable(CapabilityEnum.NO_CLASS) iter = cap.capabilities() if len(iter) == 0: self.m_FailReason = CapabilityError( "Cannot handle any class attribute!") return False if testClass and not self.handles(CapabilityEnum.NO_CLASS): att = data.classAttribute() if not self.testAttribute(att, True): return False if not self.handles(CapabilityEnum.MISSING_CLASS_VALUES): for i in range(data.numInstances()): if data.instance(i).classIsMissing(): self.m_FailReason = CapabilityError( "Cannot handle missing class values!") return False else: hasClass = 0 for i in range(data.numInstances()): if not data.instance(i).classIsMissing(): hasClass += 1 if hasClass < self.getMinimumNumberInstances(): self.m_FailReason=CapabilityError("Not enough training instances with class labels (required: "\ + str(self.getMinimumNumberInstances())\ + ", provided: "\ + str(hasClass)\ + ")!") return False missing = False for i in range(data.numInstances()): inst = data.instance(i) if not self.handles(CapabilityEnum.MISSING_VALUES): #TODO 使用稀疏矩阵pass # if isinstance(inst) # pass #else for n in range(fromIndex, toIndex + 1): if n == inst.classIndex(): continue if inst.isMissing(n): missing = True break if missing: self.m_FailReason = CapabilityError( "Cannot handle missing values!") return False if data.numInstances() < self.getMinimumNumberInstances(): self.m_FailReason = CapabilityError( "Not enough training instances (required: " + str(self.getMinimumNumberInstances()) + ", provided: " + str(data.numInstances()) + ")!") return False # if self.handles(CapabilityEnum.ONLY_MULTIINSTANCE): # if data.numAttributes() != 3: # return False # if not data.attribute(0).isNominal() or data.classIndex() != data.numAttributes()-1: # return False # owner=self.getOwner() # if isinstance(owner,MultiInstanceCapabilitiesHandler): # handler=owner # cap=handler.getMultiInstanceCapabilities() # if data.numInstances()>0 and data.attribute(1).numValues()>0: # result=cap.testAttribute(data.attribute(1)) return True