def forInstances(cls, data: Instances, multi: bool = False) -> 'Capabilities': result = Capabilities(None) result.m_InterfaceDefinedCapabilities = set() if data.classIndex() == -1: result.enable(CapabilityEnum.NO_CLASS) else: if data.classAttribute().type() == Attribute.NOMINAL: if data.classAttribute().numValues() == 1: result.enable(CapabilityEnum.UNARY_CLASS) elif data.classAttribute().numValues() == 2: result.enable(CapabilityEnum.BINARY_CLASS) else: result.enable(CapabilityEnum.NOMINAL_CLASS) elif data.classAttribute().type() == Attribute.NUMERIC: result.enable(CapabilityEnum.NUMERIC_CLASS) elif data.classAttribute().type() == Attribute.STRING: result.enable(CapabilityEnum.STRING_CLASS) elif data.classAttribute().type() == Attribute.DATE: result.enable(CapabilityEnum.DATE_CLASS) else: raise Exception("Unknown class attribute type '" + data.classAttribute().name() + "'!") for i in range(data.numInstances()): if data.instance(i).classIsMissing(): result.enable(CapabilityEnum.MISSING_CLASS_VALUES) break for i in range(data.numAttributes()): if i == data.classIndex(): continue if data.attribute(i).type() == Attribute.NOMINAL: result.enable(CapabilityEnum.UNARY_ATTRIBUTES) if data.attribute(i).numValues() == 2: result.enable(CapabilityEnum.BINARY_ATTRIBUTES) elif data.attribute(i).numValues() > 2: result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES) elif data.attribute(i).type() == Attribute.NUMERIC: result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES) elif data.attribute(i).type() == Attribute.DATE: result.enable(CapabilityEnum.DATE_ATTRIBUTES) elif data.attribute(i).type() == Attribute.STRING: result.enable(CapabilityEnum.STRING_ATTRIBUTES) else: raise Exception("Unknown attribute type '" + data.attribute(i).name() + "'!") missing = False for i in range(data.numInstances()): inst = data.instance(i) for n in range(data.numAttributes()): if n == inst.classIndex(): continue if inst.isMissing(n): missing = True break if missing: result.enable(CapabilityEnum.MISSING_VALUES) break return result
def calculateRSquared(data: Instances, ssr: float): yMean = data.meanOrMode(data.classIndex()) tss = 0 for i in range(data.numInstances()): tss+=(data.instance(i).value(data.classIndex())-yMean)*\ (data.instance(i).value(data.classIndex())-yMean) rsq = 1 - ssr / tss return rsq
def removeClass(self, inst: Instances): af = Remove() if inst.classIndex() < 0: retI = inst else: af.setAttributeIndices("" + str(inst.classIndex() + 1)) af.setInvertSelection(False) af.setInputFormat(inst) retI = Filter.useFilter(inst, af) return retI
def evaluateClusterer(self, test: Instances, outputModel: bool): i = loglk = unclusteredInstances = 0 cc = self.m_Clusterer.numberOfClusters() self.m_numClusters = cc instanceStats = [0] * cc hasClass = test.classIndex() >= 0 clusterAssignments = [] filter = None #type:Filter testRaw = copy.deepcopy(test) testRaw.setClassIndex(test.classIndex()) if hasClass: if testRaw.classAttribute().isNumeric(): raise Exception(unclusteredInstances) filter = Remove() filter.setAttributeIndices(str(testRaw.classIndex() + 1)) filter.setInvertSelection(False) filter.setInputFormat(testRaw) for inst in testRaw: if filter is not None: filter.input(inst) filter.batchFinished() inst = filter.output() cnum = self.m_Clusterer.clusterInstance(inst) clusterAssignments.append(cnum) if cnum != -1: instanceStats[cnum] += 1 sumNum = sum(instanceStats) loglk /= sumNum self.m_logL = loglk self.m_clusterAssignments = [] # for i in clusterAssignments: # print(",",i,end="") # print() for i in range(len(clusterAssignments)): self.m_clusterAssignments.append(clusterAssignments[i]) numInstFieldWidth = int( math.log(len(clusterAssignments)) / math.log(10) + 1) if outputModel: self.m_clusteringResult += str(self.m_Clusterer) self.m_clusteringResult += "Clustered Instances\n\n" clustFieldWidth = int((math.log(cc) / math.log(10)) + 1) for i in range(cc): if instanceStats[i] > 0: self.m_clusteringResult+= Utils.doubleToString(i, clustFieldWidth, 0) \ +" " \ + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) \ +"(" + Utils.doubleToString((instanceStats[i] / sumNum * 100), 3, 0) \ +"%)\n" if unclusteredInstances > 0: self.m_clusteringResult += "\nUnclustered instances : " + str( unclusteredInstances) if hasClass: self.evaluateClustersWithRespectToClass(test)
def selectModel(self, data: Instances, test: Instances = None): if test is not None: return self.selectModel(data) multiVal = True averageInfoGain = validModels = 0 checkDistribution = Distribution(data) noSplitModel = NoSplit(checkDistribution) if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \ Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())): return noSplitModel if self.m_allData is not None: for attr in data.enumerateAttributes(): if attr.isNumeric() or Utils.gr( 0.3 * self.m_allData.numInstances(), attr.numValues()): multiVal = False break currentModel = [None] * data.numAttributes() #type:List[C45Split] sumOfWeights = data.sumOfWeight() for i in range(data.numAttributes()): if i != data.classIndex(): currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights, self.m_useMDLcorrection) currentModel[i].buildClassifer(data) if currentModel[i].checkModel(): if self.m_allData is not None: if data.attribute(i).isNumeric() or \ (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())): averageInfoGain = averageInfoGain + currentModel[ i].infoGain() validModels += 1 else: averageInfoGain = averageInfoGain + currentModel[ i].infoGain() validModels += 1 else: currentModel[i] = None if validModels == 0: return noSplitModel averageInfoGain = averageInfoGain / validModels minResult = 0 for i in range(data.numAttributes()): if i != data.classIndex() and currentModel[i].checkModel(): if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\ Utils.gr(currentModel[i].gainRatio(), minResult): bestModel = currentModel[i] minResult = currentModel[i].gainRatio() if Utils.equal(minResult, 0): return noSplitModel bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex()) if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue: bestModel.setSplitPoint(self.m_allData) return bestModel
def setInputFormat(self, instanceInfo: Instances): super().setInputFormat(instanceInfo) if instanceInfo.classIndex() < 0: raise Exception("No class has been assigned to the instances") self.setOutputFormatBinary() self.m_Indices = None if instanceInfo.classAttribute().isNominal(): return True return False
def setInstances(self, inst: Instances): self.m_Instances = inst attribNames = [] for i in range(inst.numAttributes()): tp = "(" + Attribute.typeToStringShort(inst.attribute(i)) + ")" attribNames.append(tp + inst.attribute(i).name()) self.m_ClassCombo.clear() self.m_ClassCombo.addItems(attribNames) if len(attribNames) > 0: if inst.classIndex() == -1: self.m_ClassCombo.setCurrentIndex(len(attribNames) - 1) else: self.m_ClassCombo.setCurrentIndex(inst.classIndex()) self.m_ClassCombo.setEnabled(True) self.m_StartBut.setEnabled(self.m_RunThread is None) self.m_StopBut.setEnabled(self.m_RunThread is not None) else: self.m_StartBut.setEnabled(False) self.m_StopBut.setEnabled(False)
def setupEval(self, evaluation: Evaluation, classifier: Classifier, inst: Instances, plotInstances: ClassifierErrorsPlotInstances, onlySetPriors: bool): # if isinstance(classifier,InputMappedClassifier)... #else evaluation.setPriors(inst) if not onlySetPriors: if plotInstances is not None: plotInstances.setInstances(inst) plotInstances.setClassifier(classifier) plotInstances.setClassIndex(inst.classIndex()) plotInstances.setEvaluation(evaluation) return evaluation
def evaluateClustersWithRespectToClass(self, inst: Instances): numClasses = inst.classAttribute().numValues() counts = [[0] * numClasses for i in range(self.m_numClusters)] clusterTotals = [0] * self.m_numClusters best = [0] * (self.m_numClusters + 1) current = [0] * (self.m_numClusters + 1) instances = copy.deepcopy(inst) instances.setClassIndex(inst.classIndex()) i = 0 for instance in instances: if self.m_clusterAssignments[i] >= 0: if not instance.classIsMissing(): counts[int(self.m_clusterAssignments[i])][int( instance.classValue())] += 1 clusterTotals[int(self.m_clusterAssignments[i])] += 1 i += 1 numInstances = i best[self.m_numClusters] = float('inf') self.mapClasses(self.m_numClusters, 0, counts, clusterTotals, current, best, 0) self.m_clusteringResult += "\n\nClass attribute: " + inst.classAttribute( ).name() + "\n" self.m_clusteringResult += "Classes to Clusters:\n" matrixString = self.toMatrixString(counts, clusterTotals, Instances(inst, 0)) self.m_clusteringResult += matrixString + '\n' Cwidth = 1 + int(math.log(self.m_numClusters) / math.log(10)) for i in range(self.m_numClusters): if clusterTotals[i] > 0: self.m_clusteringResult += "Cluster " + Utils.doubleToString( i, Cwidth, 0) self.m_clusteringResult += " <-- " if best[i] < 0: self.m_clusteringResult += "No class\n" else: self.m_clusteringResult += inst.classAttribute().value( int(best[i])) + '\n' self.m_clusteringResult+="\nIncorrectly clustered instances :\t"\ + str(best[self.m_numClusters])\ + "\t" \ + Utils.doubleToString((best[self.m_numClusters] / numInstances * 100.0), 8, 4) \ + " %\n" self.m_classToCluster = [] for i in range(self.m_numClusters): self.m_classToCluster[i] = int(best[i])
def getCapabilities(self, data: Instances = None): if data is None: result = Capabilities(self) result.enableAll() result.setMinimumNumberInstances(0) return result result = self.getCapabilities() if data.classIndex() == -1: classes = result.getClassCapabilities() iter = classes.capabilities() for item in iter: if item != CapabilityEnum.NO_CLASS: result.disable(item) result.disableDependency(item) else: result.disable(CapabilityEnum.NO_CLASS) result.disableDependency(CapabilityEnum.NO_CLASS) return result
def threadRun(self, filter: Filter): if filter is not None: #addUndo classIndex = self.m_AttVisualizePanel.getColoringIndex() cp = Instances(self.m_Instances) cp.setClassIndex(classIndex) self.m_StopBut.setEnabled(True) filterCopy = deepcopy(filter) filterCopy.setInputFormat(cp) newInstances = Filter.useFilter(cp, filterCopy) self.m_StopBut.setEnabled(False) if newInstances is None or newInstances.numAttributes() < 1: raise Exception("Dataset is empty.") #addUndo self.m_AttVisualizePanel.setColoringIndex(cp.classIndex()) if self.m_Instances.classIndex() < 0: newInstances.setClassIndex(-1) self.m_Instances = newInstances self.setInstances(self.m_Instances) self.m_RunThread = None
def calculateStdErrorOfCoef(data: Instances, selected: List[bool], ssr: float, n: int, k: int): array = [[0] * k for i in range(n)] column = 0 for j in range(data.numAttributes()): if data.classIndex() != j and selected[j]: for i in range(n): array[i][column] = data.instance(i).value(j) column += 1 for i in range(n): array[i][k - 1] = 1 X = np.array(array) XtX = np.dot(X.T, X) inverse = np.linalg.pinv(XtX) mse = ssr / (n - k) cov = mse * inverse result = [] for i in range(k): result.append(math.sqrt(cov[i][i])) return result
def setInputFormat(self, instanceInfo: Instances): super().setInputFormat(instanceInfo) self.attributeIndices.setUpper(instanceInfo.numAttributes() - 1) attributes = [] outputClass = -1 self.m_SelectedAttributes = self.attributeIndices.getSelection() if len(self.m_SelectedAttributes) == instanceInfo.numAttributes(): self.setOutputFormat(instanceInfo) self.initOutputLocators(self.getInputFormat(), self.m_SelectedAttributes) return True for current in self.m_SelectedAttributes: if instanceInfo.classIndex() == current: outputClass = len(attributes) keep = instanceInfo.attribute(current).copy() attributes.append(keep) self.initInputLocators(self.getInputFormat(), self.m_SelectedAttributes) outputFormat = Instances(instanceInfo.relationName(), attributes, 0) outputFormat.setClassIndex(outputClass) self.setOutputFormat(outputFormat) return True
def threadClassifierRun(self): try: self.m_CEPanel.addToHistory() inst = Instances(self.m_Instances) trainTimeStart = trainTimeElapsed = testTimeStart = testTimeElapsed = 0 userTestStructure = None if self.m_SetTestFrame is not None: userTestStructure = deepcopy( self.m_SetTestFrame.getInstances()) #type:Instances userTestStructure.setClassIndex(self.m_TestClassIndex) #默认outputmodel,output per-class stats,output confusion matrix,store predictions for visualization #outputPredictionsText=None numFolds = 10 classIndex = self.m_ClassCombo.currentIndex() inst.setClassIndex(classIndex) classifier = self.m_ClassifierEditor.getValue() #type:Classifier name = time.strftime("%H:%M:%S - ") outPutResult = "" evaluation = None #type:Evaluation grph = None if self.m_CVBut.isChecked(): testMode = 1 numFolds = int(self.m_CVText.text()) if numFolds <= 1: raise Exception("Number of folds must be greater than 1") elif self.m_TrainBut.isChecked(): testMode = 2 elif self.m_TestSplitBut.isChecked(): testMode = 3 # if source is None: # raise Exception("No user test set has been specified") if not inst.equalHeaders(userTestStructure): QMessageBox.critical(self.m_Explorer, "错误", "测试数据集属性不同") else: raise Exception("Unknown test mode") cname = classifier.__module__ if cname.startswith("classifiers."): name += cname[len("classifiers."):] else: name += cname cmd = classifier.__module__ # if isinstance(classifier,OptionHandler): # cmd+=" "+Utils.joinOptions(classifier.getOptions()) plotInstances = ClassifierErrorsPlotInstances() plotInstances.setInstances(userTestStructure if testMode == 4 else inst) plotInstances.setClassifier(classifier) plotInstances.setClassIndex(inst.classIndex()) plotInstances.setPointSizeProportionalToMargin(False) outPutResult += "=== Run information ===\n\n" outPutResult += "Scheme: " + cname # if isinstance(classifier,OptionHandler): # o=classifier.getOptions() # outPutResult+=" "+Utils.joinOptions(o) outPutResult += "\n" outPutResult += "Relation: " + inst.relationName() + '\n' outPutResult += "Instances: " + str(inst.numInstances()) + '\n' outPutResult += "Attributes: " + str(inst.numAttributes()) + '\n' if inst.numAttributes() < 100: for i in range(inst.numAttributes()): outPutResult += " " + inst.attribute( i).name() + '\n' else: outPutResult += " [list of attributes omitted]\n" outPutResult += "Test mode: " if testMode == 1: outPutResult += str(numFolds) + "-fold cross-validation\n" elif testMode == 2: outPutResult += "evaluate on training data\n" elif testMode == 3: outPutResult += "user supplied test set: " + str( userTestStructure.numInstances()) + " instances\n" outPutResult += "\n" self.m_History.addResult(name, outPutResult) self.m_History.setSingle(name) if testMode == 2 or testMode == 3: trainTimeStart = time.time() classifier.buildClassifier(inst) trainTimeElapsed = time.time() - trainTimeStart outPutResult += "=== Classifier model (full training set) ===\n\n" outPutResult += str(classifier) + "\n" outPutResult += "\nTime taken to build model: " + Utils.doubleToString( trainTimeElapsed, 2) + " seconds\n\n" self.m_History.updateResult(name, outPutResult) if isinstance(classifier, Drawable): grph = classifier.graph() print("==========update Compelte=================") if testMode == 2: evaluation = Evaluation(inst) evaluation = self.setupEval(evaluation, classifier, inst, plotInstances, False) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) plotInstances.setUp() testTimeStart = time.time() #TODO # if isinstance(classifier,BatchPredictor) # else: for jj in range(inst.numInstances()): plotInstances.process(inst.instance(jj), classifier, evaluation) testTimeElapsed = time.time() - testTimeStart outPutResult += "=== Evaluation on training set ===\n" elif testMode == 1: rnd = 1 inst.randomize(rnd) if inst.attribute(classIndex).isNominal(): inst.stratify(numFolds) evaluation = Evaluation(inst) evaluation = self.setupEval(evaluation, classifier, inst, plotInstances, False) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) plotInstances.setUp() for fold in range(numFolds): train = inst.trainCV(numFolds, fold, rnd) evaluation = self.setupEval(evaluation, classifier, train, plotInstances, True) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) current = deepcopy(classifier) current.buildClassifier(train) test = inst.testCV(numFolds, fold) # TODO # if isinstance(classifier,BatchPredictor) # else: for jj in range(test.numInstances()): plotInstances.process(test.instance(jj), current, evaluation) if inst.attribute(classIndex).isNominal(): outPutResult += "=== Stratified cross-validation ===\n" else: outPutResult += "=== Cross-validation ===\n" elif testMode == 3: evaluation = Evaluation(inst) evaluation = self.setupEval(evaluation, classifier, inst, plotInstances, False) plotInstances.setInstances(userTestStructure) evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics) plotInstances.setUp() # TODO # if isinstance(classifier,BatchPredictor) testTimeStart = time.time() for i in range(userTestStructure.numInstances()): instance = userTestStructure.instance(i) # if isinstance(classifier,BatchPredictor) #else plotInstances.process(instance, classifier, evaluation) # if isinstance(classifier,BatchPredictor) testTimeElapsed = time.time() - testTimeStart outPutResult += "=== Evaluation on test set ===\n" if testMode != 1: mode = "" if testMode == 2: mode = "training data" elif testMode == 3: mode = "supplied test set" outPutResult += "\nTime taken to test model on " + mode + ": " + Utils.doubleToString( testTimeElapsed, 2) + " seconds\n\n" outPutResult += evaluation.toSummaryString(False) + '\n' self.m_History.updateResult(name, outPutResult) if inst.attribute(classIndex).isNominal(): outPutResult += evaluation.toClassDetailsString() + '\n' outPutResult += evaluation.toMatrixString() + '\n' self.m_History.updateResult(name, outPutResult) Utils.debugOut(outPutResult) if (plotInstances is not None and plotInstances.canPlot(False)): visName = name + " (" + inst.relationName() + ")" pl2d = plotInstances.getPlotData(cname) plotInstances.cleanUp() vv = [] trainHeader = Instances(self.m_Instances, 0) trainHeader.setClassIndex(classIndex) vv.append(trainHeader) if grph is not None: vv.append(grph) if evaluation is not None and evaluation.predictions( ) is not None: vv.append(evaluation.predictions()) vv.append(inst.classAttribute()) self.history_add_visualize_signal.emit(name, vv, visName, pl2d) except Exception as e: self.error_diglog_signal.emit(str(e)) self.mutex.lock() self.m_StartBut.setEnabled(True) self.m_StopBut.setEnabled(False) self.m_RunThread = None self.mutex.unlock() print("RunFinished")
class LinearRegression(AbstractClassifier): SELECTION_M5 = 0 #default SELECTION_NONE = 1 SELECTION_GREEDY = 2 TAGS_SELECTION = [ Tag(SELECTION_M5, "M5 method"), Tag(SELECTION_NONE, "No attribute selection"), Tag(SELECTION_GREEDY, "Greedy method") ] propertyList = { "AttributeSelectionMethod": "TAGS_SELECTION", "Ridge": "1e-8" } methodList = { "AttributeSelectionMethod": "setAttributeSelectionMethod", "Ridge": "setRidge" } def __init__(self): super().__init__() self.m_Coefficients = None #type:List[float] self.m_SelectedAttributes = None #type:List[bool] self.m_TransformedData = None #type:Instances self.m_MissingFilter = None #type:ReplaceMissingValues self.m_TransformFilter = None #type:NominalToBinary self.m_ClassStdDev = 0 self.m_ClassMean = 0 self.m_ClassIndex = 0 self.m_Means = None #type:List[float] self.m_StdDevs = None #type:List[float] self.outputAdditionalStats = False self.AttributeSelectionMethod = 0 self.EliminateColinearAttributes = True self.m_checksTurnedOff = False self.Ridge = 1e-8 self.Minimal = False self.m_ModelBuilt = False self.m_isZeroR = False self.m_df = 0 self.m_RSquared = 0 self.m_RSquaredAdj = 0 self.m_FStat = 0 self.m_StdErrorOfCoef = None #type:List[float] self.m_TStats = None #type:List[float] self.numDecimalPlaces = 4 def __str__(self): if not self.m_ModelBuilt: return "Linear Regression: No model built yet." if self.Minimal: return "Linear Regression: Model built." text = "" column = 0 first = True text += "\nLinear Regression Model\n\n" text += self.m_TransformedData.classAttribute().name() + " =\n\n" for i in range(self.m_TransformedData.numAttributes()): if i != self.m_ClassIndex and self.m_SelectedAttributes[i]: if not first: text += " +\n" else: first = False text += Utils.doubleToString(self.m_Coefficients[column], 12, self.numDecimalPlaces) + " * " text += self.m_TransformedData.attribute(i).name() column += 1 text += " +\n" + Utils.doubleToString(self.m_Coefficients[column], 12, self.numDecimalPlaces) if self.outputAdditionalStats: maxAttLength = 0 for i in range(self.m_TransformedData.numAttributes()): if i != self.m_ClassIndex and self.m_SelectedAttributes[i]: if len(self.m_TransformedData.attribute( i).name()) > maxAttLength: maxAttLength = len( self.m_TransformedData.attribute(i).name()) maxAttLength += 3 if maxAttLength < len("Variable") + 3: maxAttLength = len("Variable") + 3 text+="\n\nRegression Analysis:\n\n" \ + Utils.padRight("Variable", maxAttLength)\ + " Coefficient SE of Coef t-Stat" column = 0 for i in range(self.m_TransformedData.numAttributes()): if i != self.m_ClassIndex and self.m_SelectedAttributes[i]: text += "\n" + Utils.padRight( self.m_TransformedData.attribute(i).name(), maxAttLength) text += Utils.doubleToString(self.m_Coefficients[column], 12, self.numDecimalPlaces) text += " " + Utils.doubleToString( self.m_StdErrorOfCoef[column], 12, self.numDecimalPlaces) text += " " + Utils.doubleToString( self.m_TStats[column], 12, self.numDecimalPlaces) column += 1 text += Utils.padRight( "\nconst", maxAttLength + 1) + Utils.doubleToString( self.m_Coefficients[column], 12, self.numDecimalPlaces) text += " " + Utils.doubleToString(self.m_StdErrorOfCoef[column], 12, self.numDecimalPlaces) text += " " + Utils.doubleToString(self.m_TStats[column], 12, self.numDecimalPlaces) text += "\n\nDegrees of freedom = " + str(self.m_df) text += "\nR^2 value = " + Utils.doubleToString( self.m_RSquared, self.numDecimalPlaces) text += "\nAdjusted R^2 = " + Utils.doubleToString( self.m_RSquaredAdj, 5) text += "\nF-statistic = " + Utils.doubleToString( self.m_FStat, self.numDecimalPlaces) return text def getCapabilities(self): result = super().getCapabilities() result.disableAll() result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES) result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES) result.enable(CapabilityEnum.DATE_ATTRIBUTES) result.enable(CapabilityEnum.MISSING_VALUES) result.enable(CapabilityEnum.NUMERIC_CLASS) result.enable(CapabilityEnum.DATE_CLASS) result.enable(CapabilityEnum.MISSING_CLASS_VALUES) return result def setAttributeSelectionMethod(self, value: str): index = int(value) self.AttributeSelectionMethod = self.TAGS_SELECTION[index].getID() def setRidge(self, value: str): try: val = float(value) self.Ridge = val self.propertyList.update({"Ridge": value}) except ValueError: pass def buildClassifier(self, data: Instances): self.m_ModelBuilt = False self.m_isZeroR = False if data.numInstances() == 1: self.m_Coefficients = [data.instance(0).classValue()] self.m_SelectedAttributes = [False] * data.numAttributes() self.m_isZeroR = True return if not self.m_checksTurnedOff: self.getCapabilities().testWithFail(data) if self.outputAdditionalStats: ok = True for i in range(data.numInstances()): if data.instance(i).weight() != 1: ok = False break if not ok: raise Exception( "Can only compute additional statistics on unweighted data" ) data = Instances(data) data.deleteWithMissingClass() self.m_TransformFilter = NominalToBinary() self.m_TransformFilter.setInputFormat(data) data = Filter.useFilter(data, self.m_TransformFilter) self.m_MissingFilter = ReplaceMissingValues() self.m_MissingFilter.setInputFormat(data) data = Filter.useFilter(data, self.m_MissingFilter) data.deleteWithMissingClass() else: self.m_TransformFilter = None self.m_MissingFilter = None self.m_ClassIndex = data.classIndex() self.m_TransformedData = data self.m_Coefficients = None self.m_SelectedAttributes = [False] * data.numAttributes() self.m_Means = [0] * data.numAttributes() self.m_StdDevs = [0] * data.numAttributes() for j in range(data.numAttributes()): if j != self.m_ClassIndex: self.m_SelectedAttributes[j] = True self.m_Means[j] = data.meanOrMode(j) self.m_StdDevs[j] = math.sqrt(data.variance(j)) if self.m_StdDevs[j] == 0: self.m_SelectedAttributes[j] = False self.m_ClassStdDev = math.sqrt( data.variance(self.m_TransformedData.classIndex())) self.m_ClassMean = data.meanOrMode(self.m_TransformedData.classIndex()) self.findBestModel() if self.outputAdditionalStats: k = 1 for i in range(data.numAttributes()): if i != data.classIndex(): if self.m_SelectedAttributes[i]: k += 1 self.m_df = self.m_TransformedData.numInstances() - k se = self.calculateSE(self.m_SelectedAttributes, self.m_Coefficients) self.m_RSquared = RegressionAnalysis.calculateRSquared( self.m_TransformedData, se) self.m_RSquaredAdj = RegressionAnalysis.calculateAdjRSquared( self.m_RSquared, self.m_TransformedData.numInstances(), k) self.m_FStat = RegressionAnalysis.calculateFStat( self.m_RSquared, self.m_TransformedData.numInstances(), k) self.m_StdErrorOfCoef = RegressionAnalysis.calculateStdErrorOfCoef( self.m_TransformedData, self.m_SelectedAttributes, se, self.m_TransformedData.numInstances(), k) self.m_TStats = RegressionAnalysis.calculateTStats( self.m_Coefficients, self.m_StdErrorOfCoef, k) if self.Minimal: self.m_TransformedData = None self.m_Means = None self.m_StdDevs = None else: self.m_TransformedData = Instances(data, 0) self.m_ModelBuilt = True def classifyInstance(self, instance: Instance): transformedInstance = instance if not self.m_checksTurnedOff and not self.m_isZeroR: self.m_TransformFilter.input(transformedInstance) self.m_TransformFilter.batchFinished() transformedInstance = self.m_TransformFilter.output() self.m_MissingFilter.input(transformedInstance) self.m_MissingFilter.batchFinished() transformedInstance = self.m_MissingFilter.output() return self.regressionPrediction(transformedInstance, self.m_SelectedAttributes, self.m_Coefficients) def findBestModel(self): numInstances = self.m_TransformedData.numInstances() self.m_Coefficients = self.doRegression(self.m_SelectedAttributes) while self.EliminateColinearAttributes and self.deselectColinearAttributes( self.m_SelectedAttributes, self.m_Coefficients): self.m_Coefficients = self.doRegression(self.m_SelectedAttributes) numAttributes = 1 for m_SelectedAttribute in self.m_SelectedAttributes: if m_SelectedAttribute: numAttributes += 1 fullMSE = self.calculateSE(self.m_SelectedAttributes, self.m_Coefficients) akaike = (numInstances - numAttributes) + 2 * numAttributes currentNumAttributes = numAttributes improved = True if self.AttributeSelectionMethod == self.SELECTION_GREEDY: while improved: currentSelected = self.m_SelectedAttributes[:] improved = False currentNumAttributes -= 1 for i in range(len(self.m_SelectedAttributes)): if currentSelected[i]: currentSelected[i] = False currentCoeffs = self.doRegression(currentSelected) currentMSE = self.calculateSE(currentSelected, currentCoeffs) currentAkaike = currentMSE / fullMSE * ( numInstances - numAttributes) + 2 * currentNumAttributes if currentAkaike < akaike: improved = True akaike = currentAkaike self.m_SelectedAttributes = currentSelected[:] self.m_Coefficients = currentCoeffs currentSelected[i] = True elif self.AttributeSelectionMethod == self.SELECTION_M5: while improved: improved = False currentNumAttributes -= 1 minSC = 0 minAttr = -1 coeff = 0 for i in range(len(self.m_SelectedAttributes)): if self.m_SelectedAttributes[i]: SC = math.fabs(self.m_Coefficients[coeff] * self.m_StdDevs[i] / self.m_ClassStdDev) if coeff == 0 or SC < minSC: minSC = SC minAttr = i coeff += 1 if minAttr >= 0: self.m_SelectedAttributes[minAttr] = False currentCoeffs = self.doRegression( self.m_SelectedAttributes) currentMSE = self.calculateSE(self.m_SelectedAttributes, currentCoeffs) currentAkaike = currentMSE / fullMSE * ( numInstances - numAttributes) + 2 * currentNumAttributes if currentAkaike < akaike: improved = True akaike = currentAkaike self.m_Coefficients = currentCoeffs else: self.m_SelectedAttributes[minAttr] = True def calculateSE(self, selectedAttributes: List[bool], coefficients: List[float]): mse = 0 for i in range(self.m_TransformedData.numInstances()): prediction = self.regressionPrediction( self.m_TransformedData.instance(i), selectedAttributes, coefficients) error = prediction - self.m_TransformedData.instance( i).classValue() mse += error * error return mse def regressionPrediction(self, transformedInstance: Instance, selectedAttributes: List[bool], coefficients: List[float]): result = 0 column = 0 for j in range(transformedInstance.numAttributes()): if self.m_ClassIndex != j and selectedAttributes[j]: result += coefficients[column] * transformedInstance.value(j) column += 1 result += coefficients[column] return result def deselectColinearAttributes(self, selectedAttributes: List[bool], coefficients: List[float]): maxSC = 1.5 maxAttr = -1 coeff = 0 for i in range(len(selectedAttributes)): if selectedAttributes[i]: SC = math.fabs(coefficients[coeff] * self.m_StdDevs[i] / self.m_ClassStdDev) if SC > maxSC: maxSC = SC maxAttr = i coeff += 1 if maxAttr >= 0: selectedAttributes[maxAttr] = False return True return False def doRegression(self, selectedAttributes: List[bool]) -> List: numAttributes = 0 for selectedAttribute in selectedAttributes: if selectedAttribute: numAttributes += 1 coefficients = [0] * (numAttributes + 1) if numAttributes > 0: independentTransposed = np.zeros( (numAttributes, self.m_TransformedData.numInstances())) dependent = np.zeros(self.m_TransformedData.numInstances()) for i in range(self.m_TransformedData.numInstances()): inst = self.m_TransformedData.instance(i) sqrt_weight = math.sqrt(inst.weight()) index = 0 for j in range(self.m_TransformedData.numAttributes()): if j == self.m_ClassIndex: dependent[i] = inst.classValue() * sqrt_weight else: if selectedAttributes[j]: value = inst.value(j) - self.m_Means[j] if not self.m_checksTurnedOff: value /= self.m_StdDevs[j] independentTransposed[index][ i] = value * sqrt_weight index += 1 aTy = np.dot(independentTransposed, dependent) aTa = np.around( np.dot(independentTransposed, independentTransposed.T), 2) ridge = self.getRidge() for i in range(numAttributes): aTa[i][i] += ridge coeffsWithoutIntercept = np.dot(aTy, np.linalg.pinv(aTa)) if len(coeffsWithoutIntercept.shape) > 1: coefficients = coeffsWithoutIntercept[0].copy() else: coefficients = coeffsWithoutIntercept.copy() coefficients = np.append(coefficients, self.m_ClassMean) column = 0 for i in range(self.m_TransformedData.numAttributes()): if i != self.m_TransformedData.classIndex( ) and selectedAttributes[i]: if not self.m_checksTurnedOff: coefficients[column] /= self.m_StdDevs[i] coefficients[-1] -= coefficients[column] * self.m_Means[i] column += 1 return coefficients def getRidge(self): return self.Ridge
def buildClassifier(self, data: Instances): self.m_ModelBuilt = False self.m_isZeroR = False if data.numInstances() == 1: self.m_Coefficients = [data.instance(0).classValue()] self.m_SelectedAttributes = [False] * data.numAttributes() self.m_isZeroR = True return if not self.m_checksTurnedOff: self.getCapabilities().testWithFail(data) if self.outputAdditionalStats: ok = True for i in range(data.numInstances()): if data.instance(i).weight() != 1: ok = False break if not ok: raise Exception( "Can only compute additional statistics on unweighted data" ) data = Instances(data) data.deleteWithMissingClass() self.m_TransformFilter = NominalToBinary() self.m_TransformFilter.setInputFormat(data) data = Filter.useFilter(data, self.m_TransformFilter) self.m_MissingFilter = ReplaceMissingValues() self.m_MissingFilter.setInputFormat(data) data = Filter.useFilter(data, self.m_MissingFilter) data.deleteWithMissingClass() else: self.m_TransformFilter = None self.m_MissingFilter = None self.m_ClassIndex = data.classIndex() self.m_TransformedData = data self.m_Coefficients = None self.m_SelectedAttributes = [False] * data.numAttributes() self.m_Means = [0] * data.numAttributes() self.m_StdDevs = [0] * data.numAttributes() for j in range(data.numAttributes()): if j != self.m_ClassIndex: self.m_SelectedAttributes[j] = True self.m_Means[j] = data.meanOrMode(j) self.m_StdDevs[j] = math.sqrt(data.variance(j)) if self.m_StdDevs[j] == 0: self.m_SelectedAttributes[j] = False self.m_ClassStdDev = math.sqrt( data.variance(self.m_TransformedData.classIndex())) self.m_ClassMean = data.meanOrMode(self.m_TransformedData.classIndex()) self.findBestModel() if self.outputAdditionalStats: k = 1 for i in range(data.numAttributes()): if i != data.classIndex(): if self.m_SelectedAttributes[i]: k += 1 self.m_df = self.m_TransformedData.numInstances() - k se = self.calculateSE(self.m_SelectedAttributes, self.m_Coefficients) self.m_RSquared = RegressionAnalysis.calculateRSquared( self.m_TransformedData, se) self.m_RSquaredAdj = RegressionAnalysis.calculateAdjRSquared( self.m_RSquared, self.m_TransformedData.numInstances(), k) self.m_FStat = RegressionAnalysis.calculateFStat( self.m_RSquared, self.m_TransformedData.numInstances(), k) self.m_StdErrorOfCoef = RegressionAnalysis.calculateStdErrorOfCoef( self.m_TransformedData, self.m_SelectedAttributes, se, self.m_TransformedData.numInstances(), k) self.m_TStats = RegressionAnalysis.calculateTStats( self.m_Coefficients, self.m_StdErrorOfCoef, k) if self.Minimal: self.m_TransformedData = None self.m_Means = None self.m_StdDevs = None else: self.m_TransformedData = Instances(data, 0) self.m_ModelBuilt = True
class NormalizableDistance(): R_MIN = 0 R_MAX = 1 R_WIDTH = 2 def __init__(self, data: Instances = None): self.m_AttributeIndices = Range("first-last") self.m_DontNormalize = False self.m_Ranges = None #type:List[List] self.m_ActiveIndices = None #type:List if data is None: self.invalidate() else: self.setInstances(data) def invalidate(self): self.m_Validated = False def setInstances(self, inst: Instances): self.m_Data = inst self.invalidate() def clean(self): self.m_Data = Instances(self.m_Data, 0) def update(self, ins: Instance): #初始化 self.validate() self.m_Ranges = self.updateRanges(ins, self.m_Ranges) @overload def distance(self, first: Instance, second: Instance): ... @overload def distance(self, first: Instance, second: Instance, stats: PerformanceStats): ... @overload def distance(self, first: Instance, second: Instance, cutOffValue: float): ... @overload def distance(self, first: Instance, second: Instance, cutOffValue: float, stats: PerformanceStats): ... def distance(self, first: Instance, second: Instance, a0=None, a1=None): if a0 is None or isinstance(a0, PerformanceStats): return self.distance(first, second, float("inf"), a0) elif isinstance(a0, float): distance = 0 firstNumValues = first.numValues() secondNumValues = second.numValues() numAttributes = self.m_Data.numAttributes() classIndex = self.m_Data.classIndex() self.validate() p1 = p2 = 0 while p1 < firstNumValues or p2 < secondNumValues: if p1 >= firstNumValues: firstI = numAttributes else: firstI = first.index(p1) if p2 >= secondNumValues: secondI = numAttributes else: secondI = second.index(p2) if firstI == classIndex: p1 += 1 continue if firstI < numAttributes and not self.m_ActiveIndices[firstI]: p1 += 1 continue if secondI == classIndex: p2 += 1 continue if secondI < numAttributes and not self.m_ActiveIndices[ secondI]: p2 += 1 continue if firstI == secondI: diff = self.difference(firstI, first.valueSparse(p1), second.valueSparse(p2)) p1 += 1 p2 += 1 elif firstI > secondI: diff = self.difference(secondI, 0, second.valueSparse(p2)) p2 += 1 else: diff = self.difference(firstI, first.valueSparse(p1), 0) p1 += 1 if isinstance(a1, PerformanceStats): a1.incrCoordCount() distance = self.updateDistance(distance, diff) if distance > a0: return float('inf') return distance def updateDistance(self, currDist: float, diff: float) -> float: ... def difference(self, index: int, val1: float, val2: float): if self.m_Data.attribute(index).type() == Attribute.NOMINAL: if Utils.isMissingValue(val1) or Utils.isMissingValue( val2) or int(val1) != int(val2): return 1 return 0 elif self.m_Data.attribute(index).type() == Attribute.NUMERIC: if Utils.isMissingValue(val1) or Utils.isMissingValue(val2): if Utils.isMissingValue(val1) and Utils.isMissingValue(val2): if not self.m_DontNormalize: return 1 return self.m_Ranges[index][self.R_WIDTH] else: if Utils.isMissingValue(val2): diff = self.norm( val1, index) if not self.m_DontNormalize else val1 else: diff = self.norm( val2, index) if not self.m_DontNormalize else val2 if not self.m_DontNormalize and diff < 0.5: diff = 1 - diff elif self.m_DontNormalize: if (self.m_Ranges[index][self.R_MAX] - diff) > ( diff - self.m_Ranges[index][self.R_MIN]): return self.m_Ranges[index][self.R_MAX] - diff else: return diff - self.m_Ranges[index][self.R_MIN] return diff else: if not self.m_DontNormalize: return self.norm(val1, index) - self.norm(val2, index) return val1 - val2 else: return 0 def norm(self, x: float, i: int): if self.m_Ranges[i][self.R_WIDTH] == 0: return 0 return (x - self.m_Ranges[i][self.R_MIN]) / self.m_Ranges[i][self.R_WIDTH] def validate(self): if not self.m_Validated: self.initialize() self.m_Validated = True def initialize(self): self.initializeAttributeIndices() self.initializeRanges() def initializeAttributeIndices(self): self.m_AttributeIndices.setUpper(self.m_Data.numAttributes() - 1) self.m_ActiveIndices = [] for i in range(self.m_Data.numAttributes()): self.m_ActiveIndices.append(self.m_AttributeIndices.isInRange(i)) def initializeRanges(self) -> List[List]: if self.m_Data is None: self.m_Ranges = None return self.m_Ranges numAtt = self.m_Data.numAttributes() ranges = [[0] * 3 for i in range(numAtt)] if self.m_Data.numInstances() <= 0: self.initializeRangesEmpty(numAtt, ranges) self.m_Ranges = ranges return self.m_Ranges else: self.updateRangesFirst(self.m_Data.instance(0), numAtt, ranges) for i in range(self.m_Data.numInstances()): self.updateRanges(self.m_Data.instance(i), ranges) self.m_Ranges = ranges return self.m_Ranges def initializeRangesEmpty(self, numAtt: int, ranges: List[List]): for j in range(numAtt): ranges[j][self.R_MIN] = float('inf') ranges[j][self.R_MAX] = float('inf') ranges[j][self.R_WIDTH] = float('inf') def updateRangesFirst(self, instance: Instance, numAtt: int, ranges: List[List]): for i in range(len(ranges)): for j in range(len(ranges[i])): ranges[i][j] = 0 numVals = instance.numValues() for j in range(numVals): currIndex = instance.index(j) if not instance.isMissingSparse(j): return True return False def updateRanges(self, instance: Instance, ranges: List[List[float]]): numVals = instance.numValues() prevIndex = 0 for j in range(numVals): currIndex = instance.index(j) while prevIndex < currIndex: if 0 < ranges[prevIndex][self.R_MIN]: ranges[prevIndex][self.R_MIN] = 0 ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][ self.R_MAX] - ranges[prevIndex][self.R_MIN] if 0 > ranges[prevIndex][self.R_MAX]: ranges[prevIndex][self.R_MAX] = 0 ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][ self.R_MAX] - ranges[prevIndex][self.R_MIN] prevIndex += 1 prevIndex += 1 if not instance.isMissingSparse(j): val = instance.valueSparse(j) if val < ranges[currIndex][self.R_MIN]: ranges[currIndex][self.R_MIN] = val ranges[currIndex][self.R_WIDTH] = ranges[currIndex][ self.R_MAX] - ranges[currIndex][self.R_MIN] if val > ranges[currIndex][self.R_MAX]: ranges[currIndex][self.R_MAX] = val ranges[currIndex][self.R_WIDTH] = ranges[currIndex][ self.R_MAX] - ranges[currIndex][self.R_MIN] return ranges
class KNN(AbstractClassifier): WEIGHT_NONE=0 WEIGHT_INVERSE=1 WEIGHT_SIMILARITY=2 TAGS_WEIGHTING=[Tag(WEIGHT_NONE,"No distance weighting"), Tag(WEIGHT_INVERSE,"Weight by 1/distance"), Tag(WEIGHT_SIMILARITY,"Weight by 1-distance")] propertyList={"kNN":"1","DistanceWeighting":"TAGS_WEIGHTING"} methodList = {"kNN":"setkNN","DistanceWeighting":"setDistanceWeighting"} def __init__(self,k:int=None): super().__init__() self.m_NNSearch=LinearNNSearch() self.m_Train=None #type:Instances self.initilize() if k is not None: self.setKNN(k) def __str__(self): if self.m_Train is None: return "IBk: No model built yet." if self.m_Train.numInstances() == 0: return "Warning: no training instances - ZeroR model used." #TODO 高级 result="IB1 instance-based classifier\n" +"using " + str(self.kNN) if self.DistanceWeighting == self.WEIGHT_INVERSE: result+=" inverse-distance-weighted" elif self.DistanceWeighting == self.WEIGHT_SIMILARITY: result+= " similarity-weighted" result+=" nearest neighbour(s) for classification\n" if self.WindowSize != 0: result+="using a maximum of " + str(self.WindowSize) + " (windowed) training instances\n" return result def setkNN(self,value:str): try: val=int(value) self.kNN=val self.propertyList.update({"kNN":value}) except ValueError: pass def setDistanceWeighting(self,value:int): self.DistanceWeighting=self.TAGS_WEIGHTING[value].getID() def initilize(self): self.setKNN(1) #多少个样本用于分类,默认整个样本集 self.WindowSize=0 self.DistanceWeighting=self.WEIGHT_NONE self.CrossValidate=False self.MEanSquared=False def setKNN(self,k:int): self.kNN=k self.m_kNNUpper=k self.m_kNNValid=False def getKNN(self): return self.kNN def getCapabilities(self): result=super().getCapabilities() result.disableAll() result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES) result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES) result.enable(CapabilityEnum.DATE_ATTRIBUTES) result.enable(CapabilityEnum.MISSING_VALUES) result.enable(CapabilityEnum.NOMINAL_CLASS) result.enable(CapabilityEnum.NUMERIC_CLASS) result.enable(CapabilityEnum.DATE_CLASS) result.enable(CapabilityEnum.MISSING_CLASS_VALUES) result.setMinimumNumberInstances(0) return result def buildClassifier(self,data:Instances): self.getCapabilities().testWithFail(data) instances=Instances(data) instances.deleteWithMissingClass() self.m_NumClasses=instances.numClasses() self.m_ClassType=instances.classAttribute().type() self.m_Train=Instances(instances,0,instances.numInstances()) #只保存了样本集 if self.WindowSize > 0 and instances.numInstances() > self.WindowSize: self.m_Train=Instances(self.m_Train,self.m_Train.numInstances()-self.WindowSize,self.WindowSize) self.m_NumAttributesUsed=0 for i in range(self.m_Train.numAttributes()): if i != self.m_Train.classIndex() and (self.m_Train.attribute(i).isNominal() or self.m_Train.attribute(i).isNumeric()): self.m_NumAttributesUsed+=1 self.m_NNSearch.setInstances(self.m_Train) self.m_kNNValid=False self.m_defaultModel=ZeroR() self.m_defaultModel.buildClassifier(instances) def distributionForInstance(self,instance:Instance)->List[float]: if self.m_Train.numInstances() == 0: return self.m_defaultModel.distributionForInstance(instance) #超过样本容量,则循环删除 if self.WindowSize > 0 and self.m_Train.numInstances() > self.WindowSize: self.m_kNNValid=False deletedInstance=False while(self.m_Train.numInstances()>self.WindowSize): self.m_Train.delete(0) if deletedInstance is True: self.m_NNSearch.setInstances(self.m_Train) if not self.m_kNNValid and self.CrossValidate and self.m_kNNUpper>=1: pass self.m_NNSearch.addInstanceInfo(instance) #获取k个邻居的样本集和距离 neighbours=self.m_NNSearch.kNearestNeighbours(instance,self.kNN) distances=self.m_NNSearch.getDistances() distribution=self.makeDistribution(neighbours,distances) return distribution #获取k个邻近样本的概率分布 def makeDistribution(self,neighbours:Instances,distances:List)->List[float]: distribution=[0]*self.m_NumClasses total=0 if self.m_ClassType == Attribute.NOMINAL: for i in range(self.m_NumClasses): distribution[i]=1/max(1,self.m_Train.numInstances()) total=self.m_NumClasses/max(1,self.m_Train.numInstances()) for i in range(neighbours.numInstances()): current=neighbours.instance(i) distances[i]=distances[i]*distances[i] distances[i]=math.sqrt(distances[i]/self.m_NumAttributesUsed) if self.DistanceWeighting == self.WEIGHT_INVERSE: weight=1/distances[i] elif self.DistanceWeighting == self.WEIGHT_SIMILARITY: weight=1-distances[i] else: weight=1 weight*=current.weight() if self.m_ClassType == Attribute.NOMINAL: distribution[int(current.classValue())]+=weight elif self.m_ClassType == Attribute.NUMERIC: distribution[0]+=current.classValue()*weight total+=weight if total > 0: Utils.normalize(distribution, total) return distribution
def testInstances(self, data: Instances, *args): if len(args) == 0: return self.testInstances(data, 0, data.numAttributes() - 1) fromIndex = args[0] toIndex = args[1] if self.doNotCheckCapabilities(): return True if len(self.m_Capabilities) == 0 or (len(self.m_Capabilities) == 1 and self.handles( CapabilityEnum.NO_CLASS)): sys.stderr.write("No capabilities set!") if toIndex - fromIndex < 0: self.m_FailReason = CapabilityError("No attributes!") return False testClass = data.classIndex() > -1 and data.classIndex( ) >= fromIndex and data.classIndex() <= toIndex for i in range(fromIndex, toIndex + 1): att = data.attribute(i) if i == data.classIndex(): continue if not self.testAttribute(att): return False if not self.handles( CapabilityEnum.NO_CLASS) and data.classIndex() == -1: self.m_FailReason = CapabilityError("Class attribute not set!") return False if self.handles(CapabilityEnum.NO_CLASS) and data.classIndex() > -1: cap = self.getClassCapabilities() cap.disable(CapabilityEnum.NO_CLASS) iter = cap.capabilities() if len(iter) == 0: self.m_FailReason = CapabilityError( "Cannot handle any class attribute!") return False if testClass and not self.handles(CapabilityEnum.NO_CLASS): att = data.classAttribute() if not self.testAttribute(att, True): return False if not self.handles(CapabilityEnum.MISSING_CLASS_VALUES): for i in range(data.numInstances()): if data.instance(i).classIsMissing(): self.m_FailReason = CapabilityError( "Cannot handle missing class values!") return False else: hasClass = 0 for i in range(data.numInstances()): if not data.instance(i).classIsMissing(): hasClass += 1 if hasClass < self.getMinimumNumberInstances(): self.m_FailReason=CapabilityError("Not enough training instances with class labels (required: "\ + str(self.getMinimumNumberInstances())\ + ", provided: "\ + str(hasClass)\ + ")!") return False missing = False for i in range(data.numInstances()): inst = data.instance(i) if not self.handles(CapabilityEnum.MISSING_VALUES): #TODO 使用稀疏矩阵pass # if isinstance(inst) # pass #else for n in range(fromIndex, toIndex + 1): if n == inst.classIndex(): continue if inst.isMissing(n): missing = True break if missing: self.m_FailReason = CapabilityError( "Cannot handle missing values!") return False if data.numInstances() < self.getMinimumNumberInstances(): self.m_FailReason = CapabilityError( "Not enough training instances (required: " + str(self.getMinimumNumberInstances()) + ", provided: " + str(data.numInstances()) + ")!") return False # if self.handles(CapabilityEnum.ONLY_MULTIINSTANCE): # if data.numAttributes() != 3: # return False # if not data.attribute(0).isNominal() or data.classIndex() != data.numAttributes()-1: # return False # owner=self.getOwner() # if isinstance(owner,MultiInstanceCapabilitiesHandler): # handler=owner # cap=handler.getMultiInstanceCapabilities() # if data.numInstances()>0 and data.attribute(1).numValues()>0: # result=cap.testAttribute(data.attribute(1)) return True