Esempio n. 1
0
    def __init__(self,data:Instances):
        self.m_Header=Instances(data,0)
        self.m_NumClasses=data.numClasses()
        self.m_NumFolds=1
        self.m_metricsToDisplay=[]
        self.m_ClassIsNominal=data.classAttribute().isNominal()
        self.m_WithClass=0
        self.m_Unclassified=0
        self.m_SumKBInfo=0
        self.m_SumSchemeEntropy=0
        self.m_SumPriorEntropy=0
        self.m_SumErr=self.m_SumAbsErr=self.m_SumSqrErr=self.m_SumPriorAbsErr=self.m_SumPriorSqrErr=0
        self.m_ConfLevel=0.95
        self.m_TotalCoverage=self.m_TotalSizeOfRegions=0
        self.m_MissingClass=0
        self.m_Incorrect=self.m_Correct=0
        self.m_DiscardPredictions=False
        self.m_CoverageStatisticsAvailable=True
        self.m_ComplexityStatisticsAvailable=True
        self.m_SumClass=self.m_SumSqrClass=self.m_SumPredicted=self.m_SumSqrPredicted=self.m_SumClassPredicted=0

        self.m_Predictions=None     #type:List[Prediction]
        if self.m_ClassIsNominal:
            self.m_ConfusionMatrix=[[0]*self.m_NumClasses for i in range(self.m_NumClasses)]       #type:List[List[float]]
            self.m_ClassNames=[]        #type:List[str]
            for i in range(self.m_NumClasses):
                self.m_ClassNames.append(data.classAttribute().value(i))
        self.m_ClassPriors=[0]*self.m_NumClasses       #type:List[float]
        self.setPriors(data)
        self.m_MarginCounts=[0]*(self.k_MarginResolution+1)
        for s in self.BUILT_IN_EVAL_METRICS:
            if s.lower() != "coverage" and s.lower() != "region size":
                self.m_metricsToDisplay.append(s.lower())
Esempio n. 2
0
 def buildClassifier(self, instances: Instances):
     self.getCapabilities().testWithFail(instances)
     sumOfWeights = 0
     self.m_Class = instances.classAttribute()
     self.m_ClassValue = 0
     attrType = instances.classAttribute().type()
     if attrType == Attribute.NUMERIC:
         self.m_Counts = None
     elif attrType == Attribute.NOMINAL:
         self.m_Counts = []
         for i in range(instances.numClasses()):
             self.m_Counts.append(1)
         sumOfWeights = instances.numClasses()
     for instance in instances:
         classValue = instance.classValue()
         if not Utils.isMissingValue(classValue):
             if instances.classAttribute().isNominal():
                 self.m_Counts[classValue] += instance.weight()
             else:
                 self.m_ClassValue += instance.weight() * classValue
             sumOfWeights += instance.weight()
     if instances.classAttribute().isNumeric():
         if Utils.gr(sumOfWeights, 0):
             self.m_ClassValue /= sumOfWeights
     else:
         self.m_ClassValue = Utils.maxIndex(self.m_Counts)
         Utils.normalize(self.m_Counts, sumOfWeights)
 def dumpLabel(self,index:int,data:Instances):
     text=""
     text+=data.classAttribute().value(self.m_distribution.maxClass(index))
     text+=" ("+str(Utils.roundDouble(self.m_distribution.perBag(index), 2))
     if Utils.gr(self.m_distribution.numIncorrect(index), 0):
         text+="/"+str(Utils.roundDouble(self.m_distribution.numIncorrect(index), 2))
     text+=")"
     return text
Esempio n. 4
0
    def evaluateClustersWithRespectToClass(self, inst: Instances):
        numClasses = inst.classAttribute().numValues()
        counts = [[0] * numClasses for i in range(self.m_numClusters)]
        clusterTotals = [0] * self.m_numClusters
        best = [0] * (self.m_numClusters + 1)
        current = [0] * (self.m_numClusters + 1)

        instances = copy.deepcopy(inst)
        instances.setClassIndex(inst.classIndex())
        i = 0
        for instance in instances:
            if self.m_clusterAssignments[i] >= 0:
                if not instance.classIsMissing():
                    counts[int(self.m_clusterAssignments[i])][int(
                        instance.classValue())] += 1
                    clusterTotals[int(self.m_clusterAssignments[i])] += 1
            i += 1
        numInstances = i
        best[self.m_numClusters] = float('inf')
        self.mapClasses(self.m_numClusters, 0, counts, clusterTotals, current,
                        best, 0)
        self.m_clusteringResult += "\n\nClass attribute: " + inst.classAttribute(
        ).name() + "\n"
        self.m_clusteringResult += "Classes to Clusters:\n"
        matrixString = self.toMatrixString(counts, clusterTotals,
                                           Instances(inst, 0))
        self.m_clusteringResult += matrixString + '\n'
        Cwidth = 1 + int(math.log(self.m_numClusters) / math.log(10))
        for i in range(self.m_numClusters):
            if clusterTotals[i] > 0:
                self.m_clusteringResult += "Cluster " + Utils.doubleToString(
                    i, Cwidth, 0)
                self.m_clusteringResult += " <-- "
                if best[i] < 0:
                    self.m_clusteringResult += "No class\n"
                else:
                    self.m_clusteringResult += inst.classAttribute().value(
                        int(best[i])) + '\n'
        self.m_clusteringResult+="\nIncorrectly clustered instances :\t"\
                                  + str(best[self.m_numClusters])\
                                  + "\t" \
                                 + Utils.doubleToString((best[self.m_numClusters] / numInstances * 100.0), 8, 4) \
                                  + " %\n"
        self.m_classToCluster = []
        for i in range(self.m_numClusters):
            self.m_classToCluster[i] = int(best[i])
Esempio n. 5
0
 def setInputFormat(self, instanceInfo: Instances):
     super().setInputFormat(instanceInfo)
     if instanceInfo.classIndex() < 0:
         raise Exception("No class has been assigned to the instances")
     self.setOutputFormatBinary()
     self.m_Indices = None
     if instanceInfo.classAttribute().isNominal():
         return True
     return False
Esempio n. 6
0
 def forInstances(cls,
                  data: Instances,
                  multi: bool = False) -> 'Capabilities':
     result = Capabilities(None)
     result.m_InterfaceDefinedCapabilities = set()
     if data.classIndex() == -1:
         result.enable(CapabilityEnum.NO_CLASS)
     else:
         if data.classAttribute().type() == Attribute.NOMINAL:
             if data.classAttribute().numValues() == 1:
                 result.enable(CapabilityEnum.UNARY_CLASS)
             elif data.classAttribute().numValues() == 2:
                 result.enable(CapabilityEnum.BINARY_CLASS)
             else:
                 result.enable(CapabilityEnum.NOMINAL_CLASS)
         elif data.classAttribute().type() == Attribute.NUMERIC:
             result.enable(CapabilityEnum.NUMERIC_CLASS)
         elif data.classAttribute().type() == Attribute.STRING:
             result.enable(CapabilityEnum.STRING_CLASS)
         elif data.classAttribute().type() == Attribute.DATE:
             result.enable(CapabilityEnum.DATE_CLASS)
         else:
             raise Exception("Unknown class attribute type '" +
                             data.classAttribute().name() + "'!")
         for i in range(data.numInstances()):
             if data.instance(i).classIsMissing():
                 result.enable(CapabilityEnum.MISSING_CLASS_VALUES)
                 break
     for i in range(data.numAttributes()):
         if i == data.classIndex():
             continue
         if data.attribute(i).type() == Attribute.NOMINAL:
             result.enable(CapabilityEnum.UNARY_ATTRIBUTES)
             if data.attribute(i).numValues() == 2:
                 result.enable(CapabilityEnum.BINARY_ATTRIBUTES)
             elif data.attribute(i).numValues() > 2:
                 result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.NUMERIC:
             result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.DATE:
             result.enable(CapabilityEnum.DATE_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.STRING:
             result.enable(CapabilityEnum.STRING_ATTRIBUTES)
         else:
             raise Exception("Unknown attribute type '" +
                             data.attribute(i).name() + "'!")
     missing = False
     for i in range(data.numInstances()):
         inst = data.instance(i)
         for n in range(data.numAttributes()):
             if n == inst.classIndex():
                 continue
             if inst.isMissing(n):
                 missing = True
                 break
         if missing:
             result.enable(CapabilityEnum.MISSING_VALUES)
             break
     return result
Esempio n. 7
0
    def buildClassifier(self,data:Instances):
        self.getCapabilities().testWithFail(data)
        instances=Instances(data)
        instances.deleteWithMissingClass()

        self.m_NumClasses=instances.numClasses()
        self.m_ClassType=instances.classAttribute().type()
        self.m_Train=Instances(instances,0,instances.numInstances())
        #只保存了样本集
        if self.WindowSize > 0 and instances.numInstances() > self.WindowSize:
            self.m_Train=Instances(self.m_Train,self.m_Train.numInstances()-self.WindowSize,self.WindowSize)
        self.m_NumAttributesUsed=0
        for i in range(self.m_Train.numAttributes()):
            if i != self.m_Train.classIndex() and (self.m_Train.attribute(i).isNominal() or  self.m_Train.attribute(i).isNumeric()):
                self.m_NumAttributesUsed+=1
        self.m_NNSearch.setInstances(self.m_Train)
        self.m_kNNValid=False
        self.m_defaultModel=ZeroR()
        self.m_defaultModel.buildClassifier(instances)
Esempio n. 8
0
 def toMatrixString(self, counts: List[List], clusterTotals: List,
                    inst: Instances):
     ms = ""
     maxval = 0
     for i in range(self.m_numClusters):
         for j in range(len(counts[0])):
             if counts[i][j] > maxval:
                 maxval = counts[i][j]
     Cwidth = 1 + max(int(math.log(maxval) / math.log(10)),
                      int(math.log(self.m_numClusters) / math.log(10)))
     ms += '\n'
     for i in range(self.m_numClusters):
         if clusterTotals[i] > 0:
             ms += " " + Utils.doubleToString(i, Cwidth, 0)
     ms += "  <-- assigned to cluster\n"
     for i in range(len(counts[0])):
         for j in range(self.m_numClusters):
             if clusterTotals[j] > 0:
                 ms += " " + Utils.doubleToString(counts[j][i], Cwidth, 0)
         ms += " | " + inst.classAttribute().value(i) + "\n"
     return ms
Esempio n. 9
0
    def threadClassifierRun(self):
        try:
            self.m_CEPanel.addToHistory()
            inst = Instances(self.m_Instances)
            trainTimeStart = trainTimeElapsed = testTimeStart = testTimeElapsed = 0
            userTestStructure = None
            if self.m_SetTestFrame is not None:
                userTestStructure = deepcopy(
                    self.m_SetTestFrame.getInstances())  #type:Instances
                userTestStructure.setClassIndex(self.m_TestClassIndex)

            #默认outputmodel,output per-class stats,output confusion matrix,store predictions for visualization
            #outputPredictionsText=None
            numFolds = 10
            classIndex = self.m_ClassCombo.currentIndex()
            inst.setClassIndex(classIndex)
            classifier = self.m_ClassifierEditor.getValue()  #type:Classifier
            name = time.strftime("%H:%M:%S - ")
            outPutResult = ""
            evaluation = None  #type:Evaluation
            grph = None

            if self.m_CVBut.isChecked():
                testMode = 1
                numFolds = int(self.m_CVText.text())
                if numFolds <= 1:
                    raise Exception("Number of folds must be greater than 1")
            elif self.m_TrainBut.isChecked():
                testMode = 2
            elif self.m_TestSplitBut.isChecked():
                testMode = 3
                # if source is None:
                #     raise Exception("No user test set has been specified")
                if not inst.equalHeaders(userTestStructure):
                    QMessageBox.critical(self.m_Explorer, "错误", "测试数据集属性不同")
            else:
                raise Exception("Unknown test mode")
            cname = classifier.__module__
            if cname.startswith("classifiers."):
                name += cname[len("classifiers."):]
            else:
                name += cname
            cmd = classifier.__module__
            # if isinstance(classifier,OptionHandler):
            #     cmd+=" "+Utils.joinOptions(classifier.getOptions())
            plotInstances = ClassifierErrorsPlotInstances()
            plotInstances.setInstances(userTestStructure if testMode ==
                                       4 else inst)
            plotInstances.setClassifier(classifier)
            plotInstances.setClassIndex(inst.classIndex())
            plotInstances.setPointSizeProportionalToMargin(False)
            outPutResult += "=== Run information ===\n\n"
            outPutResult += "Scheme:       " + cname

            # if isinstance(classifier,OptionHandler):
            #     o=classifier.getOptions()
            #     outPutResult+=" "+Utils.joinOptions(o)
            outPutResult += "\n"
            outPutResult += "Relation:     " + inst.relationName() + '\n'
            outPutResult += "Instances:    " + str(inst.numInstances()) + '\n'
            outPutResult += "Attributes:   " + str(inst.numAttributes()) + '\n'
            if inst.numAttributes() < 100:
                for i in range(inst.numAttributes()):
                    outPutResult += "              " + inst.attribute(
                        i).name() + '\n'
            else:
                outPutResult += "              [list of attributes omitted]\n"
            outPutResult += "Test mode:    "
            if testMode == 1:
                outPutResult += str(numFolds) + "-fold cross-validation\n"
            elif testMode == 2:
                outPutResult += "evaluate on training data\n"
            elif testMode == 3:
                outPutResult += "user supplied test set: " + str(
                    userTestStructure.numInstances()) + " instances\n"
            outPutResult += "\n"
            self.m_History.addResult(name, outPutResult)
            self.m_History.setSingle(name)

            if testMode == 2 or testMode == 3:
                trainTimeStart = time.time()
                classifier.buildClassifier(inst)
                trainTimeElapsed = time.time() - trainTimeStart
            outPutResult += "=== Classifier model (full training set) ===\n\n"
            outPutResult += str(classifier) + "\n"
            outPutResult += "\nTime taken to build model: " + Utils.doubleToString(
                trainTimeElapsed, 2) + " seconds\n\n"
            self.m_History.updateResult(name, outPutResult)
            if isinstance(classifier, Drawable):
                grph = classifier.graph()

            print("==========update Compelte=================")

            if testMode == 2:
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                testTimeStart = time.time()
                #TODO
                # if isinstance(classifier,BatchPredictor)
                # else:
                for jj in range(inst.numInstances()):
                    plotInstances.process(inst.instance(jj), classifier,
                                          evaluation)
                testTimeElapsed = time.time() - testTimeStart
                outPutResult += "=== Evaluation on training set ===\n"
            elif testMode == 1:
                rnd = 1
                inst.randomize(rnd)
                if inst.attribute(classIndex).isNominal():
                    inst.stratify(numFolds)
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                for fold in range(numFolds):
                    train = inst.trainCV(numFolds, fold, rnd)
                    evaluation = self.setupEval(evaluation, classifier, train,
                                                plotInstances, True)
                    evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                    current = deepcopy(classifier)
                    current.buildClassifier(train)
                    test = inst.testCV(numFolds, fold)
                    # TODO
                    # if isinstance(classifier,BatchPredictor)
                    # else:
                    for jj in range(test.numInstances()):
                        plotInstances.process(test.instance(jj), current,
                                              evaluation)
                if inst.attribute(classIndex).isNominal():
                    outPutResult += "=== Stratified cross-validation ===\n"
                else:
                    outPutResult += "=== Cross-validation ===\n"
            elif testMode == 3:
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)

                plotInstances.setInstances(userTestStructure)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                # TODO
                # if isinstance(classifier,BatchPredictor)
                testTimeStart = time.time()
                for i in range(userTestStructure.numInstances()):
                    instance = userTestStructure.instance(i)
                    # if isinstance(classifier,BatchPredictor)
                    #else
                    plotInstances.process(instance, classifier, evaluation)
                # if isinstance(classifier,BatchPredictor)
                testTimeElapsed = time.time() - testTimeStart
                outPutResult += "=== Evaluation on test set ===\n"
            if testMode != 1:
                mode = ""
                if testMode == 2:
                    mode = "training data"
                elif testMode == 3:
                    mode = "supplied test set"
                outPutResult += "\nTime taken to test model on " + mode + ": " + Utils.doubleToString(
                    testTimeElapsed, 2) + " seconds\n\n"
            outPutResult += evaluation.toSummaryString(False) + '\n'
            self.m_History.updateResult(name, outPutResult)
            if inst.attribute(classIndex).isNominal():
                outPutResult += evaluation.toClassDetailsString() + '\n'
                outPutResult += evaluation.toMatrixString() + '\n'
            self.m_History.updateResult(name, outPutResult)
            Utils.debugOut(outPutResult)

            if (plotInstances is not None and plotInstances.canPlot(False)):
                visName = name + " (" + inst.relationName() + ")"
                pl2d = plotInstances.getPlotData(cname)
                plotInstances.cleanUp()
                vv = []
                trainHeader = Instances(self.m_Instances, 0)
                trainHeader.setClassIndex(classIndex)
                vv.append(trainHeader)
                if grph is not None:
                    vv.append(grph)
                if evaluation is not None and evaluation.predictions(
                ) is not None:
                    vv.append(evaluation.predictions())
                    vv.append(inst.classAttribute())
                self.history_add_visualize_signal.emit(name, vv, visName, pl2d)
        except Exception as e:
            self.error_diglog_signal.emit(str(e))
        self.mutex.lock()
        self.m_StartBut.setEnabled(True)
        self.m_StopBut.setEnabled(False)
        self.m_RunThread = None
        self.mutex.unlock()
        print("RunFinished")
Esempio n. 10
0
    def testInstances(self, data: Instances, *args):
        if len(args) == 0:
            return self.testInstances(data, 0, data.numAttributes() - 1)
        fromIndex = args[0]
        toIndex = args[1]
        if self.doNotCheckCapabilities():
            return True
        if len(self.m_Capabilities) == 0 or (len(self.m_Capabilities) == 1
                                             and self.handles(
                                                 CapabilityEnum.NO_CLASS)):
            sys.stderr.write("No capabilities set!")
        if toIndex - fromIndex < 0:
            self.m_FailReason = CapabilityError("No attributes!")
            return False
        testClass = data.classIndex() > -1 and data.classIndex(
        ) >= fromIndex and data.classIndex() <= toIndex
        for i in range(fromIndex, toIndex + 1):
            att = data.attribute(i)
            if i == data.classIndex():
                continue
            if not self.testAttribute(att):
                return False
        if not self.handles(
                CapabilityEnum.NO_CLASS) and data.classIndex() == -1:
            self.m_FailReason = CapabilityError("Class attribute not set!")
            return False

        if self.handles(CapabilityEnum.NO_CLASS) and data.classIndex() > -1:
            cap = self.getClassCapabilities()
            cap.disable(CapabilityEnum.NO_CLASS)
            iter = cap.capabilities()
            if len(iter) == 0:
                self.m_FailReason = CapabilityError(
                    "Cannot handle any class attribute!")
                return False
        if testClass and not self.handles(CapabilityEnum.NO_CLASS):
            att = data.classAttribute()
            if not self.testAttribute(att, True):
                return False
            if not self.handles(CapabilityEnum.MISSING_CLASS_VALUES):
                for i in range(data.numInstances()):
                    if data.instance(i).classIsMissing():
                        self.m_FailReason = CapabilityError(
                            "Cannot handle missing class values!")
                        return False
            else:
                hasClass = 0
                for i in range(data.numInstances()):
                    if not data.instance(i).classIsMissing():
                        hasClass += 1
                if hasClass < self.getMinimumNumberInstances():
                    self.m_FailReason=CapabilityError("Not enough training instances with class labels (required: "\
                                                      + str(self.getMinimumNumberInstances())\
                                                      + ", provided: "\
                                                      + str(hasClass)\
                                                      + ")!")
                    return False
        missing = False
        for i in range(data.numInstances()):
            inst = data.instance(i)
            if not self.handles(CapabilityEnum.MISSING_VALUES):
                #TODO 使用稀疏矩阵pass
                # if isinstance(inst)
                #     pass
                #else
                for n in range(fromIndex, toIndex + 1):
                    if n == inst.classIndex():
                        continue
                    if inst.isMissing(n):
                        missing = True
                        break
                if missing:
                    self.m_FailReason = CapabilityError(
                        "Cannot handle missing values!")
                    return False
        if data.numInstances() < self.getMinimumNumberInstances():
            self.m_FailReason = CapabilityError(
                "Not enough training instances (required: " +
                str(self.getMinimumNumberInstances()) + ", provided: " +
                str(data.numInstances()) + ")!")
            return False
        # if self.handles(CapabilityEnum.ONLY_MULTIINSTANCE):
        #     if data.numAttributes() != 3:
        #         return False
        #     if not data.attribute(0).isNominal() or data.classIndex() != data.numAttributes()-1:
        #         return False
        #     owner=self.getOwner()
        #     if isinstance(owner,MultiInstanceCapabilitiesHandler):
        #         handler=owner
        #         cap=handler.getMultiInstanceCapabilities()
        #         if data.numInstances()>0 and data.attribute(1).numValues()>0:
        #             result=cap.testAttribute(data.attribute(1))
        return True