Ejemplo n.º 1
0
 def forInstances(cls,
                  data: Instances,
                  multi: bool = False) -> 'Capabilities':
     result = Capabilities(None)
     result.m_InterfaceDefinedCapabilities = set()
     if data.classIndex() == -1:
         result.enable(CapabilityEnum.NO_CLASS)
     else:
         if data.classAttribute().type() == Attribute.NOMINAL:
             if data.classAttribute().numValues() == 1:
                 result.enable(CapabilityEnum.UNARY_CLASS)
             elif data.classAttribute().numValues() == 2:
                 result.enable(CapabilityEnum.BINARY_CLASS)
             else:
                 result.enable(CapabilityEnum.NOMINAL_CLASS)
         elif data.classAttribute().type() == Attribute.NUMERIC:
             result.enable(CapabilityEnum.NUMERIC_CLASS)
         elif data.classAttribute().type() == Attribute.STRING:
             result.enable(CapabilityEnum.STRING_CLASS)
         elif data.classAttribute().type() == Attribute.DATE:
             result.enable(CapabilityEnum.DATE_CLASS)
         else:
             raise Exception("Unknown class attribute type '" +
                             data.classAttribute().name() + "'!")
         for i in range(data.numInstances()):
             if data.instance(i).classIsMissing():
                 result.enable(CapabilityEnum.MISSING_CLASS_VALUES)
                 break
     for i in range(data.numAttributes()):
         if i == data.classIndex():
             continue
         if data.attribute(i).type() == Attribute.NOMINAL:
             result.enable(CapabilityEnum.UNARY_ATTRIBUTES)
             if data.attribute(i).numValues() == 2:
                 result.enable(CapabilityEnum.BINARY_ATTRIBUTES)
             elif data.attribute(i).numValues() > 2:
                 result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.NUMERIC:
             result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.DATE:
             result.enable(CapabilityEnum.DATE_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.STRING:
             result.enable(CapabilityEnum.STRING_ATTRIBUTES)
         else:
             raise Exception("Unknown attribute type '" +
                             data.attribute(i).name() + "'!")
     missing = False
     for i in range(data.numInstances()):
         inst = data.instance(i)
         for n in range(data.numAttributes()):
             if n == inst.classIndex():
                 continue
             if inst.isMissing(n):
                 missing = True
                 break
         if missing:
             result.enable(CapabilityEnum.MISSING_VALUES)
             break
     return result
Ejemplo n.º 2
0
def calculateRSquared(data: Instances, ssr: float):
    yMean = data.meanOrMode(data.classIndex())
    tss = 0
    for i in range(data.numInstances()):
        tss+=(data.instance(i).value(data.classIndex())-yMean)*\
             (data.instance(i).value(data.classIndex())-yMean)
    rsq = 1 - ssr / tss
    return rsq
Ejemplo n.º 3
0
 def removeClass(self, inst: Instances):
     af = Remove()
     if inst.classIndex() < 0:
         retI = inst
     else:
         af.setAttributeIndices("" + str(inst.classIndex() + 1))
         af.setInvertSelection(False)
         af.setInputFormat(inst)
         retI = Filter.useFilter(inst, af)
     return retI
Ejemplo n.º 4
0
    def evaluateClusterer(self, test: Instances, outputModel: bool):
        i = loglk = unclusteredInstances = 0
        cc = self.m_Clusterer.numberOfClusters()
        self.m_numClusters = cc
        instanceStats = [0] * cc
        hasClass = test.classIndex() >= 0
        clusterAssignments = []
        filter = None  #type:Filter

        testRaw = copy.deepcopy(test)
        testRaw.setClassIndex(test.classIndex())

        if hasClass:
            if testRaw.classAttribute().isNumeric():
                raise Exception(unclusteredInstances)
            filter = Remove()
            filter.setAttributeIndices(str(testRaw.classIndex() + 1))
            filter.setInvertSelection(False)
            filter.setInputFormat(testRaw)
        for inst in testRaw:
            if filter is not None:
                filter.input(inst)
                filter.batchFinished()
                inst = filter.output()
            cnum = self.m_Clusterer.clusterInstance(inst)
            clusterAssignments.append(cnum)
            if cnum != -1:
                instanceStats[cnum] += 1
        sumNum = sum(instanceStats)
        loglk /= sumNum
        self.m_logL = loglk
        self.m_clusterAssignments = []
        # for i in clusterAssignments:
        #     print(",",i,end="")
        # print()
        for i in range(len(clusterAssignments)):
            self.m_clusterAssignments.append(clusterAssignments[i])
        numInstFieldWidth = int(
            math.log(len(clusterAssignments)) / math.log(10) + 1)
        if outputModel:
            self.m_clusteringResult += str(self.m_Clusterer)
        self.m_clusteringResult += "Clustered Instances\n\n"
        clustFieldWidth = int((math.log(cc) / math.log(10)) + 1)
        for i in range(cc):
            if instanceStats[i] > 0:
                self.m_clusteringResult+= Utils.doubleToString(i, clustFieldWidth, 0) \
                                          +"      " \
                                          + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) \
                                          +"(" + Utils.doubleToString((instanceStats[i] / sumNum * 100), 3, 0) \
                                          +"%)\n"
        if unclusteredInstances > 0:
            self.m_clusteringResult += "\nUnclustered instances : " + str(
                unclusteredInstances)
        if hasClass:
            self.evaluateClustersWithRespectToClass(test)
Ejemplo n.º 5
0
 def selectModel(self, data: Instances, test: Instances = None):
     if test is not None:
         return self.selectModel(data)
     multiVal = True
     averageInfoGain = validModels = 0
     checkDistribution = Distribution(data)
     noSplitModel = NoSplit(checkDistribution)
     if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \
         Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())):
         return noSplitModel
     if self.m_allData is not None:
         for attr in data.enumerateAttributes():
             if attr.isNumeric() or Utils.gr(
                     0.3 * self.m_allData.numInstances(), attr.numValues()):
                 multiVal = False
                 break
     currentModel = [None] * data.numAttributes()  #type:List[C45Split]
     sumOfWeights = data.sumOfWeight()
     for i in range(data.numAttributes()):
         if i != data.classIndex():
             currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights,
                                        self.m_useMDLcorrection)
             currentModel[i].buildClassifer(data)
             if currentModel[i].checkModel():
                 if self.m_allData is not None:
                     if data.attribute(i).isNumeric() or \
                         (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())):
                         averageInfoGain = averageInfoGain + currentModel[
                             i].infoGain()
                         validModels += 1
                 else:
                     averageInfoGain = averageInfoGain + currentModel[
                         i].infoGain()
                     validModels += 1
         else:
             currentModel[i] = None
     if validModels == 0:
         return noSplitModel
     averageInfoGain = averageInfoGain / validModels
     minResult = 0
     for i in range(data.numAttributes()):
         if i != data.classIndex() and currentModel[i].checkModel():
             if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\
                 Utils.gr(currentModel[i].gainRatio(), minResult):
                 bestModel = currentModel[i]
                 minResult = currentModel[i].gainRatio()
     if Utils.equal(minResult, 0):
         return noSplitModel
     bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex())
     if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue:
         bestModel.setSplitPoint(self.m_allData)
     return bestModel
Ejemplo n.º 6
0
 def setInputFormat(self, instanceInfo: Instances):
     super().setInputFormat(instanceInfo)
     if instanceInfo.classIndex() < 0:
         raise Exception("No class has been assigned to the instances")
     self.setOutputFormatBinary()
     self.m_Indices = None
     if instanceInfo.classAttribute().isNominal():
         return True
     return False
Ejemplo n.º 7
0
 def setInstances(self, inst: Instances):
     self.m_Instances = inst
     attribNames = []
     for i in range(inst.numAttributes()):
         tp = "(" + Attribute.typeToStringShort(inst.attribute(i)) + ")"
         attribNames.append(tp + inst.attribute(i).name())
     self.m_ClassCombo.clear()
     self.m_ClassCombo.addItems(attribNames)
     if len(attribNames) > 0:
         if inst.classIndex() == -1:
             self.m_ClassCombo.setCurrentIndex(len(attribNames) - 1)
         else:
             self.m_ClassCombo.setCurrentIndex(inst.classIndex())
         self.m_ClassCombo.setEnabled(True)
         self.m_StartBut.setEnabled(self.m_RunThread is None)
         self.m_StopBut.setEnabled(self.m_RunThread is not None)
     else:
         self.m_StartBut.setEnabled(False)
         self.m_StopBut.setEnabled(False)
Ejemplo n.º 8
0
 def setupEval(self, evaluation: Evaluation, classifier: Classifier,
               inst: Instances,
               plotInstances: ClassifierErrorsPlotInstances,
               onlySetPriors: bool):
     # if isinstance(classifier,InputMappedClassifier)...
     #else
     evaluation.setPriors(inst)
     if not onlySetPriors:
         if plotInstances is not None:
             plotInstances.setInstances(inst)
             plotInstances.setClassifier(classifier)
             plotInstances.setClassIndex(inst.classIndex())
             plotInstances.setEvaluation(evaluation)
     return evaluation
Ejemplo n.º 9
0
    def evaluateClustersWithRespectToClass(self, inst: Instances):
        numClasses = inst.classAttribute().numValues()
        counts = [[0] * numClasses for i in range(self.m_numClusters)]
        clusterTotals = [0] * self.m_numClusters
        best = [0] * (self.m_numClusters + 1)
        current = [0] * (self.m_numClusters + 1)

        instances = copy.deepcopy(inst)
        instances.setClassIndex(inst.classIndex())
        i = 0
        for instance in instances:
            if self.m_clusterAssignments[i] >= 0:
                if not instance.classIsMissing():
                    counts[int(self.m_clusterAssignments[i])][int(
                        instance.classValue())] += 1
                    clusterTotals[int(self.m_clusterAssignments[i])] += 1
            i += 1
        numInstances = i
        best[self.m_numClusters] = float('inf')
        self.mapClasses(self.m_numClusters, 0, counts, clusterTotals, current,
                        best, 0)
        self.m_clusteringResult += "\n\nClass attribute: " + inst.classAttribute(
        ).name() + "\n"
        self.m_clusteringResult += "Classes to Clusters:\n"
        matrixString = self.toMatrixString(counts, clusterTotals,
                                           Instances(inst, 0))
        self.m_clusteringResult += matrixString + '\n'
        Cwidth = 1 + int(math.log(self.m_numClusters) / math.log(10))
        for i in range(self.m_numClusters):
            if clusterTotals[i] > 0:
                self.m_clusteringResult += "Cluster " + Utils.doubleToString(
                    i, Cwidth, 0)
                self.m_clusteringResult += " <-- "
                if best[i] < 0:
                    self.m_clusteringResult += "No class\n"
                else:
                    self.m_clusteringResult += inst.classAttribute().value(
                        int(best[i])) + '\n'
        self.m_clusteringResult+="\nIncorrectly clustered instances :\t"\
                                  + str(best[self.m_numClusters])\
                                  + "\t" \
                                 + Utils.doubleToString((best[self.m_numClusters] / numInstances * 100.0), 8, 4) \
                                  + " %\n"
        self.m_classToCluster = []
        for i in range(self.m_numClusters):
            self.m_classToCluster[i] = int(best[i])
Ejemplo n.º 10
0
 def getCapabilities(self, data: Instances = None):
     if data is None:
         result = Capabilities(self)
         result.enableAll()
         result.setMinimumNumberInstances(0)
         return result
     result = self.getCapabilities()
     if data.classIndex() == -1:
         classes = result.getClassCapabilities()
         iter = classes.capabilities()
         for item in iter:
             if item != CapabilityEnum.NO_CLASS:
                 result.disable(item)
                 result.disableDependency(item)
     else:
         result.disable(CapabilityEnum.NO_CLASS)
         result.disableDependency(CapabilityEnum.NO_CLASS)
     return result
Ejemplo n.º 11
0
 def threadRun(self, filter: Filter):
     if filter is not None:
         #addUndo
         classIndex = self.m_AttVisualizePanel.getColoringIndex()
         cp = Instances(self.m_Instances)
         cp.setClassIndex(classIndex)
         self.m_StopBut.setEnabled(True)
         filterCopy = deepcopy(filter)
         filterCopy.setInputFormat(cp)
         newInstances = Filter.useFilter(cp, filterCopy)
         self.m_StopBut.setEnabled(False)
         if newInstances is None or newInstances.numAttributes() < 1:
             raise Exception("Dataset is empty.")
         #addUndo
         self.m_AttVisualizePanel.setColoringIndex(cp.classIndex())
         if self.m_Instances.classIndex() < 0:
             newInstances.setClassIndex(-1)
         self.m_Instances = newInstances
         self.setInstances(self.m_Instances)
         self.m_RunThread = None
Ejemplo n.º 12
0
def calculateStdErrorOfCoef(data: Instances, selected: List[bool], ssr: float,
                            n: int, k: int):
    array = [[0] * k for i in range(n)]
    column = 0
    for j in range(data.numAttributes()):
        if data.classIndex() != j and selected[j]:
            for i in range(n):
                array[i][column] = data.instance(i).value(j)
            column += 1
    for i in range(n):
        array[i][k - 1] = 1
    X = np.array(array)
    XtX = np.dot(X.T, X)
    inverse = np.linalg.pinv(XtX)
    mse = ssr / (n - k)
    cov = mse * inverse
    result = []
    for i in range(k):
        result.append(math.sqrt(cov[i][i]))
    return result
Ejemplo n.º 13
0
 def setInputFormat(self, instanceInfo: Instances):
     super().setInputFormat(instanceInfo)
     self.attributeIndices.setUpper(instanceInfo.numAttributes() - 1)
     attributes = []
     outputClass = -1
     self.m_SelectedAttributes = self.attributeIndices.getSelection()
     if len(self.m_SelectedAttributes) == instanceInfo.numAttributes():
         self.setOutputFormat(instanceInfo)
         self.initOutputLocators(self.getInputFormat(),
                                 self.m_SelectedAttributes)
         return True
     for current in self.m_SelectedAttributes:
         if instanceInfo.classIndex() == current:
             outputClass = len(attributes)
         keep = instanceInfo.attribute(current).copy()
         attributes.append(keep)
     self.initInputLocators(self.getInputFormat(),
                            self.m_SelectedAttributes)
     outputFormat = Instances(instanceInfo.relationName(), attributes, 0)
     outputFormat.setClassIndex(outputClass)
     self.setOutputFormat(outputFormat)
     return True
Ejemplo n.º 14
0
    def threadClassifierRun(self):
        try:
            self.m_CEPanel.addToHistory()
            inst = Instances(self.m_Instances)
            trainTimeStart = trainTimeElapsed = testTimeStart = testTimeElapsed = 0
            userTestStructure = None
            if self.m_SetTestFrame is not None:
                userTestStructure = deepcopy(
                    self.m_SetTestFrame.getInstances())  #type:Instances
                userTestStructure.setClassIndex(self.m_TestClassIndex)

            #默认outputmodel,output per-class stats,output confusion matrix,store predictions for visualization
            #outputPredictionsText=None
            numFolds = 10
            classIndex = self.m_ClassCombo.currentIndex()
            inst.setClassIndex(classIndex)
            classifier = self.m_ClassifierEditor.getValue()  #type:Classifier
            name = time.strftime("%H:%M:%S - ")
            outPutResult = ""
            evaluation = None  #type:Evaluation
            grph = None

            if self.m_CVBut.isChecked():
                testMode = 1
                numFolds = int(self.m_CVText.text())
                if numFolds <= 1:
                    raise Exception("Number of folds must be greater than 1")
            elif self.m_TrainBut.isChecked():
                testMode = 2
            elif self.m_TestSplitBut.isChecked():
                testMode = 3
                # if source is None:
                #     raise Exception("No user test set has been specified")
                if not inst.equalHeaders(userTestStructure):
                    QMessageBox.critical(self.m_Explorer, "错误", "测试数据集属性不同")
            else:
                raise Exception("Unknown test mode")
            cname = classifier.__module__
            if cname.startswith("classifiers."):
                name += cname[len("classifiers."):]
            else:
                name += cname
            cmd = classifier.__module__
            # if isinstance(classifier,OptionHandler):
            #     cmd+=" "+Utils.joinOptions(classifier.getOptions())
            plotInstances = ClassifierErrorsPlotInstances()
            plotInstances.setInstances(userTestStructure if testMode ==
                                       4 else inst)
            plotInstances.setClassifier(classifier)
            plotInstances.setClassIndex(inst.classIndex())
            plotInstances.setPointSizeProportionalToMargin(False)
            outPutResult += "=== Run information ===\n\n"
            outPutResult += "Scheme:       " + cname

            # if isinstance(classifier,OptionHandler):
            #     o=classifier.getOptions()
            #     outPutResult+=" "+Utils.joinOptions(o)
            outPutResult += "\n"
            outPutResult += "Relation:     " + inst.relationName() + '\n'
            outPutResult += "Instances:    " + str(inst.numInstances()) + '\n'
            outPutResult += "Attributes:   " + str(inst.numAttributes()) + '\n'
            if inst.numAttributes() < 100:
                for i in range(inst.numAttributes()):
                    outPutResult += "              " + inst.attribute(
                        i).name() + '\n'
            else:
                outPutResult += "              [list of attributes omitted]\n"
            outPutResult += "Test mode:    "
            if testMode == 1:
                outPutResult += str(numFolds) + "-fold cross-validation\n"
            elif testMode == 2:
                outPutResult += "evaluate on training data\n"
            elif testMode == 3:
                outPutResult += "user supplied test set: " + str(
                    userTestStructure.numInstances()) + " instances\n"
            outPutResult += "\n"
            self.m_History.addResult(name, outPutResult)
            self.m_History.setSingle(name)

            if testMode == 2 or testMode == 3:
                trainTimeStart = time.time()
                classifier.buildClassifier(inst)
                trainTimeElapsed = time.time() - trainTimeStart
            outPutResult += "=== Classifier model (full training set) ===\n\n"
            outPutResult += str(classifier) + "\n"
            outPutResult += "\nTime taken to build model: " + Utils.doubleToString(
                trainTimeElapsed, 2) + " seconds\n\n"
            self.m_History.updateResult(name, outPutResult)
            if isinstance(classifier, Drawable):
                grph = classifier.graph()

            print("==========update Compelte=================")

            if testMode == 2:
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                testTimeStart = time.time()
                #TODO
                # if isinstance(classifier,BatchPredictor)
                # else:
                for jj in range(inst.numInstances()):
                    plotInstances.process(inst.instance(jj), classifier,
                                          evaluation)
                testTimeElapsed = time.time() - testTimeStart
                outPutResult += "=== Evaluation on training set ===\n"
            elif testMode == 1:
                rnd = 1
                inst.randomize(rnd)
                if inst.attribute(classIndex).isNominal():
                    inst.stratify(numFolds)
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                for fold in range(numFolds):
                    train = inst.trainCV(numFolds, fold, rnd)
                    evaluation = self.setupEval(evaluation, classifier, train,
                                                plotInstances, True)
                    evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                    current = deepcopy(classifier)
                    current.buildClassifier(train)
                    test = inst.testCV(numFolds, fold)
                    # TODO
                    # if isinstance(classifier,BatchPredictor)
                    # else:
                    for jj in range(test.numInstances()):
                        plotInstances.process(test.instance(jj), current,
                                              evaluation)
                if inst.attribute(classIndex).isNominal():
                    outPutResult += "=== Stratified cross-validation ===\n"
                else:
                    outPutResult += "=== Cross-validation ===\n"
            elif testMode == 3:
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)

                plotInstances.setInstances(userTestStructure)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                # TODO
                # if isinstance(classifier,BatchPredictor)
                testTimeStart = time.time()
                for i in range(userTestStructure.numInstances()):
                    instance = userTestStructure.instance(i)
                    # if isinstance(classifier,BatchPredictor)
                    #else
                    plotInstances.process(instance, classifier, evaluation)
                # if isinstance(classifier,BatchPredictor)
                testTimeElapsed = time.time() - testTimeStart
                outPutResult += "=== Evaluation on test set ===\n"
            if testMode != 1:
                mode = ""
                if testMode == 2:
                    mode = "training data"
                elif testMode == 3:
                    mode = "supplied test set"
                outPutResult += "\nTime taken to test model on " + mode + ": " + Utils.doubleToString(
                    testTimeElapsed, 2) + " seconds\n\n"
            outPutResult += evaluation.toSummaryString(False) + '\n'
            self.m_History.updateResult(name, outPutResult)
            if inst.attribute(classIndex).isNominal():
                outPutResult += evaluation.toClassDetailsString() + '\n'
                outPutResult += evaluation.toMatrixString() + '\n'
            self.m_History.updateResult(name, outPutResult)
            Utils.debugOut(outPutResult)

            if (plotInstances is not None and plotInstances.canPlot(False)):
                visName = name + " (" + inst.relationName() + ")"
                pl2d = plotInstances.getPlotData(cname)
                plotInstances.cleanUp()
                vv = []
                trainHeader = Instances(self.m_Instances, 0)
                trainHeader.setClassIndex(classIndex)
                vv.append(trainHeader)
                if grph is not None:
                    vv.append(grph)
                if evaluation is not None and evaluation.predictions(
                ) is not None:
                    vv.append(evaluation.predictions())
                    vv.append(inst.classAttribute())
                self.history_add_visualize_signal.emit(name, vv, visName, pl2d)
        except Exception as e:
            self.error_diglog_signal.emit(str(e))
        self.mutex.lock()
        self.m_StartBut.setEnabled(True)
        self.m_StopBut.setEnabled(False)
        self.m_RunThread = None
        self.mutex.unlock()
        print("RunFinished")
Ejemplo n.º 15
0
class LinearRegression(AbstractClassifier):
    SELECTION_M5 = 0  #default
    SELECTION_NONE = 1
    SELECTION_GREEDY = 2
    TAGS_SELECTION = [
        Tag(SELECTION_M5, "M5 method"),
        Tag(SELECTION_NONE, "No attribute selection"),
        Tag(SELECTION_GREEDY, "Greedy method")
    ]
    propertyList = {
        "AttributeSelectionMethod": "TAGS_SELECTION",
        "Ridge": "1e-8"
    }
    methodList = {
        "AttributeSelectionMethod": "setAttributeSelectionMethod",
        "Ridge": "setRidge"
    }

    def __init__(self):
        super().__init__()
        self.m_Coefficients = None  #type:List[float]
        self.m_SelectedAttributes = None  #type:List[bool]
        self.m_TransformedData = None  #type:Instances
        self.m_MissingFilter = None  #type:ReplaceMissingValues
        self.m_TransformFilter = None  #type:NominalToBinary
        self.m_ClassStdDev = 0
        self.m_ClassMean = 0
        self.m_ClassIndex = 0
        self.m_Means = None  #type:List[float]
        self.m_StdDevs = None  #type:List[float]
        self.outputAdditionalStats = False
        self.AttributeSelectionMethod = 0
        self.EliminateColinearAttributes = True
        self.m_checksTurnedOff = False
        self.Ridge = 1e-8
        self.Minimal = False
        self.m_ModelBuilt = False
        self.m_isZeroR = False
        self.m_df = 0
        self.m_RSquared = 0
        self.m_RSquaredAdj = 0
        self.m_FStat = 0
        self.m_StdErrorOfCoef = None  #type:List[float]
        self.m_TStats = None  #type:List[float]
        self.numDecimalPlaces = 4

    def __str__(self):
        if not self.m_ModelBuilt:
            return "Linear Regression: No model built yet."
        if self.Minimal:
            return "Linear Regression: Model built."
        text = ""
        column = 0
        first = True
        text += "\nLinear Regression Model\n\n"
        text += self.m_TransformedData.classAttribute().name() + " =\n\n"
        for i in range(self.m_TransformedData.numAttributes()):
            if i != self.m_ClassIndex and self.m_SelectedAttributes[i]:
                if not first:
                    text += " +\n"
                else:
                    first = False
                text += Utils.doubleToString(self.m_Coefficients[column], 12,
                                             self.numDecimalPlaces) + " * "
                text += self.m_TransformedData.attribute(i).name()
                column += 1
        text += " +\n" + Utils.doubleToString(self.m_Coefficients[column], 12,
                                              self.numDecimalPlaces)
        if self.outputAdditionalStats:
            maxAttLength = 0
            for i in range(self.m_TransformedData.numAttributes()):
                if i != self.m_ClassIndex and self.m_SelectedAttributes[i]:
                    if len(self.m_TransformedData.attribute(
                            i).name()) > maxAttLength:
                        maxAttLength = len(
                            self.m_TransformedData.attribute(i).name())
            maxAttLength += 3
            if maxAttLength < len("Variable") + 3:
                maxAttLength = len("Variable") + 3
            text+="\n\nRegression Analysis:\n\n" \
                  + Utils.padRight("Variable", maxAttLength)\
                  + "  Coefficient     SE of Coef        t-Stat"
            column = 0
            for i in range(self.m_TransformedData.numAttributes()):
                if i != self.m_ClassIndex and self.m_SelectedAttributes[i]:
                    text += "\n" + Utils.padRight(
                        self.m_TransformedData.attribute(i).name(),
                        maxAttLength)
                    text += Utils.doubleToString(self.m_Coefficients[column],
                                                 12, self.numDecimalPlaces)
                    text += "   " + Utils.doubleToString(
                        self.m_StdErrorOfCoef[column], 12,
                        self.numDecimalPlaces)
                    text += "   " + Utils.doubleToString(
                        self.m_TStats[column], 12, self.numDecimalPlaces)
                    column += 1
            text += Utils.padRight(
                "\nconst", maxAttLength + 1) + Utils.doubleToString(
                    self.m_Coefficients[column], 12, self.numDecimalPlaces)
            text += "   " + Utils.doubleToString(self.m_StdErrorOfCoef[column],
                                                 12, self.numDecimalPlaces)
            text += "   " + Utils.doubleToString(self.m_TStats[column], 12,
                                                 self.numDecimalPlaces)
            text += "\n\nDegrees of freedom = " + str(self.m_df)
            text += "\nR^2 value = " + Utils.doubleToString(
                self.m_RSquared, self.numDecimalPlaces)
            text += "\nAdjusted R^2 = " + Utils.doubleToString(
                self.m_RSquaredAdj, 5)
            text += "\nF-statistic = " + Utils.doubleToString(
                self.m_FStat, self.numDecimalPlaces)
        return text

    def getCapabilities(self):
        result = super().getCapabilities()
        result.disableAll()
        result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES)
        result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES)
        result.enable(CapabilityEnum.DATE_ATTRIBUTES)
        result.enable(CapabilityEnum.MISSING_VALUES)
        result.enable(CapabilityEnum.NUMERIC_CLASS)
        result.enable(CapabilityEnum.DATE_CLASS)
        result.enable(CapabilityEnum.MISSING_CLASS_VALUES)
        return result

    def setAttributeSelectionMethod(self, value: str):
        index = int(value)
        self.AttributeSelectionMethod = self.TAGS_SELECTION[index].getID()

    def setRidge(self, value: str):
        try:
            val = float(value)
            self.Ridge = val
            self.propertyList.update({"Ridge": value})
        except ValueError:
            pass

    def buildClassifier(self, data: Instances):
        self.m_ModelBuilt = False
        self.m_isZeroR = False
        if data.numInstances() == 1:
            self.m_Coefficients = [data.instance(0).classValue()]
            self.m_SelectedAttributes = [False] * data.numAttributes()
            self.m_isZeroR = True
            return
        if not self.m_checksTurnedOff:
            self.getCapabilities().testWithFail(data)
            if self.outputAdditionalStats:
                ok = True
                for i in range(data.numInstances()):
                    if data.instance(i).weight() != 1:
                        ok = False
                        break
                if not ok:
                    raise Exception(
                        "Can only compute additional statistics on unweighted data"
                    )
            data = Instances(data)
            data.deleteWithMissingClass()
            self.m_TransformFilter = NominalToBinary()
            self.m_TransformFilter.setInputFormat(data)
            data = Filter.useFilter(data, self.m_TransformFilter)
            self.m_MissingFilter = ReplaceMissingValues()
            self.m_MissingFilter.setInputFormat(data)
            data = Filter.useFilter(data, self.m_MissingFilter)
            data.deleteWithMissingClass()
        else:
            self.m_TransformFilter = None
            self.m_MissingFilter = None
        self.m_ClassIndex = data.classIndex()
        self.m_TransformedData = data
        self.m_Coefficients = None
        self.m_SelectedAttributes = [False] * data.numAttributes()
        self.m_Means = [0] * data.numAttributes()
        self.m_StdDevs = [0] * data.numAttributes()
        for j in range(data.numAttributes()):
            if j != self.m_ClassIndex:
                self.m_SelectedAttributes[j] = True
                self.m_Means[j] = data.meanOrMode(j)
                self.m_StdDevs[j] = math.sqrt(data.variance(j))
                if self.m_StdDevs[j] == 0:
                    self.m_SelectedAttributes[j] = False
        self.m_ClassStdDev = math.sqrt(
            data.variance(self.m_TransformedData.classIndex()))
        self.m_ClassMean = data.meanOrMode(self.m_TransformedData.classIndex())
        self.findBestModel()
        if self.outputAdditionalStats:
            k = 1
            for i in range(data.numAttributes()):
                if i != data.classIndex():
                    if self.m_SelectedAttributes[i]:
                        k += 1
            self.m_df = self.m_TransformedData.numInstances() - k
            se = self.calculateSE(self.m_SelectedAttributes,
                                  self.m_Coefficients)
            self.m_RSquared = RegressionAnalysis.calculateRSquared(
                self.m_TransformedData, se)
            self.m_RSquaredAdj = RegressionAnalysis.calculateAdjRSquared(
                self.m_RSquared, self.m_TransformedData.numInstances(), k)
            self.m_FStat = RegressionAnalysis.calculateFStat(
                self.m_RSquared, self.m_TransformedData.numInstances(), k)
            self.m_StdErrorOfCoef = RegressionAnalysis.calculateStdErrorOfCoef(
                self.m_TransformedData, self.m_SelectedAttributes, se,
                self.m_TransformedData.numInstances(), k)
            self.m_TStats = RegressionAnalysis.calculateTStats(
                self.m_Coefficients, self.m_StdErrorOfCoef, k)
        if self.Minimal:
            self.m_TransformedData = None
            self.m_Means = None
            self.m_StdDevs = None
        else:
            self.m_TransformedData = Instances(data, 0)
        self.m_ModelBuilt = True

    def classifyInstance(self, instance: Instance):
        transformedInstance = instance
        if not self.m_checksTurnedOff and not self.m_isZeroR:
            self.m_TransformFilter.input(transformedInstance)
            self.m_TransformFilter.batchFinished()
            transformedInstance = self.m_TransformFilter.output()
            self.m_MissingFilter.input(transformedInstance)
            self.m_MissingFilter.batchFinished()
            transformedInstance = self.m_MissingFilter.output()
        return self.regressionPrediction(transformedInstance,
                                         self.m_SelectedAttributes,
                                         self.m_Coefficients)

    def findBestModel(self):
        numInstances = self.m_TransformedData.numInstances()
        self.m_Coefficients = self.doRegression(self.m_SelectedAttributes)
        while self.EliminateColinearAttributes and self.deselectColinearAttributes(
                self.m_SelectedAttributes, self.m_Coefficients):
            self.m_Coefficients = self.doRegression(self.m_SelectedAttributes)
        numAttributes = 1
        for m_SelectedAttribute in self.m_SelectedAttributes:
            if m_SelectedAttribute:
                numAttributes += 1
        fullMSE = self.calculateSE(self.m_SelectedAttributes,
                                   self.m_Coefficients)
        akaike = (numInstances - numAttributes) + 2 * numAttributes
        currentNumAttributes = numAttributes
        improved = True
        if self.AttributeSelectionMethod == self.SELECTION_GREEDY:
            while improved:
                currentSelected = self.m_SelectedAttributes[:]
                improved = False
                currentNumAttributes -= 1
                for i in range(len(self.m_SelectedAttributes)):
                    if currentSelected[i]:
                        currentSelected[i] = False
                        currentCoeffs = self.doRegression(currentSelected)
                        currentMSE = self.calculateSE(currentSelected,
                                                      currentCoeffs)
                        currentAkaike = currentMSE / fullMSE * (
                            numInstances -
                            numAttributes) + 2 * currentNumAttributes
                        if currentAkaike < akaike:
                            improved = True
                            akaike = currentAkaike
                            self.m_SelectedAttributes = currentSelected[:]
                            self.m_Coefficients = currentCoeffs
                        currentSelected[i] = True
        elif self.AttributeSelectionMethod == self.SELECTION_M5:
            while improved:
                improved = False
                currentNumAttributes -= 1
                minSC = 0
                minAttr = -1
                coeff = 0
                for i in range(len(self.m_SelectedAttributes)):
                    if self.m_SelectedAttributes[i]:
                        SC = math.fabs(self.m_Coefficients[coeff] *
                                       self.m_StdDevs[i] / self.m_ClassStdDev)
                        if coeff == 0 or SC < minSC:
                            minSC = SC
                            minAttr = i
                        coeff += 1
                if minAttr >= 0:
                    self.m_SelectedAttributes[minAttr] = False
                    currentCoeffs = self.doRegression(
                        self.m_SelectedAttributes)
                    currentMSE = self.calculateSE(self.m_SelectedAttributes,
                                                  currentCoeffs)
                    currentAkaike = currentMSE / fullMSE * (
                        numInstances -
                        numAttributes) + 2 * currentNumAttributes
                    if currentAkaike < akaike:
                        improved = True
                        akaike = currentAkaike
                        self.m_Coefficients = currentCoeffs
                    else:
                        self.m_SelectedAttributes[minAttr] = True

    def calculateSE(self, selectedAttributes: List[bool],
                    coefficients: List[float]):
        mse = 0
        for i in range(self.m_TransformedData.numInstances()):
            prediction = self.regressionPrediction(
                self.m_TransformedData.instance(i), selectedAttributes,
                coefficients)
            error = prediction - self.m_TransformedData.instance(
                i).classValue()
            mse += error * error
        return mse

    def regressionPrediction(self, transformedInstance: Instance,
                             selectedAttributes: List[bool],
                             coefficients: List[float]):
        result = 0
        column = 0
        for j in range(transformedInstance.numAttributes()):
            if self.m_ClassIndex != j and selectedAttributes[j]:
                result += coefficients[column] * transformedInstance.value(j)
                column += 1
        result += coefficients[column]
        return result

    def deselectColinearAttributes(self, selectedAttributes: List[bool],
                                   coefficients: List[float]):
        maxSC = 1.5
        maxAttr = -1
        coeff = 0
        for i in range(len(selectedAttributes)):
            if selectedAttributes[i]:
                SC = math.fabs(coefficients[coeff] * self.m_StdDevs[i] /
                               self.m_ClassStdDev)
                if SC > maxSC:
                    maxSC = SC
                    maxAttr = i
                coeff += 1
        if maxAttr >= 0:
            selectedAttributes[maxAttr] = False
            return True
        return False

    def doRegression(self, selectedAttributes: List[bool]) -> List:
        numAttributes = 0
        for selectedAttribute in selectedAttributes:
            if selectedAttribute:
                numAttributes += 1
        coefficients = [0] * (numAttributes + 1)
        if numAttributes > 0:
            independentTransposed = np.zeros(
                (numAttributes, self.m_TransformedData.numInstances()))
            dependent = np.zeros(self.m_TransformedData.numInstances())
            for i in range(self.m_TransformedData.numInstances()):
                inst = self.m_TransformedData.instance(i)
                sqrt_weight = math.sqrt(inst.weight())
                index = 0
                for j in range(self.m_TransformedData.numAttributes()):
                    if j == self.m_ClassIndex:
                        dependent[i] = inst.classValue() * sqrt_weight
                    else:
                        if selectedAttributes[j]:
                            value = inst.value(j) - self.m_Means[j]
                            if not self.m_checksTurnedOff:
                                value /= self.m_StdDevs[j]
                            independentTransposed[index][
                                i] = value * sqrt_weight
                            index += 1

            aTy = np.dot(independentTransposed, dependent)
            aTa = np.around(
                np.dot(independentTransposed, independentTransposed.T), 2)
            ridge = self.getRidge()
            for i in range(numAttributes):
                aTa[i][i] += ridge
            coeffsWithoutIntercept = np.dot(aTy, np.linalg.pinv(aTa))
            if len(coeffsWithoutIntercept.shape) > 1:
                coefficients = coeffsWithoutIntercept[0].copy()
            else:
                coefficients = coeffsWithoutIntercept.copy()
        coefficients = np.append(coefficients, self.m_ClassMean)
        column = 0
        for i in range(self.m_TransformedData.numAttributes()):
            if i != self.m_TransformedData.classIndex(
            ) and selectedAttributes[i]:
                if not self.m_checksTurnedOff:
                    coefficients[column] /= self.m_StdDevs[i]
                coefficients[-1] -= coefficients[column] * self.m_Means[i]
                column += 1
        return coefficients

    def getRidge(self):
        return self.Ridge
Ejemplo n.º 16
0
 def buildClassifier(self, data: Instances):
     self.m_ModelBuilt = False
     self.m_isZeroR = False
     if data.numInstances() == 1:
         self.m_Coefficients = [data.instance(0).classValue()]
         self.m_SelectedAttributes = [False] * data.numAttributes()
         self.m_isZeroR = True
         return
     if not self.m_checksTurnedOff:
         self.getCapabilities().testWithFail(data)
         if self.outputAdditionalStats:
             ok = True
             for i in range(data.numInstances()):
                 if data.instance(i).weight() != 1:
                     ok = False
                     break
             if not ok:
                 raise Exception(
                     "Can only compute additional statistics on unweighted data"
                 )
         data = Instances(data)
         data.deleteWithMissingClass()
         self.m_TransformFilter = NominalToBinary()
         self.m_TransformFilter.setInputFormat(data)
         data = Filter.useFilter(data, self.m_TransformFilter)
         self.m_MissingFilter = ReplaceMissingValues()
         self.m_MissingFilter.setInputFormat(data)
         data = Filter.useFilter(data, self.m_MissingFilter)
         data.deleteWithMissingClass()
     else:
         self.m_TransformFilter = None
         self.m_MissingFilter = None
     self.m_ClassIndex = data.classIndex()
     self.m_TransformedData = data
     self.m_Coefficients = None
     self.m_SelectedAttributes = [False] * data.numAttributes()
     self.m_Means = [0] * data.numAttributes()
     self.m_StdDevs = [0] * data.numAttributes()
     for j in range(data.numAttributes()):
         if j != self.m_ClassIndex:
             self.m_SelectedAttributes[j] = True
             self.m_Means[j] = data.meanOrMode(j)
             self.m_StdDevs[j] = math.sqrt(data.variance(j))
             if self.m_StdDevs[j] == 0:
                 self.m_SelectedAttributes[j] = False
     self.m_ClassStdDev = math.sqrt(
         data.variance(self.m_TransformedData.classIndex()))
     self.m_ClassMean = data.meanOrMode(self.m_TransformedData.classIndex())
     self.findBestModel()
     if self.outputAdditionalStats:
         k = 1
         for i in range(data.numAttributes()):
             if i != data.classIndex():
                 if self.m_SelectedAttributes[i]:
                     k += 1
         self.m_df = self.m_TransformedData.numInstances() - k
         se = self.calculateSE(self.m_SelectedAttributes,
                               self.m_Coefficients)
         self.m_RSquared = RegressionAnalysis.calculateRSquared(
             self.m_TransformedData, se)
         self.m_RSquaredAdj = RegressionAnalysis.calculateAdjRSquared(
             self.m_RSquared, self.m_TransformedData.numInstances(), k)
         self.m_FStat = RegressionAnalysis.calculateFStat(
             self.m_RSquared, self.m_TransformedData.numInstances(), k)
         self.m_StdErrorOfCoef = RegressionAnalysis.calculateStdErrorOfCoef(
             self.m_TransformedData, self.m_SelectedAttributes, se,
             self.m_TransformedData.numInstances(), k)
         self.m_TStats = RegressionAnalysis.calculateTStats(
             self.m_Coefficients, self.m_StdErrorOfCoef, k)
     if self.Minimal:
         self.m_TransformedData = None
         self.m_Means = None
         self.m_StdDevs = None
     else:
         self.m_TransformedData = Instances(data, 0)
     self.m_ModelBuilt = True
Ejemplo n.º 17
0
class NormalizableDistance():
    R_MIN = 0
    R_MAX = 1
    R_WIDTH = 2

    def __init__(self, data: Instances = None):
        self.m_AttributeIndices = Range("first-last")
        self.m_DontNormalize = False
        self.m_Ranges = None  #type:List[List]
        self.m_ActiveIndices = None  #type:List
        if data is None:
            self.invalidate()
        else:
            self.setInstances(data)

    def invalidate(self):
        self.m_Validated = False

    def setInstances(self, inst: Instances):
        self.m_Data = inst
        self.invalidate()

    def clean(self):
        self.m_Data = Instances(self.m_Data, 0)

    def update(self, ins: Instance):
        #初始化
        self.validate()
        self.m_Ranges = self.updateRanges(ins, self.m_Ranges)

    @overload
    def distance(self, first: Instance, second: Instance):
        ...

    @overload
    def distance(self, first: Instance, second: Instance,
                 stats: PerformanceStats):
        ...

    @overload
    def distance(self, first: Instance, second: Instance, cutOffValue: float):
        ...

    @overload
    def distance(self, first: Instance, second: Instance, cutOffValue: float,
                 stats: PerformanceStats):
        ...

    def distance(self, first: Instance, second: Instance, a0=None, a1=None):
        if a0 is None or isinstance(a0, PerformanceStats):
            return self.distance(first, second, float("inf"), a0)
        elif isinstance(a0, float):
            distance = 0
            firstNumValues = first.numValues()
            secondNumValues = second.numValues()
            numAttributes = self.m_Data.numAttributes()
            classIndex = self.m_Data.classIndex()
            self.validate()
            p1 = p2 = 0
            while p1 < firstNumValues or p2 < secondNumValues:
                if p1 >= firstNumValues:
                    firstI = numAttributes
                else:
                    firstI = first.index(p1)
                if p2 >= secondNumValues:
                    secondI = numAttributes
                else:
                    secondI = second.index(p2)
                if firstI == classIndex:
                    p1 += 1
                    continue
                if firstI < numAttributes and not self.m_ActiveIndices[firstI]:
                    p1 += 1
                    continue
                if secondI == classIndex:
                    p2 += 1
                    continue
                if secondI < numAttributes and not self.m_ActiveIndices[
                        secondI]:
                    p2 += 1
                    continue
                if firstI == secondI:
                    diff = self.difference(firstI, first.valueSparse(p1),
                                           second.valueSparse(p2))
                    p1 += 1
                    p2 += 1
                elif firstI > secondI:
                    diff = self.difference(secondI, 0, second.valueSparse(p2))
                    p2 += 1
                else:
                    diff = self.difference(firstI, first.valueSparse(p1), 0)
                    p1 += 1
                if isinstance(a1, PerformanceStats):
                    a1.incrCoordCount()
                distance = self.updateDistance(distance, diff)
                if distance > a0:
                    return float('inf')
            return distance

    def updateDistance(self, currDist: float, diff: float) -> float:
        ...

    def difference(self, index: int, val1: float, val2: float):
        if self.m_Data.attribute(index).type() == Attribute.NOMINAL:
            if Utils.isMissingValue(val1) or Utils.isMissingValue(
                    val2) or int(val1) != int(val2):
                return 1
            return 0
        elif self.m_Data.attribute(index).type() == Attribute.NUMERIC:
            if Utils.isMissingValue(val1) or Utils.isMissingValue(val2):
                if Utils.isMissingValue(val1) and Utils.isMissingValue(val2):
                    if not self.m_DontNormalize:
                        return 1
                    return self.m_Ranges[index][self.R_WIDTH]
                else:
                    if Utils.isMissingValue(val2):
                        diff = self.norm(
                            val1, index) if not self.m_DontNormalize else val1
                    else:
                        diff = self.norm(
                            val2, index) if not self.m_DontNormalize else val2
                    if not self.m_DontNormalize and diff < 0.5:
                        diff = 1 - diff
                    elif self.m_DontNormalize:
                        if (self.m_Ranges[index][self.R_MAX] - diff) > (
                                diff - self.m_Ranges[index][self.R_MIN]):
                            return self.m_Ranges[index][self.R_MAX] - diff
                        else:
                            return diff - self.m_Ranges[index][self.R_MIN]
                    return diff
            else:
                if not self.m_DontNormalize:
                    return self.norm(val1, index) - self.norm(val2, index)
                return val1 - val2
        else:
            return 0

    def norm(self, x: float, i: int):
        if self.m_Ranges[i][self.R_WIDTH] == 0:
            return 0
        return (x -
                self.m_Ranges[i][self.R_MIN]) / self.m_Ranges[i][self.R_WIDTH]

    def validate(self):
        if not self.m_Validated:
            self.initialize()
            self.m_Validated = True

    def initialize(self):
        self.initializeAttributeIndices()
        self.initializeRanges()

    def initializeAttributeIndices(self):
        self.m_AttributeIndices.setUpper(self.m_Data.numAttributes() - 1)
        self.m_ActiveIndices = []
        for i in range(self.m_Data.numAttributes()):
            self.m_ActiveIndices.append(self.m_AttributeIndices.isInRange(i))

    def initializeRanges(self) -> List[List]:
        if self.m_Data is None:
            self.m_Ranges = None
            return self.m_Ranges
        numAtt = self.m_Data.numAttributes()
        ranges = [[0] * 3 for i in range(numAtt)]
        if self.m_Data.numInstances() <= 0:
            self.initializeRangesEmpty(numAtt, ranges)
            self.m_Ranges = ranges
            return self.m_Ranges
        else:
            self.updateRangesFirst(self.m_Data.instance(0), numAtt, ranges)
        for i in range(self.m_Data.numInstances()):
            self.updateRanges(self.m_Data.instance(i), ranges)
        self.m_Ranges = ranges
        return self.m_Ranges

    def initializeRangesEmpty(self, numAtt: int, ranges: List[List]):
        for j in range(numAtt):
            ranges[j][self.R_MIN] = float('inf')
            ranges[j][self.R_MAX] = float('inf')
            ranges[j][self.R_WIDTH] = float('inf')

    def updateRangesFirst(self, instance: Instance, numAtt: int,
                          ranges: List[List]):
        for i in range(len(ranges)):
            for j in range(len(ranges[i])):
                ranges[i][j] = 0
        numVals = instance.numValues()
        for j in range(numVals):
            currIndex = instance.index(j)
            if not instance.isMissingSparse(j):
                return True
        return False

    def updateRanges(self, instance: Instance, ranges: List[List[float]]):
        numVals = instance.numValues()
        prevIndex = 0
        for j in range(numVals):
            currIndex = instance.index(j)
            while prevIndex < currIndex:
                if 0 < ranges[prevIndex][self.R_MIN]:
                    ranges[prevIndex][self.R_MIN] = 0
                    ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][
                        self.R_MAX] - ranges[prevIndex][self.R_MIN]
                if 0 > ranges[prevIndex][self.R_MAX]:
                    ranges[prevIndex][self.R_MAX] = 0
                    ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][
                        self.R_MAX] - ranges[prevIndex][self.R_MIN]
                prevIndex += 1
            prevIndex += 1
            if not instance.isMissingSparse(j):
                val = instance.valueSparse(j)
                if val < ranges[currIndex][self.R_MIN]:
                    ranges[currIndex][self.R_MIN] = val
                    ranges[currIndex][self.R_WIDTH] = ranges[currIndex][
                        self.R_MAX] - ranges[currIndex][self.R_MIN]
                if val > ranges[currIndex][self.R_MAX]:
                    ranges[currIndex][self.R_MAX] = val
                    ranges[currIndex][self.R_WIDTH] = ranges[currIndex][
                        self.R_MAX] - ranges[currIndex][self.R_MIN]
        return ranges
Ejemplo n.º 18
0
class KNN(AbstractClassifier):
    WEIGHT_NONE=0
    WEIGHT_INVERSE=1
    WEIGHT_SIMILARITY=2
    TAGS_WEIGHTING=[Tag(WEIGHT_NONE,"No distance weighting"),
                    Tag(WEIGHT_INVERSE,"Weight by 1/distance"),
                    Tag(WEIGHT_SIMILARITY,"Weight by 1-distance")]
    propertyList={"kNN":"1","DistanceWeighting":"TAGS_WEIGHTING"}
    methodList = {"kNN":"setkNN","DistanceWeighting":"setDistanceWeighting"}
    def __init__(self,k:int=None):
        super().__init__()
        self.m_NNSearch=LinearNNSearch()
        self.m_Train=None   #type:Instances
        self.initilize()
        if k is not None:
            self.setKNN(k)

    def __str__(self):
        if self.m_Train is None:
            return "IBk: No model built yet."
        if self.m_Train.numInstances() == 0:
            return "Warning: no training instances - ZeroR model used."
        #TODO 高级
        result="IB1 instance-based classifier\n" +"using " + str(self.kNN)
        if self.DistanceWeighting == self.WEIGHT_INVERSE:
            result+=" inverse-distance-weighted"
        elif self.DistanceWeighting == self.WEIGHT_SIMILARITY:
            result+= " similarity-weighted"
        result+=" nearest neighbour(s) for classification\n"
        if self.WindowSize != 0:
            result+="using a maximum of " + str(self.WindowSize) + " (windowed) training instances\n"
        return result

    def setkNN(self,value:str):
        try:
            val=int(value)
            self.kNN=val
            self.propertyList.update({"kNN":value})
        except ValueError:
            pass

    def setDistanceWeighting(self,value:int):
        self.DistanceWeighting=self.TAGS_WEIGHTING[value].getID()


    def initilize(self):
        self.setKNN(1)
        #多少个样本用于分类,默认整个样本集
        self.WindowSize=0
        self.DistanceWeighting=self.WEIGHT_NONE
        self.CrossValidate=False
        self.MEanSquared=False

    def setKNN(self,k:int):
        self.kNN=k
        self.m_kNNUpper=k
        self.m_kNNValid=False

    def getKNN(self):
        return self.kNN

    def getCapabilities(self):
        result=super().getCapabilities()
        result.disableAll()

        result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES)
        result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES)
        result.enable(CapabilityEnum.DATE_ATTRIBUTES)
        result.enable(CapabilityEnum.MISSING_VALUES)

        result.enable(CapabilityEnum.NOMINAL_CLASS)
        result.enable(CapabilityEnum.NUMERIC_CLASS)
        result.enable(CapabilityEnum.DATE_CLASS)
        result.enable(CapabilityEnum.MISSING_CLASS_VALUES)

        result.setMinimumNumberInstances(0)
        return result

    def buildClassifier(self,data:Instances):
        self.getCapabilities().testWithFail(data)
        instances=Instances(data)
        instances.deleteWithMissingClass()

        self.m_NumClasses=instances.numClasses()
        self.m_ClassType=instances.classAttribute().type()
        self.m_Train=Instances(instances,0,instances.numInstances())
        #只保存了样本集
        if self.WindowSize > 0 and instances.numInstances() > self.WindowSize:
            self.m_Train=Instances(self.m_Train,self.m_Train.numInstances()-self.WindowSize,self.WindowSize)
        self.m_NumAttributesUsed=0
        for i in range(self.m_Train.numAttributes()):
            if i != self.m_Train.classIndex() and (self.m_Train.attribute(i).isNominal() or  self.m_Train.attribute(i).isNumeric()):
                self.m_NumAttributesUsed+=1
        self.m_NNSearch.setInstances(self.m_Train)
        self.m_kNNValid=False
        self.m_defaultModel=ZeroR()
        self.m_defaultModel.buildClassifier(instances)


    def distributionForInstance(self,instance:Instance)->List[float]:
        if self.m_Train.numInstances() == 0:
            return self.m_defaultModel.distributionForInstance(instance)
        #超过样本容量,则循环删除
        if self.WindowSize > 0 and self.m_Train.numInstances() > self.WindowSize:
            self.m_kNNValid=False
            deletedInstance=False
            while(self.m_Train.numInstances()>self.WindowSize):
                self.m_Train.delete(0)
            if deletedInstance is True:
                self.m_NNSearch.setInstances(self.m_Train)
        if not self.m_kNNValid and self.CrossValidate and self.m_kNNUpper>=1:
            pass
        self.m_NNSearch.addInstanceInfo(instance)
        #获取k个邻居的样本集和距离
        neighbours=self.m_NNSearch.kNearestNeighbours(instance,self.kNN)
        distances=self.m_NNSearch.getDistances()
        distribution=self.makeDistribution(neighbours,distances)
        return distribution

    #获取k个邻近样本的概率分布
    def makeDistribution(self,neighbours:Instances,distances:List)->List[float]:
        distribution=[0]*self.m_NumClasses
        total=0
        if self.m_ClassType == Attribute.NOMINAL:
            for i in range(self.m_NumClasses):
                distribution[i]=1/max(1,self.m_Train.numInstances())
            total=self.m_NumClasses/max(1,self.m_Train.numInstances())
        for i in range(neighbours.numInstances()):
            current=neighbours.instance(i)
            distances[i]=distances[i]*distances[i]
            distances[i]=math.sqrt(distances[i]/self.m_NumAttributesUsed)
            if self.DistanceWeighting == self.WEIGHT_INVERSE:
                weight=1/distances[i]
            elif self.DistanceWeighting == self.WEIGHT_SIMILARITY:
                weight=1-distances[i]
            else:
                weight=1
            weight*=current.weight()
            if self.m_ClassType == Attribute.NOMINAL:
                distribution[int(current.classValue())]+=weight
            elif self.m_ClassType == Attribute.NUMERIC:
                distribution[0]+=current.classValue()*weight
            total+=weight
        if total > 0:
            Utils.normalize(distribution, total)
        return distribution
Ejemplo n.º 19
0
    def testInstances(self, data: Instances, *args):
        if len(args) == 0:
            return self.testInstances(data, 0, data.numAttributes() - 1)
        fromIndex = args[0]
        toIndex = args[1]
        if self.doNotCheckCapabilities():
            return True
        if len(self.m_Capabilities) == 0 or (len(self.m_Capabilities) == 1
                                             and self.handles(
                                                 CapabilityEnum.NO_CLASS)):
            sys.stderr.write("No capabilities set!")
        if toIndex - fromIndex < 0:
            self.m_FailReason = CapabilityError("No attributes!")
            return False
        testClass = data.classIndex() > -1 and data.classIndex(
        ) >= fromIndex and data.classIndex() <= toIndex
        for i in range(fromIndex, toIndex + 1):
            att = data.attribute(i)
            if i == data.classIndex():
                continue
            if not self.testAttribute(att):
                return False
        if not self.handles(
                CapabilityEnum.NO_CLASS) and data.classIndex() == -1:
            self.m_FailReason = CapabilityError("Class attribute not set!")
            return False

        if self.handles(CapabilityEnum.NO_CLASS) and data.classIndex() > -1:
            cap = self.getClassCapabilities()
            cap.disable(CapabilityEnum.NO_CLASS)
            iter = cap.capabilities()
            if len(iter) == 0:
                self.m_FailReason = CapabilityError(
                    "Cannot handle any class attribute!")
                return False
        if testClass and not self.handles(CapabilityEnum.NO_CLASS):
            att = data.classAttribute()
            if not self.testAttribute(att, True):
                return False
            if not self.handles(CapabilityEnum.MISSING_CLASS_VALUES):
                for i in range(data.numInstances()):
                    if data.instance(i).classIsMissing():
                        self.m_FailReason = CapabilityError(
                            "Cannot handle missing class values!")
                        return False
            else:
                hasClass = 0
                for i in range(data.numInstances()):
                    if not data.instance(i).classIsMissing():
                        hasClass += 1
                if hasClass < self.getMinimumNumberInstances():
                    self.m_FailReason=CapabilityError("Not enough training instances with class labels (required: "\
                                                      + str(self.getMinimumNumberInstances())\
                                                      + ", provided: "\
                                                      + str(hasClass)\
                                                      + ")!")
                    return False
        missing = False
        for i in range(data.numInstances()):
            inst = data.instance(i)
            if not self.handles(CapabilityEnum.MISSING_VALUES):
                #TODO 使用稀疏矩阵pass
                # if isinstance(inst)
                #     pass
                #else
                for n in range(fromIndex, toIndex + 1):
                    if n == inst.classIndex():
                        continue
                    if inst.isMissing(n):
                        missing = True
                        break
                if missing:
                    self.m_FailReason = CapabilityError(
                        "Cannot handle missing values!")
                    return False
        if data.numInstances() < self.getMinimumNumberInstances():
            self.m_FailReason = CapabilityError(
                "Not enough training instances (required: " +
                str(self.getMinimumNumberInstances()) + ", provided: " +
                str(data.numInstances()) + ")!")
            return False
        # if self.handles(CapabilityEnum.ONLY_MULTIINSTANCE):
        #     if data.numAttributes() != 3:
        #         return False
        #     if not data.attribute(0).isNominal() or data.classIndex() != data.numAttributes()-1:
        #         return False
        #     owner=self.getOwner()
        #     if isinstance(owner,MultiInstanceCapabilitiesHandler):
        #         handler=owner
        #         cap=handler.getMultiInstanceCapabilities()
        #         if data.numInstances()>0 and data.attribute(1).numValues()>0:
        #             result=cap.testAttribute(data.attribute(1))
        return True