コード例 #1
0
 def makeDistribution(self,neighbours:Instances,distances:List)->List[float]:
     distribution=[0]*self.m_NumClasses
     total=0
     if self.m_ClassType == Attribute.NOMINAL:
         for i in range(self.m_NumClasses):
             distribution[i]=1/max(1,self.m_Train.numInstances())
         total=self.m_NumClasses/max(1,self.m_Train.numInstances())
     for i in range(neighbours.numInstances()):
         current=neighbours.instance(i)
         distances[i]=distances[i]*distances[i]
         distances[i]=math.sqrt(distances[i]/self.m_NumAttributesUsed)
         if self.DistanceWeighting == self.WEIGHT_INVERSE:
             weight=1/distances[i]
         elif self.DistanceWeighting == self.WEIGHT_SIMILARITY:
             weight=1-distances[i]
         else:
             weight=1
         weight*=current.weight()
         if self.m_ClassType == Attribute.NOMINAL:
             distribution[int(current.classValue())]+=weight
         elif self.m_ClassType == Attribute.NUMERIC:
             distribution[0]+=current.classValue()*weight
         total+=weight
     if total > 0:
         Utils.normalize(distribution, total)
     return distribution
コード例 #2
0
 def initialize(self, data: Instances, type: int, indices: List[int]):
     self.m_Data = Instances(data, 0)
     self.m_Type = type
     self.m_AllowedIndices = copy.deepcopy(indices)
     self.locate()
     self.m_Indices = self.find(True)
     self.m_LocatorIndices = self.find(False)
コード例 #3
0
 def setAttributes(self, inst: Instances, pos: int = -1):
     flo = QFormLayout()
     flo.setLabelAlignment(Qt.AlignRight)
     flo.setContentsMargins(20, 20, 20, 20)
     flo.setSpacing(15)
     self.m_Instance = inst
     self.m_WidgetList = []
     self.m_InsertPos = pos
     for i in range(inst.numAttributes()):
         attr = inst.attribute(i)
         label = QLabel(attr.name())
         if attr.isNominal():
             edit = QComboBox()
             edit.addItem("")
             edit.addItems(attr.values())
         elif attr.isNumeric():
             edit = QLineEdit()
             edit.setPlaceholderText("输入数字")
             pDoubleValidator = QDoubleValidator(self)
             edit.setValidator(pDoubleValidator)
         else:
             edit = QLineEdit()
         self.m_WidgetList.append(edit)
         flo.addRow(label, edit)
     hlayout = QHBoxLayout()
     submit = QPushButton("提交")
     submit.clicked.connect(self.submitClick)
     cancel = QPushButton("取消")
     cancel.clicked.connect(self.close)
     hlayout.addWidget(submit)
     hlayout.addWidget(cancel)
     widget = QWidget()
     widget.setLayout(hlayout)
     flo.addRow(widget)
     self.setLayout(flo)
コード例 #4
0
 def determineFormat(self):
     margin = None  #type:Attribute
     if not self.m_SaveForVisualization:
         self.m_PlotInstances = None
         return
     hv = []  #type:List[Attribute]
     classAt = self.m_Instances.attribute(self.m_ClassIndex)
     if classAt.isNominal():
         attVals = []
         for i in range(classAt.numValues()):
             attVals.append(classAt.value(i))
         predictedClass = Attribute("predicted " + classAt.name(), attVals)
         margin = Attribute("prediction margin")
     else:
         predictedClass = Attribute("predicted" + classAt.name())
     for i in range(self.m_Instances.numAttributes()):
         if i == self.m_Instances.classIndex():
             if classAt.isNominal():
                 hv.append(margin)
             hv.append(predictedClass)
         hv.append(self.m_Instances.attribute(i).copy())
     #添加预测属性
     self.m_PlotInstances = Instances(
         self.m_Instances.relationName() + "_predicted", hv,
         self.m_Instances.numInstances())
     if classAt.isNominal():
         self.m_PlotInstances.setClassIndex(self.m_ClassIndex + 2)
     else:
         self.m_PlotInstances.setClassIndex(self.m_ClassIndex + 1)
コード例 #5
0
ファイル: ThresholdCurve.py プロジェクト: ccreeper/weka-forpy
 def getROCArea(cls, tcurve: Instances):
     n = tcurve.numInstances()
     if cls.RELATION_NAME != tcurve.relationName() or n == 0:
         return float('nan')
     tpInd = tcurve.attribute(cls.TRUE_POS_NAME).index()
     fpInd = tcurve.attribute(cls.FALSE_POS_NAME).index()
     tpVals = tcurve.attributeToDoubleArray(tpInd)
     fpVals = tcurve.attributeToDoubleArray(fpInd)
     area = cumNeg = 0
     totalPos = tpVals[0]
     totalNeg = fpVals[0]
     for i in range(n):
         if i < n - 1:
             cip = tpVals[i] - tpVals[i + 1]
             cin = fpVals[i] - fpVals[i + 1]
         else:
             cip = tpVals[n - 1]
             cin = fpVals[n - 1]
         area += cip * (cumNeg + (0.5 * cin))
         cumNeg += cin
     if totalNeg * totalPos == 0:
         if area == 0:
             return float("nan")
         elif area > 0:
             return float("inf")
         else:
             return float("-inf")
     area /= (totalNeg * totalPos)
     return area
コード例 #6
0
    def __init__(self,data:Instances):
        self.m_Header=Instances(data,0)
        self.m_NumClasses=data.numClasses()
        self.m_NumFolds=1
        self.m_metricsToDisplay=[]
        self.m_ClassIsNominal=data.classAttribute().isNominal()
        self.m_WithClass=0
        self.m_Unclassified=0
        self.m_SumKBInfo=0
        self.m_SumSchemeEntropy=0
        self.m_SumPriorEntropy=0
        self.m_SumErr=self.m_SumAbsErr=self.m_SumSqrErr=self.m_SumPriorAbsErr=self.m_SumPriorSqrErr=0
        self.m_ConfLevel=0.95
        self.m_TotalCoverage=self.m_TotalSizeOfRegions=0
        self.m_MissingClass=0
        self.m_Incorrect=self.m_Correct=0
        self.m_DiscardPredictions=False
        self.m_CoverageStatisticsAvailable=True
        self.m_ComplexityStatisticsAvailable=True
        self.m_SumClass=self.m_SumSqrClass=self.m_SumPredicted=self.m_SumSqrPredicted=self.m_SumClassPredicted=0

        self.m_Predictions=None     #type:List[Prediction]
        if self.m_ClassIsNominal:
            self.m_ConfusionMatrix=[[0]*self.m_NumClasses for i in range(self.m_NumClasses)]       #type:List[List[float]]
            self.m_ClassNames=[]        #type:List[str]
            for i in range(self.m_NumClasses):
                self.m_ClassNames.append(data.classAttribute().value(i))
        self.m_ClassPriors=[0]*self.m_NumClasses       #type:List[float]
        self.setPriors(data)
        self.m_MarginCounts=[0]*(self.k_MarginResolution+1)
        for s in self.BUILT_IN_EVAL_METRICS:
            if s.lower() != "coverage" and s.lower() != "region size":
                self.m_metricsToDisplay.append(s.lower())
コード例 #7
0
def calculateRSquared(data: Instances, ssr: float):
    yMean = data.meanOrMode(data.classIndex())
    tss = 0
    for i in range(data.numInstances()):
        tss+=(data.instance(i).value(data.classIndex())-yMean)*\
             (data.instance(i).value(data.classIndex())-yMean)
    rsq = 1 - ssr / tss
    return rsq
コード例 #8
0
ファイル: C45Split.py プロジェクト: ccreeper/weka-forpy
 def rightSide(self, index: int, data: Instances):
     text = ""
     if data.attribute(self.m_attIndex).isNominal():
         text += " = " + data.attribute(self.m_attIndex).value(index)
     elif index == 0:
         text += " <= " + Utils.doubleToString(self.m_splitPoint, 6)
     else:
         text += " > " + Utils.doubleToString(self.m_splitPoint, 6)
     return text
コード例 #9
0
 def setInputFormat(self, instanceInfo: Instances):
     super().setInputFormat(instanceInfo)
     if instanceInfo.classIndex() < 0:
         raise Exception("No class has been assigned to the instances")
     self.setOutputFormatBinary()
     self.m_Indices = None
     if instanceInfo.classAttribute().isNominal():
         return True
     return False
コード例 #10
0
 def buildClassifier(self, data: Instances):
     data = Instances(data)
     data.deleteWithMissingClass()
     self.buildTree(data, self.m_subtreeRaising or not self.m_cleanup)
     if self.m_collapseTheTree:
         self.collapse()
     if self.m_pruneTheTree:
         self.prune()
     if self.m_cleanup:
         self.cleanup(Instances(data, 0))
コード例 #11
0
ファイル: Filter.py プロジェクト: ccreeper/weka-forpy
 def setOutputFormat(self, outputFormat: Instances = None):
     if outputFormat is not None:
         self.m_OutputFormat = outputFormat.stringFreeStructure()
         self.initOutputLocators(self.m_OutputFormat)
         relationName = outputFormat.relationName(
         ) + "-" + self.__class__.__name__
         self.m_OutputFormat.setRelationName(relationName)
     else:
         self.m_OutputFormat = None
     self.m_OutputQueue = Queue()
コード例 #12
0
 def removeClass(self, inst: Instances):
     af = Remove()
     if inst.classIndex() < 0:
         retI = inst
     else:
         af.setAttributeIndices("" + str(inst.classIndex() + 1))
         af.setInvertSelection(False)
         af.setInputFormat(inst)
         retI = Filter.useFilter(inst, af)
     return retI
コード例 #13
0
ファイル: C45Split.py プロジェクト: ccreeper/weka-forpy
 def handleNumericAttribute(self, trainInstances: Instances):
     next = 1
     last = 0
     splitIndex = -1
     self.m_distribution = Distribution(2, trainInstances.numClasses())
     i = 0
     for inst in trainInstances:
         if inst.isMissing(self.m_attIndex):
             break
         self.m_distribution.add(1, inst)
         i += 1
     firstMiss = i
     minSplit = 0.1 * self.m_distribution.total(
     ) / trainInstances.numClasses()
     if Utils.gr(self.m_minNoObj, minSplit) or Utils.equal(
             minSplit, self.m_minNoObj):
         minSplit = self.m_minNoObj
     elif Utils.gr(minSplit, 25):
         minSplit = 25
     if Utils.gr(2 * minSplit, firstMiss):
         return
     defaultEnt = self.infoGainCrit.oldEnt(self.m_distribution)
     print("dfalut", defaultEnt)
     while next < firstMiss:
         if trainInstances.instance(next - 1).value(
                 self.m_attIndex) + 1e-5 < trainInstances.instance(
                     next).value(self.m_attIndex):
             self.m_distribution.shiftRange(1, 0, trainInstances, last,
                                            next)
             if (Utils.gr(self.m_distribution.perBag(0), minSplit) or Utils.equal(self.m_distribution.perBag(0), minSplit))\
                     and (Utils.gr(self.m_distribution.perBag(1), minSplit) or Utils.equal(self.m_distribution.perBag(1), minSplit)):
                 currentInfoGain = self.infoGainCrit.splitCritValue(
                     self.m_distribution, self.m_sumOfWeights, defaultEnt)
                 if Utils.gr(currentInfoGain, self.m_infoGain):
                     self.m_infoGain = currentInfoGain
                     splitIndex = next - 1
                 self.m_index += 1
             last = next
         next += 1
     if self.m_index == 0:
         return
     if self.m_useMDLcorrection:
         self.m_infoGain = self.m_infoGain - (Utils.log2(self.m_index) /
                                              self.m_sumOfWeights)
     if Utils.gr(0, self.m_infoGain) or Utils.equal(0, self.m_infoGain):
         return
     self.m_numSubsets = 2
     self.m_splitPoint = (
         trainInstances.instance(splitIndex + 1).value(self.m_attIndex) +
         trainInstances.instance(splitIndex).value(self.m_attIndex)) / 2
     if self.m_splitPoint == trainInstances.instance(splitIndex + 1).value(
             self.m_attIndex):
         self.m_splitPoint = trainInstances.instance(splitIndex).value(
             self.m_attIndex)
     self.m_distribution = Distribution(2, trainInstances.numClasses())
     self.m_distribution.addRange(0, trainInstances, 0, splitIndex + 1)
     self.m_distribution.addRange(1, trainInstances, splitIndex + 1,
                                  firstMiss)
     self.m_gainRatio = self.gainRatioCrit.splitCritValue(
         self.m_distribution, self.m_sumOfWeights, self.m_infoGain)
コード例 #14
0
    def evaluateClusterer(self, test: Instances, outputModel: bool):
        i = loglk = unclusteredInstances = 0
        cc = self.m_Clusterer.numberOfClusters()
        self.m_numClusters = cc
        instanceStats = [0] * cc
        hasClass = test.classIndex() >= 0
        clusterAssignments = []
        filter = None  #type:Filter

        testRaw = copy.deepcopy(test)
        testRaw.setClassIndex(test.classIndex())

        if hasClass:
            if testRaw.classAttribute().isNumeric():
                raise Exception(unclusteredInstances)
            filter = Remove()
            filter.setAttributeIndices(str(testRaw.classIndex() + 1))
            filter.setInvertSelection(False)
            filter.setInputFormat(testRaw)
        for inst in testRaw:
            if filter is not None:
                filter.input(inst)
                filter.batchFinished()
                inst = filter.output()
            cnum = self.m_Clusterer.clusterInstance(inst)
            clusterAssignments.append(cnum)
            if cnum != -1:
                instanceStats[cnum] += 1
        sumNum = sum(instanceStats)
        loglk /= sumNum
        self.m_logL = loglk
        self.m_clusterAssignments = []
        # for i in clusterAssignments:
        #     print(",",i,end="")
        # print()
        for i in range(len(clusterAssignments)):
            self.m_clusterAssignments.append(clusterAssignments[i])
        numInstFieldWidth = int(
            math.log(len(clusterAssignments)) / math.log(10) + 1)
        if outputModel:
            self.m_clusteringResult += str(self.m_Clusterer)
        self.m_clusteringResult += "Clustered Instances\n\n"
        clustFieldWidth = int((math.log(cc) / math.log(10)) + 1)
        for i in range(cc):
            if instanceStats[i] > 0:
                self.m_clusteringResult+= Utils.doubleToString(i, clustFieldWidth, 0) \
                                          +"      " \
                                          + Utils.doubleToString(instanceStats[i], numInstFieldWidth, 0) \
                                          +"(" + Utils.doubleToString((instanceStats[i] / sumNum * 100), 3, 0) \
                                          +"%)\n"
        if unclusteredInstances > 0:
            self.m_clusteringResult += "\nUnclustered instances : " + str(
                unclusteredInstances)
        if hasClass:
            self.evaluateClustersWithRespectToClass(test)
コード例 #15
0
ファイル: BinC45Split.py プロジェクト: ccreeper/weka-forpy
    def buildClassifer(self, instances: Instances):
        self.m_numSubsets = 0
        self.m_splitPoint = float("inf")
        self.m_infoGain = 0
        self.m_gainRatio = 0

        if instances.attribute(self.m_attIndex).isNominal():
            self.handleEnumeratedAttribute(instances)
        else:
            instances.sort(instances.attribute(self.m_attIndex))
            self.handleNumericAttribute(instances)
コード例 #16
0
ファイル: C45Split.py プロジェクト: ccreeper/weka-forpy
 def setSplitPoint(self, allInstances: Instances):
     newSplitPoint = float("-inf")
     if allInstances.attribute(
             self.m_attIndex).isNumeric() and self.m_numSubsets > 1:
         for i in range(allInstances.numInstances()):
             instance = allInstances.instance(i)
             tempValue = instance.value(self.m_attIndex)
             if not Utils.isMissingValue(tempValue):
                 if tempValue > newSplitPoint and tempValue <= self.m_splitPoint:
                     newSplitPoint = tempValue
         self.m_splitPoint = newSplitPoint
コード例 #17
0
ファイル: Filter.py プロジェクト: ccreeper/weka-forpy
    def useFilter(cls, data: Instances, filter: 'Filter'):
        for i in range(data.numInstances()):
            filter.input(data.instance(i))
        filter.batchFinished()
        newData = filter.getOutputFormat()
        Utils.debugOut("Queue size:", filter.m_OutputQueue.qsize())
        processed = filter.output()
        while processed is not None:
            newData.add(processed)
            processed = filter.output()

        return newData
コード例 #18
0
    def setOutputFormatNumeric(self):
        if self.m_Indices is None:
            self.setOutputFormat()
            return
        self.m_needToTransform = False
        for i in range(self.getInputFormat().numAttributes()):
            att = self.getInputFormat().attribute(i)
            if att.isNominal() and (att.numValues() > 2
                                    or self.binaryAttributesNominal
                                    or self.m_TransformAll):
                self.m_needToTransform = True
                break
        if not self.m_needToTransform:
            self.setOutputFormat(self.getInputFormat())
            return
        newClassIndex = self.getInputFormat().classIndex()
        newAtts = []
        for j in range(self.getInputFormat().numAttributes()):
            att = self.getInputFormat().attribute(j)
            if not att.isNominal() or j == self.getInputFormat().classIndex():
                newAtts.append(att.copy())
            else:
                if j < self.getInputFormat().classIndex():
                    newClassIndex += att.numValues() - 2
                for k in range(att.numValues()):
                    attributeName = att.name() + "="
                    for l in range(att.numValues()):
                        if l > k:
                            attributeName += ','
                        attributeName += att.value(
                            att.value(self.m_Indices[j][l]))
                    if self.binaryAttributesNominal:
                        a = Attribute(attributeName)
                        if self.getSpreadAttributeWeight():
                            a.setWeight(att.weight() / (att.numValues() - 1))
                        else:
                            a.setWeight(att.weight())
                        newAtts.append(a)
                    else:
                        vals = []
                        vals.append("f")
                        vals.append("t")
                        a = Attribute(attributeName, vals)
                        if self.getSpreadAttributeWeight():
                            a.setWeight(att.weight() / (att.numValues() - 1))
                        else:
                            a.setWeight(att.weight())
                        newAtts.append(a)

        outputFormat = Instances(self.getInputFormat().relationName(), newAtts,
                                 0)
        outputFormat.setClassIndex(newClassIndex)
        self.setOutputFormat(outputFormat)
コード例 #19
0
 def kNearestNeighbours(self, target: Instance, kNN: int) -> Instances:
     if self.m_Stats is not None:
         self.m_Stats.searchStart()
     heap = MyHeap(kNN)
     firstkNN = 0
     for i in range(self.m_Instances.numInstances()):
         if target == self.m_Instances.instance(i):
             continue
         if self.m_Stats is not None:
             self.m_Stats.incrPointCount()
         if firstkNN < kNN:
             distance = self.m_DistanceFunction.distance(
                 target, self.m_Instances.instance(i), float("inf"),
                 self.m_Stats)
             if distance == 0 and self.m_SkipIdentical and i < self.m_Instances.numInstances(
             ) - 1:
                 continue
             heap.put(i, distance)
             firstkNN += 1
         else:
             temp = heap.peek()
             distance = self.m_DistanceFunction.distance(
                 target, self.m_Instances.instance(i), temp.distance,
                 self.m_Stats)
             if distance == 0 and self.m_SkipIdentical:
                 continue
             if distance < temp.distance:
                 heap.putBySubstitute(i, distance)
             elif distance == temp.distance:
                 heap.putKthNearest(i, distance)
     neighbours = Instances(self.m_Instances,
                            heap.size() + heap.noOfKthNearest())
     self.m_Distances = [0] * (heap.size() + heap.noOfKthNearest())
     indices = [0] * (heap.size() + heap.noOfKthNearest())
     i = 1
     while heap.noOfKthNearest() > 0:
         h = heap.getKthNearest()
         indices[len(indices) - i] = h.index
         self.m_Distances[len(indices) - i] = h.distance
         i += 1
     while heap.size() > 0:
         h = heap.get()
         indices[len(indices) - i] = h.index
         self.m_Distances[len(indices) - i] = h.distance
         i += 1
     self.m_DistanceFunction.postProcessDistances(self.m_Distances)
     for k in range(len(indices)):
         neighbours.add(self.m_Instances.instance(indices[k]))
     if self.m_Stats is not None:
         self.m_Stats.searchStart()
     return neighbours
コード例 #20
0
ファイル: BinC45Split.py プロジェクト: ccreeper/weka-forpy
 def rightSide(self, index: int, data: Instances):
     text = ""
     if data.attribute(self.m_attIndex).isNominal():
         if index == 0:
             text += " = " + data.attribute(self.m_attIndex).value(
                 int(self.m_splitPoint))
         else:
             text += " != " + data.attribute(self.m_attIndex).value(
                 int(self.m_splitPoint))
     elif index == 0:
         text += " <= " + str(self.m_splitPoint)
     else:
         text += " > " + str(self.m_splitPoint)
     return text
コード例 #21
0
ファイル: C45Split.py プロジェクト: ccreeper/weka-forpy
 def resetDistribution(self, data: Instances):
     insts = Instances(data, data.numInstances())
     for i in range(data.numInstances()):
         if self.whichSubset(data.instance(i)) > -1:
             insts.add(data.instance(i))
     newD = Distribution(insts, self)
     newD.addInstWithUnknown(data, self.m_attIndex)
     self.m_distribution = newD
コード例 #22
0
ファイル: VisualizePanel.py プロジェクト: ccreeper/weka-forpy
 def setUpComboBoxes(self, inst: Instances):
     XNames = []
     YNames = []
     CNames = []
     for i in range(inst.numAttributes()):
         type = " (" + Attribute.typeToStringShort(inst.attribute(i)) + ")"
         XNames.append("X: " + inst.attribute(i).name() + type)
         YNames.append("Y: " + inst.attribute(i).name() + type)
         CNames.append("Colour: " + inst.attribute(i).name() + type)
     self.m_XCombo.addItems(XNames)
     self.m_YCombo.addItems(YNames)
     self.m_ColourCombo.addItems(CNames)
     self.m_XCombo.setCurrentIndex(0)
     self.m_YCombo.setCurrentIndex(1)
     self.m_ColourCombo.setCurrentIndex(inst.numAttributes() - 1)
コード例 #23
0
ファイル: StringLocator.py プロジェクト: ccreeper/weka-forpy
 def copyStringValues(cls, inst:Instance, a0=None, a1=None, a2:AttributeLocator=None, a3:Instances=None, a4:AttributeLocator=None):
     if isinstance(a0,Instances) and isinstance(a1,AttributeLocator):
         if inst.dataset() is None:
             raise Exception("Instance has no dataset assigned!!")
         elif inst.dataset().numAttributes() != a0.numAttributes():
             raise Exception("Src and Dest differ in # of attributes: "
                       + str(inst.dataset().numAttributes()) + " != "
                       + str(a0.numAttributes()))
         cls.copyStringValuesFromSrc(inst,True,inst.dataset(),a1,a0,a1)
     else:
         if a1 == a3:
             return
         if len(a2.getAttributeIndices()) != len(a4.getAttributeIndices()):
             raise Exception("Src and Dest string indices differ in length: "
                             + str(len(a2.getAttributeIndices())) + " != "
                             + str(len(a4.getAttributeIndices())))
         if len(a2.getLocatorIndices()) != len(a4.getLocatorIndices()):
             raise Exception("Src and Dest locator indices differ in length: "
                             + str(len(a2.getLocatorIndices())) + " != "
                             + str(len(a4.getLocatorIndices())))
         for i in range(len(a2.getAttributeIndices())):
             if a0:
                 instIndex = a2.getActualIndex(a2.getAttributeIndices()[i])
             else:
                 instIndex = a4.getActualIndex(a4.getAttributeIndices()[i])
             src = a1.attribute(a2.getActualIndex(a2.getAttributeIndices()[i]))
             dest = a3.attribute(a4.getActualIndex(a4.getAttributeIndices()[i]))
             if not inst.isMissing(instIndex):
                 valIndex = dest.addStringValue(src, int(inst.value(instIndex)))
                 inst.setValue(instIndex, valIndex)
コード例 #24
0
 def split(self,data:Instances)->List[Instances]:
     subsetSize=[0]*self.m_numSubsets
     for inst in data:
         subset=self.whichSubset(inst)
         if subset > -1:
             subsetSize[subset]+=1
         else:
             weights=self.weights(inst)
             for j in range(self.m_numSubsets):
                 if Utils.gr(weights[j], 0):
                     subsetSize[j]+=1
     instances=[]        #type:List[Instances]
     for j in range(self.m_numSubsets):
         instances.append(Instances(data,subsetSize[j]))
     for inst in data:
         subset=self.whichSubset(inst)
         if subset > -1:
             instances[subset].add(inst)
         else:
             weights=self.weights(inst)
             for j in range(self.m_numSubsets):
                 if Utils.gr(weights[j], 0):
                     instances[j].add(inst)
                     instances[j].lastInstance().setWeight(float(weights[j]*inst.weight()))
     return instances
コード例 #25
0
 def selectModel(self, data: Instances, test: Instances = None):
     if test is not None:
         return self.selectModel(data)
     multiVal = True
     averageInfoGain = validModels = 0
     checkDistribution = Distribution(data)
     noSplitModel = NoSplit(checkDistribution)
     if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \
         Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())):
         return noSplitModel
     if self.m_allData is not None:
         for attr in data.enumerateAttributes():
             if attr.isNumeric() or Utils.gr(
                     0.3 * self.m_allData.numInstances(), attr.numValues()):
                 multiVal = False
                 break
     currentModel = [None] * data.numAttributes()  #type:List[C45Split]
     sumOfWeights = data.sumOfWeight()
     for i in range(data.numAttributes()):
         if i != data.classIndex():
             currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights,
                                        self.m_useMDLcorrection)
             currentModel[i].buildClassifer(data)
             if currentModel[i].checkModel():
                 if self.m_allData is not None:
                     if data.attribute(i).isNumeric() or \
                         (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())):
                         averageInfoGain = averageInfoGain + currentModel[
                             i].infoGain()
                         validModels += 1
                 else:
                     averageInfoGain = averageInfoGain + currentModel[
                         i].infoGain()
                     validModels += 1
         else:
             currentModel[i] = None
     if validModels == 0:
         return noSplitModel
     averageInfoGain = averageInfoGain / validModels
     minResult = 0
     for i in range(data.numAttributes()):
         if i != data.classIndex() and currentModel[i].checkModel():
             if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\
                 Utils.gr(currentModel[i].gainRatio(), minResult):
                 bestModel = currentModel[i]
                 minResult = currentModel[i].gainRatio()
     if Utils.equal(minResult, 0):
         return noSplitModel
     bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex())
     if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue:
         bestModel.setSplitPoint(self.m_allData)
     return bestModel
コード例 #26
0
ファイル: ThresholdCurve.py プロジェクト: ccreeper/weka-forpy
 def getPRCArea(cls, tcurve: Instances):
     n = tcurve.numInstances()
     if cls.RELATION_NAME != tcurve.relationName() or n == 0:
         return float('nan')
     pInd = tcurve.attribute(cls.PRECISION_NAME).index()
     rInd = tcurve.attribute(cls.RECALL_NAME).index()
     pVals = tcurve.attributeToDoubleArray(pInd)
     rVals = tcurve.attributeToDoubleArray(rInd)
     area = 0
     xlast = rVals[n - 1]
     for i in range(n - 2, -1, -1):
         recallDelta = rVals[i] - xlast
         area += pVals[i] * recallDelta
         xlast = rVals[i]
     if area == 0:
         return Utils.missingValue()
     return area
コード例 #27
0
 def dumpLabel(self,index:int,data:Instances):
     text=""
     text+=data.classAttribute().value(self.m_distribution.maxClass(index))
     text+=" ("+str(Utils.roundDouble(self.m_distribution.perBag(index), 2))
     if Utils.gr(self.m_distribution.numIncorrect(index), 0):
         text+="/"+str(Utils.roundDouble(self.m_distribution.numIncorrect(index), 2))
     text+=")"
     return text
コード例 #28
0
    def evaluateClustersWithRespectToClass(self, inst: Instances):
        numClasses = inst.classAttribute().numValues()
        counts = [[0] * numClasses for i in range(self.m_numClusters)]
        clusterTotals = [0] * self.m_numClusters
        best = [0] * (self.m_numClusters + 1)
        current = [0] * (self.m_numClusters + 1)

        instances = copy.deepcopy(inst)
        instances.setClassIndex(inst.classIndex())
        i = 0
        for instance in instances:
            if self.m_clusterAssignments[i] >= 0:
                if not instance.classIsMissing():
                    counts[int(self.m_clusterAssignments[i])][int(
                        instance.classValue())] += 1
                    clusterTotals[int(self.m_clusterAssignments[i])] += 1
            i += 1
        numInstances = i
        best[self.m_numClusters] = float('inf')
        self.mapClasses(self.m_numClusters, 0, counts, clusterTotals, current,
                        best, 0)
        self.m_clusteringResult += "\n\nClass attribute: " + inst.classAttribute(
        ).name() + "\n"
        self.m_clusteringResult += "Classes to Clusters:\n"
        matrixString = self.toMatrixString(counts, clusterTotals,
                                           Instances(inst, 0))
        self.m_clusteringResult += matrixString + '\n'
        Cwidth = 1 + int(math.log(self.m_numClusters) / math.log(10))
        for i in range(self.m_numClusters):
            if clusterTotals[i] > 0:
                self.m_clusteringResult += "Cluster " + Utils.doubleToString(
                    i, Cwidth, 0)
                self.m_clusteringResult += " <-- "
                if best[i] < 0:
                    self.m_clusteringResult += "No class\n"
                else:
                    self.m_clusteringResult += inst.classAttribute().value(
                        int(best[i])) + '\n'
        self.m_clusteringResult+="\nIncorrectly clustered instances :\t"\
                                  + str(best[self.m_numClusters])\
                                  + "\t" \
                                 + Utils.doubleToString((best[self.m_numClusters] / numInstances * 100.0), 8, 4) \
                                  + " %\n"
        self.m_classToCluster = []
        for i in range(self.m_numClusters):
            self.m_classToCluster[i] = int(best[i])
コード例 #29
0
 def setOutputFormatNominal(self):
     self.m_needToTransform = False
     for i in range(self.getInputFormat().numAttributes()):
         att = self.getInputFormat().attribute(i)
         if att.isNominal() and i != self.getInputFormat().classIndex() and\
                 (att.numValues() > 2 or self.m_TransformAll or self.binaryAttributesNominal):
             self.m_needToTransform = True
             break
     if not self.m_needToTransform:
         self.setOutputFormat(self.getInputFormat())
         return
     newClassIndex = self.getInputFormat().classIndex()
     newAtts = []
     for j in range(self.getInputFormat().numAttributes()):
         att = self.getInputFormat().attribute(j)
         if not att.isNominal() or j == self.getInputFormat().classIndex():
             newAtts.append(att.copy())
         else:
             if att.numValues() <= 2 and not self.m_TransformAll:
                 if self.binaryAttributesNominal:
                     value = ""
                     if att.numValues() == 2:
                         value = "=" + att.value(1)
                     a = Attribute(att.name() + value)
                     a.setWeight(att.weight())
                     newAtts.append(a)
                 else:
                     newAtts.append(att.copy())
             else:
                 if j < self.getInputFormat().classIndex():
                     newClassIndex += att.numValues() - 1
                 for k in range(att.numValues()):
                     attributeName = att.name() + "="
                     attributeName += att.value(k)
                     if self.binaryAttributesNominal:
                         a = Attribute(attributeName)
                         if self.getSpreadAttributeWeight():
                             a.setWeight(att.weight() / att.numValues())
                         else:
                             a.setWeight(att.weight())
                         newAtts.append(a)
     outputFormat = Instances(self.getInputFormat().relationName(), newAtts,
                              0)
     outputFormat.setClassIndex(newClassIndex)
     self.setOutputFormat(outputFormat)
コード例 #30
0
ファイル: PlotData2D.py プロジェクト: ccreeper/weka-forpy
 def __init__(self, insts: Instances):
     self.m_maxX = self.m_minX = self.m_maxY = self.m_minY = self.m_maxC = self.m_minC = 0
     self.m_plotName = "new plot"
     self.m_plotInstances = insts
     self.m_xIndex = self.m_yIndex = self.m_cIndex = 0
     self.m_pointLookup = [[0] * 4 for i in range(insts.numInstances())]
     self.m_shapeSize = []  #type:List[int]
     self.m_shapeType = []  #type:List[int]
     self.m_connecctPoints = [False] * insts.numInstances()
     self.m_alwaysDisplayPointsOfThisSize = -1
     self.m_displayAllPoints = False
     for i in range(insts.numInstances()):
         self.m_shapeSize.append(Plot2D.DEFAULT_SHAPE_SIZE.value)
         if self.m_plotInstances.instance(i).weight() >= 0:
             self.m_shapeType.append(Plot2D.CONST_AUTOMATIC_SHAPE)
         else:
             self.m_shapeType.append(-2)
     self.determineBounds()