Beispiel #1
0
 def setModel(self, data: Instances):
     headerLabels = ["No."]
     self.m_Table.setRowCount(data.numInstances())
     for column in range(data.numAttributes()):
         lab = str(column + 1) + ":"
         lab += data.attribute(column).name() + '\n'
         lab += Attribute.typeToString(
             data.attribute(column).type()).capitalize()
         headerLabels.append(lab)
     self.m_Table.setColumnCount(data.numAttributes() + 1)
     self.m_Table.setHorizontalHeaderLabels(headerLabels)
     self.m_Table.horizontalHeader().resizeSections(
         QHeaderView.ResizeToContents)
     ###test
     # try:
     #     for row in range(data.numInstances()):
     #         print(data.instance(row).m_Data)
     # except AttributeError:
     #     print("Row:",row)
     ###
     for row in range(data.numInstances()):
         item = QTableWidgetItem(str(row + 1))
         item.setFlags(Qt.NoItemFlags)
         self.m_Table.setItem(row, 0, item)
         self.m_Table.setRawItem(data, row)
         self.m_Table.setRowHeight(row, 30)
Beispiel #2
0
 def getROCArea(cls, tcurve: Instances):
     n = tcurve.numInstances()
     if cls.RELATION_NAME != tcurve.relationName() or n == 0:
         return float('nan')
     tpInd = tcurve.attribute(cls.TRUE_POS_NAME).index()
     fpInd = tcurve.attribute(cls.FALSE_POS_NAME).index()
     tpVals = tcurve.attributeToDoubleArray(tpInd)
     fpVals = tcurve.attributeToDoubleArray(fpInd)
     area = cumNeg = 0
     totalPos = tpVals[0]
     totalNeg = fpVals[0]
     for i in range(n):
         if i < n - 1:
             cip = tpVals[i] - tpVals[i + 1]
             cin = fpVals[i] - fpVals[i + 1]
         else:
             cip = tpVals[n - 1]
             cin = fpVals[n - 1]
         area += cip * (cumNeg + (0.5 * cin))
         cumNeg += cin
     if totalNeg * totalPos == 0:
         if area == 0:
             return float("nan")
         elif area > 0:
             return float("inf")
         else:
             return float("-inf")
     area /= (totalNeg * totalPos)
     return area
Beispiel #3
0
 def rightSide(self, index: int, data: Instances):
     text = ""
     if data.attribute(self.m_attIndex).isNominal():
         text += " = " + data.attribute(self.m_attIndex).value(index)
     elif index == 0:
         text += " <= " + Utils.doubleToString(self.m_splitPoint, 6)
     else:
         text += " > " + Utils.doubleToString(self.m_splitPoint, 6)
     return text
Beispiel #4
0
    def buildClassifer(self, instances: Instances):
        self.m_numSubsets = 0
        self.m_splitPoint = float("inf")
        self.m_infoGain = 0
        self.m_gainRatio = 0

        if instances.attribute(self.m_attIndex).isNominal():
            self.handleEnumeratedAttribute(instances)
        else:
            instances.sort(instances.attribute(self.m_attIndex))
            self.handleNumericAttribute(instances)
Beispiel #5
0
 def selectModel(self, data: Instances, test: Instances = None):
     if test is not None:
         return self.selectModel(data)
     multiVal = True
     averageInfoGain = validModels = 0
     checkDistribution = Distribution(data)
     noSplitModel = NoSplit(checkDistribution)
     if Utils.gr(2*self.m_minNoObj, checkDistribution.total()) or \
         Utils.equal(checkDistribution.total(), checkDistribution.perClass(checkDistribution.maxClass())):
         return noSplitModel
     if self.m_allData is not None:
         for attr in data.enumerateAttributes():
             if attr.isNumeric() or Utils.gr(
                     0.3 * self.m_allData.numInstances(), attr.numValues()):
                 multiVal = False
                 break
     currentModel = [None] * data.numAttributes()  #type:List[C45Split]
     sumOfWeights = data.sumOfWeight()
     for i in range(data.numAttributes()):
         if i != data.classIndex():
             currentModel[i] = C45Split(i, self.m_minNoObj, sumOfWeights,
                                        self.m_useMDLcorrection)
             currentModel[i].buildClassifer(data)
             if currentModel[i].checkModel():
                 if self.m_allData is not None:
                     if data.attribute(i).isNumeric() or \
                         (multiVal or Utils.gr(0.3*self.m_allData.numInstances(), data.attribute(i).numValues())):
                         averageInfoGain = averageInfoGain + currentModel[
                             i].infoGain()
                         validModels += 1
                 else:
                     averageInfoGain = averageInfoGain + currentModel[
                         i].infoGain()
                     validModels += 1
         else:
             currentModel[i] = None
     if validModels == 0:
         return noSplitModel
     averageInfoGain = averageInfoGain / validModels
     minResult = 0
     for i in range(data.numAttributes()):
         if i != data.classIndex() and currentModel[i].checkModel():
             if currentModel[i].infoGain() >= averageInfoGain-1e-3 and\
                 Utils.gr(currentModel[i].gainRatio(), minResult):
                 bestModel = currentModel[i]
                 minResult = currentModel[i].gainRatio()
     if Utils.equal(minResult, 0):
         return noSplitModel
     bestModel.distribution().addInstWithUnknown(data, bestModel.attIndex())
     if self.m_allData is not None and not self.m_doNotMakeSplitPointActualValue:
         bestModel.setSplitPoint(self.m_allData)
     return bestModel
Beispiel #6
0
 def rightSide(self, index: int, data: Instances):
     text = ""
     if data.attribute(self.m_attIndex).isNominal():
         if index == 0:
             text += " = " + data.attribute(self.m_attIndex).value(
                 int(self.m_splitPoint))
         else:
             text += " != " + data.attribute(self.m_attIndex).value(
                 int(self.m_splitPoint))
     elif index == 0:
         text += " <= " + str(self.m_splitPoint)
     else:
         text += " > " + str(self.m_splitPoint)
     return text
Beispiel #7
0
 def setUpComboBoxes(self, inst: Instances):
     XNames = []
     YNames = []
     CNames = []
     for i in range(inst.numAttributes()):
         type = " (" + Attribute.typeToStringShort(inst.attribute(i)) + ")"
         XNames.append("X: " + inst.attribute(i).name() + type)
         YNames.append("Y: " + inst.attribute(i).name() + type)
         CNames.append("Colour: " + inst.attribute(i).name() + type)
     self.m_XCombo.addItems(XNames)
     self.m_YCombo.addItems(YNames)
     self.m_ColourCombo.addItems(CNames)
     self.m_XCombo.setCurrentIndex(0)
     self.m_YCombo.setCurrentIndex(1)
     self.m_ColourCombo.setCurrentIndex(inst.numAttributes() - 1)
Beispiel #8
0
 def copyStringValues(cls, inst:Instance, a0=None, a1=None, a2:AttributeLocator=None, a3:Instances=None, a4:AttributeLocator=None):
     if isinstance(a0,Instances) and isinstance(a1,AttributeLocator):
         if inst.dataset() is None:
             raise Exception("Instance has no dataset assigned!!")
         elif inst.dataset().numAttributes() != a0.numAttributes():
             raise Exception("Src and Dest differ in # of attributes: "
                       + str(inst.dataset().numAttributes()) + " != "
                       + str(a0.numAttributes()))
         cls.copyStringValuesFromSrc(inst,True,inst.dataset(),a1,a0,a1)
     else:
         if a1 == a3:
             return
         if len(a2.getAttributeIndices()) != len(a4.getAttributeIndices()):
             raise Exception("Src and Dest string indices differ in length: "
                             + str(len(a2.getAttributeIndices())) + " != "
                             + str(len(a4.getAttributeIndices())))
         if len(a2.getLocatorIndices()) != len(a4.getLocatorIndices()):
             raise Exception("Src and Dest locator indices differ in length: "
                             + str(len(a2.getLocatorIndices())) + " != "
                             + str(len(a4.getLocatorIndices())))
         for i in range(len(a2.getAttributeIndices())):
             if a0:
                 instIndex = a2.getActualIndex(a2.getAttributeIndices()[i])
             else:
                 instIndex = a4.getActualIndex(a4.getAttributeIndices()[i])
             src = a1.attribute(a2.getActualIndex(a2.getAttributeIndices()[i]))
             dest = a3.attribute(a4.getActualIndex(a4.getAttributeIndices()[i]))
             if not inst.isMissing(instIndex):
                 valIndex = dest.addStringValue(src, int(inst.value(instIndex)))
                 inst.setValue(instIndex, valIndex)
Beispiel #9
0
 def setAttributes(self, inst: Instances, pos: int = -1):
     flo = QFormLayout()
     flo.setLabelAlignment(Qt.AlignRight)
     flo.setContentsMargins(20, 20, 20, 20)
     flo.setSpacing(15)
     self.m_Instance = inst
     self.m_WidgetList = []
     self.m_InsertPos = pos
     for i in range(inst.numAttributes()):
         attr = inst.attribute(i)
         label = QLabel(attr.name())
         if attr.isNominal():
             edit = QComboBox()
             edit.addItem("")
             edit.addItems(attr.values())
         elif attr.isNumeric():
             edit = QLineEdit()
             edit.setPlaceholderText("输入数字")
             pDoubleValidator = QDoubleValidator(self)
             edit.setValidator(pDoubleValidator)
         else:
             edit = QLineEdit()
         self.m_WidgetList.append(edit)
         flo.addRow(label, edit)
     hlayout = QHBoxLayout()
     submit = QPushButton("提交")
     submit.clicked.connect(self.submitClick)
     cancel = QPushButton("取消")
     cancel.clicked.connect(self.close)
     hlayout.addWidget(submit)
     hlayout.addWidget(cancel)
     widget = QWidget()
     widget.setLayout(hlayout)
     flo.addRow(widget)
     self.setLayout(flo)
Beispiel #10
0
 def getPRCArea(cls, tcurve: Instances):
     n = tcurve.numInstances()
     if cls.RELATION_NAME != tcurve.relationName() or n == 0:
         return float('nan')
     pInd = tcurve.attribute(cls.PRECISION_NAME).index()
     rInd = tcurve.attribute(cls.RECALL_NAME).index()
     pVals = tcurve.attributeToDoubleArray(pInd)
     rVals = tcurve.attributeToDoubleArray(rInd)
     area = 0
     xlast = rVals[n - 1]
     for i in range(n - 2, -1, -1):
         recallDelta = rVals[i] - xlast
         area += pVals[i] * recallDelta
         xlast = rVals[i]
     if area == 0:
         return Utils.missingValue()
     return area
Beispiel #11
0
    def buildClassifer(self, instances: Instances):
        self.m_numSubsets = 0
        self.m_splitPoint = float("inf")
        self.m_infoGain = 0
        self.m_gainRatio = 0

        if instances.attribute(self.m_attIndex).isNominal():
            self.m_complexityIndex = instances.attribute(
                self.m_attIndex).numValues()
            self.m_index = self.m_complexityIndex
            self.handleEnumeratedAttribute(instances)
            print("att build after numSubsets:", self.numSubsets())
        else:
            self.m_complexityIndex = 2
            self.m_index = 0
            instances.sort(instances.attribute(self.m_attIndex))
            self.handleNumericAttribute(instances)
            print("num build after numSubsets:", self.numSubsets())
Beispiel #12
0
 def forInstances(cls,
                  data: Instances,
                  multi: bool = False) -> 'Capabilities':
     result = Capabilities(None)
     result.m_InterfaceDefinedCapabilities = set()
     if data.classIndex() == -1:
         result.enable(CapabilityEnum.NO_CLASS)
     else:
         if data.classAttribute().type() == Attribute.NOMINAL:
             if data.classAttribute().numValues() == 1:
                 result.enable(CapabilityEnum.UNARY_CLASS)
             elif data.classAttribute().numValues() == 2:
                 result.enable(CapabilityEnum.BINARY_CLASS)
             else:
                 result.enable(CapabilityEnum.NOMINAL_CLASS)
         elif data.classAttribute().type() == Attribute.NUMERIC:
             result.enable(CapabilityEnum.NUMERIC_CLASS)
         elif data.classAttribute().type() == Attribute.STRING:
             result.enable(CapabilityEnum.STRING_CLASS)
         elif data.classAttribute().type() == Attribute.DATE:
             result.enable(CapabilityEnum.DATE_CLASS)
         else:
             raise Exception("Unknown class attribute type '" +
                             data.classAttribute().name() + "'!")
         for i in range(data.numInstances()):
             if data.instance(i).classIsMissing():
                 result.enable(CapabilityEnum.MISSING_CLASS_VALUES)
                 break
     for i in range(data.numAttributes()):
         if i == data.classIndex():
             continue
         if data.attribute(i).type() == Attribute.NOMINAL:
             result.enable(CapabilityEnum.UNARY_ATTRIBUTES)
             if data.attribute(i).numValues() == 2:
                 result.enable(CapabilityEnum.BINARY_ATTRIBUTES)
             elif data.attribute(i).numValues() > 2:
                 result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.NUMERIC:
             result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.DATE:
             result.enable(CapabilityEnum.DATE_ATTRIBUTES)
         elif data.attribute(i).type() == Attribute.STRING:
             result.enable(CapabilityEnum.STRING_ATTRIBUTES)
         else:
             raise Exception("Unknown attribute type '" +
                             data.attribute(i).name() + "'!")
     missing = False
     for i in range(data.numInstances()):
         inst = data.instance(i)
         for n in range(data.numAttributes()):
             if n == inst.classIndex():
                 continue
             if inst.isMissing(n):
                 missing = True
                 break
         if missing:
             result.enable(CapabilityEnum.MISSING_VALUES)
             break
     return result
Beispiel #13
0
 def setInstances(self, inst: Instances):
     self.m_Instances = inst
     attribNames = []
     for i in range(inst.numAttributes()):
         tp = "(" + Attribute.typeToStringShort(inst.attribute(i)) + ")"
         attribNames.append(tp + inst.attribute(i).name())
     self.m_ClassCombo.clear()
     self.m_ClassCombo.addItems(attribNames)
     if len(attribNames) > 0:
         if inst.classIndex() == -1:
             self.m_ClassCombo.setCurrentIndex(len(attribNames) - 1)
         else:
             self.m_ClassCombo.setCurrentIndex(inst.classIndex())
         self.m_ClassCombo.setEnabled(True)
         self.m_StartBut.setEnabled(self.m_RunThread is None)
         self.m_StopBut.setEnabled(self.m_RunThread is not None)
     else:
         self.m_StartBut.setEnabled(False)
         self.m_StopBut.setEnabled(False)
Beispiel #14
0
 def moveCentroid(self,centroidIndex:int,members:Instances,updateClusterInfo:bool,addToCentroidInstances:bool):
     vals=[0]*members.numAttributes()
     nominalDists=[[] for i in range(members.numAttributes())]
     weightMissing=[0]*members.numAttributes()
     weightNonMissing=[0]*members.numAttributes()
     for j in range(members.numAttributes()):
         if members.attribute(j).isNominal():
             nominalDists[j]=[0]*members.attribute(j).numValues()
     for inst in members:
         for j in range(members.numAttributes()):
             if inst.isMissing(j):
                 weightMissing[j]+=inst.weight()
             else:
                 weightNonMissing[j]+=inst.weight()
                 if members.attribute(j).isNumeric():
                     vals[j]+=inst.weight()*inst.value(j)
                 else:
                     nominalDists[j][int(inst.value(j))]+=inst.weight()
     for j in range(members.numAttributes()):
         if members.attribute(j).isNumeric():
             if weightNonMissing[j]>0:
                 vals[j]/=weightNonMissing[j]
             else:
                 vals[j]= Utils.missingValue()
         else:
             max=float('-inf')
             maxIndex=-1
             for i in range(len(nominalDists[j])):
                 if nominalDists[j][i]>max:
                     max=nominalDists[j][i]
                     maxIndex=i
                 if max < weightMissing[j]:
                     vals[j]= Utils.missingValue()
                 else:
                     vals[j]=maxIndex
     if updateClusterInfo:
         for j in range(members.numAttributes()):
             self.m_ClusterMissingCounts[centroidIndex][j]=weightMissing[j]
             self.m_ClusterNominalCounts[centroidIndex][j]=nominalDists[j]
     if addToCentroidInstances:
         self.m_ClusterCentroids.add(Instance(1.0,vals))
     return vals
Beispiel #15
0
 def setSplitPoint(self, allInstances: Instances):
     newSplitPoint = float("-inf")
     if allInstances.attribute(
             self.m_attIndex).isNumeric() and self.m_numSubsets > 1:
         for i in range(allInstances.numInstances()):
             instance = allInstances.instance(i)
             tempValue = instance.value(self.m_attIndex)
             if not Utils.isMissingValue(tempValue):
                 if tempValue > newSplitPoint and tempValue <= self.m_splitPoint:
                     newSplitPoint = tempValue
         self.m_splitPoint = newSplitPoint
Beispiel #16
0
 def copyStringValuesFromSrc(cls,instance:Instance,instSrcCompat:bool,srcDataset:Instances,srcLoc:AttributeLocator,
                      destDataset:Instances,destLoc:AttributeLocator):
     if srcDataset == destDataset:
         return
     if len(srcLoc.getAttributeIndices()) != len(destLoc.getAttributeIndices()):
         raise Exception("Src and Dest string indices differ in length: "
                   + str(len(srcLoc.getAttributeIndices()))+ " != "
                   + str(len(destLoc.getAttributeIndices().length)))
     if len(srcLoc.getLocatorIndices()) != len(destLoc.getLocatorIndices()):
         raise Exception("Src and Dest locator indices differ in length: "
                   + str(len(srcLoc.getLocatorIndices())) + " != "
                   + str(len(destLoc.getLocatorIndices().length)))
     for i in range(len(srcLoc.getAttributeIndices())):
         if instSrcCompat:
             instIndex=srcLoc.getActualIndex(srcLoc.getAttributeIndices()[i])
         else:
             instIndex=destLoc.getActualIndex(destLoc.getAttributeIndices()[i])
         src=srcDataset.attribute(srcLoc.getActualIndex(srcLoc.getAttributeIndices()[i]))
         dest=destDataset.attribute(destLoc.getActualIndex(destLoc.getAttributeIndices()[i]))
         if not instance.isMissing(instIndex):
             valIndex=dest.addStringValue(src,int(instance.value(instIndex)))
             instance.setValue(instIndex,valIndex)
Beispiel #17
0
 def handleEnumeratedAttribute(self, instances: Instances):
     numAttValues = instances.attribute(self.m_attIndex).numValues()
     newDistribution = Distribution(numAttValues, instances.numClasses())
     for inst in instances:
         if not inst.isMissing(self.m_attIndex):
             newDistribution.add(int(inst.value(self.m_attIndex)), inst)
     self.m_distribution = newDistribution
     for i in range(numAttValues):
         if Utils.gr(newDistribution.perBag(i), self.m_minNoObj) or\
                 Utils.equal(newDistribution.perBag(i), self.m_minNoObj):
             secondDistribution = Distribution(newDistribution, i)
             if secondDistribution.check(self.m_minNoObj):
                 self.m_numSubsets = 2
                 currIG = self.infoGainCrit.splitCritValue(
                     secondDistribution, self.m_sumOfWeights)
                 currGR = self.gainRatioCrit.splitCritValue(
                     secondDistribution, self.m_sumOfWeights, currIG)
                 if i == 0 or Utils.gr(currGR, self.m_gainRatio):
                     self.m_gainRatio = currGR
                     self.m_infoGain = currIG
                     self.m_splitPoint = i
                     self.m_distribution = secondDistribution
Beispiel #18
0
 def setInputFormat(self, instanceInfo: Instances):
     super().setInputFormat(instanceInfo)
     self.attributeIndices.setUpper(instanceInfo.numAttributes() - 1)
     attributes = []
     outputClass = -1
     self.m_SelectedAttributes = self.attributeIndices.getSelection()
     if len(self.m_SelectedAttributes) == instanceInfo.numAttributes():
         self.setOutputFormat(instanceInfo)
         self.initOutputLocators(self.getInputFormat(),
                                 self.m_SelectedAttributes)
         return True
     for current in self.m_SelectedAttributes:
         if instanceInfo.classIndex() == current:
             outputClass = len(attributes)
         keep = instanceInfo.attribute(current).copy()
         attributes.append(keep)
     self.initInputLocators(self.getInputFormat(),
                            self.m_SelectedAttributes)
     outputFormat = Instances(instanceInfo.relationName(), attributes, 0)
     outputFormat.setClassIndex(outputClass)
     self.setOutputFormat(outputFormat)
     return True
class NormalizableDistance():
    R_MIN = 0
    R_MAX = 1
    R_WIDTH = 2

    def __init__(self, data: Instances = None):
        self.m_AttributeIndices = Range("first-last")
        self.m_DontNormalize = False
        self.m_Ranges = None  #type:List[List]
        self.m_ActiveIndices = None  #type:List
        if data is None:
            self.invalidate()
        else:
            self.setInstances(data)

    def invalidate(self):
        self.m_Validated = False

    def setInstances(self, inst: Instances):
        self.m_Data = inst
        self.invalidate()

    def clean(self):
        self.m_Data = Instances(self.m_Data, 0)

    def update(self, ins: Instance):
        #初始化
        self.validate()
        self.m_Ranges = self.updateRanges(ins, self.m_Ranges)

    @overload
    def distance(self, first: Instance, second: Instance):
        ...

    @overload
    def distance(self, first: Instance, second: Instance,
                 stats: PerformanceStats):
        ...

    @overload
    def distance(self, first: Instance, second: Instance, cutOffValue: float):
        ...

    @overload
    def distance(self, first: Instance, second: Instance, cutOffValue: float,
                 stats: PerformanceStats):
        ...

    def distance(self, first: Instance, second: Instance, a0=None, a1=None):
        if a0 is None or isinstance(a0, PerformanceStats):
            return self.distance(first, second, float("inf"), a0)
        elif isinstance(a0, float):
            distance = 0
            firstNumValues = first.numValues()
            secondNumValues = second.numValues()
            numAttributes = self.m_Data.numAttributes()
            classIndex = self.m_Data.classIndex()
            self.validate()
            p1 = p2 = 0
            while p1 < firstNumValues or p2 < secondNumValues:
                if p1 >= firstNumValues:
                    firstI = numAttributes
                else:
                    firstI = first.index(p1)
                if p2 >= secondNumValues:
                    secondI = numAttributes
                else:
                    secondI = second.index(p2)
                if firstI == classIndex:
                    p1 += 1
                    continue
                if firstI < numAttributes and not self.m_ActiveIndices[firstI]:
                    p1 += 1
                    continue
                if secondI == classIndex:
                    p2 += 1
                    continue
                if secondI < numAttributes and not self.m_ActiveIndices[
                        secondI]:
                    p2 += 1
                    continue
                if firstI == secondI:
                    diff = self.difference(firstI, first.valueSparse(p1),
                                           second.valueSparse(p2))
                    p1 += 1
                    p2 += 1
                elif firstI > secondI:
                    diff = self.difference(secondI, 0, second.valueSparse(p2))
                    p2 += 1
                else:
                    diff = self.difference(firstI, first.valueSparse(p1), 0)
                    p1 += 1
                if isinstance(a1, PerformanceStats):
                    a1.incrCoordCount()
                distance = self.updateDistance(distance, diff)
                if distance > a0:
                    return float('inf')
            return distance

    def updateDistance(self, currDist: float, diff: float) -> float:
        ...

    def difference(self, index: int, val1: float, val2: float):
        if self.m_Data.attribute(index).type() == Attribute.NOMINAL:
            if Utils.isMissingValue(val1) or Utils.isMissingValue(
                    val2) or int(val1) != int(val2):
                return 1
            return 0
        elif self.m_Data.attribute(index).type() == Attribute.NUMERIC:
            if Utils.isMissingValue(val1) or Utils.isMissingValue(val2):
                if Utils.isMissingValue(val1) and Utils.isMissingValue(val2):
                    if not self.m_DontNormalize:
                        return 1
                    return self.m_Ranges[index][self.R_WIDTH]
                else:
                    if Utils.isMissingValue(val2):
                        diff = self.norm(
                            val1, index) if not self.m_DontNormalize else val1
                    else:
                        diff = self.norm(
                            val2, index) if not self.m_DontNormalize else val2
                    if not self.m_DontNormalize and diff < 0.5:
                        diff = 1 - diff
                    elif self.m_DontNormalize:
                        if (self.m_Ranges[index][self.R_MAX] - diff) > (
                                diff - self.m_Ranges[index][self.R_MIN]):
                            return self.m_Ranges[index][self.R_MAX] - diff
                        else:
                            return diff - self.m_Ranges[index][self.R_MIN]
                    return diff
            else:
                if not self.m_DontNormalize:
                    return self.norm(val1, index) - self.norm(val2, index)
                return val1 - val2
        else:
            return 0

    def norm(self, x: float, i: int):
        if self.m_Ranges[i][self.R_WIDTH] == 0:
            return 0
        return (x -
                self.m_Ranges[i][self.R_MIN]) / self.m_Ranges[i][self.R_WIDTH]

    def validate(self):
        if not self.m_Validated:
            self.initialize()
            self.m_Validated = True

    def initialize(self):
        self.initializeAttributeIndices()
        self.initializeRanges()

    def initializeAttributeIndices(self):
        self.m_AttributeIndices.setUpper(self.m_Data.numAttributes() - 1)
        self.m_ActiveIndices = []
        for i in range(self.m_Data.numAttributes()):
            self.m_ActiveIndices.append(self.m_AttributeIndices.isInRange(i))

    def initializeRanges(self) -> List[List]:
        if self.m_Data is None:
            self.m_Ranges = None
            return self.m_Ranges
        numAtt = self.m_Data.numAttributes()
        ranges = [[0] * 3 for i in range(numAtt)]
        if self.m_Data.numInstances() <= 0:
            self.initializeRangesEmpty(numAtt, ranges)
            self.m_Ranges = ranges
            return self.m_Ranges
        else:
            self.updateRangesFirst(self.m_Data.instance(0), numAtt, ranges)
        for i in range(self.m_Data.numInstances()):
            self.updateRanges(self.m_Data.instance(i), ranges)
        self.m_Ranges = ranges
        return self.m_Ranges

    def initializeRangesEmpty(self, numAtt: int, ranges: List[List]):
        for j in range(numAtt):
            ranges[j][self.R_MIN] = float('inf')
            ranges[j][self.R_MAX] = float('inf')
            ranges[j][self.R_WIDTH] = float('inf')

    def updateRangesFirst(self, instance: Instance, numAtt: int,
                          ranges: List[List]):
        for i in range(len(ranges)):
            for j in range(len(ranges[i])):
                ranges[i][j] = 0
        numVals = instance.numValues()
        for j in range(numVals):
            currIndex = instance.index(j)
            if not instance.isMissingSparse(j):
                return True
        return False

    def updateRanges(self, instance: Instance, ranges: List[List[float]]):
        numVals = instance.numValues()
        prevIndex = 0
        for j in range(numVals):
            currIndex = instance.index(j)
            while prevIndex < currIndex:
                if 0 < ranges[prevIndex][self.R_MIN]:
                    ranges[prevIndex][self.R_MIN] = 0
                    ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][
                        self.R_MAX] - ranges[prevIndex][self.R_MIN]
                if 0 > ranges[prevIndex][self.R_MAX]:
                    ranges[prevIndex][self.R_MAX] = 0
                    ranges[prevIndex][self.R_WIDTH] = ranges[prevIndex][
                        self.R_MAX] - ranges[prevIndex][self.R_MIN]
                prevIndex += 1
            prevIndex += 1
            if not instance.isMissingSparse(j):
                val = instance.valueSparse(j)
                if val < ranges[currIndex][self.R_MIN]:
                    ranges[currIndex][self.R_MIN] = val
                    ranges[currIndex][self.R_WIDTH] = ranges[currIndex][
                        self.R_MAX] - ranges[currIndex][self.R_MIN]
                if val > ranges[currIndex][self.R_MAX]:
                    ranges[currIndex][self.R_MAX] = val
                    ranges[currIndex][self.R_WIDTH] = ranges[currIndex][
                        self.R_MAX] - ranges[currIndex][self.R_MIN]
        return ranges
Beispiel #20
0
class AttributeLocator():
    def __init__(self, data: Instances, tp: int, a0=None, a1=None):
        self.m_AllowedIndices = None  #type:List[int]
        self.m_Attributes = None  #type:List[bool]
        self.m_Locators = None  #type:List[AttributeLocator]
        self.m_Type = -1
        self.m_Data = None  #type:Instances
        self.m_Indices = None  #type:List[int]
        self.m_LocatorIndices = None  #type:List[int]
        if a0 is None and a1 is None:
            a0 = 0
            a1 = data.numAttributes() - 1
        if isinstance(a0, int) and isinstance(a1, int):
            indices = []
            for i in range(a1 - a0 + 1):
                indices.append(a0 + i)
            self.initialize(data, tp, indices)
            return
        elif isinstance(a0, List) and a1 is None:
            self.initialize(data, tp, a0)
            return

    def initialize(self, data: Instances, type: int, indices: List[int]):
        self.m_Data = Instances(data, 0)
        self.m_Type = type
        self.m_AllowedIndices = copy.deepcopy(indices)
        self.locate()
        self.m_Indices = self.find(True)
        self.m_LocatorIndices = self.find(False)

    def find(self, findAttrs: bool):
        indices = []
        if findAttrs:
            for i in range(len(self.m_Attributes)):
                if self.m_Attributes[i]:
                    indices.append(i)
        else:
            for i in range(len(self.m_Locators)):
                if self.m_Locators[i] != None:
                    indices.append(i)
        result = []
        for i in range(len(indices)):
            result.append(indices[i])
        return result

    def locate(self):
        self.m_Attributes = []
        self.m_Locators = [None] * len(self.m_AllowedIndices)
        for i in range(len(self.m_AllowedIndices)):
            self.m_Attributes.append(
                self.m_Data.attribute(self.m_AllowedIndices[i]).type() ==
                self.getType())

    def getType(self):
        return self.m_Type

    def getAttributeIndices(self):
        return self.m_Indices

    def getLocatorIndices(self):
        return self.m_LocatorIndices

    def getActualIndex(self, index: int):
        return self.m_AllowedIndices[index]

    def getLocator(self, index: int) -> 'AttributeLocator':
        return self.m_Locators[index]

    def getData(self) -> Instances:
        return self.m_Data
Beispiel #21
0
    def threadClassifierRun(self):
        try:
            self.m_CEPanel.addToHistory()
            inst = Instances(self.m_Instances)
            trainTimeStart = trainTimeElapsed = testTimeStart = testTimeElapsed = 0
            userTestStructure = None
            if self.m_SetTestFrame is not None:
                userTestStructure = deepcopy(
                    self.m_SetTestFrame.getInstances())  #type:Instances
                userTestStructure.setClassIndex(self.m_TestClassIndex)

            #默认outputmodel,output per-class stats,output confusion matrix,store predictions for visualization
            #outputPredictionsText=None
            numFolds = 10
            classIndex = self.m_ClassCombo.currentIndex()
            inst.setClassIndex(classIndex)
            classifier = self.m_ClassifierEditor.getValue()  #type:Classifier
            name = time.strftime("%H:%M:%S - ")
            outPutResult = ""
            evaluation = None  #type:Evaluation
            grph = None

            if self.m_CVBut.isChecked():
                testMode = 1
                numFolds = int(self.m_CVText.text())
                if numFolds <= 1:
                    raise Exception("Number of folds must be greater than 1")
            elif self.m_TrainBut.isChecked():
                testMode = 2
            elif self.m_TestSplitBut.isChecked():
                testMode = 3
                # if source is None:
                #     raise Exception("No user test set has been specified")
                if not inst.equalHeaders(userTestStructure):
                    QMessageBox.critical(self.m_Explorer, "错误", "测试数据集属性不同")
            else:
                raise Exception("Unknown test mode")
            cname = classifier.__module__
            if cname.startswith("classifiers."):
                name += cname[len("classifiers."):]
            else:
                name += cname
            cmd = classifier.__module__
            # if isinstance(classifier,OptionHandler):
            #     cmd+=" "+Utils.joinOptions(classifier.getOptions())
            plotInstances = ClassifierErrorsPlotInstances()
            plotInstances.setInstances(userTestStructure if testMode ==
                                       4 else inst)
            plotInstances.setClassifier(classifier)
            plotInstances.setClassIndex(inst.classIndex())
            plotInstances.setPointSizeProportionalToMargin(False)
            outPutResult += "=== Run information ===\n\n"
            outPutResult += "Scheme:       " + cname

            # if isinstance(classifier,OptionHandler):
            #     o=classifier.getOptions()
            #     outPutResult+=" "+Utils.joinOptions(o)
            outPutResult += "\n"
            outPutResult += "Relation:     " + inst.relationName() + '\n'
            outPutResult += "Instances:    " + str(inst.numInstances()) + '\n'
            outPutResult += "Attributes:   " + str(inst.numAttributes()) + '\n'
            if inst.numAttributes() < 100:
                for i in range(inst.numAttributes()):
                    outPutResult += "              " + inst.attribute(
                        i).name() + '\n'
            else:
                outPutResult += "              [list of attributes omitted]\n"
            outPutResult += "Test mode:    "
            if testMode == 1:
                outPutResult += str(numFolds) + "-fold cross-validation\n"
            elif testMode == 2:
                outPutResult += "evaluate on training data\n"
            elif testMode == 3:
                outPutResult += "user supplied test set: " + str(
                    userTestStructure.numInstances()) + " instances\n"
            outPutResult += "\n"
            self.m_History.addResult(name, outPutResult)
            self.m_History.setSingle(name)

            if testMode == 2 or testMode == 3:
                trainTimeStart = time.time()
                classifier.buildClassifier(inst)
                trainTimeElapsed = time.time() - trainTimeStart
            outPutResult += "=== Classifier model (full training set) ===\n\n"
            outPutResult += str(classifier) + "\n"
            outPutResult += "\nTime taken to build model: " + Utils.doubleToString(
                trainTimeElapsed, 2) + " seconds\n\n"
            self.m_History.updateResult(name, outPutResult)
            if isinstance(classifier, Drawable):
                grph = classifier.graph()

            print("==========update Compelte=================")

            if testMode == 2:
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                testTimeStart = time.time()
                #TODO
                # if isinstance(classifier,BatchPredictor)
                # else:
                for jj in range(inst.numInstances()):
                    plotInstances.process(inst.instance(jj), classifier,
                                          evaluation)
                testTimeElapsed = time.time() - testTimeStart
                outPutResult += "=== Evaluation on training set ===\n"
            elif testMode == 1:
                rnd = 1
                inst.randomize(rnd)
                if inst.attribute(classIndex).isNominal():
                    inst.stratify(numFolds)
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                for fold in range(numFolds):
                    train = inst.trainCV(numFolds, fold, rnd)
                    evaluation = self.setupEval(evaluation, classifier, train,
                                                plotInstances, True)
                    evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                    current = deepcopy(classifier)
                    current.buildClassifier(train)
                    test = inst.testCV(numFolds, fold)
                    # TODO
                    # if isinstance(classifier,BatchPredictor)
                    # else:
                    for jj in range(test.numInstances()):
                        plotInstances.process(test.instance(jj), current,
                                              evaluation)
                if inst.attribute(classIndex).isNominal():
                    outPutResult += "=== Stratified cross-validation ===\n"
                else:
                    outPutResult += "=== Cross-validation ===\n"
            elif testMode == 3:
                evaluation = Evaluation(inst)
                evaluation = self.setupEval(evaluation, classifier, inst,
                                            plotInstances, False)

                plotInstances.setInstances(userTestStructure)
                evaluation.setMetricsToDisplay(self.m_selectedEvalMetrics)
                plotInstances.setUp()
                # TODO
                # if isinstance(classifier,BatchPredictor)
                testTimeStart = time.time()
                for i in range(userTestStructure.numInstances()):
                    instance = userTestStructure.instance(i)
                    # if isinstance(classifier,BatchPredictor)
                    #else
                    plotInstances.process(instance, classifier, evaluation)
                # if isinstance(classifier,BatchPredictor)
                testTimeElapsed = time.time() - testTimeStart
                outPutResult += "=== Evaluation on test set ===\n"
            if testMode != 1:
                mode = ""
                if testMode == 2:
                    mode = "training data"
                elif testMode == 3:
                    mode = "supplied test set"
                outPutResult += "\nTime taken to test model on " + mode + ": " + Utils.doubleToString(
                    testTimeElapsed, 2) + " seconds\n\n"
            outPutResult += evaluation.toSummaryString(False) + '\n'
            self.m_History.updateResult(name, outPutResult)
            if inst.attribute(classIndex).isNominal():
                outPutResult += evaluation.toClassDetailsString() + '\n'
                outPutResult += evaluation.toMatrixString() + '\n'
            self.m_History.updateResult(name, outPutResult)
            Utils.debugOut(outPutResult)

            if (plotInstances is not None and plotInstances.canPlot(False)):
                visName = name + " (" + inst.relationName() + ")"
                pl2d = plotInstances.getPlotData(cname)
                plotInstances.cleanUp()
                vv = []
                trainHeader = Instances(self.m_Instances, 0)
                trainHeader.setClassIndex(classIndex)
                vv.append(trainHeader)
                if grph is not None:
                    vv.append(grph)
                if evaluation is not None and evaluation.predictions(
                ) is not None:
                    vv.append(evaluation.predictions())
                    vv.append(inst.classAttribute())
                self.history_add_visualize_signal.emit(name, vv, visName, pl2d)
        except Exception as e:
            self.error_diglog_signal.emit(str(e))
        self.mutex.lock()
        self.m_StartBut.setEnabled(True)
        self.m_StopBut.setEnabled(False)
        self.m_RunThread = None
        self.mutex.unlock()
        print("RunFinished")
Beispiel #22
0
    def buildClusterer(self,data:Instances):
        self.getCapabilities().testWithFail(data)
        self.m_Iterations=0
        #调用筛选器替换缺失值,Numeric使用平均值代替,Nominal使用出现次数最多的值代替
        self.m_ReplaceMissingFilter=ReplaceMissingValues()
        instances=Instances(data)
        instances.setClassIndex(-1)
        self.m_ReplaceMissingFilter.setInputFormat(instances)
        instances=Filter.useFilter(instances,self.m_ReplaceMissingFilter)

        #保存每个簇的样本属性值频率,m_ClusterNominalCounts是个3维,1维n个簇,2维属性类,3维属性值频率
        self.m_ClusterNominalCounts=[[[] for i in range(instances.numAttributes())] for j in range(self.NumClusters)]
        #每个簇不同属性缺失值频率
        self.m_ClusterMissingCounts=[[0]*instances.numAttributes() for  i in range(self.NumClusters)]

        #移动质心
        self.m_FullMeansOrMediansOrModes=self.moveCentroid(0,instances,True,False)
        #整个样本集的属性缺失率
        self.m_FullMissingCounts=self.m_ClusterMissingCounts[0]
        self.m_FullNominalCounts=self.m_ClusterNominalCounts[0]
        sumofWeights=instances.sumOfWeight()
        for i in range(instances.numAttributes()):
            if instances.attribute(i).isNumeric():
                if self.m_FullMissingCounts[i] == sumofWeights:
                    self.m_FullMeansOrMediansOrModes[i]=float('nan')
            else:
                if self.m_FullMissingCounts[i]>self.m_FullNominalCounts[i][Utils.maxIndex(self.m_FullNominalCounts[i])]:
                    self.m_FullMeansOrMediansOrModes[i]=-1
        self.m_ClusterCentroids=Instances(instances,self.NumClusters)
        clusterAssignments=[0]*instances.numInstances()
        self.m_DistanceFunction.setInstances(instances)
        random.seed(self.getSeed())
        initC=dict()        #type:Dict[DecisionTableHashKey,int]
        initInstances=instances

        for j in range(initInstances.numInstances()-1,-1,-1):
            instIndex=random.randint(0,j)
            hk=DecisionTableHashKey(initInstances.instance(instIndex),initInstances.numAttributes(),True)
            if hk not in initC:
                self.m_ClusterCentroids.add(initInstances.instance(instIndex))
                initC.update({hk:None})
            initInstances.swap(j,instIndex)
            if self.m_ClusterCentroids.numInstances() == self.NumClusters:
                break

        self.m_initialStartPoints=Instances(self.m_ClusterCentroids)
        self.NumClusters=self.m_ClusterCentroids.numInstances()
        converged=False
        tempI=[]    #type:List[Instances]
        self.m_squaredErrors=[0]*self.NumClusters
        self.m_ClusterNominalCounts=[[[] for i in range(instances.numAttributes())] for j in range(self.NumClusters)]
        self.m_ClusterMissingCounts=[[0]*instances.numAttributes() for  i in range(self.NumClusters)]
        #循环更新质心
        while not converged:
            emptyClusterCount=0
            self.m_Iterations+=1
            converged=True
            if self.m_executionSlots<=1 or instances.numInstances() <2*self.m_executionSlots:
                for i in range(instances.numInstances()):
                    toCluster=instances.instance(i)
                    newC=self.clusterProcessedInstance(toCluster,False,True)
                    if newC != clusterAssignments[i]:
                        converged=False
                    clusterAssignments[i]=newC
            self.m_ClusterCentroids=Instances(instances,self.NumClusters)
            for i in range(self.NumClusters):
                tempI.append(Instances(instances,0))
            for i in range(instances.numInstances()):
                tempI[clusterAssignments[i]].add(instances.instance(i))
            for i in range(self.NumClusters):
                if tempI[i].numInstances() == 0:
                    emptyClusterCount+=1
                else:
                    self.moveCentroid(i,tempI[i],True,True)
            if self.m_Iterations == self.m_MaxIterations:
                converged=True
            if emptyClusterCount>0:
                self.NumClusters-=emptyClusterCount
                if converged:
                    t=[None]*self.NumClusters   #type:List[Instances]
                    index=0
                    for k in range(len(tempI)):
                        if tempI[k].numInstances()>0:
                            t[index]=tempI[k]
                            for i in range(tempI[k].numAttributes()):
                                self.m_ClusterNominalCounts[index][i]=self.m_ClusterNominalCounts[k][i]
                            index+=1
                    tempI=t
                else:
                    tempI=[None]*self.NumClusters
            if not converged:
                self.m_ClusterNominalCounts=[[[] for i in range(instances.numAttributes())] for j in range(self.NumClusters)]
        if not self.m_FastDistanceCalc:
            for i in range(instances.numInstances()):
                self.clusterProcessedInstance(instances.instance(i),True,False)

        # for i in self.m_squaredErrors:
        #     print("squ:",i)
        self.m_ClusterSizes=[]
        for i in range(self.NumClusters):
            self.m_ClusterSizes.append(tempI[i].sumOfWeight())
        self.m_DistanceFunction.clean()
Beispiel #23
0
class KNN(AbstractClassifier):
    WEIGHT_NONE=0
    WEIGHT_INVERSE=1
    WEIGHT_SIMILARITY=2
    TAGS_WEIGHTING=[Tag(WEIGHT_NONE,"No distance weighting"),
                    Tag(WEIGHT_INVERSE,"Weight by 1/distance"),
                    Tag(WEIGHT_SIMILARITY,"Weight by 1-distance")]
    propertyList={"kNN":"1","DistanceWeighting":"TAGS_WEIGHTING"}
    methodList = {"kNN":"setkNN","DistanceWeighting":"setDistanceWeighting"}
    def __init__(self,k:int=None):
        super().__init__()
        self.m_NNSearch=LinearNNSearch()
        self.m_Train=None   #type:Instances
        self.initilize()
        if k is not None:
            self.setKNN(k)

    def __str__(self):
        if self.m_Train is None:
            return "IBk: No model built yet."
        if self.m_Train.numInstances() == 0:
            return "Warning: no training instances - ZeroR model used."
        #TODO 高级
        result="IB1 instance-based classifier\n" +"using " + str(self.kNN)
        if self.DistanceWeighting == self.WEIGHT_INVERSE:
            result+=" inverse-distance-weighted"
        elif self.DistanceWeighting == self.WEIGHT_SIMILARITY:
            result+= " similarity-weighted"
        result+=" nearest neighbour(s) for classification\n"
        if self.WindowSize != 0:
            result+="using a maximum of " + str(self.WindowSize) + " (windowed) training instances\n"
        return result

    def setkNN(self,value:str):
        try:
            val=int(value)
            self.kNN=val
            self.propertyList.update({"kNN":value})
        except ValueError:
            pass

    def setDistanceWeighting(self,value:int):
        self.DistanceWeighting=self.TAGS_WEIGHTING[value].getID()


    def initilize(self):
        self.setKNN(1)
        #多少个样本用于分类,默认整个样本集
        self.WindowSize=0
        self.DistanceWeighting=self.WEIGHT_NONE
        self.CrossValidate=False
        self.MEanSquared=False

    def setKNN(self,k:int):
        self.kNN=k
        self.m_kNNUpper=k
        self.m_kNNValid=False

    def getKNN(self):
        return self.kNN

    def getCapabilities(self):
        result=super().getCapabilities()
        result.disableAll()

        result.enable(CapabilityEnum.NOMINAL_ATTRIBUTES)
        result.enable(CapabilityEnum.NUMERIC_ATTRIBUTES)
        result.enable(CapabilityEnum.DATE_ATTRIBUTES)
        result.enable(CapabilityEnum.MISSING_VALUES)

        result.enable(CapabilityEnum.NOMINAL_CLASS)
        result.enable(CapabilityEnum.NUMERIC_CLASS)
        result.enable(CapabilityEnum.DATE_CLASS)
        result.enable(CapabilityEnum.MISSING_CLASS_VALUES)

        result.setMinimumNumberInstances(0)
        return result

    def buildClassifier(self,data:Instances):
        self.getCapabilities().testWithFail(data)
        instances=Instances(data)
        instances.deleteWithMissingClass()

        self.m_NumClasses=instances.numClasses()
        self.m_ClassType=instances.classAttribute().type()
        self.m_Train=Instances(instances,0,instances.numInstances())
        #只保存了样本集
        if self.WindowSize > 0 and instances.numInstances() > self.WindowSize:
            self.m_Train=Instances(self.m_Train,self.m_Train.numInstances()-self.WindowSize,self.WindowSize)
        self.m_NumAttributesUsed=0
        for i in range(self.m_Train.numAttributes()):
            if i != self.m_Train.classIndex() and (self.m_Train.attribute(i).isNominal() or  self.m_Train.attribute(i).isNumeric()):
                self.m_NumAttributesUsed+=1
        self.m_NNSearch.setInstances(self.m_Train)
        self.m_kNNValid=False
        self.m_defaultModel=ZeroR()
        self.m_defaultModel.buildClassifier(instances)


    def distributionForInstance(self,instance:Instance)->List[float]:
        if self.m_Train.numInstances() == 0:
            return self.m_defaultModel.distributionForInstance(instance)
        #超过样本容量,则循环删除
        if self.WindowSize > 0 and self.m_Train.numInstances() > self.WindowSize:
            self.m_kNNValid=False
            deletedInstance=False
            while(self.m_Train.numInstances()>self.WindowSize):
                self.m_Train.delete(0)
            if deletedInstance is True:
                self.m_NNSearch.setInstances(self.m_Train)
        if not self.m_kNNValid and self.CrossValidate and self.m_kNNUpper>=1:
            pass
        self.m_NNSearch.addInstanceInfo(instance)
        #获取k个邻居的样本集和距离
        neighbours=self.m_NNSearch.kNearestNeighbours(instance,self.kNN)
        distances=self.m_NNSearch.getDistances()
        distribution=self.makeDistribution(neighbours,distances)
        return distribution

    #获取k个邻近样本的概率分布
    def makeDistribution(self,neighbours:Instances,distances:List)->List[float]:
        distribution=[0]*self.m_NumClasses
        total=0
        if self.m_ClassType == Attribute.NOMINAL:
            for i in range(self.m_NumClasses):
                distribution[i]=1/max(1,self.m_Train.numInstances())
            total=self.m_NumClasses/max(1,self.m_Train.numInstances())
        for i in range(neighbours.numInstances()):
            current=neighbours.instance(i)
            distances[i]=distances[i]*distances[i]
            distances[i]=math.sqrt(distances[i]/self.m_NumAttributesUsed)
            if self.DistanceWeighting == self.WEIGHT_INVERSE:
                weight=1/distances[i]
            elif self.DistanceWeighting == self.WEIGHT_SIMILARITY:
                weight=1-distances[i]
            else:
                weight=1
            weight*=current.weight()
            if self.m_ClassType == Attribute.NOMINAL:
                distribution[int(current.classValue())]+=weight
            elif self.m_ClassType == Attribute.NUMERIC:
                distribution[0]+=current.classValue()*weight
            total+=weight
        if total > 0:
            Utils.normalize(distribution, total)
        return distribution
Beispiel #24
0
    def testInstances(self, data: Instances, *args):
        if len(args) == 0:
            return self.testInstances(data, 0, data.numAttributes() - 1)
        fromIndex = args[0]
        toIndex = args[1]
        if self.doNotCheckCapabilities():
            return True
        if len(self.m_Capabilities) == 0 or (len(self.m_Capabilities) == 1
                                             and self.handles(
                                                 CapabilityEnum.NO_CLASS)):
            sys.stderr.write("No capabilities set!")
        if toIndex - fromIndex < 0:
            self.m_FailReason = CapabilityError("No attributes!")
            return False
        testClass = data.classIndex() > -1 and data.classIndex(
        ) >= fromIndex and data.classIndex() <= toIndex
        for i in range(fromIndex, toIndex + 1):
            att = data.attribute(i)
            if i == data.classIndex():
                continue
            if not self.testAttribute(att):
                return False
        if not self.handles(
                CapabilityEnum.NO_CLASS) and data.classIndex() == -1:
            self.m_FailReason = CapabilityError("Class attribute not set!")
            return False

        if self.handles(CapabilityEnum.NO_CLASS) and data.classIndex() > -1:
            cap = self.getClassCapabilities()
            cap.disable(CapabilityEnum.NO_CLASS)
            iter = cap.capabilities()
            if len(iter) == 0:
                self.m_FailReason = CapabilityError(
                    "Cannot handle any class attribute!")
                return False
        if testClass and not self.handles(CapabilityEnum.NO_CLASS):
            att = data.classAttribute()
            if not self.testAttribute(att, True):
                return False
            if not self.handles(CapabilityEnum.MISSING_CLASS_VALUES):
                for i in range(data.numInstances()):
                    if data.instance(i).classIsMissing():
                        self.m_FailReason = CapabilityError(
                            "Cannot handle missing class values!")
                        return False
            else:
                hasClass = 0
                for i in range(data.numInstances()):
                    if not data.instance(i).classIsMissing():
                        hasClass += 1
                if hasClass < self.getMinimumNumberInstances():
                    self.m_FailReason=CapabilityError("Not enough training instances with class labels (required: "\
                                                      + str(self.getMinimumNumberInstances())\
                                                      + ", provided: "\
                                                      + str(hasClass)\
                                                      + ")!")
                    return False
        missing = False
        for i in range(data.numInstances()):
            inst = data.instance(i)
            if not self.handles(CapabilityEnum.MISSING_VALUES):
                #TODO 使用稀疏矩阵pass
                # if isinstance(inst)
                #     pass
                #else
                for n in range(fromIndex, toIndex + 1):
                    if n == inst.classIndex():
                        continue
                    if inst.isMissing(n):
                        missing = True
                        break
                if missing:
                    self.m_FailReason = CapabilityError(
                        "Cannot handle missing values!")
                    return False
        if data.numInstances() < self.getMinimumNumberInstances():
            self.m_FailReason = CapabilityError(
                "Not enough training instances (required: " +
                str(self.getMinimumNumberInstances()) + ", provided: " +
                str(data.numInstances()) + ")!")
            return False
        # if self.handles(CapabilityEnum.ONLY_MULTIINSTANCE):
        #     if data.numAttributes() != 3:
        #         return False
        #     if not data.attribute(0).isNominal() or data.classIndex() != data.numAttributes()-1:
        #         return False
        #     owner=self.getOwner()
        #     if isinstance(owner,MultiInstanceCapabilitiesHandler):
        #         handler=owner
        #         cap=handler.getMultiInstanceCapabilities()
        #         if data.numInstances()>0 and data.attribute(1).numValues()>0:
        #             result=cap.testAttribute(data.attribute(1))
        return True
Beispiel #25
0
 def clusterRunThread(self):
     self.m_CLPanel.addToHistory()
     inst = Instances(self.m_Instances)
     inst.setClassIndex(-1)
     plotInstances = ClustererAssignmentsPlotInstances()
     plotInstances.setClusterer(self.m_ClustererEditor.getValue())
     userTest = None
     if self.m_SetTestFrame is not None:
         if self.m_SetTestFrame.getInstances() is not None:
             userTest = Instances(self.m_SetTestFrame.getInstances())
     clusterer = self.m_ClustererEditor.getValue()
     outBuff = ""
     name = time.strftime("%H:%M:%S - ")
     cname = clusterer.__module__
     if cname.startswith("clusterers."):
         name += cname[len("clusterers."):]
     else:
         name += cname
     if self.m_TrainBut.isChecked():
         testMode = 0
     elif self.m_TestSplitBut.isChecked():
         testMode = 1
         if userTest is None:
             raise Exception("No user test set has been opened")
         if not inst.equalHeaders(userTest):
             raise Exception("Train and test set are not compatible\n" +
                             inst.equalHeadersMsg(userTest))
     else:
         raise Exception("Unknown test mode")
     trainInst = Instances(inst)
     outBuff += "=== Run information ===\n\n"
     outBuff += "Scheme:       " + cname
     outBuff += "\n"
     outBuff += "Relation:     " + inst.relationName() + '\n'
     outBuff += "Instances:    " + str(inst.numInstances()) + '\n'
     outBuff += "Attributes:   " + str(inst.numAttributes()) + '\n'
     if inst.numAttributes() < 100:
         for i in range(inst.numAttributes()):
             outBuff += "              " + inst.attribute(i).name() + '\n'
     else:
         outBuff += "              [list of attributes omitted]\n"
     outBuff += "Test mode:    "
     if testMode == 0:
         outBuff += "evaluate on training data\n"
     elif testMode == 1:
         "user supplied test set: " + str(
             userTest.numInstances()) + " instances\n"
     outBuff += '\n'
     self.m_History.addResult(name, outBuff)
     self.m_History.setSingle(name)
     trainTimeStart = time.time()
     if isinstance(clusterer, Clusterer):
         clusterer.buildClusterer(self.removeClass(trainInst))
     trainTimeElapsed = time.time() - trainTimeStart
     outBuff += "\n=== Clustering model (full training set) ===\n\n"
     outBuff += str(clusterer) + '\n'
     outBuff+="\nTime taken to build model (full training data) : " \
              + Utils.doubleToString(trainTimeElapsed, 2)\
             + " seconds\n\n"
     self.m_History.updateResult(name, outBuff)
     evaluation = ClusterEvaluation()
     evaluation.setClusterer(clusterer)
     if testMode == 0:
         evaluation.evaluateClusterer(trainInst, False)
         plotInstances.setInstances(inst)
         plotInstances.setClusterEvaluation(evaluation)
         outBuff += "=== Model and evaluation on training set ===\n\n"
     elif testMode == 1:
         userTestT = Instances(userTest)
         evaluation.evaluateClusterer(userTestT, False)
         plotInstances.setInstances(userTest)
         plotInstances.setClusterEvaluation(evaluation)
         outBuff += "=== Evaluation on test set ===\n"
     else:
         raise Exception("Test mode not implemented")
     outBuff += evaluation.clusterResultsToString()
     outBuff += '\n'
     self.m_History.updateResult(name, outBuff)
     if plotInstances is not None and plotInstances.canPlot(True):
         visName = name + " (" + inst.relationName() + ")"
         pl2d = plotInstances.getPlotData(name)
         plotInstances.cleanUp()
         vv = []
         trainHeader = Instances(self.m_Instances, 0)
         vv.append(trainHeader)
         self.history_add_visualize_signal.emit(name, vv, visName, pl2d)
     self.m_RunThread = None
     self.m_StartBut.setEnabled(True)
     self.m_StopBut.setEnabled(False)
     # Utils.debugOut(outBuff)
     print("Run Finished")
Beispiel #26
0
 def leftSide(self, data: Instances):
     return data.attribute(self.m_attIndex).name()