def __call__(self, data, weight=None): disc = orange.Preprocessor_discretize( data, method=orange.EntropyDiscretization()) #show_values(disc, "Entropy based discretization") model = orange.BayesLearner(disc, weight, adjustThreshold=0) #print "model.distribution", model.distribution #print "model.conditionalDistributions", model.conditionalDistributions return Classifier(classifier=model)
def __calculateMeasures(self): falsePositives = 0 falseNegatives = 0 truePositives = 0 trueNegatives = 0 totalPositives = 0 totalNegatives = 0 totalHoldOutGoodPhrases = 0 totalHoldOutBadPhrases = 0 self.trainD = self.__loadDataFromES("train", None) self.holdOutD = self.__loadDataFromES("hold", self.trainD.domain) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) self.holdOutD = orange.ExampleTable(self.trainD.domain, self.holdOutD) for row in self.holdOutD: actualClassType = row[-1].value phrase = row.getmetas().values()[0].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value if self.classifier == None: classifierFile = open(self.classifierFilePath) self.classifier = pickle.load(classifierFile) classifierFile.close() prob = self.classifier.prob_classify(featureSet).prob("1") classType = self.classifier.classify(featureSet) if classType == "1": totalPositives += 1 if classType == actualClassType: truePositives += 1 else: totalNegatives += 1 if classType == actualClassType: trueNegatives += 1 if actualClassType == "1": totalHoldOutGoodPhrases += 1 else: totalHoldOutBadPhrases += 1 precisionOfGood = 100.0 * truePositives/totalPositives recallOfGood = 100.0 * truePositives/totalHoldOutGoodPhrases fMeasureOfGood = 2.0 * precisionOfGood * recallOfGood / (precisionOfGood + recallOfGood) precisionOfBad = 100.0 * trueNegatives/totalNegatives recallOfBad = 100.0*trueNegatives/totalHoldOutBadPhrases fMeasureOfBad = 2.0 * precisionOfBad * recallOfBad / (precisionOfBad + recallOfBad) self.logger.info("\nPrecision of Good: " + str(round(precisionOfGood, 2)) + "%") self.logger.info("Recall of Good: " + str(round(recallOfGood, 2)) + "%") self.logger.info("Balanced F-measure of Good: " + str(round(fMeasureOfGood, 2)) + "%") self.logger.info("Precision of Bad: " + str(round(precisionOfBad, 2)) + "%") self.logger.info("Recall of Bad: " + str(round(recallOfBad, 2)) + "%") self.logger.info("Balanced F-measure of Bad: " + str(round(fMeasureOfBad, 2)) + "%")
def classify(self): while True: message = self.worker.receive() if message["content"] == "kill": message["responseId"] = message["requestId"] self.worker.close(message) if len(self.dispatchers) == 0: self.worker.end() break else: self.worker.send(content="kill", to=self.workerName) continue elif message["content"]["type"] == "classify": if message["content"]["from"] not in self.dispatchers: self.dispatchers[message["content"]["from"]] = RemoteChannel(message["content"]["from"], self.config) self.dispatchers[message["content"]["from"]].listen(self.unregisterDispatcher) self.phraseId = message["content"]["phraseId"] if self.classifier == None: self.trainD = self.__loadDataFromES("train", None) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) self.__train() self.trainD = self.__loadDataFromES("train", None) testD = self.__loadDataFromES("test", self.trainD.domain) self.trainD = orange.Preprocessor_discretize(self.trainD, method=orange.EntropyDiscretization()) testD = orange.ExampleTable(self.trainD.domain, testD) for row in testD: phrase = row.getmetas().values()[0].value featureSet = {} for i,feature in enumerate(self.features): featureSet[feature["name"]] = row[i].value prob = self.classifier.prob_classify(featureSet).prob("1") classType = self.classifier.classify(featureSet) self.phraseData["_source"]["prob"] = prob self.phraseData["_source"]["class_type"] = classType self.logger.info("Classified '" + phrase + "' as " + classType + " with probability " + str(prob)) self.esClient.index(index=self.processorIndex, doc_type=self.processorPhraseType, id=self.phraseId, body=self.phraseData["_source"]) self.worker.reply(message, {"phraseId": self.phraseId, "status" : "classified", "type" : "reply"}, 120000000) self.logger.info("Terminating classification worker")
def __call__(self, data, targetClass, num_of_rules=0): '''Returns CN2-SD rules by performing weighted covering algorithm.''' data_discretized = False # If any of the attributes are continuous, discretize them if data.domain.hasContinuousAttributes(): original_data = data data_discretized = True new_domain = [] discretize = orange.EntropyDiscretization(forceAttribute=True) for attribute in data.domain.attributes: if attribute.varType == orange.VarTypes.Continuous: d_attribute = discretize(attribute, data) # An attribute is irrelevant, if it is discretized into a single interval # if len(d_attribute.getValueFrom.transformer.points) > 0: new_domain.append(d_attribute) else: new_domain.append(attribute) data = original_data.select(new_domain + [original_data.domain.classVar]) self.data = data self.max_rules = num_of_rules rules = [] tc = orange.Value(data.domain.classVar, targetClass) # weighted covering self.data.addMetaAttribute( self.weightID) # set weights of all examples to 1 self.data.addMetaAttribute( self.counter) # set counters of all examples to 0 targetClassRule = SDRule(data, targetClass, conditions=[], g=1) tmpRule = self.rbf(data, self.weightID, targetClass, None) while (tmpRule.quality > 0) and (self.max_rules == 0 or len(rules) < self.max_rules): bestRule = SDRule(self.data, tc, tmpRule.filter.conditions) bestRule.quality = tmpRule.quality self.decreaseExampleWeights(bestRule) rules.append(bestRule) tmpRule = self.rbf(data, self.weightID, targetClass, None) if data_discretized: targetClassRule = SDRule(original_data, targetClass, conditions=[], g=1) # change beam so the rules apply to original data rules = [rule.getUndiscretized(original_data) for rule in rules] else: targetClassRule = SDRule(data, targetClass, conditions=[], g=1) return SDRules(rules, targetClassRule, "CN2-SD")
def getDiscretizer(self): if self.discInd == 0: preprocessor = Preprocessor_discretizeEntropy( method=orange.EntropyDiscretization()) elif self.discInd in [1, 2]: name, disc, kwds = self.DISCRETIZERS[self.discInd] preprocessor = Preprocessor_discretize(method=disc( **dict([(key, getattr(self, key, val)) for key, val in kwds.items()]))) elif self.discInd == 3: preprocessor = Preprocessor_removeContinuous() return preprocessor
def discretizeDomain(data, removeUnusedValues=1, numberOfIntervals=2): entroDisc = orange.EntropyDiscretization() equiDisc = orange.EquiNDiscretization(numberOfIntervals=numberOfIntervals) discAttrs = [] className = data and len( data ) > 0 and data.domain.classVar and data.domain.classVar.name or None # if className: # data = data.filterref(orange.Filter_hasClassValue()) # remove examples with missing classes if not data or len(data) == 0: return None # if we have a continuous class we have to discretize it before we can discretize the attributes if className and data.domain.classVar.varType == orange.VarTypes.Continuous: try: newClass = equiDisc(data.domain.classVar.name, data) newClass.name = className except orange.KernelException as ex: warnings.warn("Could not discretize class variable '%s'. %s" % (data.domain.classVar.name, ex.message)) newClass = None className = None newDomain = orange.Domain(data.domain.attributes, newClass) data = orange.ExampleTable(newDomain, data) for attr in data.domain.attributes: try: name = attr.name if attr.varType == orange.VarTypes.Continuous: # if continuous attribute then use entropy discretization if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete: new_attr = entroDisc(attr, data) else: new_attr = equiDisc(attr, data) else: new_attr = attr if removeUnusedValues: new_attr = orange.RemoveUnusedValues(new_attr, data) if new_attr is None: raise orange.KernelException("No values") new_attr.name = name discAttrs.append(new_attr) except orange.KernelException as ex: # if all values are missing, entropy discretization will throw an exception. in such cases ignore the attribute warnings.warn("Could not discretize %s attribute. %s" % (attr.name, ex.message)) if className: discAttrs.append(data.domain.classVar) d2 = data.translate(discAttrs, True) return d2
def __call__(self, data, targetClass, num_of_rules ): if self.dataOK(data): # Checks weather targetClass is discrete data_discretized = False # If any of the attributes are continuous, discretize them if data.domain.hasContinuousAttributes(): original_data = data data_discretized = True new_domain = [] discretize = orange.EntropyDiscretization(forceAttribute=True) for attribute in data.domain.attributes: if attribute.varType == orange.VarTypes.Continuous: d_attribute = discretize(attribute, data) # An attribute is irrelevant, if it is discretized into a single interval # if len(d_attribute.getValueFrom.transformer.points) > 0: new_domain.append(d_attribute) else: new_domain.append(attribute) data = original_data.select(new_domain + [original_data.domain.classVar]) # initialization of beams beam = [SDRule(data=data, targetClass=targetClass, g=self.g)] * self.beamWidth newBeam = [SDRule(data=data, targetClass=targetClass, g=self.g)] * self.beamWidth worstRuleIndex = 0 improvements = true while improvements: improvements = false for rule in beam: for attr in data.domain.attributes: value = attr.firstvalue() while(value): newRule = rule.cloneAndAddCondition(attr,value) if newRule.support > self.minSupport and self.betterThanWorstRule(newRule, newBeam, worstRuleIndex) and self.isRelevant(newRule, newBeam): worstRuleIndex = self.replaceWorstRule(newRule, newBeam, worstRuleIndex) improvements = true value = attr.nextvalue(value) beam = newBeam # perform rule subset selection if num_of_rules != 0: beam = self.ruleSubsetSelection(beam, num_of_rules, data) if data_discretized: targetClassRule = SDRule(original_data, targetClass, conditions=[], g=1) # change beam so the rules apply to original data beam = [rule.getUndiscretized(original_data) for rule in beam] else: targetClassRule = SDRule(data, targetClass, conditions=[], g =1) return SDRules(beam, targetClassRule, "SD")
def entropyDiscretization(data): """ Discretizes continuous attributes using the entropy based discretization. It removes the attributes discretized to a single interval and prints their names. Arguments: data Returns: table of examples with discretized atributes. Attributes that are categorized to a single value (constant) are removed. """ orange.setrandseed(0) tablen = orange.Preprocessor_discretize( data, method=orange.EntropyDiscretization()) attrlist = [] nrem = 0 for i in tablen.domain.attributes: if (len(i.values) > 1): attrlist.append(i) else: nrem = nrem + 1 attrlist.append(tablen.domain.classVar) return tablen.select(attrlist)
def discretizeDomain(data, removeUnusedValues = 1, numberOfIntervals = 2): entroDisc = orange.EntropyDiscretization() equiDisc = orange.EquiNDiscretization(numberOfIntervals = numberOfIntervals) discAttrs = [] className = data and len(data) > 0 and data.domain.classVar and data.domain.classVar.name or None # if className: # data = data.filterref(orange.Filter_hasClassValue()) # remove examples with missing classes if not data or len(data) == 0: return None # if we have a continuous class we have to discretize it before we can discretize the attributes if className and data.domain.classVar.varType == orange.VarTypes.Continuous: newClass = equiDisc(data.domain.classVar.name, data) newClass.name = className newDomain = orange.Domain(data.domain.attributes, newClass) data = orange.ExampleTable(newDomain, data) for attr in data.domain.attributes: try: name = attr.name if attr.varType == orange.VarTypes.Continuous: # if continuous attribute then use entropy discretization if data.domain.classVar and data.domain.classVar.varType == orange.VarTypes.Discrete: attr = entroDisc(attr, data) else: attr = equiDisc(attr, data) if removeUnusedValues: attr = orange.RemoveUnusedValues(attr, data) attr.name = name discAttrs.append(attr) except: # if all values are missing, entropy discretization will throw an exception. in such cases ignore the attribute pass if className: discAttrs.append(data.domain.classVar) return data.select(discAttrs)
def _prepare(self, t): # prepares an Orange table so that it doesn't contain continuous # attributes or missing values ### DISCRETIZE VARIABLES ### newatt = [] oldatt = [] entroD = orange.EntropyDiscretization() equiD = orange.EquiNDiscretization(numberOfIntervals=2) for i in t.domain.attributes: if i.varType == 2: d = entroD(i, t) if len(d.values) < 2: # prevent discretization into a single value d = equiD(i, t) d.name = 'E' + d.name warnings.warn('Discretizing %s into %s with %d values.' % (i.name, d.name, len(d.values))) newatt.append(d) else: oldatt.append(i) if len(newatt) > 0: t = t.select(oldatt + newatt + [t.domain.classVar]) ### FIX MISSING VALUES ### special_attributes = [] # 2006-08-23: fixed by PJ: append classVar only if it exists ## all_attributes = [i for i in t.domain.attributes]+[t.domain.classVar] all_attributes = [i for i in t.domain.attributes] if t.domain.classVar: all_attributes += [t.domain.classVar] for i in range(len(all_attributes)): for j in t: if j[i].isSpecial(): special_attributes.append(i) break # create new attributes if len(special_attributes) > 0: # prepare attributes newatts = [] for i in range(len(all_attributes)): old = all_attributes[i] if i in special_attributes: oldv = [v for v in old.values] assert ('.' not in oldv) new = orange.EnumVariable(name='M_' + old.name, values=oldv + ['.']) warnings.warn('Removing special values from %s into %s.' % (old.name, new.name)) newatts.append(new) else: newatts.append(old) # convert table exs = [] # 2006-08-23: added by PJ: add a class variable (if not already existing) if not t.domain.classVar: newatts.append(orange.EnumVariable("class", values=["."])) t = orange.ExampleTable( orange.Domain(t.domain.attributes, newatts[-1]), t) newd = orange.Domain(newatts) for ex in t: nex = [] for i in range(len(newatts)): if ex[i].isSpecial(): v = newatts[i]('.') else: v = newatts[i](int(ex[i])) nex.append(v) exs.append(orange.Example(newd, nex)) t = orange.ExampleTable(exs) return t
# Description: Entropy based discretization compared to discretization with equal-frequency # of instances in intervals # Category: preprocessing # Uses: iris.tab # Classes: Preprocessor_discretize, EntropyDiscretization # Referenced: o_categorization.htm import orange def show_values(data, heading): print heading for a in data.domain.attributes: print "%s: %s" % ( a.name, reduce(lambda x, y: x + ', ' + y, [i for i in a.values])) data = orange.ExampleTable("iris") data_ent = orange.Preprocessor_discretize( data, method=orange.EntropyDiscretization()) show_values(data_ent, "Entropy based discretization") print data_n = orange.Preprocessor_discretize( data, method=orange.EquiNDiscretization(numberOfIntervals=3)) show_values(data_n, "Equal-frequency intervals")
def __init__(self, discr = orange.EntropyDiscretization(), learnr = orange.BayesLearner()): self.disc = discr self.learner = learnr
# Description: Attribute-based discretization. Shows how different attributes may be discretized with different categorization methods and how the default attribute values names used by these methods may be simply replaced by the list of user-defined names. # Category: preprocessing # Uses: iris # Classes: EquiNDiscretization, EntropyDiscretization # Referenced: o_categorization.htm def printexamples(data, inxs, msg="%i examples"): print msg % len(inxs) for i in inxs: print i, data[i] print import orange iris = orange.ExampleTable("iris") equiN = orange.EquiNDiscretization(numberOfIntervals=4) entropy = orange.EntropyDiscretization() pl = equiN("petal length", iris) sl = equiN("sepal length", iris) pl.values = sl.values = ["very low", "low", "high", "very high"] sl_ent = entropy("sepal length", iris) inxs = [0, 15, 35, 50, 98] d_iris = iris.select( ["sepal width", pl, "sepal length", sl, sl_ent, iris.domain.classVar]) printexamples(iris, inxs, "%i examples before discretization") printexamples(d_iris, inxs, "%i examples before discretization")
def computeDiscretizer(self, i, idx, onlyDefaults=False): attr = self.data.domain[idx] indiData = self.indiData[idx] discType, intervals = indiData[:2] discName = self.shortDiscNames[discType] defaultUsed = not discType if defaultUsed: discType = self.discretization+1 intervals = self.intervals if discType >= self.D_N_METHODS + 1: try: customs = [float(r) for r in indiData[discType-self.D_N_METHODS+1]] except: customs = [] if not customs: discType = self.discretization+1 intervals = self.intervals discName = "%s ->%s)" % (self.shortDiscNames[indiData[0]][:-1], self.shortDiscNames[discType][2:-1]) defaultUsed = True if onlyDefaults and not defaultUsed: return discType -= 1 try: if discType == self.D_LEAVE: # leave continuous discretizer = None elif discType == self.D_ENTROPY: discretizer = orange.EntropyDiscretization(attr, self.data) elif discType == self.D_FREQUENCY: discretizer = orange.EquiNDiscretization(attr, self.data, numberOfIntervals = intervals) elif discType == self.D_WIDTH: discretizer = orange.EquiDistDiscretization(attr, self.data, numberOfIntervals = intervals) elif discType == self.D_REMOVE: discretizer = False else: discretizer = orange.IntervalDiscretizer(points = customs).constructVariable(attr) except: discretizer = False self.discretizers[idx] = discretizer if discType == self.D_LEAVE: discInts = "" elif discType == self.D_REMOVE: discInts = "" elif not discretizer: discInts = ": "+"<can't discretize>" else: points = discretizer.getValueFrom.transformer.points discInts = points and (": " + ", ".join([str(attr(x)) for x in points])) or ": "+"<removed>" self.indiLabels[i] = discInts + discName self.attrList.reset() if i == self.selectedAttr: self.graph.setSplits(discretizer and discretizer.getValueFrom.transformer.points or [])
def __init__(self, parent=None, signalManager=None, name="Preprocess"): OWWidget.__init__(self, parent, signalManager, name) self.inputs = [("Example Table", ExampleTable, self.setData) ] #, ("Learner", orange.Learner, self.setLearner)] self.outputs = [("Preprocess", orngWrap.PreprocessedLearner), ("Preprocessed Example Table", ExampleTable) ] #, ("Preprocessor", orange.Preprocessor)] self.autoCommit = False self.changedFlag = False # self.allSchemas = [PreprocessorSchema("Default" , [Preprocessor_discretize(method=orange.EntropyDiscretization()), Preprocessor_dropMissing()])] self.allSchemas = [("Default", [ Preprocessor_discretizeEntropy( method=orange.EntropyDiscretization()), Preprocessor_dropMissing() ], 0)] self.lastSelectedSchemaIndex = 0 self.preprocessorsList = PyListModel([], self) box = OWGUI.widgetBox(self.controlArea, "Preprocessors", addSpace=True) box.layout().setSpacing(1) self.setStyleSheet("QListView::item { margin: 1px;}") self.preprocessorsListView = QListView() self.preprocessorsListSelectionModel = ListSingleSelectionModel( self.preprocessorsList, self) self.preprocessorsListView.setItemDelegate( PreprocessorItemDelegate(self)) self.preprocessorsListView.setModel(self.preprocessorsList) self.preprocessorsListView.setSelectionModel( self.preprocessorsListSelectionModel) self.preprocessorsListView.setSelectionMode(QListView.SingleSelection) self.connect(self.preprocessorsListSelectionModel, SIGNAL("selectedIndexChanged(QModelIndex)"), self.onPreprocessorSelection) self.connect(self.preprocessorsList, SIGNAL("dataChanged(QModelIndex, QModelIndex)"), lambda arg1, arg2: self.commitIf) box.layout().addWidget(self.preprocessorsListView) self.addPreprocessorAction = QAction("+", self) self.addPreprocessorAction.pyqtConfigure( toolTip="Add a new preprocessor to the list") self.removePreprocessorAction = QAction("-", self) self.removePreprocessorAction.pyqtConfigure( toolTip="Remove selected preprocessor from the list") self.removePreprocessorAction.setEnabled(False) self.connect( self.preprocessorsListSelectionModel, SIGNAL("selectedIndexChanged(QModelIndex)"), lambda index: self. removePreprocessorAction.setEnabled(index.isValid())) actionsWidget = ModelActionsWidget( [self.addPreprocessorAction, self.removePreprocessorAction]) actionsWidget.layout().setSpacing(1) actionsWidget.layout().addStretch(10) box.layout().addWidget(actionsWidget) self.connect(self.addPreprocessorAction, SIGNAL("triggered()"), self.onAddPreprocessor) self.connect(self.removePreprocessorAction, SIGNAL("triggered()"), self.onRemovePreprocessor) box = OWGUI.widgetBox(self.controlArea, "Saved Schemas", addSpace=True) self.schemaFilterEdit = OWGUIEx.LineEditFilter(self) box.layout().addWidget(self.schemaFilterEdit) self.schemaList = PyListModel([], self, flags=Qt.ItemIsSelectable | Qt.ItemIsEditable | Qt.ItemIsEnabled) self.schemaListProxy = PySortFilterProxyModel(filter_fmt="{0.name}", parent=self) self.schemaListProxy.setFilterCaseSensitivity(Qt.CaseInsensitive) self.schemaListProxy.setSourceModel(self.schemaList) self.schemaListView = QListView() self.schemaListView.setItemDelegate(PreprocessorSchemaDelegate(self)) # self.schemaListView.setModel(self.schemaList) self.schemaListView.setModel(self.schemaListProxy) self.connect(self.schemaFilterEdit, SIGNAL("textEdited(QString)"), self.schemaListProxy.setFilterRegExp) box.layout().addWidget(self.schemaListView) self.schemaListSelectionModel = ListSingleSelectionModel( self.schemaListProxy, self) self.schemaListView.setSelectionMode(QListView.SingleSelection) self.schemaListView.setSelectionModel(self.schemaListSelectionModel) self.connect(self.schemaListSelectionModel, SIGNAL("selectedIndexChanged(QModelIndex)"), self.onSchemaSelection) self.addSchemaAction = QAction("+", self) self.addSchemaAction.pyqtConfigure( toolTip="Add a new preprocessor schema") self.updateSchemaAction = QAction("Update", self) self.updateSchemaAction.pyqtConfigure( toolTip="Save changes made in the current schema") self.removeSchemaAction = QAction("-", self) self.removeSchemaAction.pyqtConfigure(toolTip="Remove selected schema") self.updateSchemaAction.setEnabled(False) self.removeSchemaAction.setEnabled(False) actionsWidget = ModelActionsWidget([]) actionsWidget.addAction(self.addSchemaAction) actionsWidget.addAction(self.updateSchemaAction).setSizePolicy( QSizePolicy.MinimumExpanding, QSizePolicy.Fixed) actionsWidget.addAction(self.removeSchemaAction) actionsWidget.layout().setSpacing(1) box.layout().addWidget(actionsWidget) self.connect(self.addSchemaAction, SIGNAL("triggered()"), self.onAddSchema) self.connect(self.updateSchemaAction, SIGNAL("triggered()"), self.onUpdateSchema) self.connect(self.removeSchemaAction, SIGNAL("triggered()"), self.onRemoveSchema) self.addPreprocessorsMenuActions = actions = [] for name, pp, kwargs in self.preprocessors: action = QAction(name, self) self.connect(action, SIGNAL("triggered()"), lambda pp=pp, kwargs=kwargs: self.addPreprocessor( pp(**kwargs))) actions.append(action) box = OWGUI.widgetBox(self.controlArea, "Output") cb = OWGUI.checkBox(box, self, "autoCommit", "Commit on any change", callback=self.commitIf) b = OWGUI.button(box, self, "Commit", callback=self.commit) OWGUI.setStopper(self, b, cb, "changedFlag", callback=self.commitIf) self.mainAreaStack = QStackedLayout() self.stackedEditorsCache = {} OWGUI.widgetBox(self.mainArea, orientation=self.mainAreaStack) self.data = None self.learner = None self.loadSettings() self.activateLoadedSettings()
def __init__(self, method=orange.EntropyDiscretization()): self.method = method assert (isinstance(method, orange.EntropyDiscretization))
# Description: Shows how usage of different classes for discretization, including manual discretization # Category: discretization, categorization, preprocessing # Classes: EntropyDiscretization, EquiDistDiscretization, BiModalDiscretization, Discretization, IntervalDiscretizer, Discretizer, BiModalDiscretizer # Uses: iris # Referenced: discretization.htm import orange data = orange.ExampleTable("iris") print "\nEntropy discretization, first 10 examples" sep_w = orange.EntropyDiscretization("sepal width", data) data2 = data.select([data.domain["sepal width"], sep_w, data.domain.classVar]) for ex in data2[:10]: print ex print "\nDiscretized attribute:", sep_w print "Continuous attribute:", sep_w.getValueFrom.whichVar print "Cut-off points:", sep_w.getValueFrom.transformer.points print "\nManual construction of IntervalDiscretizer - single attribute" idisc = orange.IntervalDiscretizer(points=[3.0, 5.0]) sep_l = idisc.constructVariable(data.domain["sepal length"]) data2 = data.select([data.domain["sepal length"], sep_l, data.domain.classVar]) for ex in data2[:10]: print ex print "\nManual construction of IntervalDiscretizer - all attributes" idisc = orange.IntervalDiscretizer(points=[3.0, 5.0]) newattrs = [idisc.constructVariable(attr) for attr in data.domain.attributes]
def __call__(self, data, weight=None): disc = orange.Preprocessor_discretize( \ data, method=orange.EntropyDiscretization()) model = orange.BayesLearner(disc, weight) return Classifier(classifier=model)
def __call__(self, data, targetClass, max_rules=0): '''Returns the Apriori-C classifier.''' data_discretized = False # If any of the attributes are continuous, discretize them if data.domain.hasContinuousAttributes(): original_data = data data_discretized = True new_domain = [] discretize = orange.EntropyDiscretization(forceAttribute=True) for attribute in data.domain.attributes: if attribute.varType == orange.VarTypes.Continuous: d_attribute = discretize(attribute, data) # An attribute is irrelevant, if it is discretized into a single interval # if len(d_attribute.getValueFrom.transformer.points) > 0: new_domain.append(d_attribute) else: new_domain.append(attribute) data = original_data.select(new_domain + [original_data.domain.classVar]) self.data = data self.rulesSD = [] # build association classification rules rules = orange.AssociationRulesInducer(data, support=self.minSup, classificationRules=1, maxItemSets=10000000) #_______________________________ post-processing step 1 # select rules that classify in the target class right = orange.Example( data.domain, [orange.Value(orange.VarTypes.Discrete, orange.ValueTypes.DK)] * len(data.domain)) right.setclass(targetClass) rules = rules.filter(lambda rule: rule.right == right) # select rules with confidence >= minConfidence rules = rules.filter(lambda rule: rule.confidence >= self.minConf) #________________________________ post processing step 2 # weighted covering self.data.addMetaAttribute( self.weightID) # set weights of all examples to 1 bestRuleWRacc = 100 while len(rules) > 0 and self.uncoveredExamples( ) > 0 and bestRuleWRacc > 0 and (max_rules == 0 or len(self.rulesSD) < max_rules): (bestRule, bestRuleWRacc) = self.findBestRule(rules) rules.remove(bestRule) self.removeSimilarRules(bestRule, rules) self.decreaseExampleWeights(bestRule) self.rulesSD.append(bestRule) #____________________________ transform rules to SD format beam = [] targetClassRule = SDRule(data, targetClass, conditions=[], g=1) for r in self.rulesSD: cond = [] for i in range(len(r.left)): if not orange.Value.is_DC(r.left[i]): cond.append( orange.ValueFilter_discrete( position=i, values=[ orange.Value(data.domain.attributes[i], r.left[i]) ])) rSD = SDRule(data, targetClass, cond) beam.append(rSD) if data_discretized: targetClassRule = SDRule(original_data, targetClass, conditions=[], g=1) # change beam so the rules apply to original data beam = [rule.getUndiscretized(original_data) for rule in beam] else: targetClassRule = SDRule(data, targetClass, conditions=[], g=1) return SDRules(beam, targetClassRule, "Apriori-SD")
# Description: Shows how to assess the quality of attributes not in the dataset # Category: attribute quality # Classes: EntropyDiscretization, MeasureAttribute, MeasureAttribute_info # Uses: iris # Referenced: MeasureAttribute.htm import orange data = orange.ExampleTable("iris") d1 = orange.EntropyDiscretization("petal length", data) print orange.MeasureAttribute_relief(d1, data) meas = orange.MeasureAttribute_relief() for t in meas.thresholdFunction("petal length", data): print "%5.3f: %5.3f" % t thresh, score, distr = meas.bestThreshold("petal length", data) print "\nBest threshold: %5.3f (score %5.3f)" % (thresh, score)
class TestDiscretizeEntropy(testing.PreprocessorTestCase): PREPROCESSOR = Preprocessor_discretize( method=orange.EntropyDiscretization())
# Description: Entropy based discretization compared to discretization with equal-frequency # of instances in intervals # Category: preprocessing # Uses: wdbc.tab # Classes: Preprocessor_discretize, EntropyDiscretization # Referenced: o_categorization.htm import orange def show_values(data, heading): for a in data.domain.attributes: print "%s/%d: %s" % (a.name, len(a.values), reduce(lambda x,y: x+', '+y, [i for i in a.values])) data = orange.ExampleTable("../datasets/wdbc") print '%d features in original data set, discretized:' % len(data.domain.attributes) data_ent = orange.Preprocessor_discretize(data, method=orange.EntropyDiscretization()) show_values(data_ent, "Entropy based discretization") print '\nFeatures with sole value after discretization:' for a in data_ent.domain.attributes: if len(a.values)==1: print a.name import orngDisc reload(orngDisc) data_ent2 = orngDisc.entropyDiscretization(data) print '%d features after removing features discretized to a constant value' % len(data_ent2.domain.attributes)