def test_normalize(self): d = orange.ExampleTable("iris") dist = orange.Distribution(0, d) dist2 = orange.Distribution(dist) dist.normalize() self.assertTrue( all(x == y / 150 for x, y in zip(dist.values(), dist2.values())))
def test_discrete(self): d = orange.ExampleTable("iris") dist = orange.Distribution(d.domain.class_var, d) cc = orange.RandomClassifier(d.domain.class_var) self.assertEqual(cc.probabilities.variable, cc.class_var) cc2 = orange.RandomClassifier(None, dist) self.assertEqual(cc2.class_var, d.domain.class_var) self.assertEqual(cc2.probabilities.variable, cc2.class_var) self.assertEqual(id(cc2.probabilities), id(dist)) self.assertTrue(all(x==50 for x in cc2.probabilities)) for cl in [cc, cc2]: for e in d[0:150:20]: anss = set() for i in range(5): anss.add(cl(e)) self.assertEqual(len(anss), 1) anss = set() for e in d: anss.add(cl(e)) self.assertEqual(len(anss), 3) for e in d[0:150:20]: self.assertTrue(all(x==50 for x in cc2(e, orange.Classifier.GetProbabilities))) self.assertRaises(TypeError, orange.RandomClassifier, dist) self.assertRaises(ValueError, orange.RandomClassifier, None, orange.DiscDistribution()) self.assertRaises(ValueError, orange.RandomClassifier, d.domain[1], orange.Distribution(d.domain[0]))
def test_pickle(self): import pickle d = orange.ExampleTable("iris") dist = orange.Distribution(d.domain[0], d) cc = orange.ConstantClassifier(dist) s = pickle.dumps(cc) cc2 = pickle.loads(s) self.assertEqual(cc.class_var, cc2.class_var) self.assertEqual(cc.default_val, cc2.default_val) self.assertEqual(cc.default_distribution, cc2.default_distribution) cc.default_val = 42 s = pickle.dumps(cc) cc2 = pickle.loads(s) self.assertEqual(cc.default_val, cc2.default_val) dist = orange.Distribution(d.domain.class_var, d) cc = orange.ConstantClassifier(dist) s = pickle.dumps(cc) cc2 = pickle.loads(s) self.assertEqual(cc.class_var, cc2.class_var) self.assertEqual(cc.default_val, cc2.default_val) self.assertEqual(cc.default_distribution, cc2.default_distribution) cc.default_val = 1 s = pickle.dumps(cc) cc2 = pickle.loads(s) self.assertEqual(cc.default_val, cc2.default_val)
def test_hash(self): d = orange.ExampleTable("zoo") disc = orange.Distribution("type", d) disc2 = orange.Distribution(d.domain.classVar, d) self.assertEqual(hash(disc), hash(disc2)) disc2[0] += 1 self.assertNotEqual(hash(disc), hash(disc2))
def t2est_discrete(self): d = orange.ExampleTable("iris") dist = orange.Distribution(d.domain.class_var, d) cc = orange.ConstantClassifier(d.domain.class_var) self.assertEqual(cc.class_var, d.domain.class_var) self.assertEqual(cc.default_distribution.variable, cc.class_var) cc2 = orange.ConstantClassifier(dist) self.assertEqual(cc2.class_var, d.domain.class_var) self.assertEqual(cc2.default_distribution.variable, cc2.class_var) self.assertEqual(id(cc2.default_distribution), id(dist)) self.assertTrue(all(x==50 for x in cc2.default_distribution)) cc3 = orange.ConstantClassifier(d.domain.class_var, None, dist) self.assertEqual(cc3.class_var, d.domain.class_var) self.assertEqual(cc3.default_distribution.variable, cc3.class_var) self.assertEqual(id(cc3.default_distribution), id(dist)) self.assertTrue(all(x==50 for x in cc3.default_distribution)) cc4 = orange.ConstantClassifier(d.domain.class_var, "Iris-setosa", dist) self.assertEqual(cc4.class_var, d.domain.class_var) self.assertEqual(cc4.default_distribution.variable, cc3.class_var) self.assertEqual(id(cc4.default_distribution), id(dist)) self.assertTrue(all(x==50 for x in cc4.default_distribution)) for cl in [cc, cc2, cc3]: for e in d[0:150:20]: anss = set() for i in range(5): anss.add(cl(e)) self.assertEqual(len(anss), 1) anss = set() for e in d: anss.add(cl(e)) self.assertEqual(len(anss), 3) for e in d[0:150:20]: anss = set() for i in range(5): self.assertEqual(cc4(e), "Iris-setosa") for cl in [cc2, cc3, cc4]: for e in d[0:150:20]: self.assertTrue(all(x==50 for x in cl(e, orange.Classifier.GetProbabilities))) self.assertRaises(TypeError, orange.ConstantClassifier, d.domain.class_var, dist) self.assertRaises(ValueError, orange.ConstantClassifier, None, "?", orange.DiscDistribution()) self.assertRaises(ValueError, orange.ConstantClassifier, d.domain[1], "?", orange.Distribution(d.domain[0])) cc4.default_distribution = [50, 50, 50] self.assertEqual(list(cc4.default_distribution), [50, 50, 50]) cc5 = orange.ConstantClassifier(d.domain.class_var, "Iris-setosa", [50, 50, 50]) self.assertEqual(list(cc5.default_distribution), [50, 50, 50])
def test(): x = data.domain.attributes[1] y = data.domain.attributes[2] c = data.domain.classVar print "H(%s) = %5.5f" % (x.name, _entropy(p2f(orange.Distribution(x, data)))) print "H(%s) = %5.5f" % (y.name, _entropy(p2f(orange.Distribution(y, data)))) print "H(%s,%s)= %5.5f" % (x.name, y.name, joint_entropy(x, y, data)) print "I(%s;%s)= %5.5f" % (x.name, y.name, mutual_information(x, y, data)) print "H(%s|%s)= %5.5f" % (x.name, c.name, mutual_information(x, c, data)) print "InfoGain = %5.5f" % orange.MeasureAttribute_info(x, data)
def test_construction(self): d = orange.ExampleTable("zoo") self.assertRaises(TypeError, orange.DiscDistribution, zip(d.domain["type"].values, self.freqs)) disc = orange.Distribution("type", d) disc7 = orange.DiscDistribution(self.freqs) self.assertEqual(disc, disc7) disc1 = orange.Distribution(d.domain.classVar) self.assertTrue(isinstance(disc1, orange.DiscDistribution))
def test_hash(self): d = orange.ExampleTable("iris") dist = orange.Distribution("sepal length", d) dist2 = orange.Distribution(dist) self.assertEqual(hash(dist), hash(dist2)) dist2[4.4] += 1 self.assertNotEqual(hash(dist), hash(dist2)) dist2[4.4] -= 1 self.assertEqual(hash(dist), hash(dist2)) dist2[42] = 2011 self.assertNotEqual(hash(dist), hash(dist2))
def test_fromExamples(self): d = orange.ExampleTable("zoo") disc = orange.Distribution("type", d) disc2 = orange.Distribution(d.domain.classVar, d) self.assertEqual(disc, disc2) disc3 = orange.Distribution(len(d.domain.attributes), d) self.assertEqual(disc, disc3) disc4 = orange.Distribution(-1, d) self.assertEqual(disc, disc4) disc5 = orange.get_class_distribution(d) self.assertEqual(disc, disc5)
def test_classAttr_cont(self): d = orange.ExampleTable("iris") cd = orange.get_class_distribution(d) ad = orange.Distribution(0, d) cont = orange.ContingencyClassAttr(0, d) fv = cont[0].keys()[0] self.assertEqual(cont.inner_distribution, ad) self.assertEqual(cont.outer_distribution, cd) self.assertEqual(len(cont), len(cd)) s = pickle.dumps(cont) cont2 = pickle.loads(s) self.assertEqual(cont.innerDistribution, cont2.innerDistribution) self.assertEqual(cont.innerVariable, cont2.innerVariable) self.assertEqual(cont.outerDistribution, cont2.outerDistribution) self.assertEqual(cont.outerVariable, cont2.outerVariable) self.assertEqual(cont[0], cont2[0]) cont.normalize() self.assertAlmostEqual(sum(cont.p_attr(0).values()), 1.0) self.assertEqual(cont.p_attr(0)[fv], cont.p_attr(fv, 0)) self.assertEqual(cont.p_attr(0)[fv], cont2.p_attr(fv, 0)/cont2.p_attr(0).abs) x = cont[0][0] cont.add_var_class(0, 0, 0.5) self.assertEqual(x+0.5, cont[0][0]) self.assertEqual(cont[fv][0], cont[fv,0]) with self.assertRaises(IndexError): cont["?"]
def test_random(self): d = orange.ExampleTable("zoo") disc = orange.Distribution("type", d) ans = set() for i in range(1000): ans.add(int(disc.random())) self.assertEqual(ans, set(range(len(d.domain.classVar.values))))
def test_continuous(self): d = orange.ExampleTable("iris") dom2 = orange.Domain(d.domain.attributes) d = orange.ExampleTable(dom2, d) self.assertEqual(d.domain.class_var.var_type, orange.Variable.Type.Continuous) dist = orange.Distribution(d.domain.class_var, d) cc = orange.RandomClassifier(d.domain.class_var) self.assertEqual(cc.class_var, d.domain.class_var) self.assertEqual(cc.probabilities.variable, cc.class_var) self.assertRaises(ValueError, cc, d[0]) cc2 = orange.RandomClassifier(None, dist) self.assertEqual(cc2.class_var, d.domain.class_var) self.assertEqual(cc2.probabilities.variable, cc2.class_var) self.assertEqual(id(cc2.probabilities), id(dist)) for e in d[0:150:20]: anss = set() for i in range(5): anss.add(cc2(e)) self.assertEqual(len(anss), 1) anss = set() for e in d: anss.add(cc2(e)) self.assertGreater(len(anss), 10)
def filterAndStore(self): self.examples = self.filter(self.data) # set examples self.classifier = self.learner(self.examples) # set classifier distribution = [0.0] * len(self.data.domain.classVar.values) self.complexity = len(self.filter.conditions) # set rule complexity if len(self.examples) > 0: for d in self.examples: distribution[int(d.getclass())] += 1 distribution = map(lambda d: d / len(self.examples), distribution) self.classDistribution = orange.Distribution( distribution) # set distribution self.TP = self.examples.filter( {self.examples.domain.classVar: self.targetClass}) self.FP = self.examples.filter( {self.examples.domain.classVar: self.targetClass}, negate=1) # self.TP = filter(lambda e: e.getclass()==self.targetClass, self.examples) # True positives # self.FP = filter(lambda e: e.getclass()!=self.targetClass, self.examples) # flase positives TPlen = len(self.TP) * 1.0 self.quality = TPlen / (len( self.FP) + self.g) # set rule quality: generalization quocient self.support = 1.0 * len(self.examples) / len( self.data) # set rule support self.confidence = TPlen / len(self.examples) else: self.classDistribution = distribution self.TP = [] self.FP = [] self.quality = 0 # set rule quality: generalization kvocient self.support = 0 # set rule support
def err(condDist, att, value, targetClass, priorError, data): sumE = sum(condDist) valueE = condDist[targetClass] distAtt = orange.Distribution(att, data) inf = distAtt[value] * (valueE / sumE) * (1 - valueE / sumE) inf = max(inf, aproxZero) var = max(1 / inf - priorError * priorError, 0) return (math.sqrt(var))
def test_random(self): d = orange.ExampleTable("iris") dist = orange.Distribution(0, d) self.assertTrue(dist.random() in dist.keys()) rands = set() for i in range(100): rands.add(float(dist.random())) self.assertTrue(len(rands) > 1)
def __call__(self, data, weight=0): (X, y) = self.createArrayData(data) exTable = orange.ExampleTable(data.domain) for id, ex in self.anch_examples: exTable.extend(orange.ExampleTable(ex, data.domain)) (X_anch, y_anch) = self.createArrayData(exTable) betas = array([0.0] * (len(data.domain.attributes) + 1)) likelihood, betas = self.estimateBeta(X, y, betas, [0] * (len(betas)), X_anch, y_anch) # get attribute groups atGroup = [(startIndex, number of values), ...) ats = data.domain.attributes atVec = reduce( lambda x, y: x + [(y, not y == x[-1][0])], [a.getValueFrom and a.getValueFrom.whichVar or a for a in ats], [(ats[0].getValueFrom and ats[0].getValueFrom.whichVar or ats[0], 0)])[1:] atGroup = [[0, 0]] for v_i, v in enumerate(atVec): if v[1] == 0: atGroup[-1][1] += 1 else: atGroup.append([v_i, 1]) # compute zero values for attributes sumB = 0. for ag in atGroup: X_temp = concatenate((X[:, :ag[0] + 1], X[:, ag[0] + 1 + ag[1]:]), 1) if X_anch: X_anch_temp = concatenate( (X_anch[:, :ag[0] + 1], X_anch[:, ag[0] + 1 + ag[1]:]), 1) else: X_anch_temp = X_anch ## print "1", concatenate((betas[:i+1],betas[i+2:])) ## print "2", betas likelihood_temp, betas_temp = self.estimateBeta( X_temp, y, concatenate((betas[:ag[0] + 1], betas[ag[0] + ag[1] + 1:])), [0] + [1] * (len(betas) - 1 - ag[1]), X_anch_temp, y_anch) print "finBetas", betas, betas_temp print "betas", betas[0], betas_temp[0] sumB += betas[0] - betas_temp[0] apriori = orange.Distribution(data.domain.classVar, data) aprioriProb = apriori[0] / apriori.abs print "koncni rezultat", sumB, math.log( (1 - aprioriProb) / aprioriProb), betas[0] beta = [] beta_se = [] print "likelihood2", likelihood for i in range(len(betas)): beta.append(betas[i]) beta_se.append(0.0) return (self.OK, beta, beta_se, 0)
def __init__(self, rules=None, examples=None, weightID=0, **argkw): self.rules = rules self.examples = examples self.weightID = weightID self.classVar = examples.domain.classVar if examples is not None else None self.__dict__.update(argkw) if examples is not None: self.prior = orange.Distribution(examples.domain.classVar, examples)
def entropy(x, data): """entropy of an attribute x from dataset data""" if type(x)==orange.EnumVariable: return _entropy(p2f(orange.Distribution(x, data))) if type(x)==list: if len(x)==2: # joint entropy of a pair of attributes c = orange.ContingencyAttrAttr(x, y, data) return _entropy(p2f(flatten(c))) else: # joint entropy of for a set of attributes pass
def add_sub_rules(rules, examples, weight, learner, dists): apriori = orange.Distribution(examples.domain.classVar, examples, weight) newRules = orange.RuleList() for r in rules: newRules.append(r) # loop through rules for r in rules: tmpList = orange.RuleList() tmpRle = r.clone() tmpRle.filter.conditions = [] tmpRle.parentRule = None tmpRle.filterAndStore(examples, weight, r.classifier.defaultVal) tmpList.append(tmpRle) while tmpList and len(tmpList[0].filter.conditions) <= len( r.filter.conditions): tmpList2 = orange.RuleList() for tmpRule in tmpList: # evaluate tmpRule oldREP = learner.ruleFinder.evaluator.returnExpectedProb learner.ruleFinder.evaluator.returnExpectedProb = False learner.ruleFinder.evaluator.evDistGetter.dists = createEVDistList( dists[int(r.classifier.defaultVal)]) tmpRule.quality = learner.ruleFinder.evaluator( tmpRule, examples, weight, r.classifier.defaultVal, apriori) learner.ruleFinder.evaluator.returnExpectedProb = oldREP # if rule not in rules already, add it to the list if not True in [rules_equal(ri, tmpRule) for ri in newRules] and len( tmpRule.filter.conditions ) > 0 and tmpRule.quality > apriori[ r.classifier.defaultVal] / apriori.abs: newRules.append(tmpRule) # create new tmpRules, set parent Rule, append them to tmpList2 if not True in [rules_equal(ri, tmpRule) for ri in newRules]: for c in r.filter.conditions: tmpRule2 = tmpRule.clone() tmpRule2.parentRule = tmpRule tmpRule2.filter.conditions.append(c) tmpRule2.filterAndStore(examples, weight, r.classifier.defaultVal) if tmpRule2.classDistribution.abs < tmpRule.classDistribution.abs: tmpList2.append(tmpRule2) tmpList = tmpList2 for cl in examples.domain.classVar: tmpRle = orange.Rule() tmpRle.filter = orange.Filter_values(domain=examples.domain) tmpRle.parentRule = None tmpRle.filterAndStore(examples, weight, int(cl)) tmpRle.quality = tmpRle.classDistribution[int( cl)] / tmpRle.classDistribution.abs newRules.append(tmpRle) return newRules
def data_center(data): """Return the central - average - point in the data set""" atts = data.domain.attributes astats = orange.DomainBasicAttrStat(data) center = [astats[a].avg if a.varType == orange.VarTypes.Continuous \ else max(enumerate(orange.Distribution(a, data)), key=lambda x:x[1])[0] if a.varType == orange.VarTypes.Discrete else None for a in atts] if data.domain.classVar: center.append(0) return orange.Example(data.domain, center)
def test_construction(self): d = orange.ExampleTable("iris") dist = orange.Distribution("sepal length", d) g = orange.GaussianDistribution(dist) self.assertAlmostEqual(g.average(), dist.average()) self.assertAlmostEqual(g.modus(), dist.average()) self.assertAlmostEqual(g.var(), dist.var()) self.assertAlmostEqual(g.dev(), dist.dev()) g2 = orange.GaussianDistribution(dist.average(), dist.dev()) self.assertEqual(g, g2)
def test_add(self): d = orange.ExampleTable("zoo") disc = orange.Distribution("type", d) disc2 = orange.Distribution(d.domain.classVar) for ex in d: disc2.add(ex[-1]) self.assertEqual(disc, disc2) disc3 = orange.Distribution(d.domain.classVar) for ex in d: disc3.add(int(ex[-1]), 1.0) self.assertEqual(disc, disc3) disc4 = orange.Distribution(d.domain.classVar) for ex in d: disc4.add(float(ex[-1]), 1.0) self.assertEqual(disc, disc4) disc4.add(0, 1e-8) self.assertNotEqual(disc4[0], disc[0])
def test_stat(self): d = orange.ExampleTable("iris") dist = orange.Distribution(0, d) self.assertAlmostEqual(dist.average(), 5.843333333) self.assertAlmostEqual(dist.dev(), 0.8253012, 4) self.assertEqual(dist.percentile(50), 5.8) self.assertRaises(ValueError, dist.percentile, -5) self.assertRaises(ValueError, dist.percentile, 105) self.assertEqual(dist.density(5.0), 10) self.assertEqual(dist.density(4.95), 8)
def test_add(self): d = orange.ExampleTable("iris") dist = orange.Distribution(0, d) dist2 = orange.Distribution(d.domain[0]) for ex in d: dist2.add(ex[0]) self.assertEqual(dist, dist2) dist3 = orange.ContDistribution() for ex in d: dist3.add(ex[0]) self.assertEqual(dist, dist3) dist4 = orange.ContDistribution() for ex in d: dist4.add(ex[0], 1.0) self.assertEqual(dist, dist4) dist4.add(0, 1e-8) self.assertNotEqual(dist4[0], dist[0])
def t2est_continuous(self): d = orange.ExampleTable("iris") dom2 = orange.Domain(d.domain.attributes) d = orange.ExampleTable(dom2, d) self.assertEqual(d.domain.class_var.var_type, orange.Variable.Continuous) dist = orange.Distribution(d.domain.class_var, d) cc = orange.ConstantClassifier(d.domain.class_var) self.assertEqual(cc.class_var, d.domain.class_var) self.assertEqual(cc.default_distribution.variable, cc.class_var) cc2 = orange.ConstantClassifier(dist) self.assertEqual(cc2.class_var, d.domain.class_var) self.assertEqual(cc2.default_distribution.variable, cc2.class_var) self.assertEqual(id(cc2.default_distribution), id(dist)) cc3 = orange.ConstantClassifier(d.domain.class_var, None, dist) self.assertEqual(cc3.class_var, d.domain.class_var) self.assertEqual(cc3.default_distribution.variable, cc3.class_var) self.assertEqual(id(cc3.default_distribution), id(dist)) cc4 = orange.ConstantClassifier(d.domain.class_var, 5, dist) self.assertEqual(cc4.class_var, d.domain.class_var) self.assertEqual(cc4.default_distribution.variable, cc3.class_var) self.assertEqual(id(cc4.default_distribution), id(dist)) for cl in [cc2, cc3]: for e in d: self.assertEqual(cl(e), dist.average()) for e in d: self.assertEqual(cc4(e), 5) self.assertRaises(TypeError, orange.ConstantClassifier, d.domain.class_var, dist) self.assertRaises(ValueError, orange.ConstantClassifier, None, "?", orange.DiscDistribution()) self.assertRaises(ValueError, orange.ConstantClassifier, d.domain[1], "?", orange.Distribution(d.domain[0])) cc4.default_distribution = [50, 50, 50] self.assertEqual(list(cc4.default_distribution), [50, 50, 50])
def test_construction(self): d = orange.ExampleTable("iris") dist = orange.Distribution("sepal length", d) import collections dictdist = collections.defaultdict(float) for e in d: dictdist[float(e["sepal length"])] += 1 self.assertEqual(dist, dictdist) dist2 = orange.ContDistribution(dictdist) self.assertEqual(dist, dist2)
def __call__(self, examples, weight=0, fulldata=0): if examples.domain.classVar.varType != 1: raise "Logistic learner only works with discrete class." translate = orng2Array.DomainTranslation(self.translation_mode_d, self.translation_mode_c) if fulldata != 0: translate.analyse(fulldata, weight, warning=0) else: translate.analyse(examples, weight, warning=0) translate.prepareLR() mdata = translate.transform(examples) # get the attribute importances t = examples importance = [] for i in xrange(len(t.domain.attributes)): qi = orange.MeasureAttribute_relief(t.domain.attributes[i], t) importance.append((qi, i)) importance.sort() freqs = list(orange.Distribution(examples.domain.classVar, examples)) s = 1.0 / sum(freqs) freqs = [x * s for x in freqs] # normalize rl = RobustBLogisticLearner(regularization=self.regularization) if len(examples.domain.classVar.values) > 2: ## form several experiments: # identify the most frequent class value tfreqs = [(freqs[i], i) for i in xrange(len(freqs))] tfreqs.sort() base = tfreqs[-1][1] # the most frequent class classifiers = [] for i in xrange(len(tfreqs) - 1): # edit the translation alter = tfreqs[i][1] cfreqs = [tfreqs[-1][0], tfreqs[i][0]] # 0=base,1=alternative # edit all the examples for j in xrange(len(mdata)): c = int(examples[j].getclass()) if c == alter: mdata[j][-1] = 1 else: mdata[j][-1] = 0 r = rl(mdata, translate, importance, cfreqs) classifiers.append(r) return ArrayLogisticClassifier(classifiers, translate, tfreqs, examples.domain.classVar, len(mdata)) else: r = rl(mdata, translate, importance, freqs) return BasicLogisticClassifier(r, translate)
def maxNu(examples): """ Given example table compute the maximum nu parameter for Nu_SVC """ nu = 1.0 dist = list(orange.Distribution(examples.domain.classVar, examples)) def pairs(seq): for i, n1 in enumerate(seq): for n2 in seq[i + 1:]: yield n1, n2 return min([ 2.0 * min(n1, n2) / (n1 + n2) for n1, n2 in pairs(dist) if n1 != 0 and n2 != 0 ] + [nu])
def __call__(self, examples, weight=0): supervisedClassCheck(examples) rules = orange.RuleList() self.ruleStopping.apriori = orange.Distribution( examples.domain.classVar, examples) progress = getattr(self, "progressCallback", None) if progress: progress.start = 0.0 progress.end = 0.0 distrib = orange.Distribution(examples.domain.classVar, examples, weight) distrib.normalize() for targetClass in examples.domain.classVar: if progress: progress.start = progress.end progress.end += distrib[targetClass] self.targetClass = targetClass cl = orange.RuleLearner.__call__(self, examples, weight) for r in cl.rules: rules.append(r) if progress: progress(1.0, None) return CN2UnorderedClassifier(rules, examples, weight)
def data_center(data): """ Returns a center of the instances in the data set (average across data instances for continuous attributes, most frequent value for discrete attributes). """ atts = data.domain.attributes astats = orange.DomainBasicAttrStat(data) center = [astats[a].avg if a.varType == orange.VarTypes.Continuous \ # else max(enumerate(orange.Distribution(a, data)), key=lambda x:x[1])[0] if a.varType == orange.VarTypes.Discrete else _modus(orange.Distribution(a, data)) if a.varType == orange.VarTypes.Discrete else None for a in atts] if data.domain.classVar: center.append(0) return orange.Example(data.domain, center)