def learnClassifier(self, examples): transformer = orange.DomainContinuizer() transformer.multinomialTreatment = orange.DomainContinuizer.NValues transformer.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan transformer.classTreatment = orange.DomainContinuizer.Ignore newdomain = transformer(examples) newexamples = examples.translate(newdomain) #print newexamples[0] params = {} parameters = [] self.learner.normalization = False ## Normalization already done if self.svm_type in [1, 4]: numOfNuValues = 9 maxNu = max(self.maxNu(newexamples) - 1e-7, 0.0) parameters.append( ("nu", [i / 10.0 for i in range(1, 9) if i / 10.0 < maxNu] + [maxNu])) else: parameters.append(("C", [2**a for a in range(-5, 15, 2)])) if self.kernel_type == 2: parameters.append(("gamma", [2**a for a in range(-5, 5, 2)] + [0])) tunedLearner = orngWrap.TuneMParameters(object=self.learner, parameters=parameters, folds=self.folds) return SVMClassifierClassEasyWrapper( tunedLearner(newexamples, verbose=self.verbose), newdomain, examples)
def test_continuizer_iris(self): d = orange.ExampleTable("iris") dc = orange.DomainContinuizer() dc.class_treatment = dc.ClassTreatment.LeaveUnlessTarget dc.continuous_treatment = dc.ContinuousTreatment.Leave cdomain = dc(d.domain) self.assertEqual(cdomain.variables, d.domain.variables) dc.continuous_treatment = dc.ContinuousTreatment.NormalizeBySpan self.assertRaises(ValueError, dc, d.domain) cdomain = dc(d) dd = orange.ExampleTable(cdomain, d) bs = orange.DomainBasicAttrStat(d) for e, ec in zip(d[:10], dd): for i in range(4): self.assertEqual((e[i] - bs[i].min) / (bs[i].max - bs[i].min), ec[i]) dc.continuous_treatment = dc.ContinuousTreatment.NormalizeByVariance self.assertRaises(ValueError, dc, d.domain) cdomain = dc(d) dd = orange.ExampleTable(cdomain, d) bs = orange.DomainBasicAttrStat(d) for e, ec in zip(d[:10], dd): for i in range(4): self.assertEqual((e[i] - bs[i].avg) / bs[i].dev, ec[i])
def _normalize(self, examples): dc = orange.DomainContinuizer() dc.classTreatment = orange.DomainContinuizer.Ignore dc.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan dc.multinomialTreatment = orange.DomainContinuizer.NValues newdomain = dc(examples) return examples.translate(newdomain)
def __call__(self, data, weightId=0): continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased, multinomialTreatment=self.multinomialTreatment, continuousTreatment=self.continuousTreatment, classTreatment=self.classTreatment) c_domain = continuizer(data, weightId) return data.translate(c_domain)
def __setstate__(self, state): print state self.__dict__.update(state) transformer = orange.DomainContinuizer() transformer.multinominalTreatment = orange.DomainContinuizer.NValues transformer.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan transformer.classTreatment = orange.DomainContinuizer.Ignore print self.examples self.domain = transformer(self.oldexamples)
def defaultContinuizer(dataset): """Default continuizer with: - multinomial -> as normalized ordinal - class -> ignore - continuous -> leave """ continuizer = orange.DomainContinuizer() continuizer.multinomialTreatment = continuizer.AsNormalizedOrdinal continuizer.classTreatment = continuizer.Ignore continuizer.continuousTreatment = continuizer.Leave return continuizer(dataset)
print "%16s: %s" % (val.variable.name, val) data = orange.ExampleTable("bridges") for attr in data.domain: if attr.varType == orange.VarTypes.Continuous: print "%20s: continuous" % attr.name else: print "%20s: %s" % (attr.name, attr.values) print print "Original 15th example:" printExample(data[15]) continuizer = orange.DomainContinuizer() continuizer.multinomialTreatment = continuizer.LowestIsBase domain0 = continuizer(data) data0 = data.translate(domain0) print print "Lowest is base" printExample(data0[15]) continuizer.multinomialTreatment = continuizer.FrequentIsBase domain0 = continuizer(data) data0 = data.translate(domain0) print print "Frequent is base" printExample(data0[15])
def __call__(self, examples): if getattr(self, "imputer", 0): examples = self.imputer(examples)(examples) if getattr(self, "removeMissing", 0): examples = orange.Preprocessor_dropMissing(examples) continuizer = orange.DomainContinuizer( zeroBased=1, continuousTreatment=orange.DomainContinuizer.Leave, multinomialTreatment=orange.DomainContinuizer.FrequentIsBase, classTreatment=orange.DomainContinuizer.Ignore) attr = [] remain_attr = examples.domain.attributes[:] # get LL for Majority Learner tempDomain = orange.Domain(attr, examples.domain.classVar) #tempData = orange.Preprocessor_dropMissing(examples.select(tempDomain)) tempData = orange.Preprocessor_dropMissing(examples.select(tempDomain)) ll_Old = getLikelihood(orange.LogRegFitter_Cholesky(), tempData) ll_Best = -1000000 length_Old = float(len(tempData)) stop = 0 while not stop: # LOOP until all variables are added or no further deletion nor addition of attribute is possible worstAt = None # if there are more than 1 attribute then perform backward elimination if len(attr) >= 2: minG = 1000 worstAt = attr[0] ll_Best = ll_Old length_Best = length_Old for at in attr: # check all attribute whether its presence enough increases LL? tempAttr = filter(lambda x: x != at, attr) tempDomain = orange.Domain(tempAttr, examples.domain.classVar) tempDomain.addmetas(examples.domain.getmetas()) # domain, calculate P for LL improvement. tempDomain = continuizer( orange.Preprocessor_dropMissing( examples.select(tempDomain))) tempData = orange.Preprocessor_dropMissing( examples.select(tempDomain)) ll_Delete = getLikelihood(orange.LogRegFitter_Cholesky(), tempData) length_Delete = float(len(tempData)) length_Avg = (length_Delete + length_Old) / 2.0 G = -2 * length_Avg * (ll_Delete / length_Delete - ll_Old / length_Old) # set new worst attribute if G < minG: worstAt = at minG = G ll_Best = ll_Delete length_Best = length_Delete # deletion of attribute if worstAt.varType == orange.VarTypes.Continuous: P = lchisqprob(minG, 1) else: P = lchisqprob(minG, len(worstAt.values) - 1) if P >= self.deleteCrit: attr.remove(worstAt) remain_attr.append(worstAt) nodeletion = 0 ll_Old = ll_Best length_Old = length_Best else: nodeletion = 1 else: nodeletion = 1 # END OF DELETION PART # if enough attributes has been chosen, stop the procedure if self.numAttr > -1 and len(attr) >= self.numAttr: remain_attr = [] # for each attribute in the remaining maxG = -1 ll_Best = ll_Old length_Best = length_Old bestAt = None for at in remain_attr: tempAttr = attr + [at] tempDomain = orange.Domain(tempAttr, examples.domain.classVar) tempDomain.addmetas(examples.domain.getmetas()) # domain, calculate P for LL improvement. tempDomain = continuizer( orange.Preprocessor_dropMissing( examples.select(tempDomain))) tempData = orange.Preprocessor_dropMissing( examples.select(tempDomain)) ll_New = getLikelihood(orange.LogRegFitter_Cholesky(), tempData) length_New = float( len(tempData) ) # get number of examples in tempData to normalize likelihood # P=PR(CHI^2>G), G=-2(L(0)-L(1))=2(E(0)-E(1)) length_avg = (length_New + length_Old) / 2 G = -2 * length_avg * (ll_Old / length_Old - ll_New / length_New) if G > maxG: bestAt = at maxG = G ll_Best = ll_New length_Best = length_New if not bestAt: stop = 1 continue if bestAt.varType == orange.VarTypes.Continuous: P = lchisqprob(maxG, 1) else: P = lchisqprob(maxG, len(bestAt.values) - 1) # Add attribute with smallest P to attributes(attr) if P <= self.addCrit: attr.append(bestAt) remain_attr.remove(bestAt) ll_Old = ll_Best length_Old = length_Best if (P > self.addCrit and nodeletion) or (bestAt == worstAt): stop = 1 return attr
def __call__(self, data, weight=None): if not self.use_attributes is None: new_domain = orange.Domain(self.use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) if self.stepwise and self.stepwise_before: use_attributes = stepwise(data, add_sig=self.add_sig, remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # continuization (replaces discrete with continuous attributes) continuizer = orange.DomainContinuizer() continuizer.multinomialTreatment = continuizer.FrequentIsBase continuizer.zeroBased = True domain0 = continuizer(data) data = data.translate(domain0) if self.stepwise and not self.stepwise_before: use_attributes = stepwise(data, weight, add_sig=self.add_sig, remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # missing values handling (impute missing) imputer = orange.ImputerConstructor_model() imputer.learnerContinuous = orange.MajorityLearner() imputer.learnerDiscrete = orange.MajorityLearner() imputer = imputer(data) data = imputer(data) # convertion to numpy A, y, w = data.toNumpy() # weights ?? if A is None: n = len(data) m = 0 else: n, m = numpy.shape(A) if self.beta0 == True: if A is None: X = numpy.ones([len(data), 1]) else: X = numpy.insert(A, 0, 1, axis=1) # adds a column of ones else: X = A # set weights W = numpy.identity(len(data)) if weight: for di, d in enumerate(data): W[di, di] = float(d[weight]) D = dot( dot(numpy.linalg.pinv(dot(dot(X.T, W), X)), X.T), W ) # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X beta = dot(D, y) yEstimated = dot(X, beta) # estimation # some desriptive statistisc muY, sigmaY = numpy.mean(y), numpy.std(y) muX, covX = numpy.mean(X, axis=0), numpy.cov(X, rowvar=0) # model statistics SST, SSR = numpy.sum((y - muY)**2), numpy.sum((yEstimated - muY)**2) SSE, RSquare = SST - SSR, SSR / SST R = numpy.sqrt(RSquare) # coefficient of determination RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1) F = (SSR / m) / (SST - SSR / (n - m - 1)) # F statistisc df = m - 1 sigmaSquare = SSE / (n - m - 1) # standard error of estimated coefficients errCoeff = sqrt(sigmaSquare * inv(dot(X.T, X)).diagonal()) # t statistisc, significance t = beta / errCoeff df = n - 2 significance = [] for tt in t: try: significance.append( statc.betai(df * 0.5, 0.5, df / (df + tt * tt))) except: significance.append(1.0) # standardized coefficients if m > 0: stdCoeff = (sqrt(covX.diagonal()) / sigmaY) * beta else: stdCoeff = (sqrt(covX) / sigmaY) * beta model = { 'descriptives': { 'meanX': muX, 'covX': covX, 'meanY': muY, 'sigmaY': sigmaY }, 'model': { 'estCoeff': beta, 'stdErrorEstimation': errCoeff }, 'model summary': { 'TotalVar': SST, 'ExplVar': SSE, 'ResVar': SSR, 'R': R, 'RAdjusted': RAdjusted, 'F': F, 't': t, 'sig': significance } } return LinearRegression(statistics=model, domain=data.domain, name=self.name, beta0=self.beta0, imputer=imputer)
def test_continuizer_zoo(self): d = orange.ExampleTable("zoo") dd = orange.DomainDistributions(d) for i, e in enumerate(dd): if i == 2: break dc = orange.DomainContinuizer() dc.multinomial_treatment = dc.MultinomialTreatment.LowestIsBase dc.class_treatment = dc.ClassTreatment.ErrorIfCannotHandle self.assertRaises(ValueError, dc, d.domain) dc.class_treatment = dc.ClassTreatment.LeaveUnlessTarget cdomain = dc(d.domain) dd = orange.ExampleTable(cdomain, d) self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3]) for l in [2, 4, 5, 6, 8]: self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4) self.assertFalse("legs=0" in cdomain) self.assertEqual(cdomain.classVar.name, "type") self.assertFalse(cdomain.has_discrete_attributes()) self.assertFalse(cdomain.has_discrete_attributes(False)) self.assertTrue(cdomain.has_discrete_attributes(True)) dc.class_treatment = dc.ClassTreatment.AsOrdinal cdomain = dc(d.domain) dd = orange.ExampleTable(cdomain, d) self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3]) for l in [2, 4, 5, 6, 8]: self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4) self.assertFalse("legs=0" in cdomain) self.assertEqual(cdomain.classVar.name, "C_type") self.assertEqual(dd[0, -1], d.domain.class_var.values.index("mammal")) self.assertFalse(cdomain.has_discrete_attributes()) dc.class_treatment = dc.ClassTreatment.AsOrdinal cdomain = dc(d) dd = orange.ExampleTable(cdomain, d) self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3]) for l in [2, 4, 5, 6, 8]: self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4) self.assertFalse("legs=0" in cdomain) self.assertEqual(cdomain.classVar.name, "C_type") self.assertEqual(dd[0, -1], d.domain.class_var.values.index("mammal")) self.assertFalse(cdomain.has_discrete_attributes()) dc.multinomial_treatment = dc.MultinomialTreatment.FrequentIsBase self.assertRaises(ValueError, dc, d.domain) cdomain = dc(d) dd = orange.ExampleTable(cdomain, d) self.assertEqual(dd[0, 0], 1) self.assertEqual(dd[0, 1], 0) self.assertEqual(dd[0, 2], 1) dc.multinomial_treatment = dc.MultinomialTreatment.FrequentIsBase dc.zero_based = False self.assertRaises(ValueError, dc, d.domain) cdomain = dc(d) dd = orange.ExampleTable(cdomain, d) self.assertEqual(dd[0, 0], 1) self.assertEqual(dd[0, 1], -1) self.assertEqual(dd[0, 2], 1) dc.zero_based = True dc.multinomial_treatment = dc.MultinomialTreatment.NValues cdomain = dc(d.domain) dd = orange.ExampleTable(cdomain, d) for l in [0, 2, 4, 5, 6, 8]: self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4) dc.multinomial_treatment = dc.MultinomialTreatment.Ignore cdomain = dc(d.domain) for l in [0, 2, 4, 5, 6, 8]: self.assertFalse("legs=%i" in cdomain) dc.multinomial_treatment = dc.MultinomialTreatment.IgnoreAllDiscrete cdomain = dc(d.domain) self.assertEqual(cdomain.variables, [cdomain.class_var]) dc.multinomial_treatment = dc.MultinomialTreatment.ReportError self.assertRaises(ValueError, dc, d.domain) dc.multinomial_treatment = dc.MultinomialTreatment.AsOrdinal cdomain = dc(d.domain) dd = orange.ExampleTable(cdomain, d) for e, ec in zip(d[:10], dd): self.assertEqual(int(e["legs"]), ec["C_legs"]) dc.multinomial_treatment = dc.MultinomialTreatment.AsNormalizedOrdinal cdomain = dc(d.domain) dd = orange.ExampleTable(cdomain, d) for e, ec in zip(d[:10], dd): self.assertEqual(int(e["legs"]) / 5, ec["C_legs"])