Beispiel #1
0
    def learnClassifier(self, examples):
        transformer = orange.DomainContinuizer()
        transformer.multinomialTreatment = orange.DomainContinuizer.NValues
        transformer.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan
        transformer.classTreatment = orange.DomainContinuizer.Ignore
        newdomain = transformer(examples)
        newexamples = examples.translate(newdomain)
        #print newexamples[0]
        params = {}
        parameters = []
        self.learner.normalization = False  ## Normalization already done

        if self.svm_type in [1, 4]:
            numOfNuValues = 9
            maxNu = max(self.maxNu(newexamples) - 1e-7, 0.0)
            parameters.append(
                ("nu", [i / 10.0
                        for i in range(1, 9) if i / 10.0 < maxNu] + [maxNu]))
        else:
            parameters.append(("C", [2**a for a in range(-5, 15, 2)]))
        if self.kernel_type == 2:
            parameters.append(("gamma", [2**a for a in range(-5, 5, 2)] + [0]))
        tunedLearner = orngWrap.TuneMParameters(object=self.learner,
                                                parameters=parameters,
                                                folds=self.folds)

        return SVMClassifierClassEasyWrapper(
            tunedLearner(newexamples, verbose=self.verbose), newdomain,
            examples)
Beispiel #2
0
    def test_continuizer_iris(self):
        d = orange.ExampleTable("iris")
        dc = orange.DomainContinuizer()
        dc.class_treatment = dc.ClassTreatment.LeaveUnlessTarget
        dc.continuous_treatment = dc.ContinuousTreatment.Leave
        cdomain = dc(d.domain)
        self.assertEqual(cdomain.variables, d.domain.variables)

        dc.continuous_treatment = dc.ContinuousTreatment.NormalizeBySpan
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        bs = orange.DomainBasicAttrStat(d)
        for e, ec in zip(d[:10], dd):
            for i in range(4):
                self.assertEqual((e[i] - bs[i].min) / (bs[i].max - bs[i].min),
                                 ec[i])

        dc.continuous_treatment = dc.ContinuousTreatment.NormalizeByVariance
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        bs = orange.DomainBasicAttrStat(d)
        for e, ec in zip(d[:10], dd):
            for i in range(4):
                self.assertEqual((e[i] - bs[i].avg) / bs[i].dev, ec[i])
Beispiel #3
0
 def _normalize(self, examples):
     dc = orange.DomainContinuizer()
     dc.classTreatment = orange.DomainContinuizer.Ignore
     dc.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan
     dc.multinomialTreatment = orange.DomainContinuizer.NValues
     newdomain = dc(examples)
     return examples.translate(newdomain)
Beispiel #4
0
 def __call__(self, data, weightId=0):
     continuizer = orange.DomainContinuizer(zeroBased=self.zeroBased,
                                            multinomialTreatment=self.multinomialTreatment,
                                            continuousTreatment=self.continuousTreatment,
                                            classTreatment=self.classTreatment)
     c_domain = continuizer(data, weightId)
     return data.translate(c_domain)
Beispiel #5
0
 def __setstate__(self, state):
     print state
     self.__dict__.update(state)
     transformer = orange.DomainContinuizer()
     transformer.multinominalTreatment = orange.DomainContinuizer.NValues
     transformer.continuousTreatment = orange.DomainContinuizer.NormalizeBySpan
     transformer.classTreatment = orange.DomainContinuizer.Ignore
     print self.examples
     self.domain = transformer(self.oldexamples)
Beispiel #6
0
def defaultContinuizer(dataset):
    """Default continuizer with:
        
    - multinomial -> as normalized ordinal
    - class -> ignore
    - continuous -> leave
        
    """
    continuizer = orange.DomainContinuizer()
    continuizer.multinomialTreatment = continuizer.AsNormalizedOrdinal
    continuizer.classTreatment = continuizer.Ignore
    continuizer.continuousTreatment = continuizer.Leave
    return continuizer(dataset)
Beispiel #7
0
        print "%16s: %s" % (val.variable.name, val)


data = orange.ExampleTable("bridges")

for attr in data.domain:
    if attr.varType == orange.VarTypes.Continuous:
        print "%20s: continuous" % attr.name
    else:
        print "%20s: %s" % (attr.name, attr.values)

print
print "Original 15th example:"
printExample(data[15])

continuizer = orange.DomainContinuizer()

continuizer.multinomialTreatment = continuizer.LowestIsBase
domain0 = continuizer(data)
data0 = data.translate(domain0)
print
print "Lowest is base"
printExample(data0[15])

continuizer.multinomialTreatment = continuizer.FrequentIsBase
domain0 = continuizer(data)
data0 = data.translate(domain0)
print
print "Frequent is base"
printExample(data0[15])
Beispiel #8
0
    def __call__(self, examples):
        if getattr(self, "imputer", 0):
            examples = self.imputer(examples)(examples)
        if getattr(self, "removeMissing", 0):
            examples = orange.Preprocessor_dropMissing(examples)
        continuizer = orange.DomainContinuizer(
            zeroBased=1,
            continuousTreatment=orange.DomainContinuizer.Leave,
            multinomialTreatment=orange.DomainContinuizer.FrequentIsBase,
            classTreatment=orange.DomainContinuizer.Ignore)
        attr = []
        remain_attr = examples.domain.attributes[:]

        # get LL for Majority Learner
        tempDomain = orange.Domain(attr, examples.domain.classVar)
        #tempData  = orange.Preprocessor_dropMissing(examples.select(tempDomain))
        tempData = orange.Preprocessor_dropMissing(examples.select(tempDomain))

        ll_Old = getLikelihood(orange.LogRegFitter_Cholesky(), tempData)
        ll_Best = -1000000
        length_Old = float(len(tempData))

        stop = 0
        while not stop:
            # LOOP until all variables are added or no further deletion nor addition of attribute is possible
            worstAt = None
            # if there are more than 1 attribute then perform backward elimination
            if len(attr) >= 2:
                minG = 1000
                worstAt = attr[0]
                ll_Best = ll_Old
                length_Best = length_Old
                for at in attr:
                    # check all attribute whether its presence enough increases LL?

                    tempAttr = filter(lambda x: x != at, attr)
                    tempDomain = orange.Domain(tempAttr,
                                               examples.domain.classVar)
                    tempDomain.addmetas(examples.domain.getmetas())
                    # domain, calculate P for LL improvement.
                    tempDomain = continuizer(
                        orange.Preprocessor_dropMissing(
                            examples.select(tempDomain)))
                    tempData = orange.Preprocessor_dropMissing(
                        examples.select(tempDomain))

                    ll_Delete = getLikelihood(orange.LogRegFitter_Cholesky(),
                                              tempData)
                    length_Delete = float(len(tempData))
                    length_Avg = (length_Delete + length_Old) / 2.0

                    G = -2 * length_Avg * (ll_Delete / length_Delete -
                                           ll_Old / length_Old)

                    # set new worst attribute
                    if G < minG:
                        worstAt = at
                        minG = G
                        ll_Best = ll_Delete
                        length_Best = length_Delete
                # deletion of attribute

                if worstAt.varType == orange.VarTypes.Continuous:
                    P = lchisqprob(minG, 1)
                else:
                    P = lchisqprob(minG,
                                   len(worstAt.values) - 1)
                if P >= self.deleteCrit:
                    attr.remove(worstAt)
                    remain_attr.append(worstAt)
                    nodeletion = 0
                    ll_Old = ll_Best
                    length_Old = length_Best
                else:
                    nodeletion = 1
            else:
                nodeletion = 1
                # END OF DELETION PART

            # if enough attributes has been chosen, stop the procedure
            if self.numAttr > -1 and len(attr) >= self.numAttr:
                remain_attr = []

            # for each attribute in the remaining
            maxG = -1
            ll_Best = ll_Old
            length_Best = length_Old
            bestAt = None
            for at in remain_attr:
                tempAttr = attr + [at]
                tempDomain = orange.Domain(tempAttr, examples.domain.classVar)
                tempDomain.addmetas(examples.domain.getmetas())
                # domain, calculate P for LL improvement.
                tempDomain = continuizer(
                    orange.Preprocessor_dropMissing(
                        examples.select(tempDomain)))
                tempData = orange.Preprocessor_dropMissing(
                    examples.select(tempDomain))
                ll_New = getLikelihood(orange.LogRegFitter_Cholesky(),
                                       tempData)

                length_New = float(
                    len(tempData)
                )  # get number of examples in tempData to normalize likelihood

                # P=PR(CHI^2>G), G=-2(L(0)-L(1))=2(E(0)-E(1))
                length_avg = (length_New + length_Old) / 2
                G = -2 * length_avg * (ll_Old / length_Old -
                                       ll_New / length_New)
                if G > maxG:
                    bestAt = at
                    maxG = G
                    ll_Best = ll_New
                    length_Best = length_New
            if not bestAt:
                stop = 1
                continue

            if bestAt.varType == orange.VarTypes.Continuous:
                P = lchisqprob(maxG, 1)
            else:
                P = lchisqprob(maxG,
                               len(bestAt.values) - 1)
            # Add attribute with smallest P to attributes(attr)
            if P <= self.addCrit:
                attr.append(bestAt)
                remain_attr.remove(bestAt)
                ll_Old = ll_Best
                length_Old = length_Best

            if (P > self.addCrit and nodeletion) or (bestAt == worstAt):
                stop = 1

        return attr
Beispiel #9
0
    def __call__(self, data, weight=None):
        if not self.use_attributes is None:
            new_domain = orange.Domain(self.use_attributes,
                                       data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        if self.stepwise and self.stepwise_before:
            use_attributes = stepwise(data,
                                      add_sig=self.add_sig,
                                      remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        # continuization (replaces discrete with continuous attributes)
        continuizer = orange.DomainContinuizer()
        continuizer.multinomialTreatment = continuizer.FrequentIsBase
        continuizer.zeroBased = True
        domain0 = continuizer(data)
        data = data.translate(domain0)

        if self.stepwise and not self.stepwise_before:
            use_attributes = stepwise(data,
                                      weight,
                                      add_sig=self.add_sig,
                                      remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        # missing values handling (impute missing)
        imputer = orange.ImputerConstructor_model()
        imputer.learnerContinuous = orange.MajorityLearner()
        imputer.learnerDiscrete = orange.MajorityLearner()
        imputer = imputer(data)
        data = imputer(data)

        # convertion to numpy
        A, y, w = data.toNumpy()  # weights ??
        if A is None:
            n = len(data)
            m = 0
        else:
            n, m = numpy.shape(A)

        if self.beta0 == True:
            if A is None:
                X = numpy.ones([len(data), 1])
            else:
                X = numpy.insert(A, 0, 1, axis=1)  # adds a column of ones
        else:
            X = A

        # set weights
        W = numpy.identity(len(data))
        if weight:
            for di, d in enumerate(data):
                W[di, di] = float(d[weight])

        D = dot(
            dot(numpy.linalg.pinv(dot(dot(X.T, W), X)), X.T), W
        )  # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X
        beta = dot(D, y)

        yEstimated = dot(X, beta)  # estimation
        # some desriptive statistisc
        muY, sigmaY = numpy.mean(y), numpy.std(y)
        muX, covX = numpy.mean(X, axis=0), numpy.cov(X, rowvar=0)

        # model statistics
        SST, SSR = numpy.sum((y - muY)**2), numpy.sum((yEstimated - muY)**2)
        SSE, RSquare = SST - SSR, SSR / SST
        R = numpy.sqrt(RSquare)  # coefficient of determination
        RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1)
        F = (SSR / m) / (SST - SSR / (n - m - 1))  # F statistisc
        df = m - 1

        sigmaSquare = SSE / (n - m - 1)

        # standard error of estimated coefficients
        errCoeff = sqrt(sigmaSquare * inv(dot(X.T, X)).diagonal())

        # t statistisc, significance
        t = beta / errCoeff
        df = n - 2
        significance = []
        for tt in t:
            try:
                significance.append(
                    statc.betai(df * 0.5, 0.5, df / (df + tt * tt)))
            except:
                significance.append(1.0)

        # standardized coefficients
        if m > 0:
            stdCoeff = (sqrt(covX.diagonal()) / sigmaY) * beta
        else:
            stdCoeff = (sqrt(covX) / sigmaY) * beta

        model = {
            'descriptives': {
                'meanX': muX,
                'covX': covX,
                'meanY': muY,
                'sigmaY': sigmaY
            },
            'model': {
                'estCoeff': beta,
                'stdErrorEstimation': errCoeff
            },
            'model summary': {
                'TotalVar': SST,
                'ExplVar': SSE,
                'ResVar': SSR,
                'R': R,
                'RAdjusted': RAdjusted,
                'F': F,
                't': t,
                'sig': significance
            }
        }
        return LinearRegression(statistics=model,
                                domain=data.domain,
                                name=self.name,
                                beta0=self.beta0,
                                imputer=imputer)
Beispiel #10
0
    def test_continuizer_zoo(self):
        d = orange.ExampleTable("zoo")
        dd = orange.DomainDistributions(d)
        for i, e in enumerate(dd):
            if i == 2:
                break

        dc = orange.DomainContinuizer()

        dc.multinomial_treatment = dc.MultinomialTreatment.LowestIsBase

        dc.class_treatment = dc.ClassTreatment.ErrorIfCannotHandle
        self.assertRaises(ValueError, dc, d.domain)

        dc.class_treatment = dc.ClassTreatment.LeaveUnlessTarget
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3])
        for l in [2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)
        self.assertFalse("legs=0" in cdomain)
        self.assertEqual(cdomain.classVar.name, "type")
        self.assertFalse(cdomain.has_discrete_attributes())
        self.assertFalse(cdomain.has_discrete_attributes(False))
        self.assertTrue(cdomain.has_discrete_attributes(True))

        dc.class_treatment = dc.ClassTreatment.AsOrdinal
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3])
        for l in [2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)
        self.assertFalse("legs=0" in cdomain)
        self.assertEqual(cdomain.classVar.name, "C_type")
        self.assertEqual(dd[0, -1], d.domain.class_var.values.index("mammal"))
        self.assertFalse(cdomain.has_discrete_attributes())

        dc.class_treatment = dc.ClassTreatment.AsOrdinal
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(list(map(int, d[0]))[:3], list(map(int, dd[0]))[:3])
        for l in [2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)
        self.assertFalse("legs=0" in cdomain)
        self.assertEqual(cdomain.classVar.name, "C_type")
        self.assertEqual(dd[0, -1], d.domain.class_var.values.index("mammal"))
        self.assertFalse(cdomain.has_discrete_attributes())

        dc.multinomial_treatment = dc.MultinomialTreatment.FrequentIsBase
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(dd[0, 0], 1)
        self.assertEqual(dd[0, 1], 0)
        self.assertEqual(dd[0, 2], 1)

        dc.multinomial_treatment = dc.MultinomialTreatment.FrequentIsBase
        dc.zero_based = False
        self.assertRaises(ValueError, dc, d.domain)
        cdomain = dc(d)
        dd = orange.ExampleTable(cdomain, d)
        self.assertEqual(dd[0, 0], 1)
        self.assertEqual(dd[0, 1], -1)
        self.assertEqual(dd[0, 2], 1)
        dc.zero_based = True

        dc.multinomial_treatment = dc.MultinomialTreatment.NValues
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        for l in [0, 2, 4, 5, 6, 8]:
            self.assertEqual(int(dd[0, "legs=%i" % l]), l == 4)

        dc.multinomial_treatment = dc.MultinomialTreatment.Ignore
        cdomain = dc(d.domain)
        for l in [0, 2, 4, 5, 6, 8]:
            self.assertFalse("legs=%i" in cdomain)

        dc.multinomial_treatment = dc.MultinomialTreatment.IgnoreAllDiscrete
        cdomain = dc(d.domain)
        self.assertEqual(cdomain.variables, [cdomain.class_var])

        dc.multinomial_treatment = dc.MultinomialTreatment.ReportError
        self.assertRaises(ValueError, dc, d.domain)

        dc.multinomial_treatment = dc.MultinomialTreatment.AsOrdinal
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        for e, ec in zip(d[:10], dd):
            self.assertEqual(int(e["legs"]), ec["C_legs"])

        dc.multinomial_treatment = dc.MultinomialTreatment.AsNormalizedOrdinal
        cdomain = dc(d.domain)
        dd = orange.ExampleTable(cdomain, d)
        for e, ec in zip(d[:10], dd):
            self.assertEqual(int(e["legs"]) / 5, ec["C_legs"])