Esempio n. 1
0
print data[19]
print "Imputed:"
print imputer(data[19])
print

impdata = imputer(data)
for i in range(20, 25):
    print data[i]
    print impdata[i]
    print


print "\n*** TREE-BASED IMPUTATION ***\n"

import orngTree
imputer = orange.ImputerConstructor_model()
imputer.learnerContinuous = imputer.learnerDiscrete = orngTree.TreeLearner(minSubset = 20)
imputer = imputer(data)

print "Example w/ missing values"
print data[19]
print "Imputed:"
print imputer(data[19])
print


impdata = imputer(data)
for i in range(20, 25):
    print data[i]
    print impdata[i]
    print
Esempio n. 2
0
    def __call__(self, data, weight=None):
        if not self.use_attributes is None:
            new_domain = orange.Domain(self.use_attributes,
                                       data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        if self.stepwise and self.stepwise_before:
            use_attributes = stepwise(data,
                                      add_sig=self.add_sig,
                                      remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        # continuization (replaces discrete with continuous attributes)
        continuizer = orange.DomainContinuizer()
        continuizer.multinomialTreatment = continuizer.FrequentIsBase
        continuizer.zeroBased = True
        domain0 = continuizer(data)
        data = data.translate(domain0)

        if self.stepwise and not self.stepwise_before:
            use_attributes = stepwise(data,
                                      weight,
                                      add_sig=self.add_sig,
                                      remove_sig=self.remove_sig)
            new_domain = orange.Domain(use_attributes, data.domain.classVar)
            new_domain.addmetas(data.domain.getmetas())
            data = orange.ExampleTable(new_domain, data)

        # missing values handling (impute missing)
        imputer = orange.ImputerConstructor_model()
        imputer.learnerContinuous = orange.MajorityLearner()
        imputer.learnerDiscrete = orange.MajorityLearner()
        imputer = imputer(data)
        data = imputer(data)

        # convertion to numpy
        A, y, w = data.toNumpy()  # weights ??
        if A is None:
            n = len(data)
            m = 0
        else:
            n, m = numpy.shape(A)

        if self.beta0 == True:
            if A is None:
                X = numpy.ones([len(data), 1])
            else:
                X = numpy.insert(A, 0, 1, axis=1)  # adds a column of ones
        else:
            X = A

        # set weights
        W = numpy.identity(len(data))
        if weight:
            for di, d in enumerate(data):
                W[di, di] = float(d[weight])

        D = dot(
            dot(numpy.linalg.pinv(dot(dot(X.T, W), X)), X.T), W
        )  # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X
        beta = dot(D, y)

        yEstimated = dot(X, beta)  # estimation
        # some desriptive statistisc
        muY, sigmaY = numpy.mean(y), numpy.std(y)
        muX, covX = numpy.mean(X, axis=0), numpy.cov(X, rowvar=0)

        # model statistics
        SST, SSR = numpy.sum((y - muY)**2), numpy.sum((yEstimated - muY)**2)
        SSE, RSquare = SST - SSR, SSR / SST
        R = numpy.sqrt(RSquare)  # coefficient of determination
        RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1)
        F = (SSR / m) / (SST - SSR / (n - m - 1))  # F statistisc
        df = m - 1

        sigmaSquare = SSE / (n - m - 1)

        # standard error of estimated coefficients
        errCoeff = sqrt(sigmaSquare * inv(dot(X.T, X)).diagonal())

        # t statistisc, significance
        t = beta / errCoeff
        df = n - 2
        significance = []
        for tt in t:
            try:
                significance.append(
                    statc.betai(df * 0.5, 0.5, df / (df + tt * tt)))
            except:
                significance.append(1.0)

        # standardized coefficients
        if m > 0:
            stdCoeff = (sqrt(covX.diagonal()) / sigmaY) * beta
        else:
            stdCoeff = (sqrt(covX) / sigmaY) * beta

        model = {
            'descriptives': {
                'meanX': muX,
                'covX': covX,
                'meanY': muY,
                'sigmaY': sigmaY
            },
            'model': {
                'estCoeff': beta,
                'stdErrorEstimation': errCoeff
            },
            'model summary': {
                'TotalVar': SST,
                'ExplVar': SSE,
                'ResVar': SSR,
                'R': R,
                'RAdjusted': RAdjusted,
                'F': F,
                't': t,
                'sig': significance
            }
        }
        return LinearRegression(statistics=model,
                                domain=data.domain,
                                name=self.name,
                                beta0=self.beta0,
                                imputer=imputer)