print data[19] print "Imputed:" print imputer(data[19]) print impdata = imputer(data) for i in range(20, 25): print data[i] print impdata[i] print print "\n*** TREE-BASED IMPUTATION ***\n" import orngTree imputer = orange.ImputerConstructor_model() imputer.learnerContinuous = imputer.learnerDiscrete = orngTree.TreeLearner(minSubset = 20) imputer = imputer(data) print "Example w/ missing values" print data[19] print "Imputed:" print imputer(data[19]) print impdata = imputer(data) for i in range(20, 25): print data[i] print impdata[i] print
def __call__(self, data, weight=None): if not self.use_attributes is None: new_domain = orange.Domain(self.use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) if self.stepwise and self.stepwise_before: use_attributes = stepwise(data, add_sig=self.add_sig, remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # continuization (replaces discrete with continuous attributes) continuizer = orange.DomainContinuizer() continuizer.multinomialTreatment = continuizer.FrequentIsBase continuizer.zeroBased = True domain0 = continuizer(data) data = data.translate(domain0) if self.stepwise and not self.stepwise_before: use_attributes = stepwise(data, weight, add_sig=self.add_sig, remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # missing values handling (impute missing) imputer = orange.ImputerConstructor_model() imputer.learnerContinuous = orange.MajorityLearner() imputer.learnerDiscrete = orange.MajorityLearner() imputer = imputer(data) data = imputer(data) # convertion to numpy A, y, w = data.toNumpy() # weights ?? if A is None: n = len(data) m = 0 else: n, m = numpy.shape(A) if self.beta0 == True: if A is None: X = numpy.ones([len(data), 1]) else: X = numpy.insert(A, 0, 1, axis=1) # adds a column of ones else: X = A # set weights W = numpy.identity(len(data)) if weight: for di, d in enumerate(data): W[di, di] = float(d[weight]) D = dot( dot(numpy.linalg.pinv(dot(dot(X.T, W), X)), X.T), W ) # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X beta = dot(D, y) yEstimated = dot(X, beta) # estimation # some desriptive statistisc muY, sigmaY = numpy.mean(y), numpy.std(y) muX, covX = numpy.mean(X, axis=0), numpy.cov(X, rowvar=0) # model statistics SST, SSR = numpy.sum((y - muY)**2), numpy.sum((yEstimated - muY)**2) SSE, RSquare = SST - SSR, SSR / SST R = numpy.sqrt(RSquare) # coefficient of determination RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1) F = (SSR / m) / (SST - SSR / (n - m - 1)) # F statistisc df = m - 1 sigmaSquare = SSE / (n - m - 1) # standard error of estimated coefficients errCoeff = sqrt(sigmaSquare * inv(dot(X.T, X)).diagonal()) # t statistisc, significance t = beta / errCoeff df = n - 2 significance = [] for tt in t: try: significance.append( statc.betai(df * 0.5, 0.5, df / (df + tt * tt))) except: significance.append(1.0) # standardized coefficients if m > 0: stdCoeff = (sqrt(covX.diagonal()) / sigmaY) * beta else: stdCoeff = (sqrt(covX) / sigmaY) * beta model = { 'descriptives': { 'meanX': muX, 'covX': covX, 'meanY': muY, 'sigmaY': sigmaY }, 'model': { 'estCoeff': beta, 'stdErrorEstimation': errCoeff }, 'model summary': { 'TotalVar': SST, 'ExplVar': SSE, 'ResVar': SSR, 'R': R, 'RAdjusted': RAdjusted, 'F': F, 't': t, 'sig': significance } } return LinearRegression(statistics=model, domain=data.domain, name=self.name, beta0=self.beta0, imputer=imputer)