def attest_ind(a, b, dim=None): """ Return the t-test statistics on arrays a and b over the dim axis. Returns both the t statistic as well as the p-value """ # dim = a.ndim - 1 if dim is None else dim x1, x2 = ma.mean(a, dim), ma.mean(b, dim) v1, v2 = ma.var(a, dim), ma.var(b, dim) n1, n2 = (a.shape[dim], b.shape[dim]) if dim is not None else (a.size, b.size) df = float(n1+n2-2) svar = ((n1-1)*v1+(n2-1)*v2) / df t = (x1-x2)/ma.sqrt(svar*(1.0/n1 + 1.0/n2)) if t.ndim == 0: return (t, statc.betai(0.5*df,0.5,df/(df+t**2)) if t is not ma.masked and df/(df+t**2) <= 1.0 else ma.masked) else: prob = [statc.betai(0.5*df,0.5,df/(df+tsq)) if tsq is not ma.masked and df/(df+tsq) <= 1.0 else ma.masked for tsq in t*t] return t, prob
def aF_oneway(*args, **kwargs): dim = kwargs.get("dim", None) arrays = args means = [ma.mean(a, dim) for a in arrays] vars = [ma.var(a, dim) for a in arrays] lens = [ma.sum(ma.array(ma.ones(a.shape), mask=ma.asarray(a).mask), dim) for a in arrays] alldata = ma.concatenate(arrays, dim if dim is not None else 0) bign = ma.sum(ma.array(ma.ones(alldata.shape), mask=alldata.mask), dim) sstot = ma.sum(alldata ** 2, dim) - (ma.sum(alldata, dim) ** 2) / bign ssbn = ma.sum([(ma.sum(a, dim) ** 2) / L for a, L in zip(arrays, lens)], dim) # print ma.sum(alldata, dim) ** 2 / bign, ssbn ssbn -= ma.sum(alldata, dim) ** 2 / bign sswn = sstot - ssbn dfbn = dfnum = float(len(args) - 1.0) dfwn = bign - len(args) # + 1.0 F = (ssbn / dfbn) / (sswn / dfwn) if F.ndim == 0 and dfwn.ndim == 0: return (F,statc.betai(0.5 * dfwn, 0.5 * dfnum, dfwn/float(dfwn+dfnum*F)) if F is not ma.masked and dfwn/float(dfwn+dfnum*F) <= 1.0 \ and dfwn/float(dfwn+dfnum*F) >= 0.0 else ma.masked) else: prob = [statc.betai(0.5 * dfden, 0.5 * dfnum, dfden/float(dfden+dfnum*f)) if f is not ma.masked and dfden/float(dfden+dfnum*f) <= 1.0 \ and dfden/float(dfden+dfnum*f) >= 0.0 else ma.masked for dfden, f in zip (dfwn, F)] return F, prob
def __call__(self, data, weight=None): if not self.use_attributes is None: new_domain = orange.Domain(self.use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) if self.stepwise and self.stepwise_before: use_attributes=stepwise(data,add_sig=self.add_sig,remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # continuization (replaces discrete with continuous attributes) continuizer = orange.DomainContinuizer() continuizer.multinomialTreatment = continuizer.FrequentIsBase continuizer.zeroBased = True domain0 = continuizer(data) data = data.translate(domain0) if self.stepwise and not self.stepwise_before: use_attributes=stepwise(data,weight,add_sig=self.add_sig,remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # missing values handling (impute missing) imputer = orange.ImputerConstructor_model() imputer.learnerContinuous = orange.MajorityLearner() imputer.learnerDiscrete = orange.MajorityLearner() imputer = imputer(data) data = imputer(data) # convertion to numpy A, y, w = data.toNumpy() # weights ?? if A is None: n = len(data) m = 0 else: n, m = numpy.shape(A) if self.beta0 == True: if A is None: X = numpy.ones([len(data),1]) else: X = numpy.insert(A,0,1,axis=1) # adds a column of ones else: X = A # set weights W = numpy.identity(len(data)) if weight: for di, d in enumerate(data): W[di,di] = float(d[weight]) D = dot(dot(numpy.linalg.pinv(dot(dot(X.T,W),X)), X.T), W) # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X beta = dot(D,y) yEstimated = dot(X,beta) # estimation # some desriptive statistisc muY, sigmaY = numpy.mean(y), numpy.std(y) muX, covX = numpy.mean(X, axis = 0), numpy.cov(X, rowvar = 0) # model statistics SST, SSR = numpy.sum((y - muY) ** 2), numpy.sum((yEstimated - muY) ** 2) SSE, RSquare = SST-SSR, SSR/SST R = numpy.sqrt(RSquare) # coefficient of determination RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1) F = (SSR / m) / (SST - SSR / (n - m - 1)) # F statistisc df = m - 1 sigmaSquare = SSE / (n-m-1) # standard error of estimated coefficients errCoeff = sqrt(sigmaSquare * inv(dot(X.T,X)).diagonal()) # t statistisc, significance t = beta / errCoeff df = n-2 significance = [] for tt in t: try: significance.append(statc.betai(df*0.5,0.5,df/(df+tt*tt))) except: significance.append(1.0) # standardized coefficients if m>0: stdCoeff = (sqrt(covX.diagonal()) / sigmaY) * beta else: stdCoeff = (sqrt(covX) / sigmaY) * beta model = {'descriptives': { 'meanX' : muX, 'covX' : covX, 'meanY' : muY, 'sigmaY' : sigmaY}, 'model' : {'estCoeff' : beta, 'stdErrorEstimation': errCoeff}, 'model summary': {'TotalVar' : SST, 'ExplVar' : SSE, 'ResVar' : SSR, 'R' : R, 'RAdjusted' : RAdjusted, 'F' : F, 't' : t, 'sig': significance}} return LinearRegression(statistics = model, domain = data.domain, name = self.name, beta0 = self.beta0, imputer=imputer)
def __call__(self, data, weight=None): if not self.use_attributes is None: new_domain = orange.Domain(self.use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) if self.stepwise and self.stepwise_before: use_attributes = stepwise(data, add_sig=self.add_sig, remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # continuization (replaces discrete with continuous attributes) continuizer = orange.DomainContinuizer() continuizer.multinomialTreatment = continuizer.FrequentIsBase continuizer.zeroBased = True domain0 = continuizer(data) data = data.translate(domain0) if self.stepwise and not self.stepwise_before: use_attributes = stepwise(data, weight, add_sig=self.add_sig, remove_sig=self.remove_sig) new_domain = orange.Domain(use_attributes, data.domain.classVar) new_domain.addmetas(data.domain.getmetas()) data = orange.ExampleTable(new_domain, data) # missing values handling (impute missing) imputer = orange.ImputerConstructor_model() imputer.learnerContinuous = orange.MajorityLearner() imputer.learnerDiscrete = orange.MajorityLearner() imputer = imputer(data) data = imputer(data) # convertion to numpy A, y, w = data.toNumpy() # weights ?? if A is None: n = len(data) m = 0 else: n, m = numpy.shape(A) if self.beta0 == True: if A is None: X = numpy.ones([len(data), 1]) else: X = numpy.insert(A, 0, 1, axis=1) # adds a column of ones else: X = A # set weights W = numpy.identity(len(data)) if weight: for di, d in enumerate(data): W[di, di] = float(d[weight]) D = dot( dot(numpy.linalg.pinv(dot(dot(X.T, W), X)), X.T), W ) # adds some robustness by computing the pseudo inverse; normal inverse could fail due to singularity of the X.T*W*X beta = dot(D, y) yEstimated = dot(X, beta) # estimation # some desriptive statistisc muY, sigmaY = numpy.mean(y), numpy.std(y) muX, covX = numpy.mean(X, axis=0), numpy.cov(X, rowvar=0) # model statistics SST, SSR = numpy.sum((y - muY)**2), numpy.sum((yEstimated - muY)**2) SSE, RSquare = SST - SSR, SSR / SST R = numpy.sqrt(RSquare) # coefficient of determination RAdjusted = 1 - (1 - RSquare) * (n - 1) / (n - m - 1) F = (SSR / m) / (SST - SSR / (n - m - 1)) # F statistisc df = m - 1 sigmaSquare = SSE / (n - m - 1) # standard error of estimated coefficients errCoeff = sqrt(sigmaSquare * inv(dot(X.T, X)).diagonal()) # t statistisc, significance t = beta / errCoeff df = n - 2 significance = [] for tt in t: try: significance.append( statc.betai(df * 0.5, 0.5, df / (df + tt * tt))) except: significance.append(1.0) # standardized coefficients if m > 0: stdCoeff = (sqrt(covX.diagonal()) / sigmaY) * beta else: stdCoeff = (sqrt(covX) / sigmaY) * beta model = { 'descriptives': { 'meanX': muX, 'covX': covX, 'meanY': muY, 'sigmaY': sigmaY }, 'model': { 'estCoeff': beta, 'stdErrorEstimation': errCoeff }, 'model summary': { 'TotalVar': SST, 'ExplVar': SSE, 'ResVar': SSR, 'R': R, 'RAdjusted': RAdjusted, 'F': F, 't': t, 'sig': significance } } return LinearRegression(statistics=model, domain=data.domain, name=self.name, beta0=self.beta0, imputer=imputer)