def getview (self, view, pbar): # {{{ import numpy as np from pygeode.tools import loopover, npsum out = np.zeros(view.shape, self.dtype) W = np.zeros(view.shape, self.dtype) for outsl, (indata, inw) in loopover([self.var, self.mweights], view, self.var.axes, pbar=pbar): out[outsl] += npsum(indata * inw, self.indices) # Product of data and weights f = indata.size / (inw.size * out[outsl].size) W[outsl] += npsum(inw, self.indices) * f # Sum of weights return out / W
def getview (self, view, pbar): import numpy as np from pygeode.tools import loopover, npsum x = np.zeros(view.shape, self.dtype) x2 = np.zeros(view.shape, self.dtype) N = self.N for outsl, (indata,) in loopover(self.var, view, pbar=pbar): x[outsl] += npsum(indata, self.indices) x2[outsl] += npsum(indata**2, self.indices) x /= N return (x2 - N*x**2) / (N - 1)
def getview(self, view, pbar): import numpy as np from pygeode.tools import loopover, npsum x = np.zeros(view.shape, self.dtype) x2 = np.zeros(view.shape, self.dtype) N = self.N for outsl, (indata, ) in loopover(self.var, view, pbar=pbar): x[outsl] += npsum(indata, self.indices) x2[outsl] += npsum(indata**2, self.indices) x /= N return (x2 - N * x**2) / (N - 1)
def getview (self, view, pbar): import numpy as np from pygeode.tools import loopover, npsum out = np.zeros(view.shape, self.dtype) for outsl, (indata,) in loopover(self.var, view, pbar=pbar): out[outsl] += npsum(indata, self.indices) return out
def getview(self, view, pbar): import numpy as np from pygeode.tools import loopover, npsum out = np.zeros(view.shape, self.dtype) for outsl, (indata, ) in loopover(self.var, view, pbar=pbar): out[outsl] += npsum(indata, self.indices) return out
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': p-value of regession; see notes. * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; it corresponds to the probability of obtaining the regression coefficient under the null hypothesis that there is no linear relationship. Note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View # Split output request now ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError( 'One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = tuple([srcaxes[i] for i in oiaxes]) inaxes = oaxes + tuple([srcaxes[i] for i in riaxes]) oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len( riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % ( Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr, ) os2 = os + (Nr, Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes) for j in range(i + 1): xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i, :, :] = np.linalg.inv(xx[i, :, :]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [ np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr) ] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from .var import Var from .dataset import asdataset from .axis import NonCoordinateAxis ra = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor') ra2 = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor2') Nd = len(oaxes) rvs = [] if 'beta' in output: B = Var(oaxes + (ra, ), values=beta, name='beta') B.atts['longname'] = 'regression coefficient' rvs.append(B) if 'r2' in output: vary = (yy - y**2 / N) R2 = 1 - (yy - vare) / vary R2 = Var(oaxes, values=R2, name='R2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: p = [ 2. * (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr)) for i in range(Nr) ] p = np.transpose(np.array(p), [Nd] + list(range(Nd))) p = Var(oaxes + (ra, ), values=p, name='p') p.atts['longname'] = 'p-values' rvs.append(p) if 'sb' in output: sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd))) sb = Var(oaxes + (ra, ), values=sigbeta, name='sb') sb.atts['longname'] = 'standard deviation of linear coefficients' rvs.append(sb) if 'covb' in output: sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb') covb.atts['longname'] = 'Covariance matrix of the linear coefficients' rvs.append(covb) if 'se' in output: se = np.sqrt((yy - vare) / N_eff) se = Var(oaxes, values=se, name='se') se.atts['longname'] = 'standard deviation of residual' rvs.append(se) ds = asdataset(rvs) ds.atts[ 'description'] = 'multiple linear regression parameters for %s regressed against %s' % ( yn, xns) return ds
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero for each regressor * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals If the regression is computed over all axes so that the result is a scalar, the above are returned as a tuple of floats in the order specified by ``output``. Otherwise they are returned as :class:`Var` instances. The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr,) os2 = os + (Nr,Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes) for j in range(i+1): xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i,:,:] = np.linalg.inv(xx[i,:,:]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] for o in output: if o == 'B': if len(oaxes) == 0: ret.append(beta) else: ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)]) elif o == 'r': vary = (yy - y**2/N) R2 = 1 - (yy - vare) / vary if len(oaxes) == 0: ret.append(R2) else: ret.append(Var(oaxes, values=R2, name='R2')) elif o == 'p': ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)] if len(oaxes) == 0: ret.append(ps) else: ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)]) elif o == 'sb': if len(oaxes) == 0: ret.append(sigbeta) else: ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)]) elif o == 'covb': from .axis import NonCoordinateAxis as nca cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1') cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2') sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat')) elif o == 'se': se = np.sqrt((yy - vare) / N_eff) if len(oaxes) == 0: ret.append(se) else: ret.append(Var(oaxes, values=se, name='sig_resid')) else: print('multiple_regress: unrecognized output "%s"' % o) return ret
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. Returns ======= results : list of :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero * 'sm': Variance in linear coefficient * 'se': Variance of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name) # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') xy = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') x[outsl] += npsum(xdata, siaxes) y[outsl] += npsum(ydata, siaxes) xx[outsl] += npsum(xdata**2, siaxes) yy[outsl] += npsum(ydata**2, siaxes) xy[outsl] += npsum(xdata*ydata, siaxes) N = np.prod([len(srcaxes[i]) for i in riaxes]) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx -= x**2/N yy -= y**2/N xy -= (x*y)/N m = xy/xx b = (y - m*x)/float(N) if N_fac is None: N_eff = N else: N_eff = N // N_fac sige = (yy - m * xy) / (N_eff - 2.) sigm = np.sqrt(sige / xx) t = np.abs(m) / sigm p = tdist.cdf(t, N-2) * np.sign(m) xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] if 'm' in output: M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn)) ret.append(M) if 'b' in output: B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn)) ret.append(B) if 'r' in output: ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn))) if 'p' in output: P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn)) ret.append(P) if 'sm' in output: ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn))) if 'se' in output: ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn))) return ret