Beispiel #1
0
  def getview (self, view, pbar):
  # {{{
    import numpy as np
    from pygeode.tools import loopover, npsum
    out = np.zeros(view.shape, self.dtype)
    W = np.zeros(view.shape, self.dtype)
    for outsl, (indata, inw) in loopover([self.var, self.mweights], view, self.var.axes, pbar=pbar):
      out[outsl] += npsum(indata * inw, self.indices)  # Product of data and weights
      f = indata.size / (inw.size * out[outsl].size)
      W[outsl] += npsum(inw, self.indices) * f # Sum of weights

    return out / W
Beispiel #2
0
  def getview (self, view, pbar):
    import numpy as np
    from pygeode.tools import loopover, npsum
    x = np.zeros(view.shape, self.dtype)
    x2 = np.zeros(view.shape, self.dtype)
    N = self.N
    for outsl, (indata,) in loopover(self.var, view, pbar=pbar):
      x[outsl] += npsum(indata, self.indices)  
      x2[outsl] += npsum(indata**2, self.indices)  

    x /= N
    return (x2 - N*x**2) / (N - 1)
Beispiel #3
0
    def getview(self, view, pbar):
        import numpy as np
        from pygeode.tools import loopover, npsum
        x = np.zeros(view.shape, self.dtype)
        x2 = np.zeros(view.shape, self.dtype)
        N = self.N
        for outsl, (indata, ) in loopover(self.var, view, pbar=pbar):
            x[outsl] += npsum(indata, self.indices)
            x2[outsl] += npsum(indata**2, self.indices)

        x /= N
        return (x2 - N * x**2) / (N - 1)
Beispiel #4
0
  def getview (self, view, pbar):
  # {{{
    import numpy as np
    from pygeode.tools import loopover, npsum
    out = np.zeros(view.shape, self.dtype)
    W = np.zeros(view.shape, self.dtype)
    for outsl, (indata, inw) in loopover([self.var, self.mweights], view, self.var.axes, pbar=pbar):
      out[outsl] += npsum(indata * inw, self.indices)  # Product of data and weights
      f = indata.size / (inw.size * out[outsl].size)
      W[outsl] += npsum(inw, self.indices) * f # Sum of weights

    return out / W
Beispiel #5
0
 def getview (self, view, pbar):
   import numpy as np
   from pygeode.tools import loopover, npsum
   out = np.zeros(view.shape, self.dtype)
   for outsl, (indata,) in loopover(self.var, view, pbar=pbar):
     out[outsl] += npsum(indata, self.indices)
   return out
Beispiel #6
0
 def getview(self, view, pbar):
     import numpy as np
     from pygeode.tools import loopover, npsum
     out = np.zeros(view.shape, self.dtype)
     for outsl, (indata, ) in loopover(self.var, view, pbar=pbar):
         out[outsl] += npsum(indata, self.indices)
     return out
Beispiel #7
0
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None):
    # {{{
    r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed.
    Note that a constant term is not included by default. The following
    parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': p-value of regession; see notes.
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    The outputs 'B', 'p', and 'sb' will produce as many outputs as there are
    regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; it corresponds to the probability of obtaining the regression
  coefficient under the null hypothesis that there is no linear relationship.
  Note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
    from pygeode.view import View

    # Split output request now
    ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    Nr = len(Xs)

    Xaxes = combine_axes(Xs)

    srcaxes = combine_axes([Xaxes, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            ia = whichaxis(srcaxes, a)
            if ia in riaxes: ri_new.append(ia)
            else:
                raise KeyError(
                    'One of the Xs or Y does not have the axis %s.' % a)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = tuple([srcaxes[i] for i in oiaxes])
    inaxes = oaxes + tuple([srcaxes[i] for i in riaxes])
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(
        riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (
            Y.name)

    # Construct work arrays
    os = oview.shape
    os1 = os + (Nr, )
    os2 = os + (Nr, Nr)
    y = np.zeros(os, 'd')
    yy = np.zeros(os, 'd')
    xy = np.zeros(os1, 'd')
    xx = np.zeros(os2, 'd')
    xxinv = np.zeros(os2, 'd')

    N = np.prod([len(srcaxes[i]) for i in riaxes])

    # Accumulate data
    for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
        ydata = datatuple[-1].astype('d')
        xdata = [datatuple[i].astype('d') for i in range(Nr)]
        y[outsl] += npsum(ydata, siaxes)
        yy[outsl] += npsum(ydata**2, siaxes)
        for i in range(Nr):
            xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes)
            for j in range(i + 1):
                xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes)

    # Fill in opposite side of xTx
    for i in range(Nr):
        for j in range(i):
            xx[..., j, i] = xx[..., i, j]

    # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
    # loop over oview does not help)
    xx = xx.reshape(-1, Nr, Nr)
    xxinv = xxinv.reshape(-1, Nr, Nr)
    for i in range(xx.shape[0]):
        xxinv[i, :, :] = np.linalg.inv(xx[i, :, :])
    xx = xx.reshape(os2)
    xxinv = xxinv.reshape(os2)

    beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
    vare = np.sum(xy * beta, -1)

    if N_fac is None: N_eff = N
    else: N_eff = N // N_fac

    sigbeta = [
        np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)
    ]

    xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
    yn = Y.name if Y.name != '' else 'Y'

    from .var import Var
    from .dataset import asdataset
    from .axis import NonCoordinateAxis

    ra = NonCoordinateAxis(values=np.arange(Nr),
                           regressor=xns,
                           name='regressor')
    ra2 = NonCoordinateAxis(values=np.arange(Nr),
                            regressor=xns,
                            name='regressor2')
    Nd = len(oaxes)

    rvs = []

    if 'beta' in output:
        B = Var(oaxes + (ra, ), values=beta, name='beta')
        B.atts['longname'] = 'regression coefficient'
        rvs.append(B)

    if 'r2' in output:
        vary = (yy - y**2 / N)
        R2 = 1 - (yy - vare) / vary
        R2 = Var(oaxes, values=R2, name='R2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        p = [
            2. *
            (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr))
            for i in range(Nr)
        ]
        p = np.transpose(np.array(p), [Nd] + list(range(Nd)))
        p = Var(oaxes + (ra, ), values=p, name='p')
        p.atts['longname'] = 'p-values'
        rvs.append(p)

    if 'sb' in output:
        sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd)))
        sb = Var(oaxes + (ra, ), values=sigbeta, name='sb')
        sb.atts['longname'] = 'standard deviation of linear coefficients'
        rvs.append(sb)

    if 'covb' in output:
        sigmat = np.zeros(os2, 'd')
        for i in range(Nr):
            for j in range(Nr):
                #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
                sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
        covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb')
        covb.atts['longname'] = 'Covariance matrix of the linear coefficients'
        rvs.append(covb)

    if 'se' in output:
        se = np.sqrt((yy - vare) / N_eff)
        se = Var(oaxes, values=se, name='se')
        se.atts['longname'] = 'standard deviation of residual'
        rvs.append(se)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'multiple linear regression parameters for %s regressed against %s' % (
            yn, xns)

    return ds
Beispiel #8
0
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'):
# {{{
  r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term
    is not included by default. The following parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero for each regressor
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    If the regression is computed over all axes so that the result is a scalar,
    the above are returned as a tuple of floats in the order specified by
    ``output``. Otherwise they are returned as :class:`Var` instances. The outputs
    'B', 'p', and 'sb' will produce as many outputs as there are regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  Nr = len(Xs)

  Xaxes = combine_axes(Xs)

  srcaxes = combine_axes([Xaxes, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      ia = whichaxis(srcaxes, a)
      if ia in riaxes: ri_new.append(ia)
      else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name)

  # Construct work arrays
  os = oview.shape
  os1 = os + (Nr,)
  os2 = os + (Nr,Nr)
  y = np.zeros(os, 'd')
  yy = np.zeros(os, 'd')
  xy = np.zeros(os1, 'd')
  xx = np.zeros(os2, 'd')
  xxinv = np.zeros(os2, 'd')

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # Accumulate data
  for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
    ydata = datatuple[-1].astype('d')
    xdata = [datatuple[i].astype('d') for i in range(Nr)]
    y[outsl] += npsum(ydata, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    for i in range(Nr):
      xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes)
      for j in range(i+1):
        xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes)

  # Fill in opposite side of xTx
  for i in range(Nr):
    for j in range(i):
      xx[..., j, i] = xx[..., i, j]

  # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
  # loop over oview does not help)
  xx = xx.reshape(-1, Nr, Nr)
  xxinv = xxinv.reshape(-1, Nr, Nr)
  for i in range(xx.shape[0]):
    xxinv[i,:,:] = np.linalg.inv(xx[i,:,:])
  xx = xx.reshape(os2)
  xxinv = xxinv.reshape(os2)

  beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
  vare = np.sum(xy * beta, -1)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac

  sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)]

  xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  for o in output:
    if o == 'B':
      if len(oaxes) == 0:
        ret.append(beta)
      else:
        ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)])
    elif o == 'r':
      vary = (yy - y**2/N)
      R2 = 1 - (yy - vare) / vary
      if len(oaxes) == 0:
        ret.append(R2)
      else:
        ret.append(Var(oaxes, values=R2, name='R2'))
    elif o == 'p':
      ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)]
      if len(oaxes) == 0:
        ret.append(ps)
      else:
        ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)])
    elif o == 'sb':
      if len(oaxes) == 0:
        ret.append(sigbeta)
      else:
        ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)])
    elif o == 'covb':
      from .axis import NonCoordinateAxis as nca
      cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1')
      cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2')
      sigmat = np.zeros(os2, 'd')
      for i in range(Nr):
        for j in range(Nr):
          #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
          sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
      ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat'))
    elif o == 'se':
      se = np.sqrt((yy - vare) / N_eff)
      if len(oaxes) == 0:
        ret.append(se)
      else:
        ret.append(Var(oaxes, values=se, name='sig_resid'))
    else:
      print('multiple_regress: unrecognized output "%s"' % o)

  return ret
Beispiel #9
0
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'):
# {{{
  r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  Returns
  =======
  results : list of :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = m X + b + \epsilon` is assumed, and the following parameters
    can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero
    * 'sm': Variance in linear coefficient
    * 'se': Variance of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      i = whichaxis(srcaxes, a)
      if i not in riaxes: 
        raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name))
      ri_new.append(i)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name)

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  y = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  xy = np.zeros(oview.shape, 'd')
  yy = np.zeros(oview.shape, 'd')

  # Accumulate data
  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
    xdata = xdata.astype('d')
    ydata = ydata.astype('d')
    x[outsl] += npsum(xdata, siaxes)
    y[outsl] += npsum(ydata, siaxes)
    xx[outsl] += npsum(xdata**2, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    xy[outsl] += npsum(xdata*ydata, siaxes)

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx -= x**2/N
  yy -= y**2/N
  xy -= (x*y)/N

  m = xy/xx
  b = (y - m*x)/float(N)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac
  sige = (yy - m * xy) / (N_eff - 2.)
  sigm = np.sqrt(sige / xx)
  t = np.abs(m) / sigm
  p = tdist.cdf(t, N-2) * np.sign(m)
  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  if 'm' in output:
    M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn))
    ret.append(M)
  if 'b' in output:
    B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn))
    ret.append(B)
  if 'r' in output:
    ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn)))
  if 'p' in output:
    P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn))
    ret.append(P)
  if 'sm' in output:
    ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn)))
  if 'se' in output:
    ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn)))

  return ret