Example #1
0
def write_var(ncfile, dataset, unlimited=None, compress=False):
    # {{{
    from pygeode.view import View
    from pygeode.axis import Axis
    import numpy as np
    from pygeode.progress import PBar, FakePBar
    from pygeode.tools import combine_axes

    vars = list(dataset.vars)
    axes = combine_axes(v.axes for v in vars)

    # Define the dimensions
    for a in axes:
        ncfile.createDimension(a.name,
                               size=(None if a.name == unlimited else len(a)))

    # Define the variables (including axes)
    for var in vars:
        dimensions = [a.name for a in var.axes]
        v = ncfile.createVariable(var.name,
                                  datatype=var.dtype,
                                  dimensions=dimensions,
                                  zlib=compress,
                                  fill_value=var.atts.get('_FillValue', None))
        v.setncatts(var.atts)

    # global attributes
    ncfile.setncatts(dataset.atts)

    # Relative progress of each variable
    sizes = [v.size for v in vars]
    prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100

    pbar = PBar(message="Saving '%s':" % ncfile.filepath())

    # number of actual variables (non-axes) for determining our progress
    N = len([v for v in vars if not isinstance(v, Axis)])

    # Write the data
    for i, var in enumerate(vars):
        ncvar = ncfile.variables[var.name]
        varpbar = pbar.subset(prog[i], prog[i + 1])

        views = list(View(var.axes).loop_mem())

        for j, v in enumerate(views):
            vpbar = varpbar.part(j, len(views))
            ncvar[v.slices] = v.get(var, pbar=vpbar)
Example #2
0
def clim_detrend(var, yrlen, itime=-1, sig=False):
    # {{{
    ''' clim_detrend() - returns detrended time series with a daily trend.'''
    from pygeode.timeaxis import Time
    from . import stats
    from numpy import arange
    if itime == -1: itime = var.whichaxis(Time)
    tlen = var.shape[itime]

    vary = composite(var, itime, list(range(0, tlen, yrlen)), yrlen)
    yrs = vary.axes[itime]
    yrs.values = arange(len(yrs)).astype(yrs.dtype)

    print('Computing regression')
    from pygeode.progress import PBar
    m, b, p = stats.regress(yrs, vary, pbar=PBar())
    varz = flatten(vary - (m * yrs + b), itime + 1)

    varz.axes = var.axes

    # Since the axes have been modified after initialization, redo the init to get
    # shortcuts to the axes names
    Var.__init__(varz, varz.axes, varz.dtype)

    if var.name != '':
        varz.name = var.name + "'"

    if sig:
        return m, b, varz, p
    else:
        return m, b, varz
Example #3
0
def write_var (ncfile, dataset, unlimited=None, compress=False):
# {{{
  from pygeode.view import View
  from pygeode.axis import Axis 
  import numpy as np
  from pygeode.progress import PBar, FakePBar
  from pygeode.tools import combine_axes
  
  vars = list(dataset.vars)
  axes = combine_axes(v.axes for v in vars)

  # Define the dimensions
  for a in axes:
    ncfile.createDimension(a.name, size=(None if a.name == unlimited else len(a)))

  # Define the variables (including axes)
  for var in vars:
    dimensions = [a.name for a in var.axes]
    v = ncfile.createVariable(var.name, datatype=var.dtype, dimensions=dimensions, zlib=compress, fill_value=var.atts.get('_FillValue',None))
    v.setncatts(var.atts)

  # global attributes
  ncfile.setncatts(dataset.atts)

  # Relative progress of each variable
  sizes = [v.size for v in vars]
  prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  pbar = PBar(message="Saving '%s':"%ncfile.filepath())

  # number of actual variables (non-axes) for determining our progress
  N = len([v for v in vars if not isinstance(v,Axis)])

  # Write the data
  for i,var in enumerate(vars):
    ncvar = ncfile.variables[var.name]
    varpbar = pbar.subset(prog[i], prog[i+1])

    views = list(View(var.axes).loop_mem())

    for j,v in enumerate(views):
      vpbar = varpbar.part(j, len(views))
      ncvar[v.slices] = v.get(var, pbar=vpbar)
Example #4
0
    def scan_files(self, files, opener):
        from os.path import getmtime, normpath
        from pygeode.progress import PBar

        table = self.table

        # Special case: no files given
        if len(files) == 0: return

        self.selected_files.extend(files)

        # Strip out extra separators, etc. from the filenames.
        # Otherwise, if the files are scanned a second time with different
        # separators, it may cause the same file to be included more than once.
        files = [normpath(f) for f in files]

        if self.filename is not None:
            pbar = PBar(message="Scanning files for %s" % self.filename)
        else:
            pbar = PBar(message="Scanning files")

        # Construct / add to the table
        for i, f in enumerate(files):
            pbar.update(i * 100. / len(files))
            if f in table:
                # File has changed since last time?
                if int(getmtime(f)) > self.mtime:
                    # Remove existing info
                    del table[f]
                else:
                    # Otherwise, we've already dealt with the file, so skip it.
                    continue
            # Always use the latest modification time to represent the valid time of
            # the whole table.
            self.mtime = max(self.mtime, int(getmtime(f)))

            # Record all variables from the file.
            entries = []
            table[f] = entries
            for var in opener(f):

                axes = self.axis_manager.lookup_axes(var.axes)
                entries.append((var.name, axes, var.atts))

            self.modified_table = True

        pbar.update(100)
Example #5
0
    def load(self, pbar=True):
        # {{{
        ''' Returns a version of this variable with all data loaded into memory.

    Parameters
    ----------
    pbar : boolean
      If True, display a progress bar while loading data.
    '''
        from pygeode.progress import PBar
        if hasattr(self, 'values'): return self
        if pbar is True:
            pbar = PBar(message="Loading %s:" % repr(self))
        var = Var(self.axes, values=self.get(pbar=pbar))
        copy_meta(self, var)
        return var
Example #6
0
  def scan_files (self, files, opener):
    from os.path import getmtime, normpath
    from pygeode.progress import PBar

    table = self.table

    # Special case: no files given
    if len(files) == 0: return

    self.selected_files.extend(files)

    # Strip out extra separators, etc. from the filenames.
    # Otherwise, if the files are scanned a second time with different
    # separators, it may cause the same file to be included more than once.
    files = [normpath(f) for f in files]

    if self.filename is not None:
      pbar = PBar (message = "Scanning files for %s"%self.filename)
    else:
      pbar = PBar (message = "Scanning files")

    # Construct / add to the table
    for i,f in enumerate(files):
      pbar.update(i*100./len(files))
      if f in table:
        # File has changed since last time?
        if int(getmtime(f)) > self.mtime:
          # Remove existing info
          del table[f]
        else:
          # Otherwise, we've already dealt with the file, so skip it.
          continue
      # Always use the latest modification time to represent the valid time of
      # the whole table.
      self.mtime = max(self.mtime,int(getmtime(f)))

      # Record all variables from the file.
      entries = []
      table[f] = entries
      for var in opener(f):

        axes = self.axis_manager.lookup_axes(var.axes)
        entries.append((var.name, axes, var.atts))

      self.modified_table = True

    pbar.update(100)
Example #7
0
def correlate(X, Y, axes=None, pbar=None):
# {{{
  r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  rho, p : :class:`Var`
    The correlation coefficient :math:`\rho_{XY}` and p-value, respectively.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding the given
  result under the hypothesis that the true correlation coefficient between X
  and Y is zero. It is computed from the t-statistic given in eq (8.7), in
  section 8.2.3, and assumes normally distributed quantities.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
  from pygeode.view import View

  # Put all the axes being reduced over at the end 
  # so that we can reshape 
  srcaxes = combine_axes([X, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      i = whichaxis(srcaxes, a)
      if i not in riaxes: 
        raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name))
      ri_new.append(i)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  iview = View(inaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  # Construct work arrays
  x  = np.zeros(oview.shape, 'd')*np.nan
  y  = np.zeros(oview.shape, 'd')*np.nan
  xx = np.zeros(oview.shape, 'd')*np.nan
  yy = np.zeros(oview.shape, 'd')*np.nan
  xy = np.zeros(oview.shape, 'd')*np.nan
  Na = np.zeros(oview.shape, 'd')*np.nan

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
    xdata = xdata.astype('d')
    ydata = ydata.astype('d')
    xydata = xdata*ydata

    xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
    ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
    xdata = np.tile(xdata, xbc)
    ydata = np.tile(ydata, ybc)
    xdata[np.isnan(xydata)] = np.nan
    ydata[np.isnan(xydata)] = np.nan

    # It seems np.nansum does not broadcast its arguments automatically
    # so there must be a better way of doing this...
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
    y[outsl]  = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
    yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
    xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

    # Sum of weights
    Na[outsl] = np.nansum([Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

  eps = 1e-14
  imsk = ~(Na < eps)

  xx[imsk] -= (x*x)[imsk]/Na[imsk]
  yy[imsk] -= (y*y)[imsk]/Na[imsk]
  xy[imsk] -= (x*y)[imsk]/Na[imsk]

  # Compute correlation coefficient, t-statistic, p-value
  den = np.zeros(oview.shape, 'd')
  rho = np.zeros(oview.shape, 'd')

  den[imsk] = np.sqrt((xx*yy)[imsk])
  rho[den > 0.] = xy[den > 0.] / np.sqrt(xx*yy)[den > 0.]

  den = 1 - rho**2
  # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
  den[den < eps] = eps

  t = np.zeros(oview.shape, 'd')
  p = np.zeros(oview.shape, 'd')

  t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.)/den[imsk])
  p[imsk] = tdist.cdf(t[imsk], Na[imsk]-2) * np.sign(rho[imsk])
  p[~imsk] = np.nan
  rho[~imsk] = np.nan

  # Construct and return variables
  xn = X.name if X.name != '' else 'X' # Note: could write:  xn = X.name or 'X'
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  Rho = Var(oaxes, values=rho, name='C(%s, %s)' % (xn, yn))
  P = Var(oaxes, values=p, name='P(C(%s,%s) != 0)' % (xn, yn))
  return Rho, P
Example #8
0
def correlate(X, Y, axes=None, output='r2,p', pbar=None):
    # {{{
    r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults
    to 'r2,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the correlation coefficient can be obtained
    through ``ds.r2``).

    * 'r2': The correlation coefficient :math:`\rho_{XY}`
    * 'p':  The p-value; see notes.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding a correlation
  coeefficient of equal or greater magnitude (two-sided) to the given result
  under the hypothesis that the true correlation coefficient between X and Y is
  zero. It is computed from the t-statistic given in eq (8.7), in section
  8.2.3, and assumes normally distributed quantities.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['r2', 'p']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    # Put all the axes being reduced over at the end
    # so that we can reshape
    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    iview = View(inaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Count of non-NaN data points
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    imsk = (Na > 0)

    xx[imsk] -= (x * x)[imsk] / Na[imsk]
    yy[imsk] -= (y * y)[imsk] / Na[imsk]
    xy[imsk] -= (x * y)[imsk] / Na[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    # Compute correlation coefficient, t-statistic, p-value
    den = np.zeros(oview.shape, 'd')
    rho = np.zeros(oview.shape, 'd')

    den[imsk] = np.sqrt((xx * yy)[imsk])
    dmsk = (den > 0.)

    rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk]

    den = 1 - rho**2
    # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
    den[den < eps] = eps

    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk])
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2))

    p[~imsk] = np.nan
    rho[~imsk] = np.nan

    p[~dmsk] = np.nan
    rho[~dmsk] = np.nan

    # Construct and return variables
    xn = X.name if X.name != '' else 'X'  # Note: could write:  xn = X.name or 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'r2' in output:
        r2 = Var(oaxes, values=rho, name='r2')
        r2.atts['longname'] = 'Correlation coefficient between %s and %s' % (
            xn, yn)
        rvs.append(r2)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for correlation coefficient between %s and %s' % (
                xn, yn)
        rvs.append(p)

    ds = asdataset(rvs)
    ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn)

    return ds
Example #9
0
def paired_difference(X,
                      Y,
                      axes=None,
                      alpha=0.05,
                      N_fac=None,
                      output='d,p,ci',
                      pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must share all axes over which the means are being computed.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X and Y; the effective number will be given by the number estimated from the
    dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    Nx = np.product([len(X.axes[i]) for i in ixaxes])

    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])

    assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    # Construct work arrays
    d = np.full(oview.shape, np.nan, 'd')
    dd = np.full(oview.shape, np.nan, 'd')
    N = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y],
                                          oview,
                                          inaxes=srcaxes,
                                          pbar=pbar):
        ddata = xdata.astype('d') - ydata.astype('d')
        d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
        dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)

        # Count of non-NaN data points
        N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (N > 1)
    dd[imsk] -= (d * d)[imsk] / N[imsk]
    dd[imsk] /= (N[imsk] - 1)
    d[imsk] /= N[imsk]

    # Ensure variance is non-negative
    dd[dd <= 0.] = 0.

    if N_fac is not None: eN = N // N_fac
    else: eN = N

    emsk = (eN > 1)

    den = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    den = np.zeros(oview.shape, 'd')
    den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1))
    dmsk = (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1))
    ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk]

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=eN - 1, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for t-test of paired difference (%s - %s)' % (
                xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['N_fac'] = N_fac
    ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn)

    return ds
Example #10
0
def difference(X,
               Y,
               axes=None,
               alpha=0.05,
               Nx_fac=None,
               Ny_fac=None,
               output='d,p,ci',
               pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional, defaults to None
    Axes over which to compute means; if othing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float, optional; defaults to 0.05
    Confidence level for which to compute confidence interval.

  Nx_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]

    Nx = np.product([len(X.axes[i]) for i in ixaxes])
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])
    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    Nx = np.full(oview.shape, np.nan, 'd')
    Ny = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)

        # Count of non-NaN data points
        Nx[outsl] = np.nansum(
            [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0)

    for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar):
        ydata = ydata.astype('d')
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)

        # Count of non-NaN data points
        Ny[outsl] = np.nansum(
            [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (Nx > 1) & (Ny > 1)
    xx[imsk] -= (x * x)[imsk] / Nx[imsk]
    xx[imsk] /= (Nx[imsk] - 1)

    x[imsk] /= Nx[imsk]

    yy[imsk] -= (y * y)[imsk] / Ny[imsk]
    yy[imsk] /= (Ny[imsk] - 1)

    y[imsk] /= Ny[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    if Nx_fac is not None: eNx = Nx // Nx_fac
    else: eNx = Nx
    if Ny_fac is not None: eNy = Ny // Ny_fac
    else: eNy = Ny

    emsk = (eNx > 1) & (eNy > 1)

    # Compute difference
    d = x - y

    den = np.zeros(oview.shape, 'd')
    df = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    # Convert to variance of the mean of each sample
    xx[emsk] /= eNx[emsk]
    yy[emsk] /= eNy[emsk]

    den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1)
    dmsk = (den > 0.)

    df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk]

    den[emsk] = np.sqrt(xx[emsk] + yy[emsk])

    dmsk &= (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk]))

    ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk]

    df[~dmsk] = np.nan
    p[~dmsk] = np.nan
    ci[~dmsk] = np.nan

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=df, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % (
            xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['Nx_fac'] = Nx_fac
    ds.atts['Ny_fac'] = Ny_fac
    ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn)

    return ds
Example #11
0
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None):
    # {{{
    r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed.
    Note that a constant term is not included by default. The following
    parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': p-value of regession; see notes.
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    The outputs 'B', 'p', and 'sb' will produce as many outputs as there are
    regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; it corresponds to the probability of obtaining the regression
  coefficient under the null hypothesis that there is no linear relationship.
  Note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
    from pygeode.view import View

    # Split output request now
    ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    Nr = len(Xs)

    Xaxes = combine_axes(Xs)

    srcaxes = combine_axes([Xaxes, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            ia = whichaxis(srcaxes, a)
            if ia in riaxes: ri_new.append(ia)
            else:
                raise KeyError(
                    'One of the Xs or Y does not have the axis %s.' % a)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = tuple([srcaxes[i] for i in oiaxes])
    inaxes = oaxes + tuple([srcaxes[i] for i in riaxes])
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(
        riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (
            Y.name)

    # Construct work arrays
    os = oview.shape
    os1 = os + (Nr, )
    os2 = os + (Nr, Nr)
    y = np.zeros(os, 'd')
    yy = np.zeros(os, 'd')
    xy = np.zeros(os1, 'd')
    xx = np.zeros(os2, 'd')
    xxinv = np.zeros(os2, 'd')

    N = np.prod([len(srcaxes[i]) for i in riaxes])

    # Accumulate data
    for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
        ydata = datatuple[-1].astype('d')
        xdata = [datatuple[i].astype('d') for i in range(Nr)]
        y[outsl] += npsum(ydata, siaxes)
        yy[outsl] += npsum(ydata**2, siaxes)
        for i in range(Nr):
            xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes)
            for j in range(i + 1):
                xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes)

    # Fill in opposite side of xTx
    for i in range(Nr):
        for j in range(i):
            xx[..., j, i] = xx[..., i, j]

    # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
    # loop over oview does not help)
    xx = xx.reshape(-1, Nr, Nr)
    xxinv = xxinv.reshape(-1, Nr, Nr)
    for i in range(xx.shape[0]):
        xxinv[i, :, :] = np.linalg.inv(xx[i, :, :])
    xx = xx.reshape(os2)
    xxinv = xxinv.reshape(os2)

    beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
    vare = np.sum(xy * beta, -1)

    if N_fac is None: N_eff = N
    else: N_eff = N // N_fac

    sigbeta = [
        np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)
    ]

    xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
    yn = Y.name if Y.name != '' else 'Y'

    from .var import Var
    from .dataset import asdataset
    from .axis import NonCoordinateAxis

    ra = NonCoordinateAxis(values=np.arange(Nr),
                           regressor=xns,
                           name='regressor')
    ra2 = NonCoordinateAxis(values=np.arange(Nr),
                            regressor=xns,
                            name='regressor2')
    Nd = len(oaxes)

    rvs = []

    if 'beta' in output:
        B = Var(oaxes + (ra, ), values=beta, name='beta')
        B.atts['longname'] = 'regression coefficient'
        rvs.append(B)

    if 'r2' in output:
        vary = (yy - y**2 / N)
        R2 = 1 - (yy - vare) / vary
        R2 = Var(oaxes, values=R2, name='R2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        p = [
            2. *
            (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr))
            for i in range(Nr)
        ]
        p = np.transpose(np.array(p), [Nd] + list(range(Nd)))
        p = Var(oaxes + (ra, ), values=p, name='p')
        p.atts['longname'] = 'p-values'
        rvs.append(p)

    if 'sb' in output:
        sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd)))
        sb = Var(oaxes + (ra, ), values=sigbeta, name='sb')
        sb.atts['longname'] = 'standard deviation of linear coefficients'
        rvs.append(sb)

    if 'covb' in output:
        sigmat = np.zeros(os2, 'd')
        for i in range(Nr):
            for j in range(Nr):
                #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
                sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
        covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb')
        covb.atts['longname'] = 'Covariance matrix of the linear coefficients'
        rvs.append(covb)

    if 'se' in output:
        se = np.sqrt((yy - vare) / N_eff)
        se = Var(oaxes, values=se, name='se')
        se.atts['longname'] = 'standard deviation of residual'
        rvs.append(se)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'multiple linear regression parameters for %s regressed against %s' % (
            yn, xns)

    return ds
Example #12
0
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None):
    # {{{
    r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the
    following parameters can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': p-value of regression; see notes.
    * 'sm': Standard deviation of linear coefficient estimate
    * 'se': Standard deviation of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['m', 'b', 'r2', 'p', 'sm', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from regression. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (
        X.name, Y.name)

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Sum of weights
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    if N_fac is None:
        N_eff = Na - 2.
    else:
        N_eff = Na / N_fac - 2.

    nmsk = (N_eff > 0.)

    xx[nmsk] -= (x * x)[nmsk] / Na[nmsk]
    yy[nmsk] -= (y * y)[nmsk] / Na[nmsk]
    xy[nmsk] -= (x * y)[nmsk] / Na[nmsk]

    dmsk = (xx > 0.)

    m = np.zeros(oview.shape, 'd')
    b = np.zeros(oview.shape, 'd')
    r2 = np.zeros(oview.shape, 'd')

    m[dmsk] = xy[dmsk] / xx[dmsk]
    b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk]

    r2den = xx * yy
    d2msk = (r2den > 0.)

    r2[d2msk] = xy[d2msk]**2 / r2den[d2msk]

    sige = np.zeros(oview.shape, 'd')
    sigm = np.zeros(oview.shape, 'd')
    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk]
    sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk])
    sige[nmsk] = np.sqrt(sige[dmsk])
    t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk]
    p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk]))

    msk = nmsk & dmsk

    m[~msk] = np.nan
    b[~msk] = np.nan
    sige[~msk] = np.nan
    sigm[~msk] = np.nan
    p[~msk] = np.nan

    msk = nmsk & d2msk
    r2[~msk] = np.nan

    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        M = Var(oaxes, values=m, name='m')
        M.atts['longname'] = 'slope'
        rvs.append(M)

    if 'b' in output:
        B = Var(oaxes, values=b, name='b')
        B.atts['longname'] = 'intercept'
        rvs.append(B)

    if 'r2' in output:
        R2 = Var(oaxes, values=r2, name='r2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        P = Var(oaxes, values=p, name='p')
        P.atts['longname'] = 'p-value'
        rvs.append(P)

    if 'sm' in output:
        SM = Var(oaxes, values=sigm, name='sm')
        SM.atts['longname'] = 'standard deviation of slope parameter'
        rvs.append(SM)

    if 'se' in output:
        SE = Var(oaxes, values=sige, name='se')
        SE.atts['longname'] = 'standard deviation of residual'
        rvs.append(SE)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'linear regression parameters for %s regressed against %s' % (
            yn, xn)

    return ds
Example #13
0
def save (filename, var, iaxis=None, fps=15, palette='bw', minmax=None):
  from pygeode.axis import TAxis
  from pygeode.var import Var
  from pygeode.progress import PBar
  import tempfile, shutil
  import Image
  import numpy as np
  import os

  assert isinstance(var, Var)

  # Remove any degenerate dimensions, make sure the axes are in a consistent order
  var = var.squeeze().sorted()
  assert var.naxes == 3, "can only work with 3D data"
  
  if iaxis is None: iaxis = var.whichaxis(TAxis)
  assert iaxis >= 0, "no time axis found"

  tmpdir = tempfile.mkdtemp (prefix='pygeode_mpeg')
  sl = [slice(None)] * 3

  # Get max & min values of the whole dataset
  if minmax is None:
    #TODO: calculate both of these at once, with a progress bar to help the process
    min = float(var.min())
    max = float(var.max())
  else:
    assert len(minmax) == 2, "invalid minmax argument"
    min, max = minmax

  print "Saving %s:"%filename
  pbar = PBar()

  # Loop over each timestep, generate a temporary image file
  for i in range(len(var.axes[iaxis])):
    fpbar = pbar.part(i,len(var.axes[iaxis]))
    sl[iaxis] = i
    # Get data, flip y axis, add an 'RGB' axis
    data = var[sl].squeeze()[::-1,:,np.newaxis]
    data =  (data-min)/(max-min) * 255
    if palette == 'bw':
      # Same data for R, G, and B channels
      data = np.concatenate([data,data,data], axis=2)
    elif palette == 'rainbow':
      # Piecewise linear palette
      part1 = data <= 85
      part2 = (85 < data) & (data <= 170)
      part3 = 170 < data
      b = np.zeros(data.shape)
      b[part1] = 255
      b[part2] = 255 - (data[part2] - 85)*3
      g = np.zeros(data.shape)
      g[part1] = data[part1] * 3
      g[part2] = 255
      g[part3] = 255 - (data[part3] - 170) * 3
      r = np.zeros(data.shape)
      r[part2] = (data[part2] - 85) * 3
      r[part3] = 255

      data = np.concatenate([r,g,b], axis=2)

    # Encode as an 8-bit array
    data = np.asarray(np.round(data), 'uint8')
    # Save
    framefile = tmpdir+"/frame%04d.jpg"%i
    Image.fromarray(data,"RGB").save(framefile, quality=95)
#    os.system("display "+framefile)
#    break
    fpbar.update(100)

  shape = list(var.shape)
  shape = shape[:iaxis] + shape[iaxis+1:]
  h, w = shape

#  """
  # Make the movie file
  os.system("mencoder mf://%s/*.jpg -mf w=%s:h=%s:type=jpg:fps=%s \
          -ovc lavc -lavcopts vcodec=mpeg4:vbitrate=8000 -oac copy \
           -o %s" % (tmpdir, w, h, fps, filename)    )
#  """

  # Clean up files
  shutil.rmtree (tmpdir)
Example #14
0
def paired_difference(X, Y, axes, alpha=0.05, N_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  Nx_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Four quantities are computed:

    * The difference in the means, X - Y
    * The effective number of degrees of freedom, :math:`df`
    * The probability of the computed difference if the population difference was zero
    * The confidence interval of the difference at the level specified by alpha

    If the average is taken over all axes of X and Y resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  riaxes = [whichaxis(srcaxes, n) for n in axes]
  raxes = [a for i, a in enumerate(srcaxes) if i in riaxes]
  oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
  oview = View(oaxes) 

  ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
  Nx = np.product([len(X.axes[i]) for i in ixaxes])

  iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
  Ny = np.product([len(Y.axes[i]) for i in iyaxes])

  assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'
  
  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert Nx > 1, '%s has only one element along the reduction axes' % X.name
  assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

  # Construct work arrays
  d = np.zeros(oview.shape, 'd')
  dd = np.zeros(oview.shape, 'd')

  N = np.zeros(oview.shape, 'd')

  d[()] = np.nan
  dd[()] = np.nan
  N[()] = np.nan

  # Accumulate data
  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar):
    ddata = xdata.astype('d') - ydata.astype('d')
    d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
    dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)
    # Sum of weights (kludge to get masking right)
    N[outsl] = np.nansum([N[outsl], npnansum(1. + ddata*0., ixaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  dd = (dd - d**2/N) / (N - 1)
  d /= Nx

  if N_fac is not None: eN = N//N_fac
  else: eN = N
  #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean())

  den = np.sqrt(dd/(eN - 1))

  p = tdist.cdf(abs(d/den), eN - 1)*np.sign(d)
  ci = tdist.ppf(1. - alpha/2, eN - 1) * den

  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'
  if xn == yn: name = xn
  else: name = '%s-%s'%(xn, yn)

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    D = Var(oaxes, values=d, name=name)
    DF = Var(oaxes, values=eN-1, name='df_%s' % name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([D, DF, P, CI])
  else: # Degenerate case
    return d, eN-1, p, ci
Example #15
0
def save(filename,
         in_dataset,
         version=3,
         pack=None,
         compress=False,
         cfmeta=True,
         unlimited=None):
    # {{{
    from ctypes import c_int, c_long, byref
    from pygeode.view import View
    from pygeode.tools import combine_axes, point
    from pygeode.axis import Axis, DummyAxis
    import numpy as np
    from pygeode.progress import PBar, FakePBar
    from pygeode.formats import finalize_save
    from pygeode.dataset import asdataset

    assert isinstance(filename, str)

    in_dataset = asdataset(in_dataset)
    dataset = finalize_save(in_dataset, cfmeta, pack)

    # Version?
    if compress: version = 4
    assert version in (3, 4)

    fileid = c_int()

    vars = list(dataset.vars)
    # The output axes
    axes = combine_axes(v.axes for v in vars)

    # Include axes in the list of vars (for writing to netcdf).
    # Exclude axes which don't have any intrinsic values.
    vars = vars + [a for a in axes if not isinstance(a, DummyAxis)]
    #vars.extend(axes)

    # Variables (and axes) must all have unique names
    assert len(set([v.name for v in vars])) == len(
        vars), "vars must have unique names: %s" % [v.name for v in vars]

    if unlimited is not None:
        assert unlimited in [a.name for a in axes]

    # Functions for writing entire array
    allf = {
        1: lib.nc_put_var_schar,
        2: lib.nc_put_var_text,
        3: lib.nc_put_var_short,
        4: lib.nc_put_var_int,
        5: lib.nc_put_var_float,
        6: lib.nc_put_var_double,
        7: lib.nc_put_var_uchar,
        8: lib.nc_put_var_ushort,
        9: lib.nc_put_var_uint,
        10: lib.nc_put_var_longlong,
        11: lib.nc_put_var_ulonglong
    }

    # Functions for writing chunks
    chunkf = {
        1: lib.nc_put_vara_schar,
        2: lib.nc_put_vara_text,
        3: lib.nc_put_vara_short,
        4: lib.nc_put_vara_int,
        5: lib.nc_put_vara_float,
        6: lib.nc_put_vara_double,
        7: lib.nc_put_vara_uchar,
        8: lib.nc_put_vara_ushort,
        9: lib.nc_put_vara_uint,
        10: lib.nc_put_vara_longlong,
        11: lib.nc_put_vara_ulonglong
    }

    # Create the file
    if version == 3:
        ret = lib.nc_create(filename.encode('ascii'), 0, byref(fileid))
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    elif version == 4:
        ret = lib.nc_create(filename.encode('ascii'), 0x1000,
                            byref(fileid))  # 0x1000 = NC_NETCDF4
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    else: raise Exception

    try:
        # Define the dimensions
        dimids = [None] * len(axes)
        for i, a in enumerate(axes):
            dimids[i] = c_int()
            if unlimited == a.name:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(0),
                                     byref(dimids[i]))
            else:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'),
                                     c_long(len(a)), byref(dimids[i]))
            assert ret == 0, lib.nc_strerror(ret)

        # Define the variables (including axes)
        chunks = [None] * len(vars)
        varids = [None] * len(vars)
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            # Generate the array of dimension ids for this var
            d = [dimids[list(axes).index(a)] for a in var.axes]
            # Make it C-compatible
            d = (c_int * var.naxes)(*d)
            varids[i] = c_int()
            ret = lib.nc_def_var(fileid, var.name.encode('ascii'), t,
                                 var.naxes, d, byref(varids[i]))
            assert ret == 0, lib.nc_strerror(ret)
            # Compress the data? (only works for netcdf4 or (higher?))
            if compress:
                ret = lib.nc_def_var_deflate(fileid, varids[i], 1, 1, 2)
                assert ret == 0, lib.nc_strerror(ret)

        # Write the attributes

        # global attributes
        put_attributes(fileid, -1, dataset.atts, version)

        # variable attributes
        for i, var in enumerate(vars):
            # modify axes to be netcdf friendly (CF-compliant, etc.)
            put_attributes(fileid, varids[i], var.atts, version)

        # Don't pre-fill the file
        oldmode = c_int()
        ret = lib.nc_set_fill(fileid, 256, byref(oldmode))
        assert ret == 0, "Can't set fill mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)
        # Finished defining the variables, about to start writing the values
        ret = lib.nc_enddef(fileid)
        assert ret == 0, "Error leaving define mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)

        # Relative progress of each variable
        sizes = [v.size for v in vars]
        prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100

        #  print "Saving '%s':"%filename
        pbar = PBar(message="Saving '%s':" % filename)
        #  pbar = FakePBar()
        # Write the data
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            dtype = numpy_type[t]

            #    print 'writing', var.name

            # number of actual variables (non-axes) for determining our progress
            N = len([v for v in vars if not isinstance(v, Axis)])
            varpbar = pbar.subset(prog[i], prog[i + 1])

            views = list(View(var.axes).loop_mem())
            for j, v in enumerate(views):

                vpbar = varpbar.part(j, len(views))
                #      print '???', repr(str(v))

                # Should always be slices (since we're looping over whole thing contiguously?)
                for sl in v.slices:
                    assert isinstance(sl, slice)
                for sl in v.slices:
                    assert sl.step in (1, None)

                start = [sl.start for sl in v.slices]
                count = [sl.stop - sl.start for sl in v.slices]

                start = (c_long * var.naxes)(*start)
                count = (c_long * var.naxes)(*count)

                if isinstance(var, Axis):
                    assert len(start) == len(count) == 1
                    data = var.values
                    data = data[
                        start[0]:start[0] +
                        count[0]]  # the above gives us the *whole* axis,
                    # but under extreme conditions we may be looping over smaller pieces
                    vpbar.update(100)
                else:
                    data = v.get(var, pbar=vpbar)

                # Ensure the data is stored contiguously in memory
                data = np.ascontiguousarray(data, dtype=dtype)
                ret = chunkf[t](fileid, varids[i], start, count, point(data))
                assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (
                    var.name, lib.nc_strerror(ret), ret)

    finally:
        # Finished
        lib.nc_close(fileid)
Example #16
0
def difference(X, Y, axes, alpha=0.05, Nx_fac = None, Ny_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  Nx_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Four quantities are computed:

    * The difference in the means, X - Y
    * The effective number of degrees of freedom, :math:`df`
    * The probability of the computed difference if the population difference was zero
    * The confidence interval of the difference at the level specified by alpha

    If the average is taken over all axes of X and Y resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  riaxes = [whichaxis(srcaxes, n) for n in axes]
  raxes = [a for i, a in enumerate(srcaxes) if i in riaxes]
  oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
  oview = View(oaxes) 

  ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
  Nx = np.product([len(X.axes[i]) for i in ixaxes])

  iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
  Ny = np.product([len(Y.axes[i]) for i in iyaxes])
  
  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert Nx > 1, '%s has only one element along the reduction axes' % X.name
  assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  y = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  yy = np.zeros(oview.shape, 'd')

  Nx = np.zeros(oview.shape, 'd')
  Ny = np.zeros(oview.shape, 'd')

  x[()] = np.nan
  y[()] = np.nan
  xx[()] = np.nan
  yy[()] = np.nan
  Nx[()] = np.nan
  Ny[()] = np.nan

  # Accumulate data
  for outsl, (xdata,) in loopover([X], oview, pbar=pbar):
    xdata = xdata.astype('d')
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Nx[outsl] = np.nansum([Nx[outsl], npnansum(1. + xdata*0., ixaxes)], 0) 

  for outsl, (ydata,) in loopover([Y], oview, pbar=pbar):
    ydata = ydata.astype('d')
    y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
    yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Ny[outsl] = np.nansum([Ny[outsl], npnansum(1. + ydata*0., iyaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx = (xx - x**2/Nx) / (Nx - 1)
  yy = (yy - y**2/Ny) / (Ny - 1)
  x /= Nx
  y /= Ny

  if Nx_fac is not None: eNx = Nx//Nx_fac
  else: eNx = Nx
  if Ny_fac is not None: eNy = Ny//Ny_fac
  else: eNy = Ny
  #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean())

  d = x - y
  den = np.sqrt(xx/eNx + yy/eNy)
  df = (xx/eNx + yy/eNy)**2 / ((xx/eNx)**2/(eNx - 1) + (yy/eNy)**2/(eNy - 1))

  p = tdist.cdf(abs(d/den), df)*np.sign(d)
  ci = tdist.ppf(1. - alpha/2, df) * den

  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'
  if xn == yn: name = xn
  else: name = '%s-%s'%(xn, yn)

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    D = Var(oaxes, values=d, name=name)
    DF = Var(oaxes, values=df, name='df_%s' % name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([D, DF, P, CI])
  else: # Degenerate case
    return d, df, p, ci
Example #17
0
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'):
# {{{
  r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term
    is not included by default. The following parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero for each regressor
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    If the regression is computed over all axes so that the result is a scalar,
    the above are returned as a tuple of floats in the order specified by
    ``output``. Otherwise they are returned as :class:`Var` instances. The outputs
    'B', 'p', and 'sb' will produce as many outputs as there are regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  Nr = len(Xs)

  Xaxes = combine_axes(Xs)

  srcaxes = combine_axes([Xaxes, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      ia = whichaxis(srcaxes, a)
      if ia in riaxes: ri_new.append(ia)
      else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name)

  # Construct work arrays
  os = oview.shape
  os1 = os + (Nr,)
  os2 = os + (Nr,Nr)
  y = np.zeros(os, 'd')
  yy = np.zeros(os, 'd')
  xy = np.zeros(os1, 'd')
  xx = np.zeros(os2, 'd')
  xxinv = np.zeros(os2, 'd')

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # Accumulate data
  for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
    ydata = datatuple[-1].astype('d')
    xdata = [datatuple[i].astype('d') for i in range(Nr)]
    y[outsl] += npsum(ydata, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    for i in range(Nr):
      xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes)
      for j in range(i+1):
        xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes)

  # Fill in opposite side of xTx
  for i in range(Nr):
    for j in range(i):
      xx[..., j, i] = xx[..., i, j]

  # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
  # loop over oview does not help)
  xx = xx.reshape(-1, Nr, Nr)
  xxinv = xxinv.reshape(-1, Nr, Nr)
  for i in range(xx.shape[0]):
    xxinv[i,:,:] = np.linalg.inv(xx[i,:,:])
  xx = xx.reshape(os2)
  xxinv = xxinv.reshape(os2)

  beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
  vare = np.sum(xy * beta, -1)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac

  sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)]

  xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  for o in output:
    if o == 'B':
      if len(oaxes) == 0:
        ret.append(beta)
      else:
        ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)])
    elif o == 'r':
      vary = (yy - y**2/N)
      R2 = 1 - (yy - vare) / vary
      if len(oaxes) == 0:
        ret.append(R2)
      else:
        ret.append(Var(oaxes, values=R2, name='R2'))
    elif o == 'p':
      ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)]
      if len(oaxes) == 0:
        ret.append(ps)
      else:
        ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)])
    elif o == 'sb':
      if len(oaxes) == 0:
        ret.append(sigbeta)
      else:
        ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)])
    elif o == 'covb':
      from .axis import NonCoordinateAxis as nca
      cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1')
      cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2')
      sigmat = np.zeros(os2, 'd')
      for i in range(Nr):
        for j in range(Nr):
          #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
          sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
      ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat'))
    elif o == 'se':
      se = np.sqrt((yy - vare) / N_eff)
      if len(oaxes) == 0:
        ret.append(se)
      else:
        ret.append(Var(oaxes, values=se, name='sig_resid'))
    else:
      print('multiple_regress: unrecognized output "%s"' % o)

  return ret
Example #18
0
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'):
# {{{
  r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  Returns
  =======
  results : list of :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = m X + b + \epsilon` is assumed, and the following parameters
    can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero
    * 'sm': Variance in linear coefficient
    * 'se': Variance of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      i = whichaxis(srcaxes, a)
      if i not in riaxes: 
        raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name))
      ri_new.append(i)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name)

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  y = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  xy = np.zeros(oview.shape, 'd')
  yy = np.zeros(oview.shape, 'd')

  # Accumulate data
  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
    xdata = xdata.astype('d')
    ydata = ydata.astype('d')
    x[outsl] += npsum(xdata, siaxes)
    y[outsl] += npsum(ydata, siaxes)
    xx[outsl] += npsum(xdata**2, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    xy[outsl] += npsum(xdata*ydata, siaxes)

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx -= x**2/N
  yy -= y**2/N
  xy -= (x*y)/N

  m = xy/xx
  b = (y - m*x)/float(N)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac
  sige = (yy - m * xy) / (N_eff - 2.)
  sigm = np.sqrt(sige / xx)
  t = np.abs(m) / sigm
  p = tdist.cdf(t, N-2) * np.sign(m)
  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  if 'm' in output:
    M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn))
    ret.append(M)
  if 'b' in output:
    B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn))
    ret.append(B)
  if 'r' in output:
    ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn)))
  if 'p' in output:
    P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn))
    ret.append(P)
  if 'sm' in output:
    ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn)))
  if 'se' in output:
    ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn)))

  return ret
Example #19
0
def save (filename, in_dataset, version=3, pack=None, compress=False, cfmeta = True, unlimited=None):
# {{{
  from ctypes import c_int, c_long, byref
  from pygeode.view import View
  from pygeode.tools import combine_axes, point
  from pygeode.axis import Axis, DummyAxis
  import numpy as np
  from pygeode.progress import PBar, FakePBar
  from pygeode.formats import finalize_save
  from pygeode.dataset import asdataset

  assert isinstance(filename,str)

  in_dataset = asdataset(in_dataset)
  dataset = finalize_save(in_dataset, cfmeta, pack)

  # Version?
  if compress: version = 4
  assert version in (3,4)

  fileid = c_int()

  vars = list(dataset.vars)
  # The output axes
  axes = combine_axes(v.axes for v in vars)

  # Include axes in the list of vars (for writing to netcdf).
  # Exclude axes which don't have any intrinsic values.
  vars = vars + [a for a in axes if not isinstance(a,DummyAxis)]
  #vars.extend(axes)

  # Variables (and axes) must all have unique names
  assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars]

  if unlimited is not None:
    assert unlimited in [a.name for a in axes]

  # Functions for writing entire array
  allf = {1:lib.nc_put_var_schar, 2:lib.nc_put_var_text, 3:lib.nc_put_var_short,
       4:lib.nc_put_var_int, 5:lib.nc_put_var_float,
       6:lib.nc_put_var_double, 7:lib.nc_put_var_uchar,
       8:lib.nc_put_var_ushort, 9:lib.nc_put_var_uint,
      10:lib.nc_put_var_longlong, 11:lib.nc_put_var_ulonglong}

  # Functions for writing chunks
  chunkf = {1:lib.nc_put_vara_schar, 2:lib.nc_put_vara_text, 3:lib.nc_put_vara_short,
       4:lib.nc_put_vara_int, 5:lib.nc_put_vara_float,
       6:lib.nc_put_vara_double, 7:lib.nc_put_vara_uchar,
       8:lib.nc_put_vara_ushort, 9:lib.nc_put_vara_uint,
      10:lib.nc_put_vara_longlong, 11:lib.nc_put_vara_ulonglong}


  # Create the file
  if version == 3:
    ret = lib.nc_create (filename.encode('ascii'), 0, byref(fileid))
    if ret != 0: raise IOError(lib.nc_strerror(ret))
  elif version == 4:
    ret = lib.nc_create (filename.encode('ascii'), 0x1000, byref(fileid))  # 0x1000 = NC_NETCDF4
    if ret != 0: raise IOError(lib.nc_strerror(ret))
  else: raise Exception

  try:
    # Define the dimensions
    dimids = [None] * len(axes)
    for i,a in enumerate(axes):
      dimids[i] = c_int()
      if unlimited == a.name:
        ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i]))
      else:
        ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i]))
      assert ret == 0, lib.nc_strerror(ret)

    # Define the variables (including axes)
    chunks = [None] * len(vars)
    varids = [None] * len(vars)
    for i, var in enumerate(vars):
      t = nc_type[version][var.dtype.name]
      # Generate the array of dimension ids for this var
      d = [dimids[list(axes).index(a)] for a in var.axes]
      # Make it C-compatible
      d = (c_int * var.naxes)(*d)
      varids[i] = c_int()
      ret = lib.nc_def_var (fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i]))
      assert ret == 0, lib.nc_strerror(ret)
      # Compress the data? (only works for netcdf4 or (higher?))
      if compress:
        ret = lib.nc_def_var_deflate (fileid, varids[i], 1, 1, 2)
        assert ret == 0, lib.nc_strerror(ret)

    # Write the attributes

    # global attributes
    put_attributes (fileid, -1, dataset.atts, version)

    # variable attributes
    for i, var in enumerate(vars):
      # modify axes to be netcdf friendly (CF-compliant, etc.)
      put_attributes (fileid, varids[i], var.atts, version)

    # Don't pre-fill the file
    oldmode = c_int()
    ret = lib.nc_set_fill (fileid, 256, byref(oldmode))
    assert ret == 0, "Can't set fill mode: %s (error %d)" % (lib.nc_strerror(ret), ret)
    # Finished defining the variables, about to start writing the values
    ret = lib.nc_enddef (fileid)
    assert ret == 0, "Error leaving define mode: %s (error %d)" % (lib.nc_strerror(ret), ret)

    # Relative progress of each variable
    sizes = [v.size for v in vars]
    prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  #  print "Saving '%s':"%filename
    pbar = PBar(message="Saving '%s':"%filename)
  #  pbar = FakePBar()
    # Write the data
    for i, var in enumerate(vars):
      t = nc_type[version][var.dtype.name]
      dtype = numpy_type[t]

  #    print 'writing', var.name

      # number of actual variables (non-axes) for determining our progress
      N = len([v for v in vars if not isinstance(v,Axis)])
      varpbar = pbar.subset(prog[i], prog[i+1])

      views = list(View(var.axes).loop_mem())
      for j,v in enumerate(views):

        vpbar = varpbar.part(j, len(views))
  #      print '???', repr(str(v))

        # Should always be slices (since we're looping over whole thing contiguously?)
        for sl in v.slices: assert isinstance(sl, slice)
        for sl in v.slices: assert sl.step in (1,None)

        start = [sl.start for sl in v.slices]
        count = [sl.stop - sl.start for sl in v.slices]

        start = (c_long*var.naxes)(*start)
        count = (c_long*var.naxes)(*count)

        if isinstance(var, Axis):
          assert len(start) == len(count) == 1
          data = var.values
          data = data[start[0]:start[0]+count[0]] # the above gives us the *whole* axis,
                                                  # but under extreme conditions we may be looping over smaller pieces
          vpbar.update(100)
        else: data = v.get(var, pbar=vpbar)

        # Ensure the data is stored contiguously in memory
        data = np.ascontiguousarray(data, dtype=dtype)
        ret = chunkf[t](fileid, varids[i], start, count, point(data))
        assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (var.name, lib.nc_strerror(ret), ret)

  finally:
    # Finished
    lib.nc_close(fileid)
Example #20
0
def plotvar (var, **kwargs):
# {{{ 
  ''' plotvar(var, title, clevs, cmap, ax, ifig, hold)

  Produces a plot of the pygeode variable var. The routine can plot
  1d or 2d data; degenerate axes (of length 1) are ignored; their value is 
  displayed in the title of the plot.

  If the axes are longitude and latitude, the Basemap package is used to plot
  variable on a map of the world. 

  If one of the axes is a ZAxis, it is plotted on the y-axes, logarithmically if
  appropriate.

  keyword arguments:
    title: Title of the plot
    ax: A matplotlib axes object on which to produce the plot
    lblx: Show xaxis titles and labels 
    lbly: Show yaxis titles and labels 
    scaleAx: Scale values with coordinate value (for logarithmic axes only) 
    colorbar: Show colorbar
    clevs: Filled contour levels, if None, no filled contours are plotted
    cmap: A colormap passed on to the contour pylab function
    clines: Outlined levels, if None, no contour lines are plotted
    perx: Roll values in x axis (appropriate for periodic axes)
    ifig: Index of the matplotlib figure on which to produce the plot
    hold: If True, don't clear the contents of the axis
    wait: if True, don't invoke the show() command
          (the plotting main loop is not called, so subsequent pygeode commands 
           can be invoked)
  '''
  from matplotlib.pyplot import figure, show, ion, ioff, draw, cm, clf, isinteractive
###  from matplotlib.numerix import ma
  from numpy import ma
  from numpy import isnan, isinf, where
  from pygeode.progress import PBar
  from copy import copy

  # Get # of dimensions - can only do 1D or 2D
  nd = len([s for s in var.shape if s > 1])
  assert nd > 0, "the specified data has no dimensions.  Nothing to plot!"
  assert nd == 1 or nd == 2, "can only plot 1D or 2D arrays.  Try slicing along some dimensions."

  axes = var.axes
  ret = None
        
  # Create title if none has been specified
  title = kwargs.pop('title', None)
  if title is None:
    title = _buildvartitle(axes, var.name, **var.plotatts)

  pbar = kwargs.pop('pbar', True)
  if pbar is True:
    pbar = PBar(message='Loading plot values from %s:'%repr(var))
    values = var.get(pbar=pbar).squeeze()
  else:
    values = var.get().squeeze()

  # Mask out missing values (NaN)
  values = ma.masked_where(isnan(values), values)
  
  # Apply linear rescaling for plotting
  values = _scalevalues(values, **var.plotatts)
  
  # Scaling by coordinate value preserves integral for log-scaling
  scaleAx = kwargs.pop('scaleAx',False) # for line plots
  scaleX = kwargs.pop('scaleX',False) # for surface plots
  scaleY = kwargs.pop('scaleY',False) # for surface plots
  
  # Log scale for values (not axis)
  logVal = kwargs.pop('logVal',False)

  wasint = isinteractive()
  ioff()

  ax = kwargs.pop('ax', None)
  ifig = kwargs.pop('ifig', None)
  hold = kwargs.pop('hold', False)
  wait = kwargs.pop('wait', False)
  if ax is None:
    if ifig is None:
      fig = figure()
    else:
      fig=figure(ifig)
      if not hold: clf()
      
    ax = fig.add_subplot(111)
  else:
    fig = ax.figure

  if not hold and title: ax.set_title(title)

  # 1D case:
  if nd == 1:
    from pygeode.axis import ZAxis, Pres, Hybrid
    xaxis = [copy(a) for a in axes if len(a)>1][0]
    
    # adjust axis scaling
    #if xaxis.atts['units'] != xaxis.plotatts['plotunits']:
    xaxis.values = xaxis.values*xaxis.plotatts.get('scalefactor',1) + xaxis.plotatts.get('offset',0) 
    
    # Scaling by coordinate value preserves integral for log-scaling
    if (scaleAx and xaxis.plotatts.get('plotscale', 'linear')=='log' and
        var.plotatts.get('preserve', 'value')=='area'): 
      values = values * xaxis.values
    
    # Vertical?
    if isinstance(xaxis,ZAxis):
      lblx = kwargs.pop('lblx', False) # preserve previous behaviour
      lbly = kwargs.pop('lbly', True)
      
      ax.plot(values, xaxis.values, **kwargs)
      if logVal or var.plotatts.get('plotscale', 'linear')=='log': ax.set_xscale('log') # value axis 
      else: ax.set_xscale('linear') # value axis
#      ax.set_xscale(var.plotatts.get('plotscale', 'linear')) # value axis
      
      ax.set_yscale(xaxis.plotatts.get('plotscale', 'linear')) # coordiante
      ylims = min(xaxis.values),max(xaxis.values)
      ax.set_ylim(ylims[::xaxis.plotatts['plotorder']])
      
      # coordinate axis
      ax.yaxis.set_major_formatter(xaxis.formatter())
      if lbly:
        loc = xaxis.locator()
        if loc is not None: ax.yaxis.set_major_locator(loc)
        ax.set_ylabel(_buildaxistitle(**xaxis.plotatts))

      # value axis
      if lblx:
        ax.set_xlabel(_buildaxistitle(name = var.name, **var.plotatts))
            
    else:
      lblx = kwargs.pop('lblx', True)
      lbly = kwargs.pop('lbly', False) # preserve previous behaviour
      
      ax.plot(xaxis.values, values, **kwargs)
      if logVal or var.plotatts.get('plotscale', 'linear')=='log': ax.set_yscale('log') # value axis 
      else: ax.set_yscale('linear') # value axis
#      ax.set_yscale(var.plotatts.get('plotscale', 'linear')) # value axis

      ax.set_xscale(xaxis.plotatts['plotscale']) # coordinate
      xlims = min(xaxis.values),max(xaxis.values)
      ax.set_xlim(xlims[::xaxis.plotatts['plotorder']])

      ax.xaxis.set_major_formatter(xaxis.formatter())
      # coordinate axis
      if lblx:
        loc = xaxis.locator()
        if loc is not None: ax.xaxis.set_major_locator(loc)
        ax.set_xlabel(_buildaxistitle(**xaxis.plotatts))

      # value axis
      if lbly:
        ax.set_ylabel(_buildaxistitle(name = var.name, **var.plotatts))

  # 2D case:
  elif nd == 2:
    from numpy import meshgrid, concatenate, log10
    from matplotlib.pyplot import contourf, colorbar, xlim, ylim, xlabel, ylabel, gca
    from pygeode.axis import Lat, Lon, ZAxis, Pres, Hybrid, SpectralM, SpectralN

    # Patch for some versions of matplotlib, which leave gaps between polygons
    kwargs.setdefault('antialiased',False)

    yaxis, xaxis = [copy(a) for a in axes if len(a) > 1]
    
    # adjust x-axis scaling
    #if xaxis.atts['units'] != xaxis.plotatts['plotunits']:
    xaxis.values = xaxis.values*xaxis.plotatts.get('scalefactor',1) + xaxis.plotatts.get('offset',0)
    # adjust y-axis scaling
    #if yaxis.atts['units'] != yaxis.plotatts['plotunits']:
    yaxis.values = yaxis.values*yaxis.plotatts.get('scalefactor',1) + yaxis.plotatts.get('offset',0)
 
    # Transpose vertical axis?
    if isinstance(xaxis, ZAxis):
      values = values.transpose()
      xaxis, yaxis = yaxis, xaxis
    if isinstance(xaxis, SpectralN) and isinstance(yaxis, SpectralM):
      values = values.transpose()
      xaxis, yaxis = yaxis, xaxis
    if isinstance(xaxis, Lat) and isinstance(yaxis, Lon):
      values = values.transpose()
      xaxis, yaxis = yaxis, xaxis


    perx = kwargs.pop('perx', False)
    if perx:
      xvals = concatenate([xaxis.values, [xaxis.values[-1] + (xaxis.values[1] - xaxis.values[0])]])
      yvals = yaxis.values
      meshx, meshy = meshgrid (xvals, yvals)
    else:
      xvals = xaxis.values
      yvals = yaxis.values
      meshx, meshy = meshgrid (xvals, yvals)
      
    # Scaling by coordinate value preserves integral for log-scaling
    if (scaleX and xaxis.plotatts.get('plotscale', 'linear')=='log' and 
      var.plotatts.get('preserve', 'value')=='area'): 
      values = values * meshx
    if (scaleY and yaxis.plotatts.get('plotscale', 'linear')=='log' and 
        var.plotatts.get('preserve', 'value')=='area'): 
      values = values * meshy
      
    # scaling of field values
    if logVal: values = log10(values)

    #cmap = kwargs.pop('cmap', cm.gist_rainbow_r)
    clevs = kwargs.pop('clevs', 21)
    clines = kwargs.pop('clines', None)
    cbar = kwargs.pop('colorbar', {'orientation':'vertical'})
    pcolor = kwargs.pop('pcolor', False)

    mask = kwargs.pop('mask', None)
    if mask is not None:
      values = ma.masked_where(mask(values), values)
    if perx: 
      concatenate([values, values[0:1, :]], axis=0)

    #
    # Map?
    Basemap = None
    if kwargs.pop('map', True):
      # New toolkit path
      try:
        from mpl_toolkits.basemap import Basemap
      except ImportError: pass
      # Old toolkit path
      try:
        from matplotlib.toolkits.basemap import Basemap
      except ImportError: pass

    if isinstance(xaxis,Lon) and isinstance(yaxis,Lat) and Basemap is not None:
      from numpy import arange
      
      # pop some arguments related to projection grid labelling 
      projargs = dict(kwargs.pop('projection', {}))
      # meridians setup (latitude / y)
      meridians = projargs.pop('meridians',[-180,-90,0,90,180,270,360])
      # parallels setup (longitude / x)
      parallels = projargs.pop('parallels',[-90,-60,-30,0,30,60,90])
      # show labels for meridians and parallels in given location
      # labels[0]: left, labels[1]: right, labels[2]: top, labels[3]: bottom    
      labels = projargs.pop('labels',[1,0,0,1]) 
      
      # default axes boundaries 
      bnds = {'llcrnrlat':yvals.min(),
              'urcrnrlat':yvals.max(),
              'llcrnrlon':xvals.min(),
              'urcrnrlon':xvals.max()}
      # default projection      
      proj = {'projection':'cyl', 'resolution':'l'}
      
      # read projection arguments
      proj.update(projargs)
      if proj['projection'] in ['cyl', 'merc', 'mill', 'gall']:
        bnds.update(proj)
        proj.update(bnds)
            
      # construct projection axis
      m = Basemap(ax=ax, **proj)
      m.drawcoastlines(ax=ax)      
      # draw meridians and parallels (using arguments from above) 
      m.drawmeridians(meridians,labels=labels,ax=ax)
      m.drawparallels(parallels,labels=labels,ax=ax)
      m.drawmapboundary()

      # Transform mesh
      px, py = m(meshx, meshy)

      cont = None

      # Colour individual grid boxes? (no contours)
      if pcolor:
        clevs = None  # can't have both
        cont = m.pcolor(px, py, values, **kwargs)
        ret = cont

      # Filled contours?
      if clevs is not None:
        cont = m.contourf(px, py, values, clevs, **kwargs)
        ret = cont

      # Colour bar?
      if cbar and cont is not None: 
        fig.colorbar(cont, ax=ax, **cbar)

      # Contour lines?
      if clines is not None:
        ret = m.contour(px, py, values, clines, colors='k')
    else:
      cont = None

      # Colour individual grid boxes? (no contours)
      if pcolor:
        clevs = None  # can't have both
        cont = ax.pcolor(meshx, meshy, values, **kwargs)
        ret = cont

      # Filled contours?
      if clevs is not None:
        cont = ax.contourf(meshx, meshy, values, clevs, **kwargs)
        ret = cont

      # Colour bar?
      if cbar and cont is not None: 
        fig.colorbar(cont, ax=ax, **cbar)

      # Contour lines?
      if clines is not None:
        ret = ax.contour(meshx, meshy, values, clines, colors='k')

      # Disable autoscale.  Otherwise, if we set a log scale below, then
      # the range of our axes will get screwed up.
      # (This is a 'feature' of matplotlib!)
      # http://www.mail-archive.com/[email protected]/msg10527.html
      gca().set_autoscale_on(False)

      # Set the axis limits
      ax.set_xscale(xaxis.plotatts['plotscale'])
      xlims = min(xvals),max(xvals)
      ax.set_xlim(xlims[::xaxis.plotatts['plotorder']])

      ax.set_yscale(yaxis.plotatts['plotscale'])
      ylims = min(yaxis.values),max(yaxis.values)
      ax.set_ylim(ylims[::yaxis.plotatts['plotorder']])

      # Set x and y labels and formatters     
      if kwargs.pop('lblx', True):
        ax.set_xlabel(_buildaxistitle(**xaxis.plotatts))
        ax.xaxis.set_major_formatter(xaxis.formatter())
        loc = xaxis.locator()
        if loc is not None: ax.xaxis.set_major_locator(loc)
      else:
        ax.set_xticklabels('')      
      if kwargs.pop('lbly', True):
        ax.set_ylabel(_buildaxistitle(**yaxis.plotatts))
        ax.yaxis.set_major_formatter(yaxis.formatter())
        loc = yaxis.locator()
        if loc is not None: ax.yaxis.set_major_locator(loc)
      else:
        ax.set_yticklabels('')

  if wasint:
    ion()
    draw()
    if not wait: show()

  if ret is not None: return ret
Example #21
0
def save(filename, var, iaxis=None, fps=15, palette='bw', minmax=None):
    from pygeode.axis import TAxis
    from pygeode.var import Var
    from pygeode.progress import PBar
    import tempfile, shutil
    import Image
    import numpy as np
    import os

    assert isinstance(var, Var)

    # Remove any degenerate dimensions, make sure the axes are in a consistent order
    var = var.squeeze().sorted()
    assert var.naxes == 3, "can only work with 3D data"

    if iaxis is None: iaxis = var.whichaxis(TAxis)
    assert iaxis >= 0, "no time axis found"

    tmpdir = tempfile.mkdtemp(prefix='pygeode_mpeg')
    sl = [slice(None)] * 3

    # Get max & min values of the whole dataset
    if minmax is None:
        #TODO: calculate both of these at once, with a progress bar to help the process
        min = float(var.min())
        max = float(var.max())
    else:
        assert len(minmax) == 2, "invalid minmax argument"
        min, max = minmax

    print("Saving %s:" % filename)
    pbar = PBar()

    # Loop over each timestep, generate a temporary image file
    for i in range(len(var.axes[iaxis])):
        fpbar = pbar.part(i, len(var.axes[iaxis]))
        sl[iaxis] = i
        # Get data, flip y axis, add an 'RGB' axis
        data = var[sl].squeeze()[::-1, :, np.newaxis]
        data = (data - min) / (max - min) * 255
        if palette == 'bw':
            # Same data for R, G, and B channels
            data = np.concatenate([data, data, data], axis=2)
        elif palette == 'rainbow':
            # Piecewise linear palette
            part1 = data <= 85
            part2 = (85 < data) & (data <= 170)
            part3 = 170 < data
            b = np.zeros(data.shape)
            b[part1] = 255
            b[part2] = 255 - (data[part2] - 85) * 3
            g = np.zeros(data.shape)
            g[part1] = data[part1] * 3
            g[part2] = 255
            g[part3] = 255 - (data[part3] - 170) * 3
            r = np.zeros(data.shape)
            r[part2] = (data[part2] - 85) * 3
            r[part3] = 255

            data = np.concatenate([r, g, b], axis=2)

        # Encode as an 8-bit array
        data = np.asarray(np.round(data), 'uint8')
        # Save
        framefile = tmpdir + "/frame%04d.jpg" % i
        Image.fromarray(data, "RGB").save(framefile, quality=95)
        #    os.system("display "+framefile)
        #    break
        fpbar.update(100)

    shape = list(var.shape)
    shape = shape[:iaxis] + shape[iaxis + 1:]
    h, w = shape

    #  """
    # Make the movie file
    os.system("mencoder mf://%s/*.jpg -mf w=%s:h=%s:type=jpg:fps=%s \
          -ovc lavc -lavcopts vcodec=mpeg4:vbitrate=8000 -oac copy \
           -o %s" % (tmpdir, w, h, fps, filename))
    #  """

    # Clean up files
    shutil.rmtree(tmpdir)
Example #22
0
def check_dataset (dataset):
  from pygeode.view import View
  from pygeode.tools import combine_axes
  from pygeode.progress import PBar
  from pygeode.dataset import asdataset
  import numpy as np

  # Make sure we have a dataset (in case we're sent a simple list of vars)
  dataset = asdataset(dataset)

  vars = list(dataset.vars)

  # Include axes in the list of vars (to check these values too)
  axes = combine_axes(v.axes for v in vars)
  vars.extend(axes)

  # Relative progress of each variable
  sizes = [v.size for v in vars]
  prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  pbar = PBar(message="Checking %s for I/O errors:"%repr(dataset))

  failed_indices = {}
  error_messages = {}

  # Loop over the data
  for i,var in enumerate(vars):

    varpbar = pbar.subset(prog[i], prog[i+1])

    # Scan the outer axis (record axis?) for failures.
    N = var.shape[0]
    failed_indices[var.name] = []
    error_messages[var.name] = []

    for j in range(N):
      vpbar = varpbar.part(j, N)
      try:
        # Try fetching the data, see if something fails
        var[j] if var.naxes == 1 else var[j,...]
      except Exception as e:
        failed_indices[var.name].append(j)
        error_messages[var.name].append(str(e))
      vpbar.update(100)

  # Print summary information for each variable
  everything_ok = True
  for var in vars:
    indices = failed_indices[var.name]
    messages = error_messages[var.name]
    if len(indices) == 0: continue

    everything_ok = False

    print "\nFailures encountered with variable '%s':"%var.name

    # Group together record indices that give the same error message
    unique_messages = []
    aggregated_indices = []
    for ind,msg in zip(indices,messages):
      if len(unique_messages) == 0 or msg != unique_messages[-1]:
        unique_messages.append(msg)
        aggregated_indices.append([ind])
      else:
        aggregated_indices[-1].append(ind)

    # Print each error message encountered (and the record indices that give the error)
    for ind,msg in zip(aggregated_indices,unique_messages):

      # Group records together that have are consecutive (instead of printing each record separately)
      groups = []
      for i in ind:
        if len(groups) == 0 or i-1 not in groups[-1]:
          groups.append([i])
        else:
          groups[-1].append(i)
      for g in groups:
        print "=> at %s:\n    %s"% (var.axes[0].slice[g[0]:g[-1]+1], msg)

  if not everything_ok: raise Exception("Problem encountered with the dataset.")
Example #23
0
def isnonzero(X, axes, alpha=0.05, N_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X, against the hypothesis that it is 0.

  Parameters
  ==========
  X : :class:`Var`
    Variable to average.

  axes : list, optional
    Axes over which to compute the mean; if nothing is specified, the mean is
    computed over all axes.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom;
    the effective number will be given by the number estimated from the dataset
    divided by ``N_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Three quantities are computed:

    * The mean value of X
    * The probability of the computed value if the population mean was zero
    * The confidence interval of the mean at the level specified by alpha

    If the average is taken over all axes of X resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  difference

  Notes
  =====
  The number of effective degrees of freedom can be scaled as in :meth:`difference`. 
  The p-value and confidence interval are computed for the t-statistic defined in 
  eq (6.61) of von Storch and Zwiers 1999.'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  riaxes = [X.whichaxis(n) for n in axes]
  raxes = [a for i, a in enumerate(X.axes) if i in riaxes]
  oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes]
  oview = View(oaxes) 

  N = np.product([len(X.axes[i]) for i in riaxes])

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert N > 1, '%s has only one element along the reduction axes' % X.name

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  Na = np.zeros(oview.shape, 'd')

  x[()] = np.nan
  xx[()] = np.nan
  Na[()] = np.nan

  # Accumulate data
  for outsl, (xdata,) in loopover([X], oview, pbar=pbar):
    xdata = xdata.astype('d')
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Na[outsl] = np.nansum([Na[outsl], npnansum(1. + xdata*0., riaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx = (xx - x**2/Na) / (Na - 1)
  x /= Na

  if N_fac is not None: 
    eN = N//N_fac
    eNa = Na//N_fac
  else: 
    eN = N
    eNa = Na
  #print 'eff. N = %.1f' % eN

  sdom = np.sqrt(xx/eNa)

  p = tdist.cdf(abs(x/sdom), eNa - 1)*np.sign(x)
  ci = tdist.ppf(1. - alpha/2, eNa - 1) * sdom

  name = X.name if X.name != '' else 'X'

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    X = Var(oaxes, values=x, name=name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([X, P, CI])
  else: # Degenerate case
    return x, p, ci
Example #24
0
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None):
    # {{{
    r'''Computes the mean value of X and statistics relevant for a test against
  the hypothesis that it is 0.

  Parameters
  ==========
  X : :class:`Var`
    Variable to average.

  axes : list, optional
    Axes over which to compute the mean; if nothing is specified, the mean is
    computed over all axes.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom;
    the effective number will be given by the number estimated from the dataset
    divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the mean value can be obtained through ``ds.m``).
    The following quantities can be calculated.

    * 'm': The mean value of X
    * 'p': The probability of the computed value if the population mean was zero
    * 'ci': The confidence interval of the mean at the level specified by alpha

    If the average is taken over all axes of X resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  difference

  Notes
  =====
  The number of effective degrees of freedom can be scaled as in :meth:`difference`. 
  The p-value and confidence interval are computed for the t-statistic defined in 
  eq (6.61) of von Storch and Zwiers 1999.'''

    from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
    from pygeode.view import View

    riaxes = [X.whichaxis(n) for n in axes]
    raxes = [a for i, a in enumerate(X.axes) if i in riaxes]
    oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes]
    oview = View(oaxes)

    N = np.product([len(X.axes[i]) for i in riaxes])

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert N > 1, '%s has only one element along the reduction axes' % X.name

    # Construct work arrays
    x = np.zeros(oview.shape, 'd')
    xx = np.zeros(oview.shape, 'd')
    Na = np.zeros(oview.shape, 'd')

    x[()] = np.nan
    xx[()] = np.nan
    Na[()] = np.nan

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0)

        # Sum of weights (kludge to get masking right)
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0)

    imsk = (Na > 0.)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    xx[imsk] -= x[imsk]**2 / Na[imsk]
    xx[imsk] = xx[imsk] / (Na[imsk] - 1)

    x[imsk] /= Na[imsk]

    if N_fac is not None:
        eN = N // N_fac
        eNa = Na // N_fac
    else:
        eN = N
        eNa = Na

    sdom = np.zeros((oview.shape), 'd')
    p = np.zeros((oview.shape), 'd')
    t = np.zeros((oview.shape), 'd')
    ci = np.zeros((oview.shape), 'd')

    sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk])
    dmsk = (sdom > 0.)

    t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk]
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1))
    ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk]

    name = X.name if X.name != '' else 'X'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        m = Var(oaxes, values=x, name='m')
        m.atts['longname'] = 'Mean value of %s' % (name, )
        rvs.append(m)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value of test %s is 0' % (name, )
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence intervale of the mean value of %s' % (
                name, )
        rvs.append(ci)

    return asdataset(rvs)