Example #1
0
  def __init__ (self, *args):
  # {{{
    from pygeode.tools import combine_axes
    from pygeode.var import combine_meta
    import numpy as np

    assert self.op is not None, "can't instantiate UfuncVar directly"

    ivars = [i for i,v in enumerate(args) if isinstance(v, Var)]
    vars = [args[i] for i in ivars]

    axes = combine_axes(vars)

    self.args = args
    self.ivars = ivars

#    dtype = common_dtype(args)
    # create some dummy scalar args to test the dtype
    dummy_dtypes = ['int64' if isinstance(a,(int, long)) else 'float64' if isinstance(a,float) else 'complex128' if isinstance(a,complex) else a.dtype for a in args]
    dummy_args = [np.array(1,dtype=d) for d in dummy_dtypes]
    dtype = self.op(*dummy_args).dtype

    # TODO: Type check arguments. numpy arrays probably shouldn't be allowed

    # Generate a default name
    symbol = self.symbol
    names = [(arg.name or '??') if isinstance(arg,Var) else str(arg) for arg in args]
    # Strip out parentheses if there's only one name?
    if len(names) == 1:
      if names[0].startswith('(') and names[0].endswith(')'):
        names[0] = names[0][1:-1]

    if symbol is None:
      name = self.op.__name__ + '(' + ','.join(names) + ')'

    elif isinstance(symbol,(list,tuple)):
      assert len(names) == 1
      name = symbol[0] + names[0] + symbol[1]

    else:
      assert isinstance(symbol, str)
      name = '(' + symbol.join(names) + ')'

    # Special case: applying a scalar to a Var object with a simple name.
    # In this case, keep the original name.
    if len(args) == 2 and len(vars) == 1:  # One scalar, one var
      if '(' not in vars[0].name and ')' not in vars[0].name:
        if self.symbol in ('+','-','*','/'):  # Basic arithmetic only
          name = vars[0].name

#    # Copy any common generic metadata
#    self.atts = common_dict(v.atts for v in vars)
#    self.plotatts = common_dict(v.plotatts for v in vars)

    Var.__init__(self, axes, dtype=dtype)

    # Copy any common generic metadata
    combine_meta(vars, self)
    # Use our locally derived name (override combine_meta)
    self.name = name
Example #2
0
  def __init__ (self, var, indices):
  # {{{
    from pygeode.var import Var
    import numpy as np
    from pygeode.tools import combine_axes, common_dtype
    # Are we given a list of variables to work on in parallel?
    if isinstance(var,(tuple,list)):
      axes = combine_axes(var)
      dtype = common_dtype(var)
    else:
      axes = var.axes
      dtype = var.dtype

#    if not isinstance(indices,(list,tuple)): indices = [indices]
    indices = np.sort([var.whichaxis(i) for i in indices])
    assert len(indices) > 0, "no reduction axes specified"

    N = [len(axes[i]) for i in indices]
    # Check for degenerate reductions (ill-defined)
    for i,n in enumerate(N):
      if n == 0:  raise ValueError("Can't do a reduction over axis '%s' - length is 0."%axes[i].name)
    N = int(np.product(N))
    self.N =  N # number of values to reduce over
    self.var = var
    self.indices = indices

    self.in_axes = axes

    # Remove the reduction axis from the output variable
    axes = [a for i,a in enumerate(axes) if i not in indices]

    Var.__init__(self, axes, dtype=dtype, name=var.name, atts=var.atts, plotatts=var.plotatts)
Example #3
0
def tidy_axes(dataset, unlimited=None):
    # {{{
    from pygeode.tools import combine_axes
    from pygeode.axis import DummyAxis
    from pygeode.dataset import asdataset

    vars = list(dataset.vars)
    # The output axes
    axes = combine_axes(v.axes for v in vars)

    # Include axes in the list of vars (for writing to netcdf).
    # Exclude axes which don't have any intrinsic values.
    # Look at original dataset to check original type of axes (because
    # finalize_save may force everything to be NamedAxis).
    vars = vars + [
        a for a in axes if not isinstance(dataset[a.name], DummyAxis)
    ]

    # Variables (and axes) must all have unique names
    assert len(set([v.name for v in vars])) == len(
        vars), "vars must have unique names: %s" % [v.name for v in vars]

    if unlimited is not None:
        assert unlimited in [a.name for a in axes]

    return asdataset(vars)
Example #4
0
  def __init__ (self, var, indices):
  # {{{
    from pygeode.var import Var
    import numpy as np
    from pygeode.tools import combine_axes, common_dtype
    # Are we given a list of variables to work on in parallel?
    if isinstance(var,(tuple,list)):
      axes = combine_axes(var)
      dtype = common_dtype(var)
    else:
      axes = var.axes
      dtype = var.dtype

#    if not isinstance(indices,(list,tuple)): indices = [indices]
    indices = np.sort([var.whichaxis(i) for i in indices])
    assert len(indices) > 0, "no reduction axes specified"

    N = [len(axes[i]) for i in indices]
    # Check for degenerate reductions (ill-defined)
    for i,n in enumerate(N):
      if n == 0:  raise ValueError("Can't do a reduction over axis '%s' - length is 0."%axes[i].name)
    N = int(np.product(N))
    self.N =  N # number of values to reduce over
    self.var = var
    self.indices = indices

    self.in_axes = axes

    # Remove the reduction axis from the output variable
    axes = [a for i,a in enumerate(axes) if i not in indices]

    Var.__init__(self, axes, dtype=dtype, name=var.name, atts=var.atts, plotatts=var.plotatts)
Example #5
0
def write_var(ncfile, dataset, unlimited=None, compress=False):
    # {{{
    from pygeode.view import View
    from pygeode.axis import Axis
    import numpy as np
    from pygeode.progress import PBar, FakePBar
    from pygeode.tools import combine_axes

    vars = list(dataset.vars)
    axes = combine_axes(v.axes for v in vars)

    # Define the dimensions
    for a in axes:
        ncfile.createDimension(a.name,
                               size=(None if a.name == unlimited else len(a)))

    # Define the variables (including axes)
    for var in vars:
        dimensions = [a.name for a in var.axes]
        v = ncfile.createVariable(var.name,
                                  datatype=var.dtype,
                                  dimensions=dimensions,
                                  zlib=compress,
                                  fill_value=var.atts.get('_FillValue', None))
        v.setncatts(var.atts)

    # global attributes
    ncfile.setncatts(dataset.atts)

    # Relative progress of each variable
    sizes = [v.size for v in vars]
    prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100

    pbar = PBar(message="Saving '%s':" % ncfile.filepath())

    # number of actual variables (non-axes) for determining our progress
    N = len([v for v in vars if not isinstance(v, Axis)])

    # Write the data
    for i, var in enumerate(vars):
        ncvar = ncfile.variables[var.name]
        varpbar = pbar.subset(prog[i], prog[i + 1])

        views = list(View(var.axes).loop_mem())

        for j, v in enumerate(views):
            vpbar = varpbar.part(j, len(views))
            ncvar[v.slices] = v.get(var, pbar=vpbar)
Example #6
0
def write_var (ncfile, dataset, unlimited=None, compress=False):
# {{{
  from pygeode.view import View
  from pygeode.axis import Axis 
  import numpy as np
  from pygeode.progress import PBar, FakePBar
  from pygeode.tools import combine_axes
  
  vars = list(dataset.vars)
  axes = combine_axes(v.axes for v in vars)

  # Define the dimensions
  for a in axes:
    ncfile.createDimension(a.name, size=(None if a.name == unlimited else len(a)))

  # Define the variables (including axes)
  for var in vars:
    dimensions = [a.name for a in var.axes]
    v = ncfile.createVariable(var.name, datatype=var.dtype, dimensions=dimensions, zlib=compress, fill_value=var.atts.get('_FillValue',None))
    v.setncatts(var.atts)

  # global attributes
  ncfile.setncatts(dataset.atts)

  # Relative progress of each variable
  sizes = [v.size for v in vars]
  prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  pbar = PBar(message="Saving '%s':"%ncfile.filepath())

  # number of actual variables (non-axes) for determining our progress
  N = len([v for v in vars if not isinstance(v,Axis)])

  # Write the data
  for i,var in enumerate(vars):
    ncvar = ncfile.variables[var.name]
    varpbar = pbar.subset(prog[i], prog[i+1])

    views = list(View(var.axes).loop_mem())

    for j,v in enumerate(views):
      vpbar = varpbar.part(j, len(views))
      ncvar[v.slices] = v.get(var, pbar=vpbar)
Example #7
0
def tidy_axes(dataset, unlimited=None):
# {{{
  from pygeode.tools import combine_axes
  from pygeode.axis import DummyAxis
  from pygeode.dataset import asdataset
  
  vars = list(dataset.vars)
  # The output axes
  axes = combine_axes(v.axes for v in vars)

  # Include axes in the list of vars (for writing to netcdf).
  # Exclude axes which don't have any intrinsic values.
  # Look at original dataset to check original type of axes (because
  # finalize_save may force everything to be NamedAxis).
  vars = vars + [a for a in axes if not isinstance(dataset[a.name],DummyAxis)]

  # Variables (and axes) must all have unique names
  assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars]

  if unlimited is not None:
    assert unlimited in [a.name for a in axes]

  return asdataset(vars)
Example #8
0
def difference(X, Y, axes, alpha=0.05, Nx_fac = None, Ny_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  Nx_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Four quantities are computed:

    * The difference in the means, X - Y
    * The effective number of degrees of freedom, :math:`df`
    * The probability of the computed difference if the population difference was zero
    * The confidence interval of the difference at the level specified by alpha

    If the average is taken over all axes of X and Y resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  riaxes = [whichaxis(srcaxes, n) for n in axes]
  raxes = [a for i, a in enumerate(srcaxes) if i in riaxes]
  oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
  oview = View(oaxes) 

  ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
  Nx = np.product([len(X.axes[i]) for i in ixaxes])

  iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
  Ny = np.product([len(Y.axes[i]) for i in iyaxes])
  
  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert Nx > 1, '%s has only one element along the reduction axes' % X.name
  assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  y = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  yy = np.zeros(oview.shape, 'd')

  Nx = np.zeros(oview.shape, 'd')
  Ny = np.zeros(oview.shape, 'd')

  x[()] = np.nan
  y[()] = np.nan
  xx[()] = np.nan
  yy[()] = np.nan
  Nx[()] = np.nan
  Ny[()] = np.nan

  # Accumulate data
  for outsl, (xdata,) in loopover([X], oview, pbar=pbar):
    xdata = xdata.astype('d')
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Nx[outsl] = np.nansum([Nx[outsl], npnansum(1. + xdata*0., ixaxes)], 0) 

  for outsl, (ydata,) in loopover([Y], oview, pbar=pbar):
    ydata = ydata.astype('d')
    y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
    yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Ny[outsl] = np.nansum([Ny[outsl], npnansum(1. + ydata*0., iyaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx = (xx - x**2/Nx) / (Nx - 1)
  yy = (yy - y**2/Ny) / (Ny - 1)
  x /= Nx
  y /= Ny

  if Nx_fac is not None: eNx = Nx//Nx_fac
  else: eNx = Nx
  if Ny_fac is not None: eNy = Ny//Ny_fac
  else: eNy = Ny
  #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean())

  d = x - y
  den = np.sqrt(xx/eNx + yy/eNy)
  df = (xx/eNx + yy/eNy)**2 / ((xx/eNx)**2/(eNx - 1) + (yy/eNy)**2/(eNy - 1))

  p = tdist.cdf(abs(d/den), df)*np.sign(d)
  ci = tdist.ppf(1. - alpha/2, df) * den

  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'
  if xn == yn: name = xn
  else: name = '%s-%s'%(xn, yn)

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    D = Var(oaxes, values=d, name=name)
    DF = Var(oaxes, values=df, name='df_%s' % name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([D, DF, P, CI])
  else: # Degenerate case
    return d, df, p, ci
Example #9
0
def save(filename,
         in_dataset,
         version=3,
         pack=None,
         compress=False,
         cfmeta=True,
         unlimited=None):
    # {{{
    from ctypes import c_int, c_long, byref
    from pygeode.view import View
    from pygeode.tools import combine_axes, point
    from pygeode.axis import Axis, DummyAxis
    import numpy as np
    from pygeode.progress import PBar, FakePBar
    from pygeode.formats import finalize_save
    from pygeode.dataset import asdataset

    assert isinstance(filename, str)

    in_dataset = asdataset(in_dataset)
    dataset = finalize_save(in_dataset, cfmeta, pack)

    # Version?
    if compress: version = 4
    assert version in (3, 4)

    fileid = c_int()

    vars = list(dataset.vars)
    # The output axes
    axes = combine_axes(v.axes for v in vars)

    # Include axes in the list of vars (for writing to netcdf).
    # Exclude axes which don't have any intrinsic values.
    vars = vars + [a for a in axes if not isinstance(a, DummyAxis)]
    #vars.extend(axes)

    # Variables (and axes) must all have unique names
    assert len(set([v.name for v in vars])) == len(
        vars), "vars must have unique names: %s" % [v.name for v in vars]

    if unlimited is not None:
        assert unlimited in [a.name for a in axes]

    # Functions for writing entire array
    allf = {
        1: lib.nc_put_var_schar,
        2: lib.nc_put_var_text,
        3: lib.nc_put_var_short,
        4: lib.nc_put_var_int,
        5: lib.nc_put_var_float,
        6: lib.nc_put_var_double,
        7: lib.nc_put_var_uchar,
        8: lib.nc_put_var_ushort,
        9: lib.nc_put_var_uint,
        10: lib.nc_put_var_longlong,
        11: lib.nc_put_var_ulonglong
    }

    # Functions for writing chunks
    chunkf = {
        1: lib.nc_put_vara_schar,
        2: lib.nc_put_vara_text,
        3: lib.nc_put_vara_short,
        4: lib.nc_put_vara_int,
        5: lib.nc_put_vara_float,
        6: lib.nc_put_vara_double,
        7: lib.nc_put_vara_uchar,
        8: lib.nc_put_vara_ushort,
        9: lib.nc_put_vara_uint,
        10: lib.nc_put_vara_longlong,
        11: lib.nc_put_vara_ulonglong
    }

    # Create the file
    if version == 3:
        ret = lib.nc_create(filename.encode('ascii'), 0, byref(fileid))
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    elif version == 4:
        ret = lib.nc_create(filename.encode('ascii'), 0x1000,
                            byref(fileid))  # 0x1000 = NC_NETCDF4
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    else: raise Exception

    try:
        # Define the dimensions
        dimids = [None] * len(axes)
        for i, a in enumerate(axes):
            dimids[i] = c_int()
            if unlimited == a.name:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(0),
                                     byref(dimids[i]))
            else:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'),
                                     c_long(len(a)), byref(dimids[i]))
            assert ret == 0, lib.nc_strerror(ret)

        # Define the variables (including axes)
        chunks = [None] * len(vars)
        varids = [None] * len(vars)
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            # Generate the array of dimension ids for this var
            d = [dimids[list(axes).index(a)] for a in var.axes]
            # Make it C-compatible
            d = (c_int * var.naxes)(*d)
            varids[i] = c_int()
            ret = lib.nc_def_var(fileid, var.name.encode('ascii'), t,
                                 var.naxes, d, byref(varids[i]))
            assert ret == 0, lib.nc_strerror(ret)
            # Compress the data? (only works for netcdf4 or (higher?))
            if compress:
                ret = lib.nc_def_var_deflate(fileid, varids[i], 1, 1, 2)
                assert ret == 0, lib.nc_strerror(ret)

        # Write the attributes

        # global attributes
        put_attributes(fileid, -1, dataset.atts, version)

        # variable attributes
        for i, var in enumerate(vars):
            # modify axes to be netcdf friendly (CF-compliant, etc.)
            put_attributes(fileid, varids[i], var.atts, version)

        # Don't pre-fill the file
        oldmode = c_int()
        ret = lib.nc_set_fill(fileid, 256, byref(oldmode))
        assert ret == 0, "Can't set fill mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)
        # Finished defining the variables, about to start writing the values
        ret = lib.nc_enddef(fileid)
        assert ret == 0, "Error leaving define mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)

        # Relative progress of each variable
        sizes = [v.size for v in vars]
        prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100

        #  print "Saving '%s':"%filename
        pbar = PBar(message="Saving '%s':" % filename)
        #  pbar = FakePBar()
        # Write the data
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            dtype = numpy_type[t]

            #    print 'writing', var.name

            # number of actual variables (non-axes) for determining our progress
            N = len([v for v in vars if not isinstance(v, Axis)])
            varpbar = pbar.subset(prog[i], prog[i + 1])

            views = list(View(var.axes).loop_mem())
            for j, v in enumerate(views):

                vpbar = varpbar.part(j, len(views))
                #      print '???', repr(str(v))

                # Should always be slices (since we're looping over whole thing contiguously?)
                for sl in v.slices:
                    assert isinstance(sl, slice)
                for sl in v.slices:
                    assert sl.step in (1, None)

                start = [sl.start for sl in v.slices]
                count = [sl.stop - sl.start for sl in v.slices]

                start = (c_long * var.naxes)(*start)
                count = (c_long * var.naxes)(*count)

                if isinstance(var, Axis):
                    assert len(start) == len(count) == 1
                    data = var.values
                    data = data[
                        start[0]:start[0] +
                        count[0]]  # the above gives us the *whole* axis,
                    # but under extreme conditions we may be looping over smaller pieces
                    vpbar.update(100)
                else:
                    data = v.get(var, pbar=vpbar)

                # Ensure the data is stored contiguously in memory
                data = np.ascontiguousarray(data, dtype=dtype)
                ret = chunkf[t](fileid, varids[i], start, count, point(data))
                assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (
                    var.name, lib.nc_strerror(ret), ret)

    finally:
        # Finished
        lib.nc_close(fileid)
Example #10
0
    def __init__(self, *args):
        # {{{
        from pygeode.tools import combine_axes
        from pygeode.var import combine_meta
        import numpy as np

        assert self.op is not None, "can't instantiate UfuncVar directly"

        ivars = [i for i, v in enumerate(args) if isinstance(v, Var)]
        vars = [args[i] for i in ivars]

        axes = combine_axes(vars)

        self.args = args
        self.ivars = ivars

        #    dtype = common_dtype(args)
        # create some dummy scalar args to test the dtype
        dummy_dtypes = [
            'int64'
            if isinstance(a, int) else 'float64' if isinstance(a, float) else
            'complex128' if isinstance(a, complex) else a.dtype for a in args
        ]
        dummy_args = [np.array(1, dtype=d) for d in dummy_dtypes]
        dtype = self.op(*dummy_args).dtype

        # TODO: Type check arguments. numpy arrays probably shouldn't be allowed

        # Generate a default name
        symbol = self.symbol
        names = [(arg.name or '??') if isinstance(arg, Var) else str(arg)
                 for arg in args]
        # Strip out parentheses if there's only one name?
        if len(names) == 1:
            if names[0].startswith('(') and names[0].endswith(')'):
                names[0] = names[0][1:-1]

        if symbol is None:
            name = self.op.__name__ + '(' + ','.join(names) + ')'

        elif isinstance(symbol, (list, tuple)):
            assert len(names) == 1
            name = symbol[0] + names[0] + symbol[1]

        else:
            assert isinstance(symbol, str)
            name = '(' + symbol.join(names) + ')'

        # Special case: applying a scalar to a Var object with a simple name.
        # In this case, keep the original name.
        if len(args) == 2 and len(vars) == 1:  # One scalar, one var
            if '(' not in vars[0].name and ')' not in vars[0].name:
                if self.symbol in ('+', '-', '*',
                                   '/'):  # Basic arithmetic only
                    name = vars[0].name

#    # Copy any common generic metadata
#    self.atts = common_dict(v.atts for v in vars)
#    self.plotatts = common_dict(v.plotatts for v in vars)

        Var.__init__(self, axes, dtype=dtype)

        # Copy any common generic metadata
        combine_meta(vars, self)
        # Use our locally derived name (override combine_meta)
        self.name = name
Example #11
0
def paired_difference(X,
                      Y,
                      axes=None,
                      alpha=0.05,
                      N_fac=None,
                      output='d,p,ci',
                      pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must share all axes over which the means are being computed.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X and Y; the effective number will be given by the number estimated from the
    dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    Nx = np.product([len(X.axes[i]) for i in ixaxes])

    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])

    assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    # Construct work arrays
    d = np.full(oview.shape, np.nan, 'd')
    dd = np.full(oview.shape, np.nan, 'd')
    N = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y],
                                          oview,
                                          inaxes=srcaxes,
                                          pbar=pbar):
        ddata = xdata.astype('d') - ydata.astype('d')
        d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
        dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)

        # Count of non-NaN data points
        N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (N > 1)
    dd[imsk] -= (d * d)[imsk] / N[imsk]
    dd[imsk] /= (N[imsk] - 1)
    d[imsk] /= N[imsk]

    # Ensure variance is non-negative
    dd[dd <= 0.] = 0.

    if N_fac is not None: eN = N // N_fac
    else: eN = N

    emsk = (eN > 1)

    den = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    den = np.zeros(oview.shape, 'd')
    den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1))
    dmsk = (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1))
    ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk]

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=eN - 1, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for t-test of paired difference (%s - %s)' % (
                xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['N_fac'] = N_fac
    ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn)

    return ds
Example #12
0
def correlate(X, Y, axes=None, output='r2,p', pbar=None):
    # {{{
    r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults
    to 'r2,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the correlation coefficient can be obtained
    through ``ds.r2``).

    * 'r2': The correlation coefficient :math:`\rho_{XY}`
    * 'p':  The p-value; see notes.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding a correlation
  coeefficient of equal or greater magnitude (two-sided) to the given result
  under the hypothesis that the true correlation coefficient between X and Y is
  zero. It is computed from the t-statistic given in eq (8.7), in section
  8.2.3, and assumes normally distributed quantities.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['r2', 'p']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    # Put all the axes being reduced over at the end
    # so that we can reshape
    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    iview = View(inaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Count of non-NaN data points
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    imsk = (Na > 0)

    xx[imsk] -= (x * x)[imsk] / Na[imsk]
    yy[imsk] -= (y * y)[imsk] / Na[imsk]
    xy[imsk] -= (x * y)[imsk] / Na[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    # Compute correlation coefficient, t-statistic, p-value
    den = np.zeros(oview.shape, 'd')
    rho = np.zeros(oview.shape, 'd')

    den[imsk] = np.sqrt((xx * yy)[imsk])
    dmsk = (den > 0.)

    rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk]

    den = 1 - rho**2
    # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
    den[den < eps] = eps

    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk])
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2))

    p[~imsk] = np.nan
    rho[~imsk] = np.nan

    p[~dmsk] = np.nan
    rho[~dmsk] = np.nan

    # Construct and return variables
    xn = X.name if X.name != '' else 'X'  # Note: could write:  xn = X.name or 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'r2' in output:
        r2 = Var(oaxes, values=rho, name='r2')
        r2.atts['longname'] = 'Correlation coefficient between %s and %s' % (
            xn, yn)
        rvs.append(r2)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for correlation coefficient between %s and %s' % (
                xn, yn)
        rvs.append(p)

    ds = asdataset(rvs)
    ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn)

    return ds
Example #13
0
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None):
    # {{{
    r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed.
    Note that a constant term is not included by default. The following
    parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': p-value of regession; see notes.
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    The outputs 'B', 'p', and 'sb' will produce as many outputs as there are
    regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; it corresponds to the probability of obtaining the regression
  coefficient under the null hypothesis that there is no linear relationship.
  Note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
    from pygeode.view import View

    # Split output request now
    ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    Nr = len(Xs)

    Xaxes = combine_axes(Xs)

    srcaxes = combine_axes([Xaxes, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            ia = whichaxis(srcaxes, a)
            if ia in riaxes: ri_new.append(ia)
            else:
                raise KeyError(
                    'One of the Xs or Y does not have the axis %s.' % a)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = tuple([srcaxes[i] for i in oiaxes])
    inaxes = oaxes + tuple([srcaxes[i] for i in riaxes])
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(
        riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (
            Y.name)

    # Construct work arrays
    os = oview.shape
    os1 = os + (Nr, )
    os2 = os + (Nr, Nr)
    y = np.zeros(os, 'd')
    yy = np.zeros(os, 'd')
    xy = np.zeros(os1, 'd')
    xx = np.zeros(os2, 'd')
    xxinv = np.zeros(os2, 'd')

    N = np.prod([len(srcaxes[i]) for i in riaxes])

    # Accumulate data
    for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
        ydata = datatuple[-1].astype('d')
        xdata = [datatuple[i].astype('d') for i in range(Nr)]
        y[outsl] += npsum(ydata, siaxes)
        yy[outsl] += npsum(ydata**2, siaxes)
        for i in range(Nr):
            xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes)
            for j in range(i + 1):
                xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes)

    # Fill in opposite side of xTx
    for i in range(Nr):
        for j in range(i):
            xx[..., j, i] = xx[..., i, j]

    # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
    # loop over oview does not help)
    xx = xx.reshape(-1, Nr, Nr)
    xxinv = xxinv.reshape(-1, Nr, Nr)
    for i in range(xx.shape[0]):
        xxinv[i, :, :] = np.linalg.inv(xx[i, :, :])
    xx = xx.reshape(os2)
    xxinv = xxinv.reshape(os2)

    beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
    vare = np.sum(xy * beta, -1)

    if N_fac is None: N_eff = N
    else: N_eff = N // N_fac

    sigbeta = [
        np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)
    ]

    xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
    yn = Y.name if Y.name != '' else 'Y'

    from .var import Var
    from .dataset import asdataset
    from .axis import NonCoordinateAxis

    ra = NonCoordinateAxis(values=np.arange(Nr),
                           regressor=xns,
                           name='regressor')
    ra2 = NonCoordinateAxis(values=np.arange(Nr),
                            regressor=xns,
                            name='regressor2')
    Nd = len(oaxes)

    rvs = []

    if 'beta' in output:
        B = Var(oaxes + (ra, ), values=beta, name='beta')
        B.atts['longname'] = 'regression coefficient'
        rvs.append(B)

    if 'r2' in output:
        vary = (yy - y**2 / N)
        R2 = 1 - (yy - vare) / vary
        R2 = Var(oaxes, values=R2, name='R2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        p = [
            2. *
            (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr))
            for i in range(Nr)
        ]
        p = np.transpose(np.array(p), [Nd] + list(range(Nd)))
        p = Var(oaxes + (ra, ), values=p, name='p')
        p.atts['longname'] = 'p-values'
        rvs.append(p)

    if 'sb' in output:
        sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd)))
        sb = Var(oaxes + (ra, ), values=sigbeta, name='sb')
        sb.atts['longname'] = 'standard deviation of linear coefficients'
        rvs.append(sb)

    if 'covb' in output:
        sigmat = np.zeros(os2, 'd')
        for i in range(Nr):
            for j in range(Nr):
                #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
                sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
        covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb')
        covb.atts['longname'] = 'Covariance matrix of the linear coefficients'
        rvs.append(covb)

    if 'se' in output:
        se = np.sqrt((yy - vare) / N_eff)
        se = Var(oaxes, values=se, name='se')
        se.atts['longname'] = 'standard deviation of residual'
        rvs.append(se)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'multiple linear regression parameters for %s regressed against %s' % (
            yn, xns)

    return ds
Example #14
0
def difference(X,
               Y,
               axes=None,
               alpha=0.05,
               Nx_fac=None,
               Ny_fac=None,
               output='d,p,ci',
               pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional, defaults to None
    Axes over which to compute means; if othing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float, optional; defaults to 0.05
    Confidence level for which to compute confidence interval.

  Nx_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]

    Nx = np.product([len(X.axes[i]) for i in ixaxes])
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])
    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    Nx = np.full(oview.shape, np.nan, 'd')
    Ny = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)

        # Count of non-NaN data points
        Nx[outsl] = np.nansum(
            [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0)

    for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar):
        ydata = ydata.astype('d')
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)

        # Count of non-NaN data points
        Ny[outsl] = np.nansum(
            [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (Nx > 1) & (Ny > 1)
    xx[imsk] -= (x * x)[imsk] / Nx[imsk]
    xx[imsk] /= (Nx[imsk] - 1)

    x[imsk] /= Nx[imsk]

    yy[imsk] -= (y * y)[imsk] / Ny[imsk]
    yy[imsk] /= (Ny[imsk] - 1)

    y[imsk] /= Ny[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    if Nx_fac is not None: eNx = Nx // Nx_fac
    else: eNx = Nx
    if Ny_fac is not None: eNy = Ny // Ny_fac
    else: eNy = Ny

    emsk = (eNx > 1) & (eNy > 1)

    # Compute difference
    d = x - y

    den = np.zeros(oview.shape, 'd')
    df = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    # Convert to variance of the mean of each sample
    xx[emsk] /= eNx[emsk]
    yy[emsk] /= eNy[emsk]

    den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1)
    dmsk = (den > 0.)

    df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk]

    den[emsk] = np.sqrt(xx[emsk] + yy[emsk])

    dmsk &= (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk]))

    ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk]

    df[~dmsk] = np.nan
    p[~dmsk] = np.nan
    ci[~dmsk] = np.nan

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=df, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % (
            xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['Nx_fac'] = Nx_fac
    ds.atts['Ny_fac'] = Ny_fac
    ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn)

    return ds
Example #15
0
def correlate(X, Y, axes=None, pbar=None):
# {{{
  r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  rho, p : :class:`Var`
    The correlation coefficient :math:`\rho_{XY}` and p-value, respectively.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding the given
  result under the hypothesis that the true correlation coefficient between X
  and Y is zero. It is computed from the t-statistic given in eq (8.7), in
  section 8.2.3, and assumes normally distributed quantities.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
  from pygeode.view import View

  # Put all the axes being reduced over at the end 
  # so that we can reshape 
  srcaxes = combine_axes([X, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      i = whichaxis(srcaxes, a)
      if i not in riaxes: 
        raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name))
      ri_new.append(i)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  iview = View(inaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  # Construct work arrays
  x  = np.zeros(oview.shape, 'd')*np.nan
  y  = np.zeros(oview.shape, 'd')*np.nan
  xx = np.zeros(oview.shape, 'd')*np.nan
  yy = np.zeros(oview.shape, 'd')*np.nan
  xy = np.zeros(oview.shape, 'd')*np.nan
  Na = np.zeros(oview.shape, 'd')*np.nan

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
    xdata = xdata.astype('d')
    ydata = ydata.astype('d')
    xydata = xdata*ydata

    xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
    ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
    xdata = np.tile(xdata, xbc)
    ydata = np.tile(ydata, ybc)
    xdata[np.isnan(xydata)] = np.nan
    ydata[np.isnan(xydata)] = np.nan

    # It seems np.nansum does not broadcast its arguments automatically
    # so there must be a better way of doing this...
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
    y[outsl]  = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
    yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
    xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

    # Sum of weights
    Na[outsl] = np.nansum([Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

  eps = 1e-14
  imsk = ~(Na < eps)

  xx[imsk] -= (x*x)[imsk]/Na[imsk]
  yy[imsk] -= (y*y)[imsk]/Na[imsk]
  xy[imsk] -= (x*y)[imsk]/Na[imsk]

  # Compute correlation coefficient, t-statistic, p-value
  den = np.zeros(oview.shape, 'd')
  rho = np.zeros(oview.shape, 'd')

  den[imsk] = np.sqrt((xx*yy)[imsk])
  rho[den > 0.] = xy[den > 0.] / np.sqrt(xx*yy)[den > 0.]

  den = 1 - rho**2
  # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
  den[den < eps] = eps

  t = np.zeros(oview.shape, 'd')
  p = np.zeros(oview.shape, 'd')

  t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.)/den[imsk])
  p[imsk] = tdist.cdf(t[imsk], Na[imsk]-2) * np.sign(rho[imsk])
  p[~imsk] = np.nan
  rho[~imsk] = np.nan

  # Construct and return variables
  xn = X.name if X.name != '' else 'X' # Note: could write:  xn = X.name or 'X'
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  Rho = Var(oaxes, values=rho, name='C(%s, %s)' % (xn, yn))
  P = Var(oaxes, values=p, name='P(C(%s,%s) != 0)' % (xn, yn))
  return Rho, P
Example #16
0
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None):
    # {{{
    r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the
    following parameters can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': p-value of regression; see notes.
    * 'sm': Standard deviation of linear coefficient estimate
    * 'se': Standard deviation of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['m', 'b', 'r2', 'p', 'sm', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from regression. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (
        X.name, Y.name)

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Sum of weights
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    if N_fac is None:
        N_eff = Na - 2.
    else:
        N_eff = Na / N_fac - 2.

    nmsk = (N_eff > 0.)

    xx[nmsk] -= (x * x)[nmsk] / Na[nmsk]
    yy[nmsk] -= (y * y)[nmsk] / Na[nmsk]
    xy[nmsk] -= (x * y)[nmsk] / Na[nmsk]

    dmsk = (xx > 0.)

    m = np.zeros(oview.shape, 'd')
    b = np.zeros(oview.shape, 'd')
    r2 = np.zeros(oview.shape, 'd')

    m[dmsk] = xy[dmsk] / xx[dmsk]
    b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk]

    r2den = xx * yy
    d2msk = (r2den > 0.)

    r2[d2msk] = xy[d2msk]**2 / r2den[d2msk]

    sige = np.zeros(oview.shape, 'd')
    sigm = np.zeros(oview.shape, 'd')
    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk]
    sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk])
    sige[nmsk] = np.sqrt(sige[dmsk])
    t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk]
    p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk]))

    msk = nmsk & dmsk

    m[~msk] = np.nan
    b[~msk] = np.nan
    sige[~msk] = np.nan
    sigm[~msk] = np.nan
    p[~msk] = np.nan

    msk = nmsk & d2msk
    r2[~msk] = np.nan

    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        M = Var(oaxes, values=m, name='m')
        M.atts['longname'] = 'slope'
        rvs.append(M)

    if 'b' in output:
        B = Var(oaxes, values=b, name='b')
        B.atts['longname'] = 'intercept'
        rvs.append(B)

    if 'r2' in output:
        R2 = Var(oaxes, values=r2, name='r2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        P = Var(oaxes, values=p, name='p')
        P.atts['longname'] = 'p-value'
        rvs.append(P)

    if 'sm' in output:
        SM = Var(oaxes, values=sigm, name='sm')
        SM.atts['longname'] = 'standard deviation of slope parameter'
        rvs.append(SM)

    if 'se' in output:
        SE = Var(oaxes, values=sige, name='se')
        SE.atts['longname'] = 'standard deviation of residual'
        rvs.append(SE)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'linear regression parameters for %s regressed against %s' % (
            yn, xn)

    return ds
Example #17
0
def paired_difference(X, Y, axes, alpha=0.05, N_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  Nx_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Four quantities are computed:

    * The difference in the means, X - Y
    * The effective number of degrees of freedom, :math:`df`
    * The probability of the computed difference if the population difference was zero
    * The confidence interval of the difference at the level specified by alpha

    If the average is taken over all axes of X and Y resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  riaxes = [whichaxis(srcaxes, n) for n in axes]
  raxes = [a for i, a in enumerate(srcaxes) if i in riaxes]
  oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
  oview = View(oaxes) 

  ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
  Nx = np.product([len(X.axes[i]) for i in ixaxes])

  iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
  Ny = np.product([len(Y.axes[i]) for i in iyaxes])

  assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'
  
  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert Nx > 1, '%s has only one element along the reduction axes' % X.name
  assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

  # Construct work arrays
  d = np.zeros(oview.shape, 'd')
  dd = np.zeros(oview.shape, 'd')

  N = np.zeros(oview.shape, 'd')

  d[()] = np.nan
  dd[()] = np.nan
  N[()] = np.nan

  # Accumulate data
  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar):
    ddata = xdata.astype('d') - ydata.astype('d')
    d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
    dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)
    # Sum of weights (kludge to get masking right)
    N[outsl] = np.nansum([N[outsl], npnansum(1. + ddata*0., ixaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  dd = (dd - d**2/N) / (N - 1)
  d /= Nx

  if N_fac is not None: eN = N//N_fac
  else: eN = N
  #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean())

  den = np.sqrt(dd/(eN - 1))

  p = tdist.cdf(abs(d/den), eN - 1)*np.sign(d)
  ci = tdist.ppf(1. - alpha/2, eN - 1) * den

  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'
  if xn == yn: name = xn
  else: name = '%s-%s'%(xn, yn)

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    D = Var(oaxes, values=d, name=name)
    DF = Var(oaxes, values=eN-1, name='df_%s' % name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([D, DF, P, CI])
  else: # Degenerate case
    return d, eN-1, p, ci
Example #18
0
def check_dataset (dataset):
  from pygeode.view import View
  from pygeode.tools import combine_axes
  from pygeode.progress import PBar
  from pygeode.dataset import asdataset
  import numpy as np

  # Make sure we have a dataset (in case we're sent a simple list of vars)
  dataset = asdataset(dataset)

  vars = list(dataset.vars)

  # Include axes in the list of vars (to check these values too)
  axes = combine_axes(v.axes for v in vars)
  vars.extend(axes)

  # Relative progress of each variable
  sizes = [v.size for v in vars]
  prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  pbar = PBar(message="Checking %s for I/O errors:"%repr(dataset))

  failed_indices = {}
  error_messages = {}

  # Loop over the data
  for i,var in enumerate(vars):

    varpbar = pbar.subset(prog[i], prog[i+1])

    # Scan the outer axis (record axis?) for failures.
    N = var.shape[0]
    failed_indices[var.name] = []
    error_messages[var.name] = []

    for j in range(N):
      vpbar = varpbar.part(j, N)
      try:
        # Try fetching the data, see if something fails
        var[j] if var.naxes == 1 else var[j,...]
      except Exception as e:
        failed_indices[var.name].append(j)
        error_messages[var.name].append(str(e))
      vpbar.update(100)

  # Print summary information for each variable
  everything_ok = True
  for var in vars:
    indices = failed_indices[var.name]
    messages = error_messages[var.name]
    if len(indices) == 0: continue

    everything_ok = False

    print "\nFailures encountered with variable '%s':"%var.name

    # Group together record indices that give the same error message
    unique_messages = []
    aggregated_indices = []
    for ind,msg in zip(indices,messages):
      if len(unique_messages) == 0 or msg != unique_messages[-1]:
        unique_messages.append(msg)
        aggregated_indices.append([ind])
      else:
        aggregated_indices[-1].append(ind)

    # Print each error message encountered (and the record indices that give the error)
    for ind,msg in zip(aggregated_indices,unique_messages):

      # Group records together that have are consecutive (instead of printing each record separately)
      groups = []
      for i in ind:
        if len(groups) == 0 or i-1 not in groups[-1]:
          groups.append([i])
        else:
          groups[-1].append(i)
      for g in groups:
        print "=> at %s:\n    %s"% (var.axes[0].slice[g[0]:g[-1]+1], msg)

  if not everything_ok: raise Exception("Problem encountered with the dataset.")
Example #19
0
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'):
# {{{
  r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term
    is not included by default. The following parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero for each regressor
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    If the regression is computed over all axes so that the result is a scalar,
    the above are returned as a tuple of floats in the order specified by
    ``output``. Otherwise they are returned as :class:`Var` instances. The outputs
    'B', 'p', and 'sb' will produce as many outputs as there are regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  Nr = len(Xs)

  Xaxes = combine_axes(Xs)

  srcaxes = combine_axes([Xaxes, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      ia = whichaxis(srcaxes, a)
      if ia in riaxes: ri_new.append(ia)
      else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name)

  # Construct work arrays
  os = oview.shape
  os1 = os + (Nr,)
  os2 = os + (Nr,Nr)
  y = np.zeros(os, 'd')
  yy = np.zeros(os, 'd')
  xy = np.zeros(os1, 'd')
  xx = np.zeros(os2, 'd')
  xxinv = np.zeros(os2, 'd')

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # Accumulate data
  for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
    ydata = datatuple[-1].astype('d')
    xdata = [datatuple[i].astype('d') for i in range(Nr)]
    y[outsl] += npsum(ydata, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    for i in range(Nr):
      xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes)
      for j in range(i+1):
        xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes)

  # Fill in opposite side of xTx
  for i in range(Nr):
    for j in range(i):
      xx[..., j, i] = xx[..., i, j]

  # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
  # loop over oview does not help)
  xx = xx.reshape(-1, Nr, Nr)
  xxinv = xxinv.reshape(-1, Nr, Nr)
  for i in range(xx.shape[0]):
    xxinv[i,:,:] = np.linalg.inv(xx[i,:,:])
  xx = xx.reshape(os2)
  xxinv = xxinv.reshape(os2)

  beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
  vare = np.sum(xy * beta, -1)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac

  sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)]

  xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  for o in output:
    if o == 'B':
      if len(oaxes) == 0:
        ret.append(beta)
      else:
        ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)])
    elif o == 'r':
      vary = (yy - y**2/N)
      R2 = 1 - (yy - vare) / vary
      if len(oaxes) == 0:
        ret.append(R2)
      else:
        ret.append(Var(oaxes, values=R2, name='R2'))
    elif o == 'p':
      ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)]
      if len(oaxes) == 0:
        ret.append(ps)
      else:
        ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)])
    elif o == 'sb':
      if len(oaxes) == 0:
        ret.append(sigbeta)
      else:
        ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)])
    elif o == 'covb':
      from .axis import NonCoordinateAxis as nca
      cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1')
      cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2')
      sigmat = np.zeros(os2, 'd')
      for i in range(Nr):
        for j in range(Nr):
          #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
          sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
      ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat'))
    elif o == 'se':
      se = np.sqrt((yy - vare) / N_eff)
      if len(oaxes) == 0:
        ret.append(se)
      else:
        ret.append(Var(oaxes, values=se, name='sig_resid'))
    else:
      print('multiple_regress: unrecognized output "%s"' % o)

  return ret
Example #20
0
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'):
# {{{
  r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  Returns
  =======
  results : list of :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = m X + b + \epsilon` is assumed, and the following parameters
    can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero
    * 'sm': Variance in linear coefficient
    * 'se': Variance of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      i = whichaxis(srcaxes, a)
      if i not in riaxes: 
        raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name))
      ri_new.append(i)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name)

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  y = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  xy = np.zeros(oview.shape, 'd')
  yy = np.zeros(oview.shape, 'd')

  # Accumulate data
  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
    xdata = xdata.astype('d')
    ydata = ydata.astype('d')
    x[outsl] += npsum(xdata, siaxes)
    y[outsl] += npsum(ydata, siaxes)
    xx[outsl] += npsum(xdata**2, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    xy[outsl] += npsum(xdata*ydata, siaxes)

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx -= x**2/N
  yy -= y**2/N
  xy -= (x*y)/N

  m = xy/xx
  b = (y - m*x)/float(N)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac
  sige = (yy - m * xy) / (N_eff - 2.)
  sigm = np.sqrt(sige / xx)
  t = np.abs(m) / sigm
  p = tdist.cdf(t, N-2) * np.sign(m)
  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  if 'm' in output:
    M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn))
    ret.append(M)
  if 'b' in output:
    B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn))
    ret.append(B)
  if 'r' in output:
    ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn)))
  if 'p' in output:
    P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn))
    ret.append(P)
  if 'sm' in output:
    ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn)))
  if 'se' in output:
    ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn)))

  return ret
Example #21
0
def save (filename, in_dataset, version=3, pack=None, compress=False, cfmeta = True, unlimited=None):
# {{{
  from ctypes import c_int, c_long, byref
  from pygeode.view import View
  from pygeode.tools import combine_axes, point
  from pygeode.axis import Axis, DummyAxis
  import numpy as np
  from pygeode.progress import PBar, FakePBar
  from pygeode.formats import finalize_save
  from pygeode.dataset import asdataset

  assert isinstance(filename,str)

  in_dataset = asdataset(in_dataset)
  dataset = finalize_save(in_dataset, cfmeta, pack)

  # Version?
  if compress: version = 4
  assert version in (3,4)

  fileid = c_int()

  vars = list(dataset.vars)
  # The output axes
  axes = combine_axes(v.axes for v in vars)

  # Include axes in the list of vars (for writing to netcdf).
  # Exclude axes which don't have any intrinsic values.
  vars = vars + [a for a in axes if not isinstance(a,DummyAxis)]
  #vars.extend(axes)

  # Variables (and axes) must all have unique names
  assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars]

  if unlimited is not None:
    assert unlimited in [a.name for a in axes]

  # Functions for writing entire array
  allf = {1:lib.nc_put_var_schar, 2:lib.nc_put_var_text, 3:lib.nc_put_var_short,
       4:lib.nc_put_var_int, 5:lib.nc_put_var_float,
       6:lib.nc_put_var_double, 7:lib.nc_put_var_uchar,
       8:lib.nc_put_var_ushort, 9:lib.nc_put_var_uint,
      10:lib.nc_put_var_longlong, 11:lib.nc_put_var_ulonglong}

  # Functions for writing chunks
  chunkf = {1:lib.nc_put_vara_schar, 2:lib.nc_put_vara_text, 3:lib.nc_put_vara_short,
       4:lib.nc_put_vara_int, 5:lib.nc_put_vara_float,
       6:lib.nc_put_vara_double, 7:lib.nc_put_vara_uchar,
       8:lib.nc_put_vara_ushort, 9:lib.nc_put_vara_uint,
      10:lib.nc_put_vara_longlong, 11:lib.nc_put_vara_ulonglong}


  # Create the file
  if version == 3:
    ret = lib.nc_create (filename.encode('ascii'), 0, byref(fileid))
    if ret != 0: raise IOError(lib.nc_strerror(ret))
  elif version == 4:
    ret = lib.nc_create (filename.encode('ascii'), 0x1000, byref(fileid))  # 0x1000 = NC_NETCDF4
    if ret != 0: raise IOError(lib.nc_strerror(ret))
  else: raise Exception

  try:
    # Define the dimensions
    dimids = [None] * len(axes)
    for i,a in enumerate(axes):
      dimids[i] = c_int()
      if unlimited == a.name:
        ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i]))
      else:
        ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i]))
      assert ret == 0, lib.nc_strerror(ret)

    # Define the variables (including axes)
    chunks = [None] * len(vars)
    varids = [None] * len(vars)
    for i, var in enumerate(vars):
      t = nc_type[version][var.dtype.name]
      # Generate the array of dimension ids for this var
      d = [dimids[list(axes).index(a)] for a in var.axes]
      # Make it C-compatible
      d = (c_int * var.naxes)(*d)
      varids[i] = c_int()
      ret = lib.nc_def_var (fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i]))
      assert ret == 0, lib.nc_strerror(ret)
      # Compress the data? (only works for netcdf4 or (higher?))
      if compress:
        ret = lib.nc_def_var_deflate (fileid, varids[i], 1, 1, 2)
        assert ret == 0, lib.nc_strerror(ret)

    # Write the attributes

    # global attributes
    put_attributes (fileid, -1, dataset.atts, version)

    # variable attributes
    for i, var in enumerate(vars):
      # modify axes to be netcdf friendly (CF-compliant, etc.)
      put_attributes (fileid, varids[i], var.atts, version)

    # Don't pre-fill the file
    oldmode = c_int()
    ret = lib.nc_set_fill (fileid, 256, byref(oldmode))
    assert ret == 0, "Can't set fill mode: %s (error %d)" % (lib.nc_strerror(ret), ret)
    # Finished defining the variables, about to start writing the values
    ret = lib.nc_enddef (fileid)
    assert ret == 0, "Error leaving define mode: %s (error %d)" % (lib.nc_strerror(ret), ret)

    # Relative progress of each variable
    sizes = [v.size for v in vars]
    prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  #  print "Saving '%s':"%filename
    pbar = PBar(message="Saving '%s':"%filename)
  #  pbar = FakePBar()
    # Write the data
    for i, var in enumerate(vars):
      t = nc_type[version][var.dtype.name]
      dtype = numpy_type[t]

  #    print 'writing', var.name

      # number of actual variables (non-axes) for determining our progress
      N = len([v for v in vars if not isinstance(v,Axis)])
      varpbar = pbar.subset(prog[i], prog[i+1])

      views = list(View(var.axes).loop_mem())
      for j,v in enumerate(views):

        vpbar = varpbar.part(j, len(views))
  #      print '???', repr(str(v))

        # Should always be slices (since we're looping over whole thing contiguously?)
        for sl in v.slices: assert isinstance(sl, slice)
        for sl in v.slices: assert sl.step in (1,None)

        start = [sl.start for sl in v.slices]
        count = [sl.stop - sl.start for sl in v.slices]

        start = (c_long*var.naxes)(*start)
        count = (c_long*var.naxes)(*count)

        if isinstance(var, Axis):
          assert len(start) == len(count) == 1
          data = var.values
          data = data[start[0]:start[0]+count[0]] # the above gives us the *whole* axis,
                                                  # but under extreme conditions we may be looping over smaller pieces
          vpbar.update(100)
        else: data = v.get(var, pbar=vpbar)

        # Ensure the data is stored contiguously in memory
        data = np.ascontiguousarray(data, dtype=dtype)
        ret = chunkf[t](fileid, varids[i], start, count, point(data))
        assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (var.name, lib.nc_strerror(ret), ret)

  finally:
    # Finished
    lib.nc_close(fileid)