Example #1
0
def prep (var, iaxis, weight, out):
  from pygeode.timeaxis import Time
  from pygeode.var import Var
  from pygeode.axis import Axis
  from pygeode.view import View
  from warnings import warn
  from pygeode import MAX_ARRAY_SIZE

  assert isinstance(var,Var)
  assert var.naxes >= 2, "need at least 2 axes"

  # Check the outputs
  out = whichout(out)
  del out # not actually used here

  # Keep the name
  name = var.name

  # Normalize by area weight?
  var = apply_weights(var, weight=weight)
  del weight

  timeaxes = iaxis
  del iaxis
  if timeaxes is None:
    if var.hasaxis(Time): timeaxes = Time
    else:
      warn ("No explicit record axis provided.  Using the first axis.", stacklevel=2)
      timeaxes = 0

  # Keep the record axis/axes as a tuple
  # (in case we have more than one axis, i.e. time and ensemble)
  if not isinstance(timeaxes,(list,tuple)):
    assert isinstance(timeaxes,int) or issubclass(timeaxes,Axis), 'unknown iaxis type %s'%type(timeaxes)
    timeaxes = [timeaxes]

  # Convert the axes to integer ids
  timeaxes = [var.whichaxis(a) for a in timeaxes]
  spaceaxes = [i for i in range(var.naxes) if i not in timeaxes]

  # Convert to axis objects
  timeaxes = [var.axes[i] for i in timeaxes]
  spaceaxes = [var.axes[i] for i in spaceaxes]

  # Create a view, to hold the axes together
  # (provides us with other useful stuff, like a notion of 'shape' and 'size')
  time = View(axes=timeaxes)
  space = View(axes=spaceaxes)

#  var = SquishedVar(var, timeaxes, spaceaxes)

  # Preload the data, if possible
  if var.size <= MAX_ARRAY_SIZE: var = var.load()

  return var, time, space
Example #2
0
    def getview (self, view, pbar):
      from pygeode.view import View
      import numpy as np
      # Indices of the full axes
      fullaxis_ind = [self.whichaxis(a) for a in iaxes]
      # Prepend the other axes
      ind = [i for i in range(self.naxes) if i not in fullaxis_ind] + fullaxis_ind
#      print "ind:", ind
      # Reverse order
      rind = [-1] * len(ind)
      for i,I in enumerate(ind):
        rind[I] = i
      assert len(ind) == self.naxes and len(set(ind)) == self.naxes
      # Construct a view with this new order of axes, and with the specified axes unsliced.
      axes = tuple([view.axes[i] for i in ind])
      slices = tuple([view.slices[i] for i in ind])
      bigview = View(axes, slices = slices)
      bigview = bigview.unslice(*fullaxis_ind)
      viewloop = list(bigview.loop_mem())
      out = np.empty(view.shape, self.dtype)

      for i,smallview in enumerate(viewloop):
#        print '??', i
        for I in fullaxis_ind:
          assert smallview.shape[I] == bigview.shape[I], "can't get all of axis '%s' at once"%view.axes[I].name

        # Slicing relative to the original view
        outsl = tuple(smallview.map_to(bigview.clip()).slices)

        # Reorder the axes to the original order
        axes = tuple([smallview.axes[I] for I in rind])
        assert axes == self.axes
        slices = tuple([smallview.slices[I] for I in rind])
        smallview = View (axes, slices = slices)

        # fudge outsl for this new order
        outsl = tuple([outsl[I] for I in rind])

        # Slicing the 'full' axes to get what we originally needed
        insl = [slice(None)] * self.naxes
        for I in fullaxis_ind: insl[I] = view.slices[I]



        # Get the data
        tmp = old_getview (self, smallview, pbar = pbar.part(i,len(viewloop)) )

#        print '??', out.shape, '[', outsl, ']', ' = ', tmp.shape, '[', insl, ']'
        out[outsl] = tmp[insl]

      return out
Example #3
0
    def getview(self, view, pbar):

        import numpy as np
        from pygeode.view import View, simplify
        out = np.empty(view.shape, dtype=self.dtype)
        out[()] = float('nan')
        out_axes = view.clip().axes
        # Loop over all available files.
        N = 0  # Number of points covered so far
        for filename, opener, axes in self._table:
            subaxes = [
                self._axis_manager._get_axis_intersection([a1, a2])
                for a1, a2 in zip(out_axes, axes)
            ]
            reorder = []
            mask = []
            if any(len(a) == 0 for a in subaxes): continue
            for a1, a2 in zip(out_axes, subaxes):
                # Figure out where the input chunk fits into the output
                re = np.searchsorted(a2.values, a1.values)
                # Mask out elements that we don't actually have in the chunk
                m = [
                    r < len(a2.values) and a2.values[r] == v
                    for r, v in zip(re, a1.values)
                ]
                m = np.array(m)
                # Convert mask to integer indices
                m = np.arange(len(m))[m]
                # and then to a slice (where possible)
                m = simplify(m)
                re = re[m]
                # Try to simplify the re-ordering array
                if np.all(re == np.sort(re)):
                    re = simplify(re)
                reorder.append(re)
                mask.append(m)
            var = [v for v in opener(filename) if v.name == self._varname][0]
            v = View(subaxes)
            chunk = v.get(var)
            # Note: this may break if there is more than one axis with integer indices.
            assert len([
                r for r in reorder if isinstance(r, (tuple, np.ndarray))
            ]) <= 1, "Unhandled advanced indexing case."
            assert len([m for m in mask if isinstance(m, (tuple, np.ndarray))
                        ]) <= 1, "Unhandled advanced indexing case."
            out[mask] = chunk[reorder]
            N = N + chunk.size
            pbar.update(100. * N / out.size)

        return out
Example #4
0
 def getview (self, view, pbar):
   from pygeode.view import View
   import numpy as np
   # Do a brute-force mapping of the indices to the internal axes
   # (should work if the axes are in 1:1 correspondence)
   data = View(self.var.axes, force_slices=view.slices,
               force_integer_indices=view.integer_indices).get(self.var, pbar=pbar)
   return data
Example #5
0
def write_xdr(var, wfile):
  import struct
  import numpy as np
  from pygeode.view import View

  lenstr = struct.pack('!2l', var.size, var.size)
  wfile.write(lenstr)

  # Break the values into memory-friendly chunks
  if hasattr (var, 'values'):
    values_iter = [var.values]
  else:
    view = View(var.axes)
    # Trap and handle any I/O errors
    viewloop = view.loop_mem()
    #TODO: make this more general - should we be futzing around with the axes at this level
    # Break it up even further along the time axis?  (so we don't start a long process through the whole dataset)
    if var.naxes > 2:
      new_viewloop = []
      for v in viewloop:
        for s in v.integer_indices[0]:
          new_viewloop.append(v.modify_slice(0,[s]))
      viewloop = new_viewloop

    values_iter = (get_data_trap_io(v,var) for v in viewloop)

  for values in values_iter:

    daptype = np2dap[values.dtype.name]
    if daptype in ('Byte','String'):
#      # Do byte encoding here
#      raise Exception
      values = np.ascontiguousarray(values, 'uint8');
      s = lib.int8toStr(values)
    elif daptype in ('UInt16', 'Int16', 'UInt32', 'Int32'):
      values = np.ascontiguousarray(values, 'int32')
      s = lib.int32toStr(values)
    elif daptype == 'Float32':
      values = np.ascontiguousarray(values, 'float32')
      s = lib.float32toStr(values)
    elif daptype == 'Float64':
      values = np.ascontiguousarray(values, 'float64')
      s = lib.float64toStr(values)

    wfile.write(s)
Example #6
0
def write_xdr(var, wfile):
    import struct
    import numpy as np
    from pygeode.view import View

    lenstr = struct.pack('!2', var.size, var.size)
    wfile.write(lenstr)

    # Break the values into memory-friendly chunks
    if hasattr(var, 'values'):
        values_iter = [var.values]
    else:
        view = View(var.axes)
        # Trap and handle any I/O errors
        viewloop = view.loop_mem()
        #TODO: make this more general - should we be futzing around with the axes at this level
        # Break it up even further along the time axis?  (so we don't start a long process through the whole dataset)
        if var.naxes > 2:
            new_viewloop = []
            for v in viewloop:
                for s in v.integer_indices[0]:
                    new_viewloop.append(v.modify_slice(0, [s]))
            viewloop = new_viewloop

        values_iter = (get_data_trap_io(v, var) for v in viewloop)

    for values in values_iter:

        daptype = np2dap[values.dtype.name]
        if daptype in ('Byte', 'String'):
            #      # Do byte encoding here
            #      raise Exception
            values = np.ascontiguousarray(values, 'uint8')
            s = lib.int8toStr(values)
        elif daptype in ('UInt16', 'Int16', 'UInt32', 'Int32'):
            values = np.ascontiguousarray(values, 'int32')
            s = lib.int32toStr(values)
        elif daptype == 'Float32':
            values = np.ascontiguousarray(values, 'float32')
            s = lib.float32toStr(values)
        elif daptype == 'Float64':
            values = np.ascontiguousarray(values, 'float64')
            s = lib.float64toStr(values)

        wfile.write(s)
Example #7
0
  def getview (self, view, pbar):

    import numpy as np
    from pygeode.view import View, simplify
    out = np.empty(view.shape, dtype=self.dtype)
    out[()] = float('nan')
    out_axes = view.clip().axes
    # Loop over all available files.
    N = 0  # Number of points covered so far
    for filename, opener, axes in self._table:
      subaxes = [self._axis_manager._get_axis_intersection([a1,a2]) for a1,a2 in zip(out_axes,axes)]
      reorder = []
      mask = []
      if any(len(a)==0 for a in subaxes): continue
      for a1,a2 in zip(out_axes,subaxes):
        # Figure out where the input chunk fits into the output
        re = np.searchsorted(a2.values, a1.values)
        # Mask out elements that we don't actually have in the chunk
        m = [r<len(a2.values) and a2.values[r]==v for r,v in zip(re,a1.values)]
        m = np.array(m)
        # Convert mask to integer indices
        m = np.arange(len(m))[m]
        # and then to a slice (where possible)
        m = simplify(m)
        re = re[m]
        # Try to simplify the re-ordering array
        if np.all(re == np.sort(re)):
          re = simplify(re)
        reorder.append(re)
        mask.append(m)
      var = [v for v in opener(filename) if v.name == self._varname][0]
      v = View(subaxes)
      chunk = v.get(var)
      # Note: this may break if there is more than one axis with integer indices.
      assert len([r for r in reorder if isinstance(r,(tuple,np.ndarray))]) <= 1, "Unhandled advanced indexing case."
      assert len([m for m in mask if isinstance(m,(tuple,np.ndarray))]) <= 1, "Unhandled advanced indexing case."
      out[mask] = chunk[reorder]
      N = N + chunk.size
      pbar.update(100.*N/out.size)

    return out
Example #8
0
def EOF_cov (x, num=1, iaxis=None, weight=True, out=None):
  import numpy as np
  from pygeode.view import View

  x, time, space = prep (x, iaxis, weight=weight, out=out)
  del iaxis

  # Initialize space for accumulating the covariance matrix
  cov = np.zeros ([space.size, space.size], dtype='d')

  # Accumulate the covariance
  for inview in View(x.axes).loop_mem():
    X = inview.get(x)
    assert X.size >= space.size, "Spatial pattern is too large"
    X = X.reshape(-1,space.size)

    cov += np.dot(X.transpose(),X)

  # Decompose the eigenvectors & eigenvalues
  w, v = np.linalg.eigh(cov/(time.size-1))

  variance = w.sum()
  eig = np.sqrt(w[::-1][:num])
  eof = v.transpose()[::-1,:][:num,:]

  # Compute the timeseries
  pc = []
  for inview in View(x.axes).loop_mem():
    X = inview.get(x).reshape(-1,space.size)
    pc.append(np.dot(eof, X.transpose()))

  pc = np.concatenate(pc, axis=1)
  # Normalize
  pc /= eig.reshape(num,1)

  return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
Example #9
0
def write_var(ncfile, dataset, unlimited=None, compress=False):
    # {{{
    from pygeode.view import View
    from pygeode.axis import Axis
    import numpy as np
    from pygeode.progress import PBar, FakePBar
    from pygeode.tools import combine_axes

    vars = list(dataset.vars)
    axes = combine_axes(v.axes for v in vars)

    # Define the dimensions
    for a in axes:
        ncfile.createDimension(a.name,
                               size=(None if a.name == unlimited else len(a)))

    # Define the variables (including axes)
    for var in vars:
        dimensions = [a.name for a in var.axes]
        v = ncfile.createVariable(var.name,
                                  datatype=var.dtype,
                                  dimensions=dimensions,
                                  zlib=compress,
                                  fill_value=var.atts.get('_FillValue', None))
        v.setncatts(var.atts)

    # global attributes
    ncfile.setncatts(dataset.atts)

    # Relative progress of each variable
    sizes = [v.size for v in vars]
    prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100

    pbar = PBar(message="Saving '%s':" % ncfile.filepath())

    # number of actual variables (non-axes) for determining our progress
    N = len([v for v in vars if not isinstance(v, Axis)])

    # Write the data
    for i, var in enumerate(vars):
        ncvar = ncfile.variables[var.name]
        varpbar = pbar.subset(prog[i], prog[i + 1])

        views = list(View(var.axes).loop_mem())

        for j, v in enumerate(views):
            vpbar = varpbar.part(j, len(views))
            ncvar[v.slices] = v.get(var, pbar=vpbar)
Example #10
0
        def getview(self, view, pbar):
            from pygeode.view import View
            import numpy as np
            # Indices of the full axes
            fullaxis_ind = [self.whichaxis(a) for a in iaxes]
            # Prepend the other axes
            ind = [i for i in range(self.naxes) if i not in fullaxis_ind
                   ] + fullaxis_ind
            #      print "ind:", ind
            # Reverse order
            rind = [-1] * len(ind)
            for i, I in enumerate(ind):
                rind[I] = i
            assert len(ind) == self.naxes and len(set(ind)) == self.naxes
            # Construct a view with this new order of axes, and with the specified axes unsliced.
            axes = tuple([view.axes[i] for i in ind])
            slices = tuple([view.slices[i] for i in ind])
            bigview = View(axes, slices=slices)
            bigview = bigview.unslice(*fullaxis_ind)
            viewloop = list(bigview.loop_mem())
            out = np.empty(view.shape, self.dtype)

            for i, smallview in enumerate(viewloop):
                #        print '??', i
                for I in fullaxis_ind:
                    assert smallview.shape[I] == bigview.shape[
                        I], "can't get all of axis '%s' at once" % view.axes[
                            I].name

                # Slicing relative to the original view
                outsl = tuple(smallview.map_to(bigview.clip()).slices)

                # Reorder the axes to the original order
                axes = tuple([smallview.axes[I] for I in rind])
                assert axes == self.axes
                slices = tuple([smallview.slices[I] for I in rind])
                smallview = View(axes, slices=slices)

                # fudge outsl for this new order
                outsl = tuple([outsl[I] for I in rind])

                # Slicing the 'full' axes to get what we originally needed
                insl = [slice(None)] * self.naxes
                for I in fullaxis_ind:
                    insl[I] = view.slices[I]

                # Get the data
                tmp = old_getview(self,
                                  smallview,
                                  pbar=pbar.part(i, len(viewloop)))

                #        print '??', out.shape, '[', outsl, ']', ' = ', tmp.shape, '[', insl, ']'
                out[outsl] = tmp[insl]

            return out
Example #11
0
  def getview (self, view, pbar):
    from pygeode.view import View
    import numpy as np
    fillvalue = self.fillvalue
    scale = self.scale
    offset = self.offset
    # Do a brute-force mapping of the indices to the internal axes
    # (should work if the axes are in 1:1 correspondence)
    data = View(self.var.axes, force_slices=view.slices,
                force_integer_indices=view.integer_indices).get(self.var, pbar=pbar)
    if fillvalue is not None or scale is not None or offset is not None: data = np.copy(data)
    if fillvalue is not None: w = np.where(data==fillvalue)
    data = np.asarray(data, self.dtype)
    if scale is not None: data *= scale
    if offset is not None: data += offset
    if fillvalue is not None: data[w] = float('nan')

    return data
Example #12
0
def EOF_guess (x, num=1, iaxis=None, weight=True, out=None):
  import numpy as np
  from pygeode.var import Var
  from pygeode.view import View
  from pygeode import eofcore as lib

  x, time, space = prep (x, iaxis, weight=weight, out=out)
  del iaxis

  print("working on array shape %s"%(x.shape,))

  # Initialize workspace
  work = lib.start (num, space.size)

  eof = np.empty((num,)+space.shape, dtype='d')
  eig = np.empty([num], dtype='d')
  pc = np.empty((num,)+time.shape, dtype='d')

  # Variance accumulation
  variance = 0.0

  # Loop over chunks of the data
  for inview in View(x.axes).loop_mem():
    X = np.ascontiguousarray(inview.get(x), dtype='d')
    assert X.size >= space.size, "Spatial pattern is too large"
    nrec = X.size // space.size
    lib.process (work, nrec, X)

    # Accumulate variance
    variance += (X**2).sum()

  # Get result
  lib.endloop (work, eof, eig, pc)

  # Free workspace
  lib.finish (work)

  # Wrap the stuff
  return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
Example #13
0
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None):
    # {{{
    r'''Computes the mean value of X and statistics relevant for a test against
  the hypothesis that it is 0.

  Parameters
  ==========
  X : :class:`Var`
    Variable to average.

  axes : list, optional
    Axes over which to compute the mean; if nothing is specified, the mean is
    computed over all axes.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom;
    the effective number will be given by the number estimated from the dataset
    divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the mean value can be obtained through ``ds.m``).
    The following quantities can be calculated.

    * 'm': The mean value of X
    * 'p': The probability of the computed value if the population mean was zero
    * 'ci': The confidence interval of the mean at the level specified by alpha

    If the average is taken over all axes of X resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  difference

  Notes
  =====
  The number of effective degrees of freedom can be scaled as in :meth:`difference`. 
  The p-value and confidence interval are computed for the t-statistic defined in 
  eq (6.61) of von Storch and Zwiers 1999.'''

    from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
    from pygeode.view import View

    riaxes = [X.whichaxis(n) for n in axes]
    raxes = [a for i, a in enumerate(X.axes) if i in riaxes]
    oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes]
    oview = View(oaxes)

    N = np.product([len(X.axes[i]) for i in riaxes])

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert N > 1, '%s has only one element along the reduction axes' % X.name

    # Construct work arrays
    x = np.zeros(oview.shape, 'd')
    xx = np.zeros(oview.shape, 'd')
    Na = np.zeros(oview.shape, 'd')

    x[()] = np.nan
    xx[()] = np.nan
    Na[()] = np.nan

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0)

        # Sum of weights (kludge to get masking right)
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0)

    imsk = (Na > 0.)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    xx[imsk] -= x[imsk]**2 / Na[imsk]
    xx[imsk] = xx[imsk] / (Na[imsk] - 1)

    x[imsk] /= Na[imsk]

    if N_fac is not None:
        eN = N // N_fac
        eNa = Na // N_fac
    else:
        eN = N
        eNa = Na

    sdom = np.zeros((oview.shape), 'd')
    p = np.zeros((oview.shape), 'd')
    t = np.zeros((oview.shape), 'd')
    ci = np.zeros((oview.shape), 'd')

    sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk])
    dmsk = (sdom > 0.)

    t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk]
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1))
    ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk]

    name = X.name if X.name != '' else 'X'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        m = Var(oaxes, values=x, name='m')
        m.atts['longname'] = 'Mean value of %s' % (name, )
        rvs.append(m)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value of test %s is 0' % (name, )
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence intervale of the mean value of %s' % (
                name, )
        rvs.append(ci)

    return asdataset(rvs)
Example #14
0
def EOF_iter (x, num=1, iaxis=None, subspace = -1, max_iter=1000, weight=True, out=None):
  """
  (See svd.SVD for documentation on a similar function, but replace each xxx1 and xxx2 parameter with a single xxx parameter.)
  """
  import numpy as np
  from pygeode import libpath
  from pygeode.view import View
  from math import sqrt
  from pygeode.varoperations import fill
  from pygeode import svdcore as lib

  # Need vector subspace to be at least as large as the number of EOFs extracted.
  if subspace < num: subspace = num

  # Run the single-pass guess to seed the first iteration
  guess_eof, guess_eig, guess_pc = EOF_guess (x, subspace, iaxis, weight=weight, out=None)
  # Convert NaNs to zeros so they don't screw up the matrix operations
  guess_eof = fill (guess_eof, 0)

  x, time, space = prep(var=x, iaxis=iaxis, weight=weight, out=out)
  del iaxis

  eofshape =  (subspace,) + space.shape
  pcshape =  time.shape + (subspace,)

  pcs = np.empty(pcshape,dtype='d')

  oldeofs = np.empty(eofshape,dtype='d')
  # Seed with initial guess (in the weighted space)
  neweofs = apply_weights (guess_eof, weight=weight).get()
  neweofs = np.array(neweofs, dtype='d')  # so we can write
#  neweofs = np.random.rand(*eofshape)

  # Workspace for smaller representative matrix
  work1 = np.empty([subspace,subspace], dtype='d')
  work2 = np.empty([subspace,subspace], dtype='d')

  NX = space.size

  # Variance accumulation (on first iteration only)
  variance = 0.0

  for iter_num in range(1,max_iter+1):

    print('iter_num: %d'%iter_num)

    neweofs, oldeofs = oldeofs, neweofs

    # Reset the accumulation arrays for the next approximations
    neweofs[()] = 0

    # Apply the covariance matrix
    for inview in View(x.axes).loop_mem():
      X = np.ascontiguousarray(inview.get(x), dtype='d')
      assert X.size >= space.size, "spatial pattern is too large"

      nt = inview.shape[0]
      time_offset = inview.slices[0].start
      ier = lib.build_eofs (subspace, nt, NX, X, oldeofs,
                            neweofs, pcs[time_offset,...])
      assert ier == 0

      # Compute variance?
      if iter_num == 1:
        variance += (X**2).sum()

    # Useful dot products
    lib.dot(subspace, NX, oldeofs, neweofs, work1)
    lib.dot(subspace, NX, neweofs, neweofs, work2)

    # Compute surrogate matrix (using all available information from this iteration)
    A, residues, rank, s = np.linalg.lstsq(work1,work2,rcond=1e-30)

    # Eigendecomposition on surrogate matrix
    w, P = np.linalg.eig(A)

    # Sort by eigenvalue
    S = np.argsort(w)[::-1]
    w = w[S]
    print(w)
#    assert P.dtype.name == 'float64', P.dtype.name
    P = np.ascontiguousarray(P[:,S], dtype='d')

    # Translate the surrogate eigenvectors to an estimate of the true eigenvectors
    lib.transform(subspace, NX, P, neweofs)

    # Normalize
    lib.normalize (subspace, NX, neweofs)

#    # verify orthogonality
#    for i in range(num):
#      print [np.dot(neweofs[i,...].flatten(), neweofs[j,...].flatten()) for j in range(num)]

    if np.allclose(oldeofs[:num,...],neweofs[:num,...], atol=0):
      print('converged after %d iterations'%iter_num)
      break

  assert iter_num != max_iter, "no convergence"

  # Wrap as pygeode vars, and return
  # Only need some of the eofs for output (the rest might not have even converged yet)
  eof = neweofs[:num]
  pc = pcs[...,:num].transpose()

  # Extract the eigenvalues
  # (compute magnitude of pc arrays)
  #TODO: keep eigenvalues as a separate variable in the iteration loop
  eig = np.array([sqrt( (pc[i,...]**2).sum() ) for i in range(pc.shape[0]) ])
  pc = np.dot(np.diag(1/eig), pc)

  return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
Example #15
0
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None):
    # {{{
    r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the
    following parameters can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': p-value of regression; see notes.
    * 'sm': Standard deviation of linear coefficient estimate
    * 'se': Standard deviation of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['m', 'b', 'r2', 'p', 'sm', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from regression. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (
        X.name, Y.name)

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Sum of weights
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    if N_fac is None:
        N_eff = Na - 2.
    else:
        N_eff = Na / N_fac - 2.

    nmsk = (N_eff > 0.)

    xx[nmsk] -= (x * x)[nmsk] / Na[nmsk]
    yy[nmsk] -= (y * y)[nmsk] / Na[nmsk]
    xy[nmsk] -= (x * y)[nmsk] / Na[nmsk]

    dmsk = (xx > 0.)

    m = np.zeros(oview.shape, 'd')
    b = np.zeros(oview.shape, 'd')
    r2 = np.zeros(oview.shape, 'd')

    m[dmsk] = xy[dmsk] / xx[dmsk]
    b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk]

    r2den = xx * yy
    d2msk = (r2den > 0.)

    r2[d2msk] = xy[d2msk]**2 / r2den[d2msk]

    sige = np.zeros(oview.shape, 'd')
    sigm = np.zeros(oview.shape, 'd')
    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk]
    sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk])
    sige[nmsk] = np.sqrt(sige[dmsk])
    t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk]
    p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk]))

    msk = nmsk & dmsk

    m[~msk] = np.nan
    b[~msk] = np.nan
    sige[~msk] = np.nan
    sigm[~msk] = np.nan
    p[~msk] = np.nan

    msk = nmsk & d2msk
    r2[~msk] = np.nan

    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        M = Var(oaxes, values=m, name='m')
        M.atts['longname'] = 'slope'
        rvs.append(M)

    if 'b' in output:
        B = Var(oaxes, values=b, name='b')
        B.atts['longname'] = 'intercept'
        rvs.append(B)

    if 'r2' in output:
        R2 = Var(oaxes, values=r2, name='r2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        P = Var(oaxes, values=p, name='p')
        P.atts['longname'] = 'p-value'
        rvs.append(P)

    if 'sm' in output:
        SM = Var(oaxes, values=sigm, name='sm')
        SM.atts['longname'] = 'standard deviation of slope parameter'
        rvs.append(SM)

    if 'se' in output:
        SE = Var(oaxes, values=sige, name='se')
        SE.atts['longname'] = 'standard deviation of residual'
        rvs.append(SE)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'linear regression parameters for %s regressed against %s' % (
            yn, xn)

    return ds
Example #16
0
def save(filename,
         in_dataset,
         version=3,
         pack=None,
         compress=False,
         cfmeta=True,
         unlimited=None):
    # {{{
    from ctypes import c_int, c_long, byref
    from pygeode.view import View
    from pygeode.tools import combine_axes, point
    from pygeode.axis import Axis, DummyAxis
    import numpy as np
    from pygeode.progress import PBar, FakePBar
    from pygeode.formats import finalize_save
    from pygeode.dataset import asdataset

    assert isinstance(filename, str)

    in_dataset = asdataset(in_dataset)
    dataset = finalize_save(in_dataset, cfmeta, pack)

    # Version?
    if compress: version = 4
    assert version in (3, 4)

    fileid = c_int()

    vars = list(dataset.vars)
    # The output axes
    axes = combine_axes(v.axes for v in vars)

    # Include axes in the list of vars (for writing to netcdf).
    # Exclude axes which don't have any intrinsic values.
    vars = vars + [a for a in axes if not isinstance(a, DummyAxis)]
    #vars.extend(axes)

    # Variables (and axes) must all have unique names
    assert len(set([v.name for v in vars])) == len(
        vars), "vars must have unique names: %s" % [v.name for v in vars]

    if unlimited is not None:
        assert unlimited in [a.name for a in axes]

    # Functions for writing entire array
    allf = {
        1: lib.nc_put_var_schar,
        2: lib.nc_put_var_text,
        3: lib.nc_put_var_short,
        4: lib.nc_put_var_int,
        5: lib.nc_put_var_float,
        6: lib.nc_put_var_double,
        7: lib.nc_put_var_uchar,
        8: lib.nc_put_var_ushort,
        9: lib.nc_put_var_uint,
        10: lib.nc_put_var_longlong,
        11: lib.nc_put_var_ulonglong
    }

    # Functions for writing chunks
    chunkf = {
        1: lib.nc_put_vara_schar,
        2: lib.nc_put_vara_text,
        3: lib.nc_put_vara_short,
        4: lib.nc_put_vara_int,
        5: lib.nc_put_vara_float,
        6: lib.nc_put_vara_double,
        7: lib.nc_put_vara_uchar,
        8: lib.nc_put_vara_ushort,
        9: lib.nc_put_vara_uint,
        10: lib.nc_put_vara_longlong,
        11: lib.nc_put_vara_ulonglong
    }

    # Create the file
    if version == 3:
        ret = lib.nc_create(filename.encode('ascii'), 0, byref(fileid))
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    elif version == 4:
        ret = lib.nc_create(filename.encode('ascii'), 0x1000,
                            byref(fileid))  # 0x1000 = NC_NETCDF4
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    else: raise Exception

    try:
        # Define the dimensions
        dimids = [None] * len(axes)
        for i, a in enumerate(axes):
            dimids[i] = c_int()
            if unlimited == a.name:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(0),
                                     byref(dimids[i]))
            else:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'),
                                     c_long(len(a)), byref(dimids[i]))
            assert ret == 0, lib.nc_strerror(ret)

        # Define the variables (including axes)
        chunks = [None] * len(vars)
        varids = [None] * len(vars)
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            # Generate the array of dimension ids for this var
            d = [dimids[list(axes).index(a)] for a in var.axes]
            # Make it C-compatible
            d = (c_int * var.naxes)(*d)
            varids[i] = c_int()
            ret = lib.nc_def_var(fileid, var.name.encode('ascii'), t,
                                 var.naxes, d, byref(varids[i]))
            assert ret == 0, lib.nc_strerror(ret)
            # Compress the data? (only works for netcdf4 or (higher?))
            if compress:
                ret = lib.nc_def_var_deflate(fileid, varids[i], 1, 1, 2)
                assert ret == 0, lib.nc_strerror(ret)

        # Write the attributes

        # global attributes
        put_attributes(fileid, -1, dataset.atts, version)

        # variable attributes
        for i, var in enumerate(vars):
            # modify axes to be netcdf friendly (CF-compliant, etc.)
            put_attributes(fileid, varids[i], var.atts, version)

        # Don't pre-fill the file
        oldmode = c_int()
        ret = lib.nc_set_fill(fileid, 256, byref(oldmode))
        assert ret == 0, "Can't set fill mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)
        # Finished defining the variables, about to start writing the values
        ret = lib.nc_enddef(fileid)
        assert ret == 0, "Error leaving define mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)

        # Relative progress of each variable
        sizes = [v.size for v in vars]
        prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100

        #  print "Saving '%s':"%filename
        pbar = PBar(message="Saving '%s':" % filename)
        #  pbar = FakePBar()
        # Write the data
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            dtype = numpy_type[t]

            #    print 'writing', var.name

            # number of actual variables (non-axes) for determining our progress
            N = len([v for v in vars if not isinstance(v, Axis)])
            varpbar = pbar.subset(prog[i], prog[i + 1])

            views = list(View(var.axes).loop_mem())
            for j, v in enumerate(views):

                vpbar = varpbar.part(j, len(views))
                #      print '???', repr(str(v))

                # Should always be slices (since we're looping over whole thing contiguously?)
                for sl in v.slices:
                    assert isinstance(sl, slice)
                for sl in v.slices:
                    assert sl.step in (1, None)

                start = [sl.start for sl in v.slices]
                count = [sl.stop - sl.start for sl in v.slices]

                start = (c_long * var.naxes)(*start)
                count = (c_long * var.naxes)(*count)

                if isinstance(var, Axis):
                    assert len(start) == len(count) == 1
                    data = var.values
                    data = data[
                        start[0]:start[0] +
                        count[0]]  # the above gives us the *whole* axis,
                    # but under extreme conditions we may be looping over smaller pieces
                    vpbar.update(100)
                else:
                    data = v.get(var, pbar=vpbar)

                # Ensure the data is stored contiguously in memory
                data = np.ascontiguousarray(data, dtype=dtype)
                ret = chunkf[t](fileid, varids[i], start, count, point(data))
                assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (
                    var.name, lib.nc_strerror(ret), ret)

    finally:
        # Finished
        lib.nc_close(fileid)
Example #17
0
def difference(X,
               Y,
               axes=None,
               alpha=0.05,
               Nx_fac=None,
               Ny_fac=None,
               output='d,p,ci',
               pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional, defaults to None
    Axes over which to compute means; if othing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float, optional; defaults to 0.05
    Confidence level for which to compute confidence interval.

  Nx_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]

    Nx = np.product([len(X.axes[i]) for i in ixaxes])
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])
    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    Nx = np.full(oview.shape, np.nan, 'd')
    Ny = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)

        # Count of non-NaN data points
        Nx[outsl] = np.nansum(
            [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0)

    for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar):
        ydata = ydata.astype('d')
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)

        # Count of non-NaN data points
        Ny[outsl] = np.nansum(
            [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (Nx > 1) & (Ny > 1)
    xx[imsk] -= (x * x)[imsk] / Nx[imsk]
    xx[imsk] /= (Nx[imsk] - 1)

    x[imsk] /= Nx[imsk]

    yy[imsk] -= (y * y)[imsk] / Ny[imsk]
    yy[imsk] /= (Ny[imsk] - 1)

    y[imsk] /= Ny[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    if Nx_fac is not None: eNx = Nx // Nx_fac
    else: eNx = Nx
    if Ny_fac is not None: eNy = Ny // Ny_fac
    else: eNy = Ny

    emsk = (eNx > 1) & (eNy > 1)

    # Compute difference
    d = x - y

    den = np.zeros(oview.shape, 'd')
    df = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    # Convert to variance of the mean of each sample
    xx[emsk] /= eNx[emsk]
    yy[emsk] /= eNy[emsk]

    den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1)
    dmsk = (den > 0.)

    df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk]

    den[emsk] = np.sqrt(xx[emsk] + yy[emsk])

    dmsk &= (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk]))

    ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk]

    df[~dmsk] = np.nan
    p[~dmsk] = np.nan
    ci[~dmsk] = np.nan

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=df, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % (
            xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['Nx_fac'] = Nx_fac
    ds.atts['Ny_fac'] = Ny_fac
    ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn)

    return ds
Example #18
0
def correlate(X, Y, axes=None, output='r2,p', pbar=None):
    # {{{
    r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults
    to 'r2,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the correlation coefficient can be obtained
    through ``ds.r2``).

    * 'r2': The correlation coefficient :math:`\rho_{XY}`
    * 'p':  The p-value; see notes.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding a correlation
  coeefficient of equal or greater magnitude (two-sided) to the given result
  under the hypothesis that the true correlation coefficient between X and Y is
  zero. It is computed from the t-statistic given in eq (8.7), in section
  8.2.3, and assumes normally distributed quantities.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['r2', 'p']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    # Put all the axes being reduced over at the end
    # so that we can reshape
    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    iview = View(inaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Count of non-NaN data points
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    imsk = (Na > 0)

    xx[imsk] -= (x * x)[imsk] / Na[imsk]
    yy[imsk] -= (y * y)[imsk] / Na[imsk]
    xy[imsk] -= (x * y)[imsk] / Na[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    # Compute correlation coefficient, t-statistic, p-value
    den = np.zeros(oview.shape, 'd')
    rho = np.zeros(oview.shape, 'd')

    den[imsk] = np.sqrt((xx * yy)[imsk])
    dmsk = (den > 0.)

    rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk]

    den = 1 - rho**2
    # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
    den[den < eps] = eps

    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk])
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2))

    p[~imsk] = np.nan
    rho[~imsk] = np.nan

    p[~dmsk] = np.nan
    rho[~dmsk] = np.nan

    # Construct and return variables
    xn = X.name if X.name != '' else 'X'  # Note: could write:  xn = X.name or 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'r2' in output:
        r2 = Var(oaxes, values=rho, name='r2')
        r2.atts['longname'] = 'Correlation coefficient between %s and %s' % (
            xn, yn)
        rvs.append(r2)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for correlation coefficient between %s and %s' % (
                xn, yn)
        rvs.append(p)

    ds = asdataset(rvs)
    ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn)

    return ds
Example #19
0
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None):
    # {{{
    r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed.
    Note that a constant term is not included by default. The following
    parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': p-value of regession; see notes.
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    The outputs 'B', 'p', and 'sb' will produce as many outputs as there are
    regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; it corresponds to the probability of obtaining the regression
  coefficient under the null hypothesis that there is no linear relationship.
  Note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
    from pygeode.view import View

    # Split output request now
    ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    Nr = len(Xs)

    Xaxes = combine_axes(Xs)

    srcaxes = combine_axes([Xaxes, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            ia = whichaxis(srcaxes, a)
            if ia in riaxes: ri_new.append(ia)
            else:
                raise KeyError(
                    'One of the Xs or Y does not have the axis %s.' % a)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = tuple([srcaxes[i] for i in oiaxes])
    inaxes = oaxes + tuple([srcaxes[i] for i in riaxes])
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(
        riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (
            Y.name)

    # Construct work arrays
    os = oview.shape
    os1 = os + (Nr, )
    os2 = os + (Nr, Nr)
    y = np.zeros(os, 'd')
    yy = np.zeros(os, 'd')
    xy = np.zeros(os1, 'd')
    xx = np.zeros(os2, 'd')
    xxinv = np.zeros(os2, 'd')

    N = np.prod([len(srcaxes[i]) for i in riaxes])

    # Accumulate data
    for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
        ydata = datatuple[-1].astype('d')
        xdata = [datatuple[i].astype('d') for i in range(Nr)]
        y[outsl] += npsum(ydata, siaxes)
        yy[outsl] += npsum(ydata**2, siaxes)
        for i in range(Nr):
            xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes)
            for j in range(i + 1):
                xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes)

    # Fill in opposite side of xTx
    for i in range(Nr):
        for j in range(i):
            xx[..., j, i] = xx[..., i, j]

    # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
    # loop over oview does not help)
    xx = xx.reshape(-1, Nr, Nr)
    xxinv = xxinv.reshape(-1, Nr, Nr)
    for i in range(xx.shape[0]):
        xxinv[i, :, :] = np.linalg.inv(xx[i, :, :])
    xx = xx.reshape(os2)
    xxinv = xxinv.reshape(os2)

    beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
    vare = np.sum(xy * beta, -1)

    if N_fac is None: N_eff = N
    else: N_eff = N // N_fac

    sigbeta = [
        np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)
    ]

    xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
    yn = Y.name if Y.name != '' else 'Y'

    from .var import Var
    from .dataset import asdataset
    from .axis import NonCoordinateAxis

    ra = NonCoordinateAxis(values=np.arange(Nr),
                           regressor=xns,
                           name='regressor')
    ra2 = NonCoordinateAxis(values=np.arange(Nr),
                            regressor=xns,
                            name='regressor2')
    Nd = len(oaxes)

    rvs = []

    if 'beta' in output:
        B = Var(oaxes + (ra, ), values=beta, name='beta')
        B.atts['longname'] = 'regression coefficient'
        rvs.append(B)

    if 'r2' in output:
        vary = (yy - y**2 / N)
        R2 = 1 - (yy - vare) / vary
        R2 = Var(oaxes, values=R2, name='R2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        p = [
            2. *
            (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr))
            for i in range(Nr)
        ]
        p = np.transpose(np.array(p), [Nd] + list(range(Nd)))
        p = Var(oaxes + (ra, ), values=p, name='p')
        p.atts['longname'] = 'p-values'
        rvs.append(p)

    if 'sb' in output:
        sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd)))
        sb = Var(oaxes + (ra, ), values=sigbeta, name='sb')
        sb.atts['longname'] = 'standard deviation of linear coefficients'
        rvs.append(sb)

    if 'covb' in output:
        sigmat = np.zeros(os2, 'd')
        for i in range(Nr):
            for j in range(Nr):
                #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
                sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
        covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb')
        covb.atts['longname'] = 'Covariance matrix of the linear coefficients'
        rvs.append(covb)

    if 'se' in output:
        se = np.sqrt((yy - vare) / N_eff)
        se = Var(oaxes, values=se, name='se')
        se.atts['longname'] = 'standard deviation of residual'
        rvs.append(se)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'multiple linear regression parameters for %s regressed against %s' % (
            yn, xns)

    return ds
Example #20
0
    def get(self, pbar=None, **kwargs):
        # {{{
        """
    Gets a raw numpy array containing the values of the variable.

    Parameters
    ----------
    pbar : boolean (optional)
      If ``True``, will display a progress bar while the data is being
      retrieved.  This requires the *python-progressbar* package (not included
      with PyGeode).

    **kwargs : keyword arguments (optional)
      One or more keyword arguments may be included to subset the variable
      before grabbing the data.  See :func:`Var.__call__` for a similar
      method which uses this keyword subsetting.

    Returns
    -------
    out : numpy.ndarray
      The requested values, as a numpy array.

    Notes
    -----

    Once you grab the data as a numpy array, you can no longer use the PyGeode
    functions to do further work on it directly.  You can, however, use
    :func:`Var.__init__` to re-wrap your numpy array as a PyGeode Var.  This
    may be useful if you want to do some very complicated operations on the
    data using the numpy interface as an intermediate step.

    PyGeode variables can be huge!  They can be larger than the available RAM
    in your computer, or even larger than your hard disk.  Numpy arrays, on
    the other hand, need to fit in memory, so make sure you are only getting
    a reasonable piece of data at a time.

    Examples
    --------
    >>> from pygeode.tutorial import t1
    >>> print t1.Temp
    <Var 'Temp'>:
      Shape:  (lat,lon)  (32,64)
      Axes:
        lat <Lat>      :  85 S to 85 N (32 values)
        lon <Lon>      :  0 E to 354 E (64 values)
      Attributes:
        {'units': 'K'}
      Type:  Var (dtype="float64")
    >>> x = t1.Temp.get()
    >>> print x
    [[ 261.05848727  259.81373805  258.6761858  ...,  264.37317879
       263.44078874  262.30323649]
     [ 261.66049058  260.49545075  259.43074336 ...,  264.76292084
       263.89023779  262.82553041]
     [ 262.53448988  261.44963014  260.45819779 ...,  265.42340543
       264.61078196  263.61934962]
     ..., 
     [ 262.53448988  263.61934962  264.61078196 ...,  259.64557433
       260.45819779  261.44963014]
     [ 261.66049058  262.82553041  263.89023779 ...,  258.55806031
       259.43074336  260.49545075]
     [ 261.05848727  262.30323649  263.44078874 ...,  257.74379575  258.6761858
       259.81373805]]
    """
        from pygeode.view import View
        import numpy as np
        var = self.__call__(**kwargs)
        data = View(var.axes).get(var, pbar=pbar)
        if isinstance(data, np.ndarray):
            data = np.array(data, copy=True)
        return data
Example #21
0
def paired_difference(X,
                      Y,
                      axes=None,
                      alpha=0.05,
                      N_fac=None,
                      output='d,p,ci',
                      pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must share all axes over which the means are being computed.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X and Y; the effective number will be given by the number estimated from the
    dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    Nx = np.product([len(X.axes[i]) for i in ixaxes])

    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])

    assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    # Construct work arrays
    d = np.full(oview.shape, np.nan, 'd')
    dd = np.full(oview.shape, np.nan, 'd')
    N = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y],
                                          oview,
                                          inaxes=srcaxes,
                                          pbar=pbar):
        ddata = xdata.astype('d') - ydata.astype('d')
        d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
        dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)

        # Count of non-NaN data points
        N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (N > 1)
    dd[imsk] -= (d * d)[imsk] / N[imsk]
    dd[imsk] /= (N[imsk] - 1)
    d[imsk] /= N[imsk]

    # Ensure variance is non-negative
    dd[dd <= 0.] = 0.

    if N_fac is not None: eN = N // N_fac
    else: eN = N

    emsk = (eN > 1)

    den = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    den = np.zeros(oview.shape, 'd')
    den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1))
    dmsk = (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1))
    ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk]

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=eN - 1, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for t-test of paired difference (%s - %s)' % (
                xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['N_fac'] = N_fac
    ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn)

    return ds
Example #22
0
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'):
# {{{
  r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  Returns
  =======
  results : list of :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = m X + b + \epsilon` is assumed, and the following parameters
    can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero
    * 'sm': Variance in linear coefficient
    * 'se': Variance of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      i = whichaxis(srcaxes, a)
      if i not in riaxes: 
        raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name))
      ri_new.append(i)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name)

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  y = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  xy = np.zeros(oview.shape, 'd')
  yy = np.zeros(oview.shape, 'd')

  # Accumulate data
  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
    xdata = xdata.astype('d')
    ydata = ydata.astype('d')
    x[outsl] += npsum(xdata, siaxes)
    y[outsl] += npsum(ydata, siaxes)
    xx[outsl] += npsum(xdata**2, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    xy[outsl] += npsum(xdata*ydata, siaxes)

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx -= x**2/N
  yy -= y**2/N
  xy -= (x*y)/N

  m = xy/xx
  b = (y - m*x)/float(N)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac
  sige = (yy - m * xy) / (N_eff - 2.)
  sigm = np.sqrt(sige / xx)
  t = np.abs(m) / sigm
  p = tdist.cdf(t, N-2) * np.sign(m)
  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  if 'm' in output:
    M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn))
    ret.append(M)
  if 'b' in output:
    B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn))
    ret.append(B)
  if 'r' in output:
    ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn)))
  if 'p' in output:
    P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn))
    ret.append(P)
  if 'sm' in output:
    ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn)))
  if 'se' in output:
    ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn)))

  return ret
Example #23
0
    def get(self, pbar=None, **kwargs):
        # {{{
        """
    Gets a raw numpy array containing the values of the variable.

    Parameters
    ----------
    pbar : boolean (optional)
      If ``True``, will display a progress bar while the data is being
      retrieved.  This requires the *python-progressbar* package (not included
      with PyGeode).

    **kwargs : keyword arguments (optional)
      One or more keyword arguments may be included to subset the variable
      before grabbing the data.  See :func:`Var.__call__` for a similar
      method which uses this keyword subsetting.

    Returns
    -------
    out : numpy.ndarray
      The requested values, as a numpy array.

    Notes
    -----

    Once you grab the data as a numpy array, you can no longer use the PyGeode
    functions to do further work on it directly.  You can, however, use
    :func:`Var.__init__` to re-wrap your numpy array as a PyGeode Var.  This
    may be useful if you want to do some very complicated operations on the
    data using the numpy interface as an intermediate step.

    PyGeode variables can be huge!  They can be larger than the available RAM
    in your computer, or even larger than your hard disk.  Numpy arrays, on
    the other hand, need to fit in memory, so make sure you are only getting
    a reasonable piece of data at a time.

    Examples
    --------
    >>> from pygeode.tutorial import t1
    >>> print(t1.Temp)
    <Var 'Temp'>:
      Units: K  Shape:  (lat,lon)  (31,60)
      Axes:
        lat <Lat>      :  90 S to 90 N (31 values)
        lon <Lon>      :  0 E to 354 E (60 values)
      Attributes:
        {}
      Type:  Add_Var (dtype="float64")
    >>> x = t1.Temp.get()
    >>> print(x)
    [[260.73262556 258.08759192 256.45287123 ... 265.01237988 265.01237988
      263.37765919]
     [261.22683172 258.75813366 257.23239435 ... 265.22126909 265.22126909
      263.69552978]
     [261.98265134 259.69028886 258.27353093 ... 265.69177175 265.69177175
      264.27501382]
     ...
     [261.98265134 264.27501382 265.69177175 ... 258.27353093 258.27353093
      259.69028886]
     [261.22683172 263.69552978 265.22126909 ... 257.23239435 257.23239435
      258.75813366]
     [260.73262556 263.37765919 265.01237988 ... 256.45287123 256.45287123
      258.08759192]]
    """
        from pygeode.view import View
        import numpy as np
        var = self.__call__(**kwargs)
        data = View(var.axes).get(var, pbar=pbar)
        if isinstance(data, np.ndarray):
            data = np.array(data, copy=True)
        return data
Example #24
0
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'):
# {{{
  r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. A fit of the form
    :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term
    is not included by default. The following parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': Probability of this fit if the true linear coefficient was zero for each regressor
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    If the regression is computed over all axes so that the result is a scalar,
    the above are returned as a tuple of floats in the order specified by
    ``output``. Otherwise they are returned as :class:`Var` instances. The outputs
    'B', 'p', and 'sb' will produce as many outputs as there are regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
  from pygeode.view import View

  Nr = len(Xs)

  Xaxes = combine_axes(Xs)

  srcaxes = combine_axes([Xaxes, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      ia = whichaxis(srcaxes, a)
      if ia in riaxes: ri_new.append(ia)
      else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name)

  # Construct work arrays
  os = oview.shape
  os1 = os + (Nr,)
  os2 = os + (Nr,Nr)
  y = np.zeros(os, 'd')
  yy = np.zeros(os, 'd')
  xy = np.zeros(os1, 'd')
  xx = np.zeros(os2, 'd')
  xxinv = np.zeros(os2, 'd')

  N = np.prod([len(srcaxes[i]) for i in riaxes])

  # Accumulate data
  for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
    ydata = datatuple[-1].astype('d')
    xdata = [datatuple[i].astype('d') for i in range(Nr)]
    y[outsl] += npsum(ydata, siaxes)
    yy[outsl] += npsum(ydata**2, siaxes)
    for i in range(Nr):
      xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes)
      for j in range(i+1):
        xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes)

  # Fill in opposite side of xTx
  for i in range(Nr):
    for j in range(i):
      xx[..., j, i] = xx[..., i, j]

  # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
  # loop over oview does not help)
  xx = xx.reshape(-1, Nr, Nr)
  xxinv = xxinv.reshape(-1, Nr, Nr)
  for i in range(xx.shape[0]):
    xxinv[i,:,:] = np.linalg.inv(xx[i,:,:])
  xx = xx.reshape(os2)
  xxinv = xxinv.reshape(os2)

  beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
  vare = np.sum(xy * beta, -1)

  if N_fac is None: N_eff = N
  else: N_eff = N // N_fac

  sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)]

  xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  output = output.split(',')
  ret = []

  for o in output:
    if o == 'B':
      if len(oaxes) == 0:
        ret.append(beta)
      else:
        ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)])
    elif o == 'r':
      vary = (yy - y**2/N)
      R2 = 1 - (yy - vare) / vary
      if len(oaxes) == 0:
        ret.append(R2)
      else:
        ret.append(Var(oaxes, values=R2, name='R2'))
    elif o == 'p':
      ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)]
      if len(oaxes) == 0:
        ret.append(ps)
      else:
        ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)])
    elif o == 'sb':
      if len(oaxes) == 0:
        ret.append(sigbeta)
      else:
        ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)])
    elif o == 'covb':
      from .axis import NonCoordinateAxis as nca
      cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1')
      cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2')
      sigmat = np.zeros(os2, 'd')
      for i in range(Nr):
        for j in range(Nr):
          #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
          sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
      ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat'))
    elif o == 'se':
      se = np.sqrt((yy - vare) / N_eff)
      if len(oaxes) == 0:
        ret.append(se)
      else:
        ret.append(Var(oaxes, values=se, name='sig_resid'))
    else:
      print('multiple_regress: unrecognized output "%s"' % o)

  return ret
Example #25
0
def SVD (var1, var2, num=1, subspace=-1, iaxis=Time, weight1=True, weight2=True, matrix='cov'):
  """
  Finds coupled EOFs of two fields.  Note that the mean/trend/etc. is NOT
  removed in this routine.

  Parameters
  ----------
  var1, var2 : :class:`Var`
    The variables to analyse.

  num : integer
    The number of EOFs to compute (default is ``1``).

  weight1, weight2 : optional 
    Weights to use for defining orthogonality in the var1, var2 domains,
    respectively.  Patterns X and Y in the var1 domain are orthogonal if the
    sum over X*Y*weights1 is 0. Patterns Z and W in the var2 domain are
    orthogonal if the sum over Z*W*weights2 is 0. Default is to use internal
    weights defined for var1 accessed by :meth:`Var.getweights()`. If set to
    ``False`` no weighting is used.

  matrix : string, optional ['cov']
    Which matrix we are diagonalizing (default is 'cov'). 
     * 'cov': covariance matrix of var1 & var2
     * 'cov': correlation matrix of var1 & var2

  iaxis : Axis identifier
    The principal component / expansion coefficient axis, i.e., the 'time'
    axis. Can be an integer (the axis number, leftmost = 0), the axis name
    (string), or a Pygeode axis class.  If not specified, will try to use
    pygeode.timeaxis.Time, and if that fails, the leftmost axis.

  Returns
  -------
  (eof1, pc1, eof2, pc2): tuple
    * eof1: The coupled eof patterns for var1. 
    * pc1: The principal component / expansion coefficients for var1.
    * eof2: The coupled eof patterns for var2.
    * pc2: The principal component / expansion coefficients for var2.

  Notes
  -----
    Multiple orders of EOFs are concatenated along an 'order' axis.
  """
  import numpy as np
  from pygeode.timeaxis import Time
  from pygeode.var import Var
  from pygeode.view import View
  from pygeode import MAX_ARRAY_SIZE
  from warnings import warn
  from pygeode import svdcore as lib

  if matrix in ('cov', 'covariance'): matrix = 'cov'
  elif matrix in ('cor', 'corr', 'correlation'): matrix = 'cor'
  else:
    warn ("invalid matrix type '%'.  Defaulting to covariance."%matrix, stacklevel=2)
    matrix = 'cov'

  MAX_ITER = 1000

  # Iterate over more EOFs than we need
  # (this helps with convergence)
  # TODO: a more rigorous formula for the optimum number of EOFs to use
  if subspace <= 0: subspace = 2*num + 8
  if subspace < num: subspace = num  # Just in case

  # Remember the names
  prefix1 = var1.name+'_' if var1.name != '' else ''
  prefix2 = var2.name+'_' if var2.name != '' else ''

  # Apply weights?
#  if weight1 is not None: var1 *= weight1.sqrt()
#  if weight2 is not None: var2 *= weight2.sqrt()
  if weight1 is True: weight1 = var1.getweights()
  if weight1 is not False:
    assert not weight1.hasaxis(iaxis), "Can't handle weights along the record axis"
    # Normalize the weights
    W = weight1.sum() / weight1.size
    weight1 /= W
    # Apply the weights
    var1 *= weight1.sqrt()
  if weight2 is True: weight2 = var2.getweights()
  if weight2 is not False:
    assert not weight2.hasaxis(iaxis), "Can't handle weights along the record axis"
    # Normalize the weights
    W = weight2.sum() / weight2.size
    weight2 /= W
    # Apply the weights
    var2 *= weight2.sqrt()


  #TODO: allow multiple iteration axes (i.e., time and ensemble)
#  if iaxis is None:
#    if var1.hasaxis(Time) and var2.hasaxis(Time):
#      iaxis1 = var1.whichaxis(Time)
#      iaxis2 = var2.whichaxis(Time)
#    else:
#      iaxis1 = 0
#      iaxis2 = 0
#  else:
  iaxis1 = var1.whichaxis(iaxis)
  iaxis2 = var2.whichaxis(iaxis)

  assert var1.axes[iaxis1] == var2.axes[iaxis2], "incompatible iteration axes"
  del iaxis  # so we don't use this by accident


  # Special case: can load entire variable in memory
  # This will save some time, especially if the field is stored on disk, or is heavily derived
  if var1.size <= MAX_ARRAY_SIZE:
    print('preloading '+repr(var1))
    var1 = var1.load()
  if var2.size <= MAX_ARRAY_SIZE:
    print('preloading '+repr(var2))
    var2 = var2.load()

  # Use correlation instead of covariance?
  # (normalize by standard deviation)
  if matrix == 'cor':
    print('computing standard deviations')
    std1 = var1.stdev(iaxis1).load()
    std2 = var2.stdev(iaxis2).load()
    # account for grid points with zero standard deviation?
    std1.values = std1.values + (std1.values == 0)
    std2.values = std2.values + (std2.values == 0)
    var1 /= std1
    var2 /= std2


  eofshape1 =  (subspace,) + var1.shape[:iaxis1] + var1.shape[iaxis1+1:]
  eofshape2 =  (subspace,) + var2.shape[:iaxis2] + var2.shape[iaxis2+1:]

  pcshape1 =  (var1.shape[iaxis1], subspace)
  pcshape2 =  (var2.shape[iaxis2], subspace)

  # number of spatial grid points
  NX1 = var1.size // var1.shape[iaxis1]
  assert NX1 <= MAX_ARRAY_SIZE, 'field is too large!'
  NX2 = var2.size // var2.shape[iaxis2]
  assert NX2 <= MAX_ARRAY_SIZE, 'field is too large!'

  # Total number of timesteps
  NT = var1.shape[iaxis1]
  # Number of timesteps we can do in one fetch
  dt = MAX_ARRAY_SIZE // max(NX1,NX2)

  pcs1 = np.empty(pcshape1,dtype='d')
  pcs2 = np.empty(pcshape2,dtype='d')

  X = np.empty(eofshape2,dtype='d')
  U = np.empty(eofshape1,dtype='d')
  # Seed with sinusoids superimposed on random values
  Y = np.random.rand(*eofshape1)
  V = np.random.rand(*eofshape2)
  from math import pi
  for i in range(subspace):
    Y[i,...].reshape(NX1)[:] += np.cos( np.arange(NX1,dtype='d') / NX1 * 2 * pi * (i+1))
    V[i,...].reshape(NX2)[:] += np.cos( np.arange(NX2,dtype='d') / NX2 * 2 * pi * (i+1))

#  raise Exception

  # Workspace for C code
  UtAX  = np.empty([subspace,subspace], dtype='d')
  XtAtU = np.empty([subspace,subspace], dtype='d')
  VtV   = np.empty([subspace,subspace], dtype='d')
  YtY   = np.empty([subspace,subspace], dtype='d')

  # Views over whole variables
  # (rearranged to be compatible with our output eof arrays)
  view1 = View( (var1.axes[iaxis1],) + var1.axes[:iaxis1] + var1.axes[iaxis1+1:] )
  view2 = View( (var2.axes[iaxis2],) + var2.axes[:iaxis2] + var2.axes[iaxis2+1:] )


  for iter_num in range(1,MAX_ITER+1):

    print('iter_num: %d'%iter_num)

    assert Y.shape == U.shape
    assert X.shape == V.shape
    U, Y = Y, U
    X, V = V, X

    # Reset the accumulation arrays for the next approximations
    Y[()] = 0
    V[()] = 0

    # Apply the covariance/correlation matrix
    for t in range(0,NT,dt):
      # number of timesteps we actually have
      nt = min(dt,NT-t)

      # Read the data
      chunk1 = view1.modify_slice(0, slice(t,t+nt)).get(var1)
      chunk1 = np.ascontiguousarray(chunk1, dtype='d')
      chunk2 = view2.modify_slice(0, slice(t,t+nt)).get(var2)
      chunk2 = np.ascontiguousarray(chunk2, dtype='d')

      ier = lib.build_svds (subspace, nt, NX1, NX2, chunk1, chunk2,
                            X, Y, pcs2[t,...])
      assert ier == 0
      ier = lib.build_svds (subspace, nt, NX2, NX1, chunk2, chunk1,
                            U, V, pcs1[t,...])
      assert ier == 0


    # Useful dot products
    lib.dot(subspace, NX1, U, Y, UtAX)
    lib.dot(subspace, NX2, V, V, VtV)
    lib.dot(subspace, NX1, Y, U, XtAtU)
    lib.dot(subspace, NX1, Y, Y, YtY)

    # Compute surrogate matrices (using all available information from this iteration)
    A1, residues, rank, s = np.linalg.lstsq(UtAX,VtV,rcond=1e-30)
    A2, residues, rank, s = np.linalg.lstsq(XtAtU,YtY,rcond=1e-30)

    # Eigendecomposition on surrogate matrices
    Dy, Qy = np.linalg.eig(np.dot(A1,A2))
    Dv, Qv = np.linalg.eig(np.dot(A2,A1))

    # Sort by eigenvalue (largest first)
    S = np.argsort(np.real(Dy))[::-1]
    Dy = Dy[S]
    Qy = np.ascontiguousarray(Qy[:,S], dtype='d')
    S = np.argsort(np.real(Dv))[::-1]
    Dv = Dv[S]
    Qv = np.ascontiguousarray(Qv[:,S], dtype='d')

    # get estimate of true eigenvalues
    D = np.sqrt(Dy)  # should also = np.sqrt(Dv) in theory
    print(D)

    # Translate the surrogate eigenvectors to an estimate of the true eigenvectors
    lib.transform(subspace, NX1, Qy, Y)
    lib.transform(subspace, NX2, Qv, V)

    # Normalize
    lib.normalize (subspace, NX1, Y)
    lib.normalize (subspace, NX2, V)

    if not np.allclose(U[:num,...],Y[:num,...], atol=0): continue
    if not np.allclose(X[:num,...],V[:num,...], atol=0): continue
    print('converged after %d iterations'%iter_num)
    break

  assert iter_num != MAX_ITER, "no convergence"

  # Flip the sign of the var2 EOFs and PCs so that the covariance is positive
  lib.fixcov (subspace, NT, NX2, pcs1, pcs2, V)

  # Wrap as pygeode vars, and return
  # Only need some of the eofs for output (the rest might not have even converged yet)
  orderaxis = order(num)

  eof1 = np.array(Y[:num])
  pc1 = np.array(pcs1[...,:num]).transpose()
  eof1 = Var((orderaxis,)+var1.axes[:iaxis1]+var1.axes[iaxis1+1:], values=eof1)
  pc1 = Var((orderaxis,var1.axes[iaxis1]), values = pc1)

  eof2 = np.array(V[:num])
  pc2 = np.array(pcs2[...,:num]).transpose()
  eof2 = Var((orderaxis,)+var2.axes[:iaxis2]+var2.axes[iaxis2+1:], values=eof2)
  pc2 = Var((orderaxis,var2.axes[iaxis2]), values = pc2)

  # Apply weights?
  if weight1 is not False: eof1 /= weight1.sqrt()
  if weight2 is not False: eof2 /= weight2.sqrt()

  # Use correlation instead of covariance?
  # Re-scale the fields by standard deviation
  if matrix == 'cor':
    eof1 *= std1
    eof2 *= std2

  # Give it a name
  eof1.name = prefix1 + "EOF"
  pc1.name = prefix1 + "PC"
  eof2.name = prefix2 + "EOF"
  pc2.name = prefix2 + "PC"

  return eof1, pc1, eof2, pc2
Example #26
0
def difference(X, Y, axes, alpha=0.05, Nx_fac = None, Ny_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  Nx_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Four quantities are computed:

    * The difference in the means, X - Y
    * The effective number of degrees of freedom, :math:`df`
    * The probability of the computed difference if the population difference was zero
    * The confidence interval of the difference at the level specified by alpha

    If the average is taken over all axes of X and Y resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  riaxes = [whichaxis(srcaxes, n) for n in axes]
  raxes = [a for i, a in enumerate(srcaxes) if i in riaxes]
  oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
  oview = View(oaxes) 

  ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
  Nx = np.product([len(X.axes[i]) for i in ixaxes])

  iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
  Ny = np.product([len(Y.axes[i]) for i in iyaxes])
  
  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert Nx > 1, '%s has only one element along the reduction axes' % X.name
  assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  y = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  yy = np.zeros(oview.shape, 'd')

  Nx = np.zeros(oview.shape, 'd')
  Ny = np.zeros(oview.shape, 'd')

  x[()] = np.nan
  y[()] = np.nan
  xx[()] = np.nan
  yy[()] = np.nan
  Nx[()] = np.nan
  Ny[()] = np.nan

  # Accumulate data
  for outsl, (xdata,) in loopover([X], oview, pbar=pbar):
    xdata = xdata.astype('d')
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Nx[outsl] = np.nansum([Nx[outsl], npnansum(1. + xdata*0., ixaxes)], 0) 

  for outsl, (ydata,) in loopover([Y], oview, pbar=pbar):
    ydata = ydata.astype('d')
    y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
    yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Ny[outsl] = np.nansum([Ny[outsl], npnansum(1. + ydata*0., iyaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx = (xx - x**2/Nx) / (Nx - 1)
  yy = (yy - y**2/Ny) / (Ny - 1)
  x /= Nx
  y /= Ny

  if Nx_fac is not None: eNx = Nx//Nx_fac
  else: eNx = Nx
  if Ny_fac is not None: eNy = Ny//Ny_fac
  else: eNy = Ny
  #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean())

  d = x - y
  den = np.sqrt(xx/eNx + yy/eNy)
  df = (xx/eNx + yy/eNy)**2 / ((xx/eNx)**2/(eNx - 1) + (yy/eNy)**2/(eNy - 1))

  p = tdist.cdf(abs(d/den), df)*np.sign(d)
  ci = tdist.ppf(1. - alpha/2, df) * den

  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'
  if xn == yn: name = xn
  else: name = '%s-%s'%(xn, yn)

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    D = Var(oaxes, values=d, name=name)
    DF = Var(oaxes, values=df, name='df_%s' % name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([D, DF, P, CI])
  else: # Degenerate case
    return d, df, p, ci
Example #27
0
def to_xarray(dataset):
    """
  Converts a PyGeode Dataset into an xarray Dataset.

  Parameters
  ----------
  dataset : pygeode.Dataset
    The dataset to be converted.

  Returns
  -------
  out : xarray.Dataset
    An object which can be used with the xarray package.
  """
    from pygeode.dataset import asdataset
    from pygeode.formats.cfmeta import encode_cf
    from pygeode.view import View
    from dask.base import tokenize
    import dask.array as da
    import xarray as xr
    dataset = asdataset(dataset)
    # Encode the axes/variables with CF metadata.
    dataset = encode_cf(dataset)
    out = dict()
    # Loop over each axis and variable.
    for var in list(dataset.axes) + list(dataset.vars):
        # Generate a unique name to identify it with dask.
        name = var.name + "-" + tokenize(var)
        dsk = dict()
        dims = [a.name for a in var.axes]

        # Special case: already have the values in memory.
        if hasattr(var, 'values'):
            out[var.name] = xr.DataArray(var.values,
                                         dims=dims,
                                         attrs=var.atts,
                                         name=var.name)
            continue

        # Keep track of all the slices that were made over each dimension.
        # This information will be used to determine the "chunking" that was done
        # on the variable from inview.loop_mem().
        slice_order = [[] for a in var.axes]
        chunks = []
        # Break up the variable into into portions that are small enough to fit
        # in memory.  These will become the "chunks" for dask.
        inview = View(var.axes)
        for outview in inview.loop_mem():
            integer_indices = list(map(tuple, outview.integer_indices))
            # Determine *how* loop_mem is splitting the axes, and define the chunk
            # sizes accordingly.
            # A little indirect, but loop_mem doesn't make its chunking choices
            # available to the caller.
            for o, sl in zip(slice_order, integer_indices):
                if sl not in o:
                    o.append(sl)
            ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)]
            # Add this chunk to the dask array.
            key = tuple([name] + ind)
            dsk[key] = (var.getview, outview, False)
        # Construct the dask array.
        chunks = [list(map(len, sl)) for sl in slice_order]
        arr = da.Array(dsk, name, chunks, dtype=var.dtype)
        # Wrap this into an xarray.DataArray (with metadata and named axes).
        out[var.name] = xr.DataArray(arr,
                                     dims=dims,
                                     attrs=var.atts,
                                     name=var.name)
    # Build the final xarray.Dataset.
    out = xr.Dataset(out, attrs=dataset.atts)
    # Re-decode the CF metadata on the xarray side.
    out = xr.conventions.decode_cf(out)
    return out
Example #28
0
def paired_difference(X, Y, axes, alpha=0.05, N_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  Nx_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Four quantities are computed:

    * The difference in the means, X - Y
    * The effective number of degrees of freedom, :math:`df`
    * The probability of the computed difference if the population difference was zero
    * The confidence interval of the difference at the level specified by alpha

    If the average is taken over all axes of X and Y resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  srcaxes = combine_axes([X, Y])
  riaxes = [whichaxis(srcaxes, n) for n in axes]
  raxes = [a for i, a in enumerate(srcaxes) if i in riaxes]
  oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
  oview = View(oaxes) 

  ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
  Nx = np.product([len(X.axes[i]) for i in ixaxes])

  iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
  Ny = np.product([len(Y.axes[i]) for i in iyaxes])

  assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'
  
  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert Nx > 1, '%s has only one element along the reduction axes' % X.name
  assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

  # Construct work arrays
  d = np.zeros(oview.shape, 'd')
  dd = np.zeros(oview.shape, 'd')

  N = np.zeros(oview.shape, 'd')

  d[()] = np.nan
  dd[()] = np.nan
  N[()] = np.nan

  # Accumulate data
  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar):
    ddata = xdata.astype('d') - ydata.astype('d')
    d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
    dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)
    # Sum of weights (kludge to get masking right)
    N[outsl] = np.nansum([N[outsl], npnansum(1. + ddata*0., ixaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  dd = (dd - d**2/N) / (N - 1)
  d /= Nx

  if N_fac is not None: eN = N//N_fac
  else: eN = N
  #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean())

  den = np.sqrt(dd/(eN - 1))

  p = tdist.cdf(abs(d/den), eN - 1)*np.sign(d)
  ci = tdist.ppf(1. - alpha/2, eN - 1) * den

  xn = X.name if X.name != '' else 'X'
  yn = Y.name if Y.name != '' else 'Y'
  if xn == yn: name = xn
  else: name = '%s-%s'%(xn, yn)

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    D = Var(oaxes, values=d, name=name)
    DF = Var(oaxes, values=eN-1, name='df_%s' % name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([D, DF, P, CI])
  else: # Degenerate case
    return d, eN-1, p, ci
Example #29
0
def SVD(var1,
        var2,
        num=1,
        subspace=-1,
        iaxis=Time,
        weight1=True,
        weight2=True,
        matrix='cov'):
    """
  Finds coupled EOFs of two fields.  Note that the mean/trend/etc. is NOT
  removed in this routine.

  Parameters
  ----------
  var1, var2 : :class:`Var`
    The variables to analyse.

  num : integer
    The number of EOFs to compute (default is ``1``).

  weight1, weight2 : optional 
    Weights to use for defining orthogonality in the var1, var2 domains,
    respectively.  Patterns X and Y in the var1 domain are orthogonal if the
    sum over X*Y*weights1 is 0. Patterns Z and W in the var2 domain are
    orthogonal if the sum over Z*W*weights2 is 0. Default is to use internal
    weights defined for var1 accessed by :meth:`Var.getweights()`. If set to
    ``False`` no weighting is used.

  matrix : string, optional ['cov']
    Which matrix we are diagonalizing (default is 'cov'). 
     * 'cov': covariance matrix of var1 & var2
     * 'cov': correlation matrix of var1 & var2

  iaxis : Axis identifier
    The principal component / expansion coefficient axis, i.e., the 'time'
    axis. Can be an integer (the axis number, leftmost = 0), the axis name
    (string), or a Pygeode axis class.  If not specified, will try to use
    pygeode.timeaxis.Time, and if that fails, the leftmost axis.

  Returns
  -------
  (eof1, pc1, eof2, pc2): tuple
    * eof1: The coupled eof patterns for var1. 
    * pc1: The principal component / expansion coefficients for var1.
    * eof2: The coupled eof patterns for var2.
    * pc2: The principal component / expansion coefficients for var2.

  Notes
  -----
    Multiple orders of EOFs are concatenated along an 'order' axis.
  """
    import numpy as np
    from pygeode.timeaxis import Time
    from pygeode.var import Var
    from pygeode.view import View
    from pygeode import MAX_ARRAY_SIZE
    from warnings import warn
    from pygeode import svdcore as lib

    if matrix in ('cov', 'covariance'): matrix = 'cov'
    elif matrix in ('cor', 'corr', 'correlation'): matrix = 'cor'
    else:
        warn("invalid matrix type '%'.  Defaulting to covariance." % matrix,
             stacklevel=2)
        matrix = 'cov'

    MAX_ITER = 1000

    # Iterate over more EOFs than we need
    # (this helps with convergence)
    # TODO: a more rigorous formula for the optimum number of EOFs to use
    if subspace <= 0: subspace = 2 * num + 8
    if subspace < num: subspace = num  # Just in case

    # Remember the names
    prefix1 = var1.name + '_' if var1.name != '' else ''
    prefix2 = var2.name + '_' if var2.name != '' else ''

    # Apply weights?
    #  if weight1 is not None: var1 *= weight1.sqrt()
    #  if weight2 is not None: var2 *= weight2.sqrt()
    if weight1 is True: weight1 = var1.getweights()
    if weight1 is not False:
        assert not weight1.hasaxis(
            iaxis), "Can't handle weights along the record axis"
        # Normalize the weights
        W = weight1.sum() / weight1.size
        weight1 /= W
        # Apply the weights
        var1 *= weight1.sqrt()
    if weight2 is True: weight2 = var2.getweights()
    if weight2 is not False:
        assert not weight2.hasaxis(
            iaxis), "Can't handle weights along the record axis"
        # Normalize the weights
        W = weight2.sum() / weight2.size
        weight2 /= W
        # Apply the weights
        var2 *= weight2.sqrt()

    #TODO: allow multiple iteration axes (i.e., time and ensemble)
#  if iaxis is None:
#    if var1.hasaxis(Time) and var2.hasaxis(Time):
#      iaxis1 = var1.whichaxis(Time)
#      iaxis2 = var2.whichaxis(Time)
#    else:
#      iaxis1 = 0
#      iaxis2 = 0
#  else:
    iaxis1 = var1.whichaxis(iaxis)
    iaxis2 = var2.whichaxis(iaxis)

    assert var1.axes[iaxis1] == var2.axes[
        iaxis2], "incompatible iteration axes"
    del iaxis  # so we don't use this by accident

    # Special case: can load entire variable in memory
    # This will save some time, especially if the field is stored on disk, or is heavily derived
    if var1.size <= MAX_ARRAY_SIZE:
        print('preloading ' + repr(var1))
        var1 = var1.load()
    if var2.size <= MAX_ARRAY_SIZE:
        print('preloading ' + repr(var2))
        var2 = var2.load()

    # Use correlation instead of covariance?
    # (normalize by standard deviation)
    if matrix == 'cor':
        print('computing standard deviations')
        std1 = var1.stdev(iaxis1).load()
        std2 = var2.stdev(iaxis2).load()
        # account for grid points with zero standard deviation?
        std1.values = std1.values + (std1.values == 0)
        std2.values = std2.values + (std2.values == 0)
        var1 /= std1
        var2 /= std2

    eofshape1 = (subspace, ) + var1.shape[:iaxis1] + var1.shape[iaxis1 + 1:]
    eofshape2 = (subspace, ) + var2.shape[:iaxis2] + var2.shape[iaxis2 + 1:]

    pcshape1 = (var1.shape[iaxis1], subspace)
    pcshape2 = (var2.shape[iaxis2], subspace)

    # number of spatial grid points
    NX1 = var1.size // var1.shape[iaxis1]
    assert NX1 <= MAX_ARRAY_SIZE, 'field is too large!'
    NX2 = var2.size // var2.shape[iaxis2]
    assert NX2 <= MAX_ARRAY_SIZE, 'field is too large!'

    # Total number of timesteps
    NT = var1.shape[iaxis1]
    # Number of timesteps we can do in one fetch
    dt = MAX_ARRAY_SIZE // max(NX1, NX2)

    pcs1 = np.empty(pcshape1, dtype='d')
    pcs2 = np.empty(pcshape2, dtype='d')

    X = np.empty(eofshape2, dtype='d')
    U = np.empty(eofshape1, dtype='d')
    # Seed with sinusoids superimposed on random values
    Y = np.random.rand(*eofshape1)
    V = np.random.rand(*eofshape2)
    from math import pi
    for i in range(subspace):
        Y[i, ...].reshape(NX1)[:] += np.cos(
            np.arange(NX1, dtype='d') / NX1 * 2 * pi * (i + 1))
        V[i, ...].reshape(NX2)[:] += np.cos(
            np.arange(NX2, dtype='d') / NX2 * 2 * pi * (i + 1))


#  raise Exception

# Workspace for C code
    UtAX = np.empty([subspace, subspace], dtype='d')
    XtAtU = np.empty([subspace, subspace], dtype='d')
    VtV = np.empty([subspace, subspace], dtype='d')
    YtY = np.empty([subspace, subspace], dtype='d')

    # Views over whole variables
    # (rearranged to be compatible with our output eof arrays)
    view1 = View((var1.axes[iaxis1], ) + var1.axes[:iaxis1] +
                 var1.axes[iaxis1 + 1:])
    view2 = View((var2.axes[iaxis2], ) + var2.axes[:iaxis2] +
                 var2.axes[iaxis2 + 1:])

    for iter_num in range(1, MAX_ITER + 1):

        print('iter_num: %d' % iter_num)

        assert Y.shape == U.shape
        assert X.shape == V.shape
        U, Y = Y, U
        X, V = V, X

        # Reset the accumulation arrays for the next approximations
        Y[()] = 0
        V[()] = 0

        # Apply the covariance/correlation matrix
        for t in range(0, NT, dt):
            # number of timesteps we actually have
            nt = min(dt, NT - t)

            # Read the data
            chunk1 = view1.modify_slice(0, slice(t, t + nt)).get(var1)
            chunk1 = np.ascontiguousarray(chunk1, dtype='d')
            chunk2 = view2.modify_slice(0, slice(t, t + nt)).get(var2)
            chunk2 = np.ascontiguousarray(chunk2, dtype='d')

            ier = lib.build_svds(subspace, nt, NX1, NX2, chunk1, chunk2, X, Y,
                                 pcs2[t, ...])
            assert ier == 0
            ier = lib.build_svds(subspace, nt, NX2, NX1, chunk2, chunk1, U, V,
                                 pcs1[t, ...])
            assert ier == 0

        # Useful dot products
        lib.dot(subspace, NX1, U, Y, UtAX)
        lib.dot(subspace, NX2, V, V, VtV)
        lib.dot(subspace, NX1, Y, U, XtAtU)
        lib.dot(subspace, NX1, Y, Y, YtY)

        # Compute surrogate matrices (using all available information from this iteration)
        A1, residues, rank, s = np.linalg.lstsq(UtAX, VtV, rcond=1e-30)
        A2, residues, rank, s = np.linalg.lstsq(XtAtU, YtY, rcond=1e-30)

        # Eigendecomposition on surrogate matrices
        Dy, Qy = np.linalg.eig(np.dot(A1, A2))
        Dv, Qv = np.linalg.eig(np.dot(A2, A1))

        # Sort by eigenvalue (largest first)
        S = np.argsort(np.real(Dy))[::-1]
        Dy = Dy[S]
        Qy = np.ascontiguousarray(Qy[:, S], dtype='d')
        S = np.argsort(np.real(Dv))[::-1]
        Dv = Dv[S]
        Qv = np.ascontiguousarray(Qv[:, S], dtype='d')

        # get estimate of true eigenvalues
        D = np.sqrt(Dy)  # should also = np.sqrt(Dv) in theory
        print(D)

        # Translate the surrogate eigenvectors to an estimate of the true eigenvectors
        lib.transform(subspace, NX1, Qy, Y)
        lib.transform(subspace, NX2, Qv, V)

        # Normalize
        lib.normalize(subspace, NX1, Y)
        lib.normalize(subspace, NX2, V)

        if not np.allclose(U[:num, ...], Y[:num, ...], atol=0): continue
        if not np.allclose(X[:num, ...], V[:num, ...], atol=0): continue
        print('converged after %d iterations' % iter_num)
        break

    assert iter_num != MAX_ITER, "no convergence"

    # Flip the sign of the var2 EOFs and PCs so that the covariance is positive
    lib.fixcov(subspace, NT, NX2, pcs1, pcs2, V)

    # Wrap as pygeode vars, and return
    # Only need some of the eofs for output (the rest might not have even converged yet)
    orderaxis = order(num)

    eof1 = np.array(Y[:num])
    pc1 = np.array(pcs1[..., :num]).transpose()
    eof1 = Var((orderaxis, ) + var1.axes[:iaxis1] + var1.axes[iaxis1 + 1:],
               values=eof1)
    pc1 = Var((orderaxis, var1.axes[iaxis1]), values=pc1)

    eof2 = np.array(V[:num])
    pc2 = np.array(pcs2[..., :num]).transpose()
    eof2 = Var((orderaxis, ) + var2.axes[:iaxis2] + var2.axes[iaxis2 + 1:],
               values=eof2)
    pc2 = Var((orderaxis, var2.axes[iaxis2]), values=pc2)

    # Apply weights?
    if weight1 is not False: eof1 /= weight1.sqrt()
    if weight2 is not False: eof2 /= weight2.sqrt()

    # Use correlation instead of covariance?
    # Re-scale the fields by standard deviation
    if matrix == 'cor':
        eof1 *= std1
        eof2 *= std2

    # Give it a name
    eof1.name = prefix1 + "EOF"
    pc1.name = prefix1 + "PC"
    eof2.name = prefix2 + "EOF"
    pc2.name = prefix2 + "PC"

    return eof1, pc1, eof2, pc2
Example #30
0
def correlate(X, Y, axes=None, pbar=None):
# {{{
  r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  rho, p : :class:`Var`
    The correlation coefficient :math:`\rho_{XY}` and p-value, respectively.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding the given
  result under the hypothesis that the true correlation coefficient between X
  and Y is zero. It is computed from the t-statistic given in eq (8.7), in
  section 8.2.3, and assumes normally distributed quantities.'''

  from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
  from pygeode.view import View

  # Put all the axes being reduced over at the end 
  # so that we can reshape 
  srcaxes = combine_axes([X, Y])
  oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
  if axes is not None:
    ri_new = []
    for a in axes:
      i = whichaxis(srcaxes, a)
      if i not in riaxes: 
        raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name))
      ri_new.append(i)
    oiaxes.extend([r for r in riaxes if r not in ri_new])
    riaxes = ri_new
    
  oaxes = [srcaxes[i] for i in oiaxes]
  inaxes = oaxes + [srcaxes[i] for i in riaxes]
  oview = View(oaxes) 
  iview = View(inaxes) 
  siaxes = list(range(len(oaxes), len(srcaxes)))

  # Construct work arrays
  x  = np.zeros(oview.shape, 'd')*np.nan
  y  = np.zeros(oview.shape, 'd')*np.nan
  xx = np.zeros(oview.shape, 'd')*np.nan
  yy = np.zeros(oview.shape, 'd')*np.nan
  xy = np.zeros(oview.shape, 'd')*np.nan
  Na = np.zeros(oview.shape, 'd')*np.nan

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
    xdata = xdata.astype('d')
    ydata = ydata.astype('d')
    xydata = xdata*ydata

    xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
    ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
    xdata = np.tile(xdata, xbc)
    ydata = np.tile(ydata, ybc)
    xdata[np.isnan(xydata)] = np.nan
    ydata[np.isnan(xydata)] = np.nan

    # It seems np.nansum does not broadcast its arguments automatically
    # so there must be a better way of doing this...
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
    y[outsl]  = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
    yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
    xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

    # Sum of weights
    Na[outsl] = np.nansum([Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

  eps = 1e-14
  imsk = ~(Na < eps)

  xx[imsk] -= (x*x)[imsk]/Na[imsk]
  yy[imsk] -= (y*y)[imsk]/Na[imsk]
  xy[imsk] -= (x*y)[imsk]/Na[imsk]

  # Compute correlation coefficient, t-statistic, p-value
  den = np.zeros(oview.shape, 'd')
  rho = np.zeros(oview.shape, 'd')

  den[imsk] = np.sqrt((xx*yy)[imsk])
  rho[den > 0.] = xy[den > 0.] / np.sqrt(xx*yy)[den > 0.]

  den = 1 - rho**2
  # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
  den[den < eps] = eps

  t = np.zeros(oview.shape, 'd')
  p = np.zeros(oview.shape, 'd')

  t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.)/den[imsk])
  p[imsk] = tdist.cdf(t[imsk], Na[imsk]-2) * np.sign(rho[imsk])
  p[~imsk] = np.nan
  rho[~imsk] = np.nan

  # Construct and return variables
  xn = X.name if X.name != '' else 'X' # Note: could write:  xn = X.name or 'X'
  yn = Y.name if Y.name != '' else 'Y'

  from pygeode.var import Var
  Rho = Var(oaxes, values=rho, name='C(%s, %s)' % (xn, yn))
  P = Var(oaxes, values=p, name='P(C(%s,%s) != 0)' % (xn, yn))
  return Rho, P
Example #31
0
def isnonzero(X, axes, alpha=0.05, N_fac = None, pbar=None):
# {{{
  r'''Computes the mean value and statistics of X, against the hypothesis that it is 0.

  Parameters
  ==========
  X : :class:`Var`
    Variable to average.

  axes : list, optional
    Axes over which to compute the mean; if nothing is specified, the mean is
    computed over all axes.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom;
    the effective number will be given by the number estimated from the dataset
    divided by ``N_fac``.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple or :class:`Dataset` instance.
    Three quantities are computed:

    * The mean value of X
    * The probability of the computed value if the population mean was zero
    * The confidence interval of the mean at the level specified by alpha

    If the average is taken over all axes of X resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  difference

  Notes
  =====
  The number of effective degrees of freedom can be scaled as in :meth:`difference`. 
  The p-value and confidence interval are computed for the t-statistic defined in 
  eq (6.61) of von Storch and Zwiers 1999.'''

  from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
  from pygeode.view import View

  riaxes = [X.whichaxis(n) for n in axes]
  raxes = [a for i, a in enumerate(X.axes) if i in riaxes]
  oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes]
  oview = View(oaxes) 

  N = np.product([len(X.axes[i]) for i in riaxes])

  if pbar is None:
    from pygeode.progress import PBar
    pbar = PBar()

  assert N > 1, '%s has only one element along the reduction axes' % X.name

  # Construct work arrays
  x = np.zeros(oview.shape, 'd')
  xx = np.zeros(oview.shape, 'd')
  Na = np.zeros(oview.shape, 'd')

  x[()] = np.nan
  xx[()] = np.nan
  Na[()] = np.nan

  # Accumulate data
  for outsl, (xdata,) in loopover([X], oview, pbar=pbar):
    xdata = xdata.astype('d')
    x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0)
    xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0)
    # Sum of weights (kludge to get masking right)
    Na[outsl] = np.nansum([Na[outsl], npnansum(1. + xdata*0., riaxes)], 0) 

  # remove the mean (NOTE: numerically unstable if mean >> stdev)
  xx = (xx - x**2/Na) / (Na - 1)
  x /= Na

  if N_fac is not None: 
    eN = N//N_fac
    eNa = Na//N_fac
  else: 
    eN = N
    eNa = Na
  #print 'eff. N = %.1f' % eN

  sdom = np.sqrt(xx/eNa)

  p = tdist.cdf(abs(x/sdom), eNa - 1)*np.sign(x)
  ci = tdist.ppf(1. - alpha/2, eNa - 1) * sdom

  name = X.name if X.name != '' else 'X'

  if len(oaxes) > 0:
    from pygeode import Var, Dataset
    X = Var(oaxes, values=x, name=name)
    P = Var(oaxes, values=p, name='p_%s' % name)
    CI = Var(oaxes, values=ci, name='CI_%s' % name)
    return Dataset([X, P, CI])
  else: # Degenerate case
    return x, p, ci
Example #32
0
def to_xarray(dataset):
  """
  Converts a PyGeode Dataset into an xarray Dataset.

  Parameters
  ----------
  dataset : pygeode.Dataset
    The dataset to be converted.

  Returns
  -------
  out : xarray.Dataset
    An object which can be used with the xarray package.
  """
  from pygeode.dataset import asdataset
  from pygeode.formats.cfmeta import encode_cf
  from pygeode.view import View
  from dask.base import tokenize
  import dask.array as da
  import xarray as xr
  dataset = asdataset(dataset)
  # Encode the axes/variables with CF metadata.
  dataset = encode_cf(dataset)
  out = dict()
  # Loop over each axis and variable.
  for var in list(dataset.axes) + list(dataset.vars):
    # Generate a unique name to identify it with dask.
    name = var.name + "-" + tokenize(var)
    dsk = dict()
    dims = [a.name for a in var.axes]

    # Special case: already have the values in memory.
    if hasattr(var,'values'):
      out[var.name] = xr.DataArray(var.values, dims=dims, attrs=var.atts, name=var.name)
      continue

    # Keep track of all the slices that were made over each dimension.
    # This information will be used to determine the "chunking" that was done
    # on the variable from inview.loop_mem().
    slice_order = [[] for a in var.axes]
    chunks = []
    # Break up the variable into into portions that are small enough to fit
    # in memory.  These will become the "chunks" for dask.
    inview = View(var.axes)
    for outview in inview.loop_mem():
      integer_indices = map(tuple,outview.integer_indices)
      # Determine *how* loop_mem is splitting the axes, and define the chunk
      # sizes accordingly.
      # A little indirect, but loop_mem doesn't make its chunking choices
      # available to the caller.
      for o, sl in zip(slice_order, integer_indices):
        if sl not in o:
          o.append(sl)
      ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)]
      # Add this chunk to the dask array.
      key = tuple([name] + ind)
      dsk[key] = (var.getview, outview, False)
    # Construct the dask array.
    chunks = [map(len,sl) for sl in slice_order]
    arr = da.Array(dsk, name, chunks, dtype=var.dtype)
    # Wrap this into an xarray.DataArray (with metadata and named axes).
    out[var.name] = xr.DataArray(arr, dims = dims, attrs = var.atts, name=var.name)
  # Build the final xarray.Dataset.
  out = xr.Dataset(out, attrs=dataset.atts)
  # Re-decode the CF metadata on the xarray side.
  out = xr.conventions.decode_cf(out)
  return out