def prep (var, iaxis, weight, out): from pygeode.timeaxis import Time from pygeode.var import Var from pygeode.axis import Axis from pygeode.view import View from warnings import warn from pygeode import MAX_ARRAY_SIZE assert isinstance(var,Var) assert var.naxes >= 2, "need at least 2 axes" # Check the outputs out = whichout(out) del out # not actually used here # Keep the name name = var.name # Normalize by area weight? var = apply_weights(var, weight=weight) del weight timeaxes = iaxis del iaxis if timeaxes is None: if var.hasaxis(Time): timeaxes = Time else: warn ("No explicit record axis provided. Using the first axis.", stacklevel=2) timeaxes = 0 # Keep the record axis/axes as a tuple # (in case we have more than one axis, i.e. time and ensemble) if not isinstance(timeaxes,(list,tuple)): assert isinstance(timeaxes,int) or issubclass(timeaxes,Axis), 'unknown iaxis type %s'%type(timeaxes) timeaxes = [timeaxes] # Convert the axes to integer ids timeaxes = [var.whichaxis(a) for a in timeaxes] spaceaxes = [i for i in range(var.naxes) if i not in timeaxes] # Convert to axis objects timeaxes = [var.axes[i] for i in timeaxes] spaceaxes = [var.axes[i] for i in spaceaxes] # Create a view, to hold the axes together # (provides us with other useful stuff, like a notion of 'shape' and 'size') time = View(axes=timeaxes) space = View(axes=spaceaxes) # var = SquishedVar(var, timeaxes, spaceaxes) # Preload the data, if possible if var.size <= MAX_ARRAY_SIZE: var = var.load() return var, time, space
def getview (self, view, pbar): from pygeode.view import View import numpy as np # Indices of the full axes fullaxis_ind = [self.whichaxis(a) for a in iaxes] # Prepend the other axes ind = [i for i in range(self.naxes) if i not in fullaxis_ind] + fullaxis_ind # print "ind:", ind # Reverse order rind = [-1] * len(ind) for i,I in enumerate(ind): rind[I] = i assert len(ind) == self.naxes and len(set(ind)) == self.naxes # Construct a view with this new order of axes, and with the specified axes unsliced. axes = tuple([view.axes[i] for i in ind]) slices = tuple([view.slices[i] for i in ind]) bigview = View(axes, slices = slices) bigview = bigview.unslice(*fullaxis_ind) viewloop = list(bigview.loop_mem()) out = np.empty(view.shape, self.dtype) for i,smallview in enumerate(viewloop): # print '??', i for I in fullaxis_ind: assert smallview.shape[I] == bigview.shape[I], "can't get all of axis '%s' at once"%view.axes[I].name # Slicing relative to the original view outsl = tuple(smallview.map_to(bigview.clip()).slices) # Reorder the axes to the original order axes = tuple([smallview.axes[I] for I in rind]) assert axes == self.axes slices = tuple([smallview.slices[I] for I in rind]) smallview = View (axes, slices = slices) # fudge outsl for this new order outsl = tuple([outsl[I] for I in rind]) # Slicing the 'full' axes to get what we originally needed insl = [slice(None)] * self.naxes for I in fullaxis_ind: insl[I] = view.slices[I] # Get the data tmp = old_getview (self, smallview, pbar = pbar.part(i,len(viewloop)) ) # print '??', out.shape, '[', outsl, ']', ' = ', tmp.shape, '[', insl, ']' out[outsl] = tmp[insl] return out
def getview(self, view, pbar): import numpy as np from pygeode.view import View, simplify out = np.empty(view.shape, dtype=self.dtype) out[()] = float('nan') out_axes = view.clip().axes # Loop over all available files. N = 0 # Number of points covered so far for filename, opener, axes in self._table: subaxes = [ self._axis_manager._get_axis_intersection([a1, a2]) for a1, a2 in zip(out_axes, axes) ] reorder = [] mask = [] if any(len(a) == 0 for a in subaxes): continue for a1, a2 in zip(out_axes, subaxes): # Figure out where the input chunk fits into the output re = np.searchsorted(a2.values, a1.values) # Mask out elements that we don't actually have in the chunk m = [ r < len(a2.values) and a2.values[r] == v for r, v in zip(re, a1.values) ] m = np.array(m) # Convert mask to integer indices m = np.arange(len(m))[m] # and then to a slice (where possible) m = simplify(m) re = re[m] # Try to simplify the re-ordering array if np.all(re == np.sort(re)): re = simplify(re) reorder.append(re) mask.append(m) var = [v for v in opener(filename) if v.name == self._varname][0] v = View(subaxes) chunk = v.get(var) # Note: this may break if there is more than one axis with integer indices. assert len([ r for r in reorder if isinstance(r, (tuple, np.ndarray)) ]) <= 1, "Unhandled advanced indexing case." assert len([m for m in mask if isinstance(m, (tuple, np.ndarray)) ]) <= 1, "Unhandled advanced indexing case." out[mask] = chunk[reorder] N = N + chunk.size pbar.update(100. * N / out.size) return out
def getview (self, view, pbar): from pygeode.view import View import numpy as np # Do a brute-force mapping of the indices to the internal axes # (should work if the axes are in 1:1 correspondence) data = View(self.var.axes, force_slices=view.slices, force_integer_indices=view.integer_indices).get(self.var, pbar=pbar) return data
def write_xdr(var, wfile): import struct import numpy as np from pygeode.view import View lenstr = struct.pack('!2l', var.size, var.size) wfile.write(lenstr) # Break the values into memory-friendly chunks if hasattr (var, 'values'): values_iter = [var.values] else: view = View(var.axes) # Trap and handle any I/O errors viewloop = view.loop_mem() #TODO: make this more general - should we be futzing around with the axes at this level # Break it up even further along the time axis? (so we don't start a long process through the whole dataset) if var.naxes > 2: new_viewloop = [] for v in viewloop: for s in v.integer_indices[0]: new_viewloop.append(v.modify_slice(0,[s])) viewloop = new_viewloop values_iter = (get_data_trap_io(v,var) for v in viewloop) for values in values_iter: daptype = np2dap[values.dtype.name] if daptype in ('Byte','String'): # # Do byte encoding here # raise Exception values = np.ascontiguousarray(values, 'uint8'); s = lib.int8toStr(values) elif daptype in ('UInt16', 'Int16', 'UInt32', 'Int32'): values = np.ascontiguousarray(values, 'int32') s = lib.int32toStr(values) elif daptype == 'Float32': values = np.ascontiguousarray(values, 'float32') s = lib.float32toStr(values) elif daptype == 'Float64': values = np.ascontiguousarray(values, 'float64') s = lib.float64toStr(values) wfile.write(s)
def write_xdr(var, wfile): import struct import numpy as np from pygeode.view import View lenstr = struct.pack('!2', var.size, var.size) wfile.write(lenstr) # Break the values into memory-friendly chunks if hasattr(var, 'values'): values_iter = [var.values] else: view = View(var.axes) # Trap and handle any I/O errors viewloop = view.loop_mem() #TODO: make this more general - should we be futzing around with the axes at this level # Break it up even further along the time axis? (so we don't start a long process through the whole dataset) if var.naxes > 2: new_viewloop = [] for v in viewloop: for s in v.integer_indices[0]: new_viewloop.append(v.modify_slice(0, [s])) viewloop = new_viewloop values_iter = (get_data_trap_io(v, var) for v in viewloop) for values in values_iter: daptype = np2dap[values.dtype.name] if daptype in ('Byte', 'String'): # # Do byte encoding here # raise Exception values = np.ascontiguousarray(values, 'uint8') s = lib.int8toStr(values) elif daptype in ('UInt16', 'Int16', 'UInt32', 'Int32'): values = np.ascontiguousarray(values, 'int32') s = lib.int32toStr(values) elif daptype == 'Float32': values = np.ascontiguousarray(values, 'float32') s = lib.float32toStr(values) elif daptype == 'Float64': values = np.ascontiguousarray(values, 'float64') s = lib.float64toStr(values) wfile.write(s)
def getview (self, view, pbar): import numpy as np from pygeode.view import View, simplify out = np.empty(view.shape, dtype=self.dtype) out[()] = float('nan') out_axes = view.clip().axes # Loop over all available files. N = 0 # Number of points covered so far for filename, opener, axes in self._table: subaxes = [self._axis_manager._get_axis_intersection([a1,a2]) for a1,a2 in zip(out_axes,axes)] reorder = [] mask = [] if any(len(a)==0 for a in subaxes): continue for a1,a2 in zip(out_axes,subaxes): # Figure out where the input chunk fits into the output re = np.searchsorted(a2.values, a1.values) # Mask out elements that we don't actually have in the chunk m = [r<len(a2.values) and a2.values[r]==v for r,v in zip(re,a1.values)] m = np.array(m) # Convert mask to integer indices m = np.arange(len(m))[m] # and then to a slice (where possible) m = simplify(m) re = re[m] # Try to simplify the re-ordering array if np.all(re == np.sort(re)): re = simplify(re) reorder.append(re) mask.append(m) var = [v for v in opener(filename) if v.name == self._varname][0] v = View(subaxes) chunk = v.get(var) # Note: this may break if there is more than one axis with integer indices. assert len([r for r in reorder if isinstance(r,(tuple,np.ndarray))]) <= 1, "Unhandled advanced indexing case." assert len([m for m in mask if isinstance(m,(tuple,np.ndarray))]) <= 1, "Unhandled advanced indexing case." out[mask] = chunk[reorder] N = N + chunk.size pbar.update(100.*N/out.size) return out
def EOF_cov (x, num=1, iaxis=None, weight=True, out=None): import numpy as np from pygeode.view import View x, time, space = prep (x, iaxis, weight=weight, out=out) del iaxis # Initialize space for accumulating the covariance matrix cov = np.zeros ([space.size, space.size], dtype='d') # Accumulate the covariance for inview in View(x.axes).loop_mem(): X = inview.get(x) assert X.size >= space.size, "Spatial pattern is too large" X = X.reshape(-1,space.size) cov += np.dot(X.transpose(),X) # Decompose the eigenvectors & eigenvalues w, v = np.linalg.eigh(cov/(time.size-1)) variance = w.sum() eig = np.sqrt(w[::-1][:num]) eof = v.transpose()[::-1,:][:num,:] # Compute the timeseries pc = [] for inview in View(x.axes).loop_mem(): X = inview.get(x).reshape(-1,space.size) pc.append(np.dot(eof, X.transpose())) pc = np.concatenate(pc, axis=1) # Normalize pc /= eig.reshape(num,1) return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
def write_var(ncfile, dataset, unlimited=None, compress=False): # {{{ from pygeode.view import View from pygeode.axis import Axis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.tools import combine_axes vars = list(dataset.vars) axes = combine_axes(v.axes for v in vars) # Define the dimensions for a in axes: ncfile.createDimension(a.name, size=(None if a.name == unlimited else len(a))) # Define the variables (including axes) for var in vars: dimensions = [a.name for a in var.axes] v = ncfile.createVariable(var.name, datatype=var.dtype, dimensions=dimensions, zlib=compress, fill_value=var.atts.get('_FillValue', None)) v.setncatts(var.atts) # global attributes ncfile.setncatts(dataset.atts) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100 pbar = PBar(message="Saving '%s':" % ncfile.filepath()) # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v, Axis)]) # Write the data for i, var in enumerate(vars): ncvar = ncfile.variables[var.name] varpbar = pbar.subset(prog[i], prog[i + 1]) views = list(View(var.axes).loop_mem()) for j, v in enumerate(views): vpbar = varpbar.part(j, len(views)) ncvar[v.slices] = v.get(var, pbar=vpbar)
def getview(self, view, pbar): from pygeode.view import View import numpy as np # Indices of the full axes fullaxis_ind = [self.whichaxis(a) for a in iaxes] # Prepend the other axes ind = [i for i in range(self.naxes) if i not in fullaxis_ind ] + fullaxis_ind # print "ind:", ind # Reverse order rind = [-1] * len(ind) for i, I in enumerate(ind): rind[I] = i assert len(ind) == self.naxes and len(set(ind)) == self.naxes # Construct a view with this new order of axes, and with the specified axes unsliced. axes = tuple([view.axes[i] for i in ind]) slices = tuple([view.slices[i] for i in ind]) bigview = View(axes, slices=slices) bigview = bigview.unslice(*fullaxis_ind) viewloop = list(bigview.loop_mem()) out = np.empty(view.shape, self.dtype) for i, smallview in enumerate(viewloop): # print '??', i for I in fullaxis_ind: assert smallview.shape[I] == bigview.shape[ I], "can't get all of axis '%s' at once" % view.axes[ I].name # Slicing relative to the original view outsl = tuple(smallview.map_to(bigview.clip()).slices) # Reorder the axes to the original order axes = tuple([smallview.axes[I] for I in rind]) assert axes == self.axes slices = tuple([smallview.slices[I] for I in rind]) smallview = View(axes, slices=slices) # fudge outsl for this new order outsl = tuple([outsl[I] for I in rind]) # Slicing the 'full' axes to get what we originally needed insl = [slice(None)] * self.naxes for I in fullaxis_ind: insl[I] = view.slices[I] # Get the data tmp = old_getview(self, smallview, pbar=pbar.part(i, len(viewloop))) # print '??', out.shape, '[', outsl, ']', ' = ', tmp.shape, '[', insl, ']' out[outsl] = tmp[insl] return out
def getview (self, view, pbar): from pygeode.view import View import numpy as np fillvalue = self.fillvalue scale = self.scale offset = self.offset # Do a brute-force mapping of the indices to the internal axes # (should work if the axes are in 1:1 correspondence) data = View(self.var.axes, force_slices=view.slices, force_integer_indices=view.integer_indices).get(self.var, pbar=pbar) if fillvalue is not None or scale is not None or offset is not None: data = np.copy(data) if fillvalue is not None: w = np.where(data==fillvalue) data = np.asarray(data, self.dtype) if scale is not None: data *= scale if offset is not None: data += offset if fillvalue is not None: data[w] = float('nan') return data
def EOF_guess (x, num=1, iaxis=None, weight=True, out=None): import numpy as np from pygeode.var import Var from pygeode.view import View from pygeode import eofcore as lib x, time, space = prep (x, iaxis, weight=weight, out=out) del iaxis print("working on array shape %s"%(x.shape,)) # Initialize workspace work = lib.start (num, space.size) eof = np.empty((num,)+space.shape, dtype='d') eig = np.empty([num], dtype='d') pc = np.empty((num,)+time.shape, dtype='d') # Variance accumulation variance = 0.0 # Loop over chunks of the data for inview in View(x.axes).loop_mem(): X = np.ascontiguousarray(inview.get(x), dtype='d') assert X.size >= space.size, "Spatial pattern is too large" nrec = X.size // space.size lib.process (work, nrec, X) # Accumulate variance variance += (X**2).sum() # Get result lib.endloop (work, eof, eig, pc) # Free workspace lib.finish (work) # Wrap the stuff return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None): # {{{ r'''Computes the mean value of X and statistics relevant for a test against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the mean value can be obtained through ``ds.m``). The following quantities can be calculated. * 'm': The mean value of X * 'p': The probability of the computed value if the population mean was zero * 'ci': The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0) imsk = (Na > 0.) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx[imsk] -= x[imsk]**2 / Na[imsk] xx[imsk] = xx[imsk] / (Na[imsk] - 1) x[imsk] /= Na[imsk] if N_fac is not None: eN = N // N_fac eNa = Na // N_fac else: eN = N eNa = Na sdom = np.zeros((oview.shape), 'd') p = np.zeros((oview.shape), 'd') t = np.zeros((oview.shape), 'd') ci = np.zeros((oview.shape), 'd') sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk]) dmsk = (sdom > 0.) t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk] p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1)) ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk] name = X.name if X.name != '' else 'X' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: m = Var(oaxes, values=x, name='m') m.atts['longname'] = 'Mean value of %s' % (name, ) rvs.append(m) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value of test %s is 0' % (name, ) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence intervale of the mean value of %s' % ( name, ) rvs.append(ci) return asdataset(rvs)
def EOF_iter (x, num=1, iaxis=None, subspace = -1, max_iter=1000, weight=True, out=None): """ (See svd.SVD for documentation on a similar function, but replace each xxx1 and xxx2 parameter with a single xxx parameter.) """ import numpy as np from pygeode import libpath from pygeode.view import View from math import sqrt from pygeode.varoperations import fill from pygeode import svdcore as lib # Need vector subspace to be at least as large as the number of EOFs extracted. if subspace < num: subspace = num # Run the single-pass guess to seed the first iteration guess_eof, guess_eig, guess_pc = EOF_guess (x, subspace, iaxis, weight=weight, out=None) # Convert NaNs to zeros so they don't screw up the matrix operations guess_eof = fill (guess_eof, 0) x, time, space = prep(var=x, iaxis=iaxis, weight=weight, out=out) del iaxis eofshape = (subspace,) + space.shape pcshape = time.shape + (subspace,) pcs = np.empty(pcshape,dtype='d') oldeofs = np.empty(eofshape,dtype='d') # Seed with initial guess (in the weighted space) neweofs = apply_weights (guess_eof, weight=weight).get() neweofs = np.array(neweofs, dtype='d') # so we can write # neweofs = np.random.rand(*eofshape) # Workspace for smaller representative matrix work1 = np.empty([subspace,subspace], dtype='d') work2 = np.empty([subspace,subspace], dtype='d') NX = space.size # Variance accumulation (on first iteration only) variance = 0.0 for iter_num in range(1,max_iter+1): print('iter_num: %d'%iter_num) neweofs, oldeofs = oldeofs, neweofs # Reset the accumulation arrays for the next approximations neweofs[()] = 0 # Apply the covariance matrix for inview in View(x.axes).loop_mem(): X = np.ascontiguousarray(inview.get(x), dtype='d') assert X.size >= space.size, "spatial pattern is too large" nt = inview.shape[0] time_offset = inview.slices[0].start ier = lib.build_eofs (subspace, nt, NX, X, oldeofs, neweofs, pcs[time_offset,...]) assert ier == 0 # Compute variance? if iter_num == 1: variance += (X**2).sum() # Useful dot products lib.dot(subspace, NX, oldeofs, neweofs, work1) lib.dot(subspace, NX, neweofs, neweofs, work2) # Compute surrogate matrix (using all available information from this iteration) A, residues, rank, s = np.linalg.lstsq(work1,work2,rcond=1e-30) # Eigendecomposition on surrogate matrix w, P = np.linalg.eig(A) # Sort by eigenvalue S = np.argsort(w)[::-1] w = w[S] print(w) # assert P.dtype.name == 'float64', P.dtype.name P = np.ascontiguousarray(P[:,S], dtype='d') # Translate the surrogate eigenvectors to an estimate of the true eigenvectors lib.transform(subspace, NX, P, neweofs) # Normalize lib.normalize (subspace, NX, neweofs) # # verify orthogonality # for i in range(num): # print [np.dot(neweofs[i,...].flatten(), neweofs[j,...].flatten()) for j in range(num)] if np.allclose(oldeofs[:num,...],neweofs[:num,...], atol=0): print('converged after %d iterations'%iter_num) break assert iter_num != max_iter, "no convergence" # Wrap as pygeode vars, and return # Only need some of the eofs for output (the rest might not have even converged yet) eof = neweofs[:num] pc = pcs[...,:num].transpose() # Extract the eigenvalues # (compute magnitude of pc arrays) #TODO: keep eigenvalues as a separate variable in the iteration loop eig = np.array([sqrt( (pc[i,...]**2).sum() ) for i in range(pc.shape[0]) ]) pc = np.dot(np.diag(1/eig), pc) return finalize (x, time, space, eof, eig, pc, variance, weight=weight, out=out)
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': p-value of regression; see notes. * 'sm': Standard deviation of linear coefficient estimate * 'se': Standard deviation of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['m', 'b', 'r2', 'p', 'sm', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from regression. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % ( X.name, Y.name) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) if N_fac is None: N_eff = Na - 2. else: N_eff = Na / N_fac - 2. nmsk = (N_eff > 0.) xx[nmsk] -= (x * x)[nmsk] / Na[nmsk] yy[nmsk] -= (y * y)[nmsk] / Na[nmsk] xy[nmsk] -= (x * y)[nmsk] / Na[nmsk] dmsk = (xx > 0.) m = np.zeros(oview.shape, 'd') b = np.zeros(oview.shape, 'd') r2 = np.zeros(oview.shape, 'd') m[dmsk] = xy[dmsk] / xx[dmsk] b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk] r2den = xx * yy d2msk = (r2den > 0.) r2[d2msk] = xy[d2msk]**2 / r2den[d2msk] sige = np.zeros(oview.shape, 'd') sigm = np.zeros(oview.shape, 'd') t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk] sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk]) sige[nmsk] = np.sqrt(sige[dmsk]) t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk] p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk])) msk = nmsk & dmsk m[~msk] = np.nan b[~msk] = np.nan sige[~msk] = np.nan sigm[~msk] = np.nan p[~msk] = np.nan msk = nmsk & d2msk r2[~msk] = np.nan xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: M = Var(oaxes, values=m, name='m') M.atts['longname'] = 'slope' rvs.append(M) if 'b' in output: B = Var(oaxes, values=b, name='b') B.atts['longname'] = 'intercept' rvs.append(B) if 'r2' in output: R2 = Var(oaxes, values=r2, name='r2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: P = Var(oaxes, values=p, name='p') P.atts['longname'] = 'p-value' rvs.append(P) if 'sm' in output: SM = Var(oaxes, values=sigm, name='sm') SM.atts['longname'] = 'standard deviation of slope parameter' rvs.append(SM) if 'se' in output: SE = Var(oaxes, values=sige, name='se') SE.atts['longname'] = 'standard deviation of residual' rvs.append(SE) ds = asdataset(rvs) ds.atts[ 'description'] = 'linear regression parameters for %s regressed against %s' % ( yn, xn) return ds
def save(filename, in_dataset, version=3, pack=None, compress=False, cfmeta=True, unlimited=None): # {{{ from ctypes import c_int, c_long, byref from pygeode.view import View from pygeode.tools import combine_axes, point from pygeode.axis import Axis, DummyAxis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.formats import finalize_save from pygeode.dataset import asdataset assert isinstance(filename, str) in_dataset = asdataset(in_dataset) dataset = finalize_save(in_dataset, cfmeta, pack) # Version? if compress: version = 4 assert version in (3, 4) fileid = c_int() vars = list(dataset.vars) # The output axes axes = combine_axes(v.axes for v in vars) # Include axes in the list of vars (for writing to netcdf). # Exclude axes which don't have any intrinsic values. vars = vars + [a for a in axes if not isinstance(a, DummyAxis)] #vars.extend(axes) # Variables (and axes) must all have unique names assert len(set([v.name for v in vars])) == len( vars), "vars must have unique names: %s" % [v.name for v in vars] if unlimited is not None: assert unlimited in [a.name for a in axes] # Functions for writing entire array allf = { 1: lib.nc_put_var_schar, 2: lib.nc_put_var_text, 3: lib.nc_put_var_short, 4: lib.nc_put_var_int, 5: lib.nc_put_var_float, 6: lib.nc_put_var_double, 7: lib.nc_put_var_uchar, 8: lib.nc_put_var_ushort, 9: lib.nc_put_var_uint, 10: lib.nc_put_var_longlong, 11: lib.nc_put_var_ulonglong } # Functions for writing chunks chunkf = { 1: lib.nc_put_vara_schar, 2: lib.nc_put_vara_text, 3: lib.nc_put_vara_short, 4: lib.nc_put_vara_int, 5: lib.nc_put_vara_float, 6: lib.nc_put_vara_double, 7: lib.nc_put_vara_uchar, 8: lib.nc_put_vara_ushort, 9: lib.nc_put_vara_uint, 10: lib.nc_put_vara_longlong, 11: lib.nc_put_vara_ulonglong } # Create the file if version == 3: ret = lib.nc_create(filename.encode('ascii'), 0, byref(fileid)) if ret != 0: raise IOError(lib.nc_strerror(ret)) elif version == 4: ret = lib.nc_create(filename.encode('ascii'), 0x1000, byref(fileid)) # 0x1000 = NC_NETCDF4 if ret != 0: raise IOError(lib.nc_strerror(ret)) else: raise Exception try: # Define the dimensions dimids = [None] * len(axes) for i, a in enumerate(axes): dimids[i] = c_int() if unlimited == a.name: ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i])) else: ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i])) assert ret == 0, lib.nc_strerror(ret) # Define the variables (including axes) chunks = [None] * len(vars) varids = [None] * len(vars) for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] # Generate the array of dimension ids for this var d = [dimids[list(axes).index(a)] for a in var.axes] # Make it C-compatible d = (c_int * var.naxes)(*d) varids[i] = c_int() ret = lib.nc_def_var(fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i])) assert ret == 0, lib.nc_strerror(ret) # Compress the data? (only works for netcdf4 or (higher?)) if compress: ret = lib.nc_def_var_deflate(fileid, varids[i], 1, 1, 2) assert ret == 0, lib.nc_strerror(ret) # Write the attributes # global attributes put_attributes(fileid, -1, dataset.atts, version) # variable attributes for i, var in enumerate(vars): # modify axes to be netcdf friendly (CF-compliant, etc.) put_attributes(fileid, varids[i], var.atts, version) # Don't pre-fill the file oldmode = c_int() ret = lib.nc_set_fill(fileid, 256, byref(oldmode)) assert ret == 0, "Can't set fill mode: %s (error %d)" % ( lib.nc_strerror(ret), ret) # Finished defining the variables, about to start writing the values ret = lib.nc_enddef(fileid) assert ret == 0, "Error leaving define mode: %s (error %d)" % ( lib.nc_strerror(ret), ret) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100 # print "Saving '%s':"%filename pbar = PBar(message="Saving '%s':" % filename) # pbar = FakePBar() # Write the data for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] dtype = numpy_type[t] # print 'writing', var.name # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v, Axis)]) varpbar = pbar.subset(prog[i], prog[i + 1]) views = list(View(var.axes).loop_mem()) for j, v in enumerate(views): vpbar = varpbar.part(j, len(views)) # print '???', repr(str(v)) # Should always be slices (since we're looping over whole thing contiguously?) for sl in v.slices: assert isinstance(sl, slice) for sl in v.slices: assert sl.step in (1, None) start = [sl.start for sl in v.slices] count = [sl.stop - sl.start for sl in v.slices] start = (c_long * var.naxes)(*start) count = (c_long * var.naxes)(*count) if isinstance(var, Axis): assert len(start) == len(count) == 1 data = var.values data = data[ start[0]:start[0] + count[0]] # the above gives us the *whole* axis, # but under extreme conditions we may be looping over smaller pieces vpbar.update(100) else: data = v.get(var, pbar=vpbar) # Ensure the data is stored contiguously in memory data = np.ascontiguousarray(data, dtype=dtype) ret = chunkf[t](fileid, varids[i], start, count, point(data)) assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % ( var.name, lib.nc_strerror(ret), ret) finally: # Finished lib.nc_close(fileid)
def difference(X, Y, axes=None, alpha=0.05, Nx_fac=None, Ny_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional, defaults to None Axes over which to compute means; if othing is specified, the mean is computed over all axes common to X and Y. alpha : float, optional; defaults to 0.05 Confidence level for which to compute confidence interval. Nx_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name if pbar is None: from pygeode.progress import PBar pbar = PBar() # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') Nx = np.full(oview.shape, np.nan, 'd') Ny = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Count of non-NaN data points Nx[outsl] = np.nansum( [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0) for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Count of non-NaN data points Ny[outsl] = np.nansum( [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (Nx > 1) & (Ny > 1) xx[imsk] -= (x * x)[imsk] / Nx[imsk] xx[imsk] /= (Nx[imsk] - 1) x[imsk] /= Nx[imsk] yy[imsk] -= (y * y)[imsk] / Ny[imsk] yy[imsk] /= (Ny[imsk] - 1) y[imsk] /= Ny[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. if Nx_fac is not None: eNx = Nx // Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny // Ny_fac else: eNy = Ny emsk = (eNx > 1) & (eNy > 1) # Compute difference d = x - y den = np.zeros(oview.shape, 'd') df = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') # Convert to variance of the mean of each sample xx[emsk] /= eNx[emsk] yy[emsk] /= eNy[emsk] den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1) dmsk = (den > 0.) df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk] den[emsk] = np.sqrt(xx[emsk] + yy[emsk]) dmsk &= (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk])) ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk] df[~dmsk] = np.nan p[~dmsk] = np.nan ci[~dmsk] = np.nan # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=df, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['Nx_fac'] = Nx_fac ds.atts['Ny_fac'] = Ny_fac ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn) return ds
def correlate(X, Y, axes=None, output='r2,p', pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'r2,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the correlation coefficient can be obtained through ``ds.r2``). * 'r2': The correlation coefficient :math:`\rho_{XY}` * 'p': The p-value; see notes. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding a correlation coeefficient of equal or greater magnitude (two-sided) to the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['r2', 'p'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Count of non-NaN data points Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) imsk = (Na > 0) xx[imsk] -= (x * x)[imsk] / Na[imsk] yy[imsk] -= (y * y)[imsk] / Na[imsk] xy[imsk] -= (x * y)[imsk] / Na[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx * yy)[imsk]) dmsk = (den > 0.) rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk]) p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2)) p[~imsk] = np.nan rho[~imsk] = np.nan p[~dmsk] = np.nan rho[~dmsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'r2' in output: r2 = Var(oaxes, values=rho, name='r2') r2.atts['longname'] = 'Correlation coefficient between %s and %s' % ( xn, yn) rvs.append(r2) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for correlation coefficient between %s and %s' % ( xn, yn) rvs.append(p) ds = asdataset(rvs) ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn) return ds
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': p-value of regession; see notes. * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; it corresponds to the probability of obtaining the regression coefficient under the null hypothesis that there is no linear relationship. Note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View # Split output request now ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError( 'One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = tuple([srcaxes[i] for i in oiaxes]) inaxes = oaxes + tuple([srcaxes[i] for i in riaxes]) oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len( riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % ( Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr, ) os2 = os + (Nr, Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes) for j in range(i + 1): xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i, :, :] = np.linalg.inv(xx[i, :, :]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [ np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr) ] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from .var import Var from .dataset import asdataset from .axis import NonCoordinateAxis ra = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor') ra2 = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor2') Nd = len(oaxes) rvs = [] if 'beta' in output: B = Var(oaxes + (ra, ), values=beta, name='beta') B.atts['longname'] = 'regression coefficient' rvs.append(B) if 'r2' in output: vary = (yy - y**2 / N) R2 = 1 - (yy - vare) / vary R2 = Var(oaxes, values=R2, name='R2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: p = [ 2. * (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr)) for i in range(Nr) ] p = np.transpose(np.array(p), [Nd] + list(range(Nd))) p = Var(oaxes + (ra, ), values=p, name='p') p.atts['longname'] = 'p-values' rvs.append(p) if 'sb' in output: sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd))) sb = Var(oaxes + (ra, ), values=sigbeta, name='sb') sb.atts['longname'] = 'standard deviation of linear coefficients' rvs.append(sb) if 'covb' in output: sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb') covb.atts['longname'] = 'Covariance matrix of the linear coefficients' rvs.append(covb) if 'se' in output: se = np.sqrt((yy - vare) / N_eff) se = Var(oaxes, values=se, name='se') se.atts['longname'] = 'standard deviation of residual' rvs.append(se) ds = asdataset(rvs) ds.atts[ 'description'] = 'multiple linear regression parameters for %s regressed against %s' % ( yn, xns) return ds
def get(self, pbar=None, **kwargs): # {{{ """ Gets a raw numpy array containing the values of the variable. Parameters ---------- pbar : boolean (optional) If ``True``, will display a progress bar while the data is being retrieved. This requires the *python-progressbar* package (not included with PyGeode). **kwargs : keyword arguments (optional) One or more keyword arguments may be included to subset the variable before grabbing the data. See :func:`Var.__call__` for a similar method which uses this keyword subsetting. Returns ------- out : numpy.ndarray The requested values, as a numpy array. Notes ----- Once you grab the data as a numpy array, you can no longer use the PyGeode functions to do further work on it directly. You can, however, use :func:`Var.__init__` to re-wrap your numpy array as a PyGeode Var. This may be useful if you want to do some very complicated operations on the data using the numpy interface as an intermediate step. PyGeode variables can be huge! They can be larger than the available RAM in your computer, or even larger than your hard disk. Numpy arrays, on the other hand, need to fit in memory, so make sure you are only getting a reasonable piece of data at a time. Examples -------- >>> from pygeode.tutorial import t1 >>> print t1.Temp <Var 'Temp'>: Shape: (lat,lon) (32,64) Axes: lat <Lat> : 85 S to 85 N (32 values) lon <Lon> : 0 E to 354 E (64 values) Attributes: {'units': 'K'} Type: Var (dtype="float64") >>> x = t1.Temp.get() >>> print x [[ 261.05848727 259.81373805 258.6761858 ..., 264.37317879 263.44078874 262.30323649] [ 261.66049058 260.49545075 259.43074336 ..., 264.76292084 263.89023779 262.82553041] [ 262.53448988 261.44963014 260.45819779 ..., 265.42340543 264.61078196 263.61934962] ..., [ 262.53448988 263.61934962 264.61078196 ..., 259.64557433 260.45819779 261.44963014] [ 261.66049058 262.82553041 263.89023779 ..., 258.55806031 259.43074336 260.49545075] [ 261.05848727 262.30323649 263.44078874 ..., 257.74379575 258.6761858 259.81373805]] """ from pygeode.view import View import numpy as np var = self.__call__(**kwargs) data = View(var.axes).get(var, pbar=pbar) if isinstance(data, np.ndarray): data = np.array(data, copy=True) return data
def paired_difference(X, Y, axes=None, alpha=0.05, N_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must share all axes over which the means are being computed. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X and Y; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.full(oview.shape, np.nan, 'd') dd = np.full(oview.shape, np.nan, 'd') N = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Count of non-NaN data points N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (N > 1) dd[imsk] -= (d * d)[imsk] / N[imsk] dd[imsk] /= (N[imsk] - 1) d[imsk] /= N[imsk] # Ensure variance is non-negative dd[dd <= 0.] = 0. if N_fac is not None: eN = N // N_fac else: eN = N emsk = (eN > 1) den = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') den = np.zeros(oview.shape, 'd') den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1)) dmsk = (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1)) ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk] # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=eN - 1, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for t-test of paired difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['N_fac'] = N_fac ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn) return ds
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. Returns ======= results : list of :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero * 'sm': Variance in linear coefficient * 'se': Variance of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name) # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') xy = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') x[outsl] += npsum(xdata, siaxes) y[outsl] += npsum(ydata, siaxes) xx[outsl] += npsum(xdata**2, siaxes) yy[outsl] += npsum(ydata**2, siaxes) xy[outsl] += npsum(xdata*ydata, siaxes) N = np.prod([len(srcaxes[i]) for i in riaxes]) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx -= x**2/N yy -= y**2/N xy -= (x*y)/N m = xy/xx b = (y - m*x)/float(N) if N_fac is None: N_eff = N else: N_eff = N // N_fac sige = (yy - m * xy) / (N_eff - 2.) sigm = np.sqrt(sige / xx) t = np.abs(m) / sigm p = tdist.cdf(t, N-2) * np.sign(m) xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] if 'm' in output: M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn)) ret.append(M) if 'b' in output: B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn)) ret.append(B) if 'r' in output: ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn))) if 'p' in output: P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn)) ret.append(P) if 'sm' in output: ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn))) if 'se' in output: ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn))) return ret
def get(self, pbar=None, **kwargs): # {{{ """ Gets a raw numpy array containing the values of the variable. Parameters ---------- pbar : boolean (optional) If ``True``, will display a progress bar while the data is being retrieved. This requires the *python-progressbar* package (not included with PyGeode). **kwargs : keyword arguments (optional) One or more keyword arguments may be included to subset the variable before grabbing the data. See :func:`Var.__call__` for a similar method which uses this keyword subsetting. Returns ------- out : numpy.ndarray The requested values, as a numpy array. Notes ----- Once you grab the data as a numpy array, you can no longer use the PyGeode functions to do further work on it directly. You can, however, use :func:`Var.__init__` to re-wrap your numpy array as a PyGeode Var. This may be useful if you want to do some very complicated operations on the data using the numpy interface as an intermediate step. PyGeode variables can be huge! They can be larger than the available RAM in your computer, or even larger than your hard disk. Numpy arrays, on the other hand, need to fit in memory, so make sure you are only getting a reasonable piece of data at a time. Examples -------- >>> from pygeode.tutorial import t1 >>> print(t1.Temp) <Var 'Temp'>: Units: K Shape: (lat,lon) (31,60) Axes: lat <Lat> : 90 S to 90 N (31 values) lon <Lon> : 0 E to 354 E (60 values) Attributes: {} Type: Add_Var (dtype="float64") >>> x = t1.Temp.get() >>> print(x) [[260.73262556 258.08759192 256.45287123 ... 265.01237988 265.01237988 263.37765919] [261.22683172 258.75813366 257.23239435 ... 265.22126909 265.22126909 263.69552978] [261.98265134 259.69028886 258.27353093 ... 265.69177175 265.69177175 264.27501382] ... [261.98265134 264.27501382 265.69177175 ... 258.27353093 258.27353093 259.69028886] [261.22683172 263.69552978 265.22126909 ... 257.23239435 257.23239435 258.75813366] [260.73262556 263.37765919 265.01237988 ... 256.45287123 256.45287123 258.08759192]] """ from pygeode.view import View import numpy as np var = self.__call__(**kwargs) data = View(var.axes).get(var, pbar=pbar) if isinstance(data, np.ndarray): data = np.array(data, copy=True) return data
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero for each regressor * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals If the regression is computed over all axes so that the result is a scalar, the above are returned as a tuple of floats in the order specified by ``output``. Otherwise they are returned as :class:`Var` instances. The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr,) os2 = os + (Nr,Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes) for j in range(i+1): xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i,:,:] = np.linalg.inv(xx[i,:,:]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] for o in output: if o == 'B': if len(oaxes) == 0: ret.append(beta) else: ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)]) elif o == 'r': vary = (yy - y**2/N) R2 = 1 - (yy - vare) / vary if len(oaxes) == 0: ret.append(R2) else: ret.append(Var(oaxes, values=R2, name='R2')) elif o == 'p': ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)] if len(oaxes) == 0: ret.append(ps) else: ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)]) elif o == 'sb': if len(oaxes) == 0: ret.append(sigbeta) else: ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)]) elif o == 'covb': from .axis import NonCoordinateAxis as nca cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1') cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2') sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat')) elif o == 'se': se = np.sqrt((yy - vare) / N_eff) if len(oaxes) == 0: ret.append(se) else: ret.append(Var(oaxes, values=se, name='sig_resid')) else: print('multiple_regress: unrecognized output "%s"' % o) return ret
def SVD (var1, var2, num=1, subspace=-1, iaxis=Time, weight1=True, weight2=True, matrix='cov'): """ Finds coupled EOFs of two fields. Note that the mean/trend/etc. is NOT removed in this routine. Parameters ---------- var1, var2 : :class:`Var` The variables to analyse. num : integer The number of EOFs to compute (default is ``1``). weight1, weight2 : optional Weights to use for defining orthogonality in the var1, var2 domains, respectively. Patterns X and Y in the var1 domain are orthogonal if the sum over X*Y*weights1 is 0. Patterns Z and W in the var2 domain are orthogonal if the sum over Z*W*weights2 is 0. Default is to use internal weights defined for var1 accessed by :meth:`Var.getweights()`. If set to ``False`` no weighting is used. matrix : string, optional ['cov'] Which matrix we are diagonalizing (default is 'cov'). * 'cov': covariance matrix of var1 & var2 * 'cov': correlation matrix of var1 & var2 iaxis : Axis identifier The principal component / expansion coefficient axis, i.e., the 'time' axis. Can be an integer (the axis number, leftmost = 0), the axis name (string), or a Pygeode axis class. If not specified, will try to use pygeode.timeaxis.Time, and if that fails, the leftmost axis. Returns ------- (eof1, pc1, eof2, pc2): tuple * eof1: The coupled eof patterns for var1. * pc1: The principal component / expansion coefficients for var1. * eof2: The coupled eof patterns for var2. * pc2: The principal component / expansion coefficients for var2. Notes ----- Multiple orders of EOFs are concatenated along an 'order' axis. """ import numpy as np from pygeode.timeaxis import Time from pygeode.var import Var from pygeode.view import View from pygeode import MAX_ARRAY_SIZE from warnings import warn from pygeode import svdcore as lib if matrix in ('cov', 'covariance'): matrix = 'cov' elif matrix in ('cor', 'corr', 'correlation'): matrix = 'cor' else: warn ("invalid matrix type '%'. Defaulting to covariance."%matrix, stacklevel=2) matrix = 'cov' MAX_ITER = 1000 # Iterate over more EOFs than we need # (this helps with convergence) # TODO: a more rigorous formula for the optimum number of EOFs to use if subspace <= 0: subspace = 2*num + 8 if subspace < num: subspace = num # Just in case # Remember the names prefix1 = var1.name+'_' if var1.name != '' else '' prefix2 = var2.name+'_' if var2.name != '' else '' # Apply weights? # if weight1 is not None: var1 *= weight1.sqrt() # if weight2 is not None: var2 *= weight2.sqrt() if weight1 is True: weight1 = var1.getweights() if weight1 is not False: assert not weight1.hasaxis(iaxis), "Can't handle weights along the record axis" # Normalize the weights W = weight1.sum() / weight1.size weight1 /= W # Apply the weights var1 *= weight1.sqrt() if weight2 is True: weight2 = var2.getweights() if weight2 is not False: assert not weight2.hasaxis(iaxis), "Can't handle weights along the record axis" # Normalize the weights W = weight2.sum() / weight2.size weight2 /= W # Apply the weights var2 *= weight2.sqrt() #TODO: allow multiple iteration axes (i.e., time and ensemble) # if iaxis is None: # if var1.hasaxis(Time) and var2.hasaxis(Time): # iaxis1 = var1.whichaxis(Time) # iaxis2 = var2.whichaxis(Time) # else: # iaxis1 = 0 # iaxis2 = 0 # else: iaxis1 = var1.whichaxis(iaxis) iaxis2 = var2.whichaxis(iaxis) assert var1.axes[iaxis1] == var2.axes[iaxis2], "incompatible iteration axes" del iaxis # so we don't use this by accident # Special case: can load entire variable in memory # This will save some time, especially if the field is stored on disk, or is heavily derived if var1.size <= MAX_ARRAY_SIZE: print('preloading '+repr(var1)) var1 = var1.load() if var2.size <= MAX_ARRAY_SIZE: print('preloading '+repr(var2)) var2 = var2.load() # Use correlation instead of covariance? # (normalize by standard deviation) if matrix == 'cor': print('computing standard deviations') std1 = var1.stdev(iaxis1).load() std2 = var2.stdev(iaxis2).load() # account for grid points with zero standard deviation? std1.values = std1.values + (std1.values == 0) std2.values = std2.values + (std2.values == 0) var1 /= std1 var2 /= std2 eofshape1 = (subspace,) + var1.shape[:iaxis1] + var1.shape[iaxis1+1:] eofshape2 = (subspace,) + var2.shape[:iaxis2] + var2.shape[iaxis2+1:] pcshape1 = (var1.shape[iaxis1], subspace) pcshape2 = (var2.shape[iaxis2], subspace) # number of spatial grid points NX1 = var1.size // var1.shape[iaxis1] assert NX1 <= MAX_ARRAY_SIZE, 'field is too large!' NX2 = var2.size // var2.shape[iaxis2] assert NX2 <= MAX_ARRAY_SIZE, 'field is too large!' # Total number of timesteps NT = var1.shape[iaxis1] # Number of timesteps we can do in one fetch dt = MAX_ARRAY_SIZE // max(NX1,NX2) pcs1 = np.empty(pcshape1,dtype='d') pcs2 = np.empty(pcshape2,dtype='d') X = np.empty(eofshape2,dtype='d') U = np.empty(eofshape1,dtype='d') # Seed with sinusoids superimposed on random values Y = np.random.rand(*eofshape1) V = np.random.rand(*eofshape2) from math import pi for i in range(subspace): Y[i,...].reshape(NX1)[:] += np.cos( np.arange(NX1,dtype='d') / NX1 * 2 * pi * (i+1)) V[i,...].reshape(NX2)[:] += np.cos( np.arange(NX2,dtype='d') / NX2 * 2 * pi * (i+1)) # raise Exception # Workspace for C code UtAX = np.empty([subspace,subspace], dtype='d') XtAtU = np.empty([subspace,subspace], dtype='d') VtV = np.empty([subspace,subspace], dtype='d') YtY = np.empty([subspace,subspace], dtype='d') # Views over whole variables # (rearranged to be compatible with our output eof arrays) view1 = View( (var1.axes[iaxis1],) + var1.axes[:iaxis1] + var1.axes[iaxis1+1:] ) view2 = View( (var2.axes[iaxis2],) + var2.axes[:iaxis2] + var2.axes[iaxis2+1:] ) for iter_num in range(1,MAX_ITER+1): print('iter_num: %d'%iter_num) assert Y.shape == U.shape assert X.shape == V.shape U, Y = Y, U X, V = V, X # Reset the accumulation arrays for the next approximations Y[()] = 0 V[()] = 0 # Apply the covariance/correlation matrix for t in range(0,NT,dt): # number of timesteps we actually have nt = min(dt,NT-t) # Read the data chunk1 = view1.modify_slice(0, slice(t,t+nt)).get(var1) chunk1 = np.ascontiguousarray(chunk1, dtype='d') chunk2 = view2.modify_slice(0, slice(t,t+nt)).get(var2) chunk2 = np.ascontiguousarray(chunk2, dtype='d') ier = lib.build_svds (subspace, nt, NX1, NX2, chunk1, chunk2, X, Y, pcs2[t,...]) assert ier == 0 ier = lib.build_svds (subspace, nt, NX2, NX1, chunk2, chunk1, U, V, pcs1[t,...]) assert ier == 0 # Useful dot products lib.dot(subspace, NX1, U, Y, UtAX) lib.dot(subspace, NX2, V, V, VtV) lib.dot(subspace, NX1, Y, U, XtAtU) lib.dot(subspace, NX1, Y, Y, YtY) # Compute surrogate matrices (using all available information from this iteration) A1, residues, rank, s = np.linalg.lstsq(UtAX,VtV,rcond=1e-30) A2, residues, rank, s = np.linalg.lstsq(XtAtU,YtY,rcond=1e-30) # Eigendecomposition on surrogate matrices Dy, Qy = np.linalg.eig(np.dot(A1,A2)) Dv, Qv = np.linalg.eig(np.dot(A2,A1)) # Sort by eigenvalue (largest first) S = np.argsort(np.real(Dy))[::-1] Dy = Dy[S] Qy = np.ascontiguousarray(Qy[:,S], dtype='d') S = np.argsort(np.real(Dv))[::-1] Dv = Dv[S] Qv = np.ascontiguousarray(Qv[:,S], dtype='d') # get estimate of true eigenvalues D = np.sqrt(Dy) # should also = np.sqrt(Dv) in theory print(D) # Translate the surrogate eigenvectors to an estimate of the true eigenvectors lib.transform(subspace, NX1, Qy, Y) lib.transform(subspace, NX2, Qv, V) # Normalize lib.normalize (subspace, NX1, Y) lib.normalize (subspace, NX2, V) if not np.allclose(U[:num,...],Y[:num,...], atol=0): continue if not np.allclose(X[:num,...],V[:num,...], atol=0): continue print('converged after %d iterations'%iter_num) break assert iter_num != MAX_ITER, "no convergence" # Flip the sign of the var2 EOFs and PCs so that the covariance is positive lib.fixcov (subspace, NT, NX2, pcs1, pcs2, V) # Wrap as pygeode vars, and return # Only need some of the eofs for output (the rest might not have even converged yet) orderaxis = order(num) eof1 = np.array(Y[:num]) pc1 = np.array(pcs1[...,:num]).transpose() eof1 = Var((orderaxis,)+var1.axes[:iaxis1]+var1.axes[iaxis1+1:], values=eof1) pc1 = Var((orderaxis,var1.axes[iaxis1]), values = pc1) eof2 = np.array(V[:num]) pc2 = np.array(pcs2[...,:num]).transpose() eof2 = Var((orderaxis,)+var2.axes[:iaxis2]+var2.axes[iaxis2+1:], values=eof2) pc2 = Var((orderaxis,var2.axes[iaxis2]), values = pc2) # Apply weights? if weight1 is not False: eof1 /= weight1.sqrt() if weight2 is not False: eof2 /= weight2.sqrt() # Use correlation instead of covariance? # Re-scale the fields by standard deviation if matrix == 'cor': eof1 *= std1 eof2 *= std2 # Give it a name eof1.name = prefix1 + "EOF" pc1.name = prefix1 + "PC" eof2.name = prefix2 + "EOF" pc2.name = prefix2 + "PC" return eof1, pc1, eof2, pc2
def difference(X, Y, axes, alpha=0.05, Nx_fac = None, Ny_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') Nx = np.zeros(oview.shape, 'd') Ny = np.zeros(oview.shape, 'd') x[()] = np.nan y[()] = np.nan xx[()] = np.nan yy[()] = np.nan Nx[()] = np.nan Ny[()] = np.nan # Accumulate data for outsl, (xdata,) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) Nx[outsl] = np.nansum([Nx[outsl], npnansum(1. + xdata*0., ixaxes)], 0) for outsl, (ydata,) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Sum of weights (kludge to get masking right) Ny[outsl] = np.nansum([Ny[outsl], npnansum(1. + ydata*0., iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx = (xx - x**2/Nx) / (Nx - 1) yy = (yy - y**2/Ny) / (Ny - 1) x /= Nx y /= Ny if Nx_fac is not None: eNx = Nx//Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny//Ny_fac else: eNy = Ny #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) d = x - y den = np.sqrt(xx/eNx + yy/eNy) df = (xx/eNx + yy/eNy)**2 / ((xx/eNx)**2/(eNx - 1) + (yy/eNy)**2/(eNy - 1)) p = tdist.cdf(abs(d/den), df)*np.sign(d) ci = tdist.ppf(1. - alpha/2, df) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=df, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, df, p, ci
def to_xarray(dataset): """ Converts a PyGeode Dataset into an xarray Dataset. Parameters ---------- dataset : pygeode.Dataset The dataset to be converted. Returns ------- out : xarray.Dataset An object which can be used with the xarray package. """ from pygeode.dataset import asdataset from pygeode.formats.cfmeta import encode_cf from pygeode.view import View from dask.base import tokenize import dask.array as da import xarray as xr dataset = asdataset(dataset) # Encode the axes/variables with CF metadata. dataset = encode_cf(dataset) out = dict() # Loop over each axis and variable. for var in list(dataset.axes) + list(dataset.vars): # Generate a unique name to identify it with dask. name = var.name + "-" + tokenize(var) dsk = dict() dims = [a.name for a in var.axes] # Special case: already have the values in memory. if hasattr(var, 'values'): out[var.name] = xr.DataArray(var.values, dims=dims, attrs=var.atts, name=var.name) continue # Keep track of all the slices that were made over each dimension. # This information will be used to determine the "chunking" that was done # on the variable from inview.loop_mem(). slice_order = [[] for a in var.axes] chunks = [] # Break up the variable into into portions that are small enough to fit # in memory. These will become the "chunks" for dask. inview = View(var.axes) for outview in inview.loop_mem(): integer_indices = list(map(tuple, outview.integer_indices)) # Determine *how* loop_mem is splitting the axes, and define the chunk # sizes accordingly. # A little indirect, but loop_mem doesn't make its chunking choices # available to the caller. for o, sl in zip(slice_order, integer_indices): if sl not in o: o.append(sl) ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)] # Add this chunk to the dask array. key = tuple([name] + ind) dsk[key] = (var.getview, outview, False) # Construct the dask array. chunks = [list(map(len, sl)) for sl in slice_order] arr = da.Array(dsk, name, chunks, dtype=var.dtype) # Wrap this into an xarray.DataArray (with metadata and named axes). out[var.name] = xr.DataArray(arr, dims=dims, attrs=var.atts, name=var.name) # Build the final xarray.Dataset. out = xr.Dataset(out, attrs=dataset.atts) # Re-decode the CF metadata on the xarray side. out = xr.conventions.decode_cf(out) return out
def paired_difference(X, Y, axes, alpha=0.05, N_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.zeros(oview.shape, 'd') dd = np.zeros(oview.shape, 'd') N = np.zeros(oview.shape, 'd') d[()] = np.nan dd[()] = np.nan N[()] = np.nan # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) N[outsl] = np.nansum([N[outsl], npnansum(1. + ddata*0., ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) dd = (dd - d**2/N) / (N - 1) d /= Nx if N_fac is not None: eN = N//N_fac else: eN = N #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) den = np.sqrt(dd/(eN - 1)) p = tdist.cdf(abs(d/den), eN - 1)*np.sign(d) ci = tdist.ppf(1. - alpha/2, eN - 1) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=eN-1, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, eN-1, p, ci
def SVD(var1, var2, num=1, subspace=-1, iaxis=Time, weight1=True, weight2=True, matrix='cov'): """ Finds coupled EOFs of two fields. Note that the mean/trend/etc. is NOT removed in this routine. Parameters ---------- var1, var2 : :class:`Var` The variables to analyse. num : integer The number of EOFs to compute (default is ``1``). weight1, weight2 : optional Weights to use for defining orthogonality in the var1, var2 domains, respectively. Patterns X and Y in the var1 domain are orthogonal if the sum over X*Y*weights1 is 0. Patterns Z and W in the var2 domain are orthogonal if the sum over Z*W*weights2 is 0. Default is to use internal weights defined for var1 accessed by :meth:`Var.getweights()`. If set to ``False`` no weighting is used. matrix : string, optional ['cov'] Which matrix we are diagonalizing (default is 'cov'). * 'cov': covariance matrix of var1 & var2 * 'cov': correlation matrix of var1 & var2 iaxis : Axis identifier The principal component / expansion coefficient axis, i.e., the 'time' axis. Can be an integer (the axis number, leftmost = 0), the axis name (string), or a Pygeode axis class. If not specified, will try to use pygeode.timeaxis.Time, and if that fails, the leftmost axis. Returns ------- (eof1, pc1, eof2, pc2): tuple * eof1: The coupled eof patterns for var1. * pc1: The principal component / expansion coefficients for var1. * eof2: The coupled eof patterns for var2. * pc2: The principal component / expansion coefficients for var2. Notes ----- Multiple orders of EOFs are concatenated along an 'order' axis. """ import numpy as np from pygeode.timeaxis import Time from pygeode.var import Var from pygeode.view import View from pygeode import MAX_ARRAY_SIZE from warnings import warn from pygeode import svdcore as lib if matrix in ('cov', 'covariance'): matrix = 'cov' elif matrix in ('cor', 'corr', 'correlation'): matrix = 'cor' else: warn("invalid matrix type '%'. Defaulting to covariance." % matrix, stacklevel=2) matrix = 'cov' MAX_ITER = 1000 # Iterate over more EOFs than we need # (this helps with convergence) # TODO: a more rigorous formula for the optimum number of EOFs to use if subspace <= 0: subspace = 2 * num + 8 if subspace < num: subspace = num # Just in case # Remember the names prefix1 = var1.name + '_' if var1.name != '' else '' prefix2 = var2.name + '_' if var2.name != '' else '' # Apply weights? # if weight1 is not None: var1 *= weight1.sqrt() # if weight2 is not None: var2 *= weight2.sqrt() if weight1 is True: weight1 = var1.getweights() if weight1 is not False: assert not weight1.hasaxis( iaxis), "Can't handle weights along the record axis" # Normalize the weights W = weight1.sum() / weight1.size weight1 /= W # Apply the weights var1 *= weight1.sqrt() if weight2 is True: weight2 = var2.getweights() if weight2 is not False: assert not weight2.hasaxis( iaxis), "Can't handle weights along the record axis" # Normalize the weights W = weight2.sum() / weight2.size weight2 /= W # Apply the weights var2 *= weight2.sqrt() #TODO: allow multiple iteration axes (i.e., time and ensemble) # if iaxis is None: # if var1.hasaxis(Time) and var2.hasaxis(Time): # iaxis1 = var1.whichaxis(Time) # iaxis2 = var2.whichaxis(Time) # else: # iaxis1 = 0 # iaxis2 = 0 # else: iaxis1 = var1.whichaxis(iaxis) iaxis2 = var2.whichaxis(iaxis) assert var1.axes[iaxis1] == var2.axes[ iaxis2], "incompatible iteration axes" del iaxis # so we don't use this by accident # Special case: can load entire variable in memory # This will save some time, especially if the field is stored on disk, or is heavily derived if var1.size <= MAX_ARRAY_SIZE: print('preloading ' + repr(var1)) var1 = var1.load() if var2.size <= MAX_ARRAY_SIZE: print('preloading ' + repr(var2)) var2 = var2.load() # Use correlation instead of covariance? # (normalize by standard deviation) if matrix == 'cor': print('computing standard deviations') std1 = var1.stdev(iaxis1).load() std2 = var2.stdev(iaxis2).load() # account for grid points with zero standard deviation? std1.values = std1.values + (std1.values == 0) std2.values = std2.values + (std2.values == 0) var1 /= std1 var2 /= std2 eofshape1 = (subspace, ) + var1.shape[:iaxis1] + var1.shape[iaxis1 + 1:] eofshape2 = (subspace, ) + var2.shape[:iaxis2] + var2.shape[iaxis2 + 1:] pcshape1 = (var1.shape[iaxis1], subspace) pcshape2 = (var2.shape[iaxis2], subspace) # number of spatial grid points NX1 = var1.size // var1.shape[iaxis1] assert NX1 <= MAX_ARRAY_SIZE, 'field is too large!' NX2 = var2.size // var2.shape[iaxis2] assert NX2 <= MAX_ARRAY_SIZE, 'field is too large!' # Total number of timesteps NT = var1.shape[iaxis1] # Number of timesteps we can do in one fetch dt = MAX_ARRAY_SIZE // max(NX1, NX2) pcs1 = np.empty(pcshape1, dtype='d') pcs2 = np.empty(pcshape2, dtype='d') X = np.empty(eofshape2, dtype='d') U = np.empty(eofshape1, dtype='d') # Seed with sinusoids superimposed on random values Y = np.random.rand(*eofshape1) V = np.random.rand(*eofshape2) from math import pi for i in range(subspace): Y[i, ...].reshape(NX1)[:] += np.cos( np.arange(NX1, dtype='d') / NX1 * 2 * pi * (i + 1)) V[i, ...].reshape(NX2)[:] += np.cos( np.arange(NX2, dtype='d') / NX2 * 2 * pi * (i + 1)) # raise Exception # Workspace for C code UtAX = np.empty([subspace, subspace], dtype='d') XtAtU = np.empty([subspace, subspace], dtype='d') VtV = np.empty([subspace, subspace], dtype='d') YtY = np.empty([subspace, subspace], dtype='d') # Views over whole variables # (rearranged to be compatible with our output eof arrays) view1 = View((var1.axes[iaxis1], ) + var1.axes[:iaxis1] + var1.axes[iaxis1 + 1:]) view2 = View((var2.axes[iaxis2], ) + var2.axes[:iaxis2] + var2.axes[iaxis2 + 1:]) for iter_num in range(1, MAX_ITER + 1): print('iter_num: %d' % iter_num) assert Y.shape == U.shape assert X.shape == V.shape U, Y = Y, U X, V = V, X # Reset the accumulation arrays for the next approximations Y[()] = 0 V[()] = 0 # Apply the covariance/correlation matrix for t in range(0, NT, dt): # number of timesteps we actually have nt = min(dt, NT - t) # Read the data chunk1 = view1.modify_slice(0, slice(t, t + nt)).get(var1) chunk1 = np.ascontiguousarray(chunk1, dtype='d') chunk2 = view2.modify_slice(0, slice(t, t + nt)).get(var2) chunk2 = np.ascontiguousarray(chunk2, dtype='d') ier = lib.build_svds(subspace, nt, NX1, NX2, chunk1, chunk2, X, Y, pcs2[t, ...]) assert ier == 0 ier = lib.build_svds(subspace, nt, NX2, NX1, chunk2, chunk1, U, V, pcs1[t, ...]) assert ier == 0 # Useful dot products lib.dot(subspace, NX1, U, Y, UtAX) lib.dot(subspace, NX2, V, V, VtV) lib.dot(subspace, NX1, Y, U, XtAtU) lib.dot(subspace, NX1, Y, Y, YtY) # Compute surrogate matrices (using all available information from this iteration) A1, residues, rank, s = np.linalg.lstsq(UtAX, VtV, rcond=1e-30) A2, residues, rank, s = np.linalg.lstsq(XtAtU, YtY, rcond=1e-30) # Eigendecomposition on surrogate matrices Dy, Qy = np.linalg.eig(np.dot(A1, A2)) Dv, Qv = np.linalg.eig(np.dot(A2, A1)) # Sort by eigenvalue (largest first) S = np.argsort(np.real(Dy))[::-1] Dy = Dy[S] Qy = np.ascontiguousarray(Qy[:, S], dtype='d') S = np.argsort(np.real(Dv))[::-1] Dv = Dv[S] Qv = np.ascontiguousarray(Qv[:, S], dtype='d') # get estimate of true eigenvalues D = np.sqrt(Dy) # should also = np.sqrt(Dv) in theory print(D) # Translate the surrogate eigenvectors to an estimate of the true eigenvectors lib.transform(subspace, NX1, Qy, Y) lib.transform(subspace, NX2, Qv, V) # Normalize lib.normalize(subspace, NX1, Y) lib.normalize(subspace, NX2, V) if not np.allclose(U[:num, ...], Y[:num, ...], atol=0): continue if not np.allclose(X[:num, ...], V[:num, ...], atol=0): continue print('converged after %d iterations' % iter_num) break assert iter_num != MAX_ITER, "no convergence" # Flip the sign of the var2 EOFs and PCs so that the covariance is positive lib.fixcov(subspace, NT, NX2, pcs1, pcs2, V) # Wrap as pygeode vars, and return # Only need some of the eofs for output (the rest might not have even converged yet) orderaxis = order(num) eof1 = np.array(Y[:num]) pc1 = np.array(pcs1[..., :num]).transpose() eof1 = Var((orderaxis, ) + var1.axes[:iaxis1] + var1.axes[iaxis1 + 1:], values=eof1) pc1 = Var((orderaxis, var1.axes[iaxis1]), values=pc1) eof2 = np.array(V[:num]) pc2 = np.array(pcs2[..., :num]).transpose() eof2 = Var((orderaxis, ) + var2.axes[:iaxis2] + var2.axes[iaxis2 + 1:], values=eof2) pc2 = Var((orderaxis, var2.axes[iaxis2]), values=pc2) # Apply weights? if weight1 is not False: eof1 /= weight1.sqrt() if weight2 is not False: eof2 /= weight2.sqrt() # Use correlation instead of covariance? # Re-scale the fields by standard deviation if matrix == 'cor': eof1 *= std1 eof2 *= std2 # Give it a name eof1.name = prefix1 + "EOF" pc1.name = prefix1 + "PC" eof2.name = prefix2 + "EOF" pc2.name = prefix2 + "PC" return eof1, pc1, eof2, pc2
def correlate(X, Y, axes=None, pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= rho, p : :class:`Var` The correlation coefficient :math:`\rho_{XY}` and p-value, respectively. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.zeros(oview.shape, 'd')*np.nan y = np.zeros(oview.shape, 'd')*np.nan xx = np.zeros(oview.shape, 'd')*np.nan yy = np.zeros(oview.shape, 'd')*np.nan xy = np.zeros(oview.shape, 'd')*np.nan Na = np.zeros(oview.shape, 'd')*np.nan if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata*ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum([Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) eps = 1e-14 imsk = ~(Na < eps) xx[imsk] -= (x*x)[imsk]/Na[imsk] yy[imsk] -= (y*y)[imsk]/Na[imsk] xy[imsk] -= (x*y)[imsk]/Na[imsk] # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx*yy)[imsk]) rho[den > 0.] = xy[den > 0.] / np.sqrt(xx*yy)[den > 0.] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.)/den[imsk]) p[imsk] = tdist.cdf(t[imsk], Na[imsk]-2) * np.sign(rho[imsk]) p[~imsk] = np.nan rho[~imsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var Rho = Var(oaxes, values=rho, name='C(%s, %s)' % (xn, yn)) P = Var(oaxes, values=p, name='P(C(%s,%s) != 0)' % (xn, yn)) return Rho, P
def isnonzero(X, axes, alpha=0.05, N_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X, against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Three quantities are computed: * The mean value of X * The probability of the computed value if the population mean was zero * The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata,) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum([Na[outsl], npnansum(1. + xdata*0., riaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx = (xx - x**2/Na) / (Na - 1) x /= Na if N_fac is not None: eN = N//N_fac eNa = Na//N_fac else: eN = N eNa = Na #print 'eff. N = %.1f' % eN sdom = np.sqrt(xx/eNa) p = tdist.cdf(abs(x/sdom), eNa - 1)*np.sign(x) ci = tdist.ppf(1. - alpha/2, eNa - 1) * sdom name = X.name if X.name != '' else 'X' if len(oaxes) > 0: from pygeode import Var, Dataset X = Var(oaxes, values=x, name=name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([X, P, CI]) else: # Degenerate case return x, p, ci
def to_xarray(dataset): """ Converts a PyGeode Dataset into an xarray Dataset. Parameters ---------- dataset : pygeode.Dataset The dataset to be converted. Returns ------- out : xarray.Dataset An object which can be used with the xarray package. """ from pygeode.dataset import asdataset from pygeode.formats.cfmeta import encode_cf from pygeode.view import View from dask.base import tokenize import dask.array as da import xarray as xr dataset = asdataset(dataset) # Encode the axes/variables with CF metadata. dataset = encode_cf(dataset) out = dict() # Loop over each axis and variable. for var in list(dataset.axes) + list(dataset.vars): # Generate a unique name to identify it with dask. name = var.name + "-" + tokenize(var) dsk = dict() dims = [a.name for a in var.axes] # Special case: already have the values in memory. if hasattr(var,'values'): out[var.name] = xr.DataArray(var.values, dims=dims, attrs=var.atts, name=var.name) continue # Keep track of all the slices that were made over each dimension. # This information will be used to determine the "chunking" that was done # on the variable from inview.loop_mem(). slice_order = [[] for a in var.axes] chunks = [] # Break up the variable into into portions that are small enough to fit # in memory. These will become the "chunks" for dask. inview = View(var.axes) for outview in inview.loop_mem(): integer_indices = map(tuple,outview.integer_indices) # Determine *how* loop_mem is splitting the axes, and define the chunk # sizes accordingly. # A little indirect, but loop_mem doesn't make its chunking choices # available to the caller. for o, sl in zip(slice_order, integer_indices): if sl not in o: o.append(sl) ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)] # Add this chunk to the dask array. key = tuple([name] + ind) dsk[key] = (var.getview, outview, False) # Construct the dask array. chunks = [map(len,sl) for sl in slice_order] arr = da.Array(dsk, name, chunks, dtype=var.dtype) # Wrap this into an xarray.DataArray (with metadata and named axes). out[var.name] = xr.DataArray(arr, dims = dims, attrs = var.atts, name=var.name) # Build the final xarray.Dataset. out = xr.Dataset(out, attrs=dataset.atts) # Re-decode the CF metadata on the xarray side. out = xr.conventions.decode_cf(out) return out