def write_var(ncfile, dataset, unlimited=None, compress=False): # {{{ from pygeode.view import View from pygeode.axis import Axis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.tools import combine_axes vars = list(dataset.vars) axes = combine_axes(v.axes for v in vars) # Define the dimensions for a in axes: ncfile.createDimension(a.name, size=(None if a.name == unlimited else len(a))) # Define the variables (including axes) for var in vars: dimensions = [a.name for a in var.axes] v = ncfile.createVariable(var.name, datatype=var.dtype, dimensions=dimensions, zlib=compress, fill_value=var.atts.get('_FillValue', None)) v.setncatts(var.atts) # global attributes ncfile.setncatts(dataset.atts) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100 pbar = PBar(message="Saving '%s':" % ncfile.filepath()) # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v, Axis)]) # Write the data for i, var in enumerate(vars): ncvar = ncfile.variables[var.name] varpbar = pbar.subset(prog[i], prog[i + 1]) views = list(View(var.axes).loop_mem()) for j, v in enumerate(views): vpbar = varpbar.part(j, len(views)) ncvar[v.slices] = v.get(var, pbar=vpbar)
def clim_detrend(var, yrlen, itime=-1, sig=False): # {{{ ''' clim_detrend() - returns detrended time series with a daily trend.''' from pygeode.timeaxis import Time from . import stats from numpy import arange if itime == -1: itime = var.whichaxis(Time) tlen = var.shape[itime] vary = composite(var, itime, list(range(0, tlen, yrlen)), yrlen) yrs = vary.axes[itime] yrs.values = arange(len(yrs)).astype(yrs.dtype) print('Computing regression') from pygeode.progress import PBar m, b, p = stats.regress(yrs, vary, pbar=PBar()) varz = flatten(vary - (m * yrs + b), itime + 1) varz.axes = var.axes # Since the axes have been modified after initialization, redo the init to get # shortcuts to the axes names Var.__init__(varz, varz.axes, varz.dtype) if var.name != '': varz.name = var.name + "'" if sig: return m, b, varz, p else: return m, b, varz
def write_var (ncfile, dataset, unlimited=None, compress=False): # {{{ from pygeode.view import View from pygeode.axis import Axis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.tools import combine_axes vars = list(dataset.vars) axes = combine_axes(v.axes for v in vars) # Define the dimensions for a in axes: ncfile.createDimension(a.name, size=(None if a.name == unlimited else len(a))) # Define the variables (including axes) for var in vars: dimensions = [a.name for a in var.axes] v = ncfile.createVariable(var.name, datatype=var.dtype, dimensions=dimensions, zlib=compress, fill_value=var.atts.get('_FillValue',None)) v.setncatts(var.atts) # global attributes ncfile.setncatts(dataset.atts) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100 pbar = PBar(message="Saving '%s':"%ncfile.filepath()) # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v,Axis)]) # Write the data for i,var in enumerate(vars): ncvar = ncfile.variables[var.name] varpbar = pbar.subset(prog[i], prog[i+1]) views = list(View(var.axes).loop_mem()) for j,v in enumerate(views): vpbar = varpbar.part(j, len(views)) ncvar[v.slices] = v.get(var, pbar=vpbar)
def scan_files(self, files, opener): from os.path import getmtime, normpath from pygeode.progress import PBar table = self.table # Special case: no files given if len(files) == 0: return self.selected_files.extend(files) # Strip out extra separators, etc. from the filenames. # Otherwise, if the files are scanned a second time with different # separators, it may cause the same file to be included more than once. files = [normpath(f) for f in files] if self.filename is not None: pbar = PBar(message="Scanning files for %s" % self.filename) else: pbar = PBar(message="Scanning files") # Construct / add to the table for i, f in enumerate(files): pbar.update(i * 100. / len(files)) if f in table: # File has changed since last time? if int(getmtime(f)) > self.mtime: # Remove existing info del table[f] else: # Otherwise, we've already dealt with the file, so skip it. continue # Always use the latest modification time to represent the valid time of # the whole table. self.mtime = max(self.mtime, int(getmtime(f))) # Record all variables from the file. entries = [] table[f] = entries for var in opener(f): axes = self.axis_manager.lookup_axes(var.axes) entries.append((var.name, axes, var.atts)) self.modified_table = True pbar.update(100)
def load(self, pbar=True): # {{{ ''' Returns a version of this variable with all data loaded into memory. Parameters ---------- pbar : boolean If True, display a progress bar while loading data. ''' from pygeode.progress import PBar if hasattr(self, 'values'): return self if pbar is True: pbar = PBar(message="Loading %s:" % repr(self)) var = Var(self.axes, values=self.get(pbar=pbar)) copy_meta(self, var) return var
def scan_files (self, files, opener): from os.path import getmtime, normpath from pygeode.progress import PBar table = self.table # Special case: no files given if len(files) == 0: return self.selected_files.extend(files) # Strip out extra separators, etc. from the filenames. # Otherwise, if the files are scanned a second time with different # separators, it may cause the same file to be included more than once. files = [normpath(f) for f in files] if self.filename is not None: pbar = PBar (message = "Scanning files for %s"%self.filename) else: pbar = PBar (message = "Scanning files") # Construct / add to the table for i,f in enumerate(files): pbar.update(i*100./len(files)) if f in table: # File has changed since last time? if int(getmtime(f)) > self.mtime: # Remove existing info del table[f] else: # Otherwise, we've already dealt with the file, so skip it. continue # Always use the latest modification time to represent the valid time of # the whole table. self.mtime = max(self.mtime,int(getmtime(f))) # Record all variables from the file. entries = [] table[f] = entries for var in opener(f): axes = self.axis_manager.lookup_axes(var.axes) entries.append((var.name, axes, var.atts)) self.modified_table = True pbar.update(100)
def correlate(X, Y, axes=None, pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= rho, p : :class:`Var` The correlation coefficient :math:`\rho_{XY}` and p-value, respectively. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.zeros(oview.shape, 'd')*np.nan y = np.zeros(oview.shape, 'd')*np.nan xx = np.zeros(oview.shape, 'd')*np.nan yy = np.zeros(oview.shape, 'd')*np.nan xy = np.zeros(oview.shape, 'd')*np.nan Na = np.zeros(oview.shape, 'd')*np.nan if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata*ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum([Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) eps = 1e-14 imsk = ~(Na < eps) xx[imsk] -= (x*x)[imsk]/Na[imsk] yy[imsk] -= (y*y)[imsk]/Na[imsk] xy[imsk] -= (x*y)[imsk]/Na[imsk] # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx*yy)[imsk]) rho[den > 0.] = xy[den > 0.] / np.sqrt(xx*yy)[den > 0.] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.)/den[imsk]) p[imsk] = tdist.cdf(t[imsk], Na[imsk]-2) * np.sign(rho[imsk]) p[~imsk] = np.nan rho[~imsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var Rho = Var(oaxes, values=rho, name='C(%s, %s)' % (xn, yn)) P = Var(oaxes, values=p, name='P(C(%s,%s) != 0)' % (xn, yn)) return Rho, P
def correlate(X, Y, axes=None, output='r2,p', pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'r2,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the correlation coefficient can be obtained through ``ds.r2``). * 'r2': The correlation coefficient :math:`\rho_{XY}` * 'p': The p-value; see notes. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding a correlation coeefficient of equal or greater magnitude (two-sided) to the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['r2', 'p'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Count of non-NaN data points Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) imsk = (Na > 0) xx[imsk] -= (x * x)[imsk] / Na[imsk] yy[imsk] -= (y * y)[imsk] / Na[imsk] xy[imsk] -= (x * y)[imsk] / Na[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx * yy)[imsk]) dmsk = (den > 0.) rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk]) p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2)) p[~imsk] = np.nan rho[~imsk] = np.nan p[~dmsk] = np.nan rho[~dmsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'r2' in output: r2 = Var(oaxes, values=rho, name='r2') r2.atts['longname'] = 'Correlation coefficient between %s and %s' % ( xn, yn) rvs.append(r2) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for correlation coefficient between %s and %s' % ( xn, yn) rvs.append(p) ds = asdataset(rvs) ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn) return ds
def paired_difference(X, Y, axes=None, alpha=0.05, N_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must share all axes over which the means are being computed. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X and Y; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.full(oview.shape, np.nan, 'd') dd = np.full(oview.shape, np.nan, 'd') N = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Count of non-NaN data points N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (N > 1) dd[imsk] -= (d * d)[imsk] / N[imsk] dd[imsk] /= (N[imsk] - 1) d[imsk] /= N[imsk] # Ensure variance is non-negative dd[dd <= 0.] = 0. if N_fac is not None: eN = N // N_fac else: eN = N emsk = (eN > 1) den = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') den = np.zeros(oview.shape, 'd') den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1)) dmsk = (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1)) ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk] # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=eN - 1, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for t-test of paired difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['N_fac'] = N_fac ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn) return ds
def difference(X, Y, axes=None, alpha=0.05, Nx_fac=None, Ny_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional, defaults to None Axes over which to compute means; if othing is specified, the mean is computed over all axes common to X and Y. alpha : float, optional; defaults to 0.05 Confidence level for which to compute confidence interval. Nx_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name if pbar is None: from pygeode.progress import PBar pbar = PBar() # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') Nx = np.full(oview.shape, np.nan, 'd') Ny = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Count of non-NaN data points Nx[outsl] = np.nansum( [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0) for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Count of non-NaN data points Ny[outsl] = np.nansum( [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (Nx > 1) & (Ny > 1) xx[imsk] -= (x * x)[imsk] / Nx[imsk] xx[imsk] /= (Nx[imsk] - 1) x[imsk] /= Nx[imsk] yy[imsk] -= (y * y)[imsk] / Ny[imsk] yy[imsk] /= (Ny[imsk] - 1) y[imsk] /= Ny[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. if Nx_fac is not None: eNx = Nx // Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny // Ny_fac else: eNy = Ny emsk = (eNx > 1) & (eNy > 1) # Compute difference d = x - y den = np.zeros(oview.shape, 'd') df = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') # Convert to variance of the mean of each sample xx[emsk] /= eNx[emsk] yy[emsk] /= eNy[emsk] den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1) dmsk = (den > 0.) df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk] den[emsk] = np.sqrt(xx[emsk] + yy[emsk]) dmsk &= (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk])) ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk] df[~dmsk] = np.nan p[~dmsk] = np.nan ci[~dmsk] = np.nan # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=df, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['Nx_fac'] = Nx_fac ds.atts['Ny_fac'] = Ny_fac ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn) return ds
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': p-value of regession; see notes. * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; it corresponds to the probability of obtaining the regression coefficient under the null hypothesis that there is no linear relationship. Note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View # Split output request now ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError( 'One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = tuple([srcaxes[i] for i in oiaxes]) inaxes = oaxes + tuple([srcaxes[i] for i in riaxes]) oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len( riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % ( Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr, ) os2 = os + (Nr, Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes) for j in range(i + 1): xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i, :, :] = np.linalg.inv(xx[i, :, :]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [ np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr) ] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from .var import Var from .dataset import asdataset from .axis import NonCoordinateAxis ra = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor') ra2 = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor2') Nd = len(oaxes) rvs = [] if 'beta' in output: B = Var(oaxes + (ra, ), values=beta, name='beta') B.atts['longname'] = 'regression coefficient' rvs.append(B) if 'r2' in output: vary = (yy - y**2 / N) R2 = 1 - (yy - vare) / vary R2 = Var(oaxes, values=R2, name='R2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: p = [ 2. * (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr)) for i in range(Nr) ] p = np.transpose(np.array(p), [Nd] + list(range(Nd))) p = Var(oaxes + (ra, ), values=p, name='p') p.atts['longname'] = 'p-values' rvs.append(p) if 'sb' in output: sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd))) sb = Var(oaxes + (ra, ), values=sigbeta, name='sb') sb.atts['longname'] = 'standard deviation of linear coefficients' rvs.append(sb) if 'covb' in output: sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb') covb.atts['longname'] = 'Covariance matrix of the linear coefficients' rvs.append(covb) if 'se' in output: se = np.sqrt((yy - vare) / N_eff) se = Var(oaxes, values=se, name='se') se.atts['longname'] = 'standard deviation of residual' rvs.append(se) ds = asdataset(rvs) ds.atts[ 'description'] = 'multiple linear regression parameters for %s regressed against %s' % ( yn, xns) return ds
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': p-value of regression; see notes. * 'sm': Standard deviation of linear coefficient estimate * 'se': Standard deviation of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['m', 'b', 'r2', 'p', 'sm', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from regression. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % ( X.name, Y.name) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) if N_fac is None: N_eff = Na - 2. else: N_eff = Na / N_fac - 2. nmsk = (N_eff > 0.) xx[nmsk] -= (x * x)[nmsk] / Na[nmsk] yy[nmsk] -= (y * y)[nmsk] / Na[nmsk] xy[nmsk] -= (x * y)[nmsk] / Na[nmsk] dmsk = (xx > 0.) m = np.zeros(oview.shape, 'd') b = np.zeros(oview.shape, 'd') r2 = np.zeros(oview.shape, 'd') m[dmsk] = xy[dmsk] / xx[dmsk] b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk] r2den = xx * yy d2msk = (r2den > 0.) r2[d2msk] = xy[d2msk]**2 / r2den[d2msk] sige = np.zeros(oview.shape, 'd') sigm = np.zeros(oview.shape, 'd') t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk] sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk]) sige[nmsk] = np.sqrt(sige[dmsk]) t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk] p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk])) msk = nmsk & dmsk m[~msk] = np.nan b[~msk] = np.nan sige[~msk] = np.nan sigm[~msk] = np.nan p[~msk] = np.nan msk = nmsk & d2msk r2[~msk] = np.nan xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: M = Var(oaxes, values=m, name='m') M.atts['longname'] = 'slope' rvs.append(M) if 'b' in output: B = Var(oaxes, values=b, name='b') B.atts['longname'] = 'intercept' rvs.append(B) if 'r2' in output: R2 = Var(oaxes, values=r2, name='r2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: P = Var(oaxes, values=p, name='p') P.atts['longname'] = 'p-value' rvs.append(P) if 'sm' in output: SM = Var(oaxes, values=sigm, name='sm') SM.atts['longname'] = 'standard deviation of slope parameter' rvs.append(SM) if 'se' in output: SE = Var(oaxes, values=sige, name='se') SE.atts['longname'] = 'standard deviation of residual' rvs.append(SE) ds = asdataset(rvs) ds.atts[ 'description'] = 'linear regression parameters for %s regressed against %s' % ( yn, xn) return ds
def save (filename, var, iaxis=None, fps=15, palette='bw', minmax=None): from pygeode.axis import TAxis from pygeode.var import Var from pygeode.progress import PBar import tempfile, shutil import Image import numpy as np import os assert isinstance(var, Var) # Remove any degenerate dimensions, make sure the axes are in a consistent order var = var.squeeze().sorted() assert var.naxes == 3, "can only work with 3D data" if iaxis is None: iaxis = var.whichaxis(TAxis) assert iaxis >= 0, "no time axis found" tmpdir = tempfile.mkdtemp (prefix='pygeode_mpeg') sl = [slice(None)] * 3 # Get max & min values of the whole dataset if minmax is None: #TODO: calculate both of these at once, with a progress bar to help the process min = float(var.min()) max = float(var.max()) else: assert len(minmax) == 2, "invalid minmax argument" min, max = minmax print "Saving %s:"%filename pbar = PBar() # Loop over each timestep, generate a temporary image file for i in range(len(var.axes[iaxis])): fpbar = pbar.part(i,len(var.axes[iaxis])) sl[iaxis] = i # Get data, flip y axis, add an 'RGB' axis data = var[sl].squeeze()[::-1,:,np.newaxis] data = (data-min)/(max-min) * 255 if palette == 'bw': # Same data for R, G, and B channels data = np.concatenate([data,data,data], axis=2) elif palette == 'rainbow': # Piecewise linear palette part1 = data <= 85 part2 = (85 < data) & (data <= 170) part3 = 170 < data b = np.zeros(data.shape) b[part1] = 255 b[part2] = 255 - (data[part2] - 85)*3 g = np.zeros(data.shape) g[part1] = data[part1] * 3 g[part2] = 255 g[part3] = 255 - (data[part3] - 170) * 3 r = np.zeros(data.shape) r[part2] = (data[part2] - 85) * 3 r[part3] = 255 data = np.concatenate([r,g,b], axis=2) # Encode as an 8-bit array data = np.asarray(np.round(data), 'uint8') # Save framefile = tmpdir+"/frame%04d.jpg"%i Image.fromarray(data,"RGB").save(framefile, quality=95) # os.system("display "+framefile) # break fpbar.update(100) shape = list(var.shape) shape = shape[:iaxis] + shape[iaxis+1:] h, w = shape # """ # Make the movie file os.system("mencoder mf://%s/*.jpg -mf w=%s:h=%s:type=jpg:fps=%s \ -ovc lavc -lavcopts vcodec=mpeg4:vbitrate=8000 -oac copy \ -o %s" % (tmpdir, w, h, fps, filename) ) # """ # Clean up files shutil.rmtree (tmpdir)
def paired_difference(X, Y, axes, alpha=0.05, N_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.zeros(oview.shape, 'd') dd = np.zeros(oview.shape, 'd') N = np.zeros(oview.shape, 'd') d[()] = np.nan dd[()] = np.nan N[()] = np.nan # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) N[outsl] = np.nansum([N[outsl], npnansum(1. + ddata*0., ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) dd = (dd - d**2/N) / (N - 1) d /= Nx if N_fac is not None: eN = N//N_fac else: eN = N #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) den = np.sqrt(dd/(eN - 1)) p = tdist.cdf(abs(d/den), eN - 1)*np.sign(d) ci = tdist.ppf(1. - alpha/2, eN - 1) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=eN-1, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, eN-1, p, ci
def save(filename, in_dataset, version=3, pack=None, compress=False, cfmeta=True, unlimited=None): # {{{ from ctypes import c_int, c_long, byref from pygeode.view import View from pygeode.tools import combine_axes, point from pygeode.axis import Axis, DummyAxis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.formats import finalize_save from pygeode.dataset import asdataset assert isinstance(filename, str) in_dataset = asdataset(in_dataset) dataset = finalize_save(in_dataset, cfmeta, pack) # Version? if compress: version = 4 assert version in (3, 4) fileid = c_int() vars = list(dataset.vars) # The output axes axes = combine_axes(v.axes for v in vars) # Include axes in the list of vars (for writing to netcdf). # Exclude axes which don't have any intrinsic values. vars = vars + [a for a in axes if not isinstance(a, DummyAxis)] #vars.extend(axes) # Variables (and axes) must all have unique names assert len(set([v.name for v in vars])) == len( vars), "vars must have unique names: %s" % [v.name for v in vars] if unlimited is not None: assert unlimited in [a.name for a in axes] # Functions for writing entire array allf = { 1: lib.nc_put_var_schar, 2: lib.nc_put_var_text, 3: lib.nc_put_var_short, 4: lib.nc_put_var_int, 5: lib.nc_put_var_float, 6: lib.nc_put_var_double, 7: lib.nc_put_var_uchar, 8: lib.nc_put_var_ushort, 9: lib.nc_put_var_uint, 10: lib.nc_put_var_longlong, 11: lib.nc_put_var_ulonglong } # Functions for writing chunks chunkf = { 1: lib.nc_put_vara_schar, 2: lib.nc_put_vara_text, 3: lib.nc_put_vara_short, 4: lib.nc_put_vara_int, 5: lib.nc_put_vara_float, 6: lib.nc_put_vara_double, 7: lib.nc_put_vara_uchar, 8: lib.nc_put_vara_ushort, 9: lib.nc_put_vara_uint, 10: lib.nc_put_vara_longlong, 11: lib.nc_put_vara_ulonglong } # Create the file if version == 3: ret = lib.nc_create(filename.encode('ascii'), 0, byref(fileid)) if ret != 0: raise IOError(lib.nc_strerror(ret)) elif version == 4: ret = lib.nc_create(filename.encode('ascii'), 0x1000, byref(fileid)) # 0x1000 = NC_NETCDF4 if ret != 0: raise IOError(lib.nc_strerror(ret)) else: raise Exception try: # Define the dimensions dimids = [None] * len(axes) for i, a in enumerate(axes): dimids[i] = c_int() if unlimited == a.name: ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i])) else: ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i])) assert ret == 0, lib.nc_strerror(ret) # Define the variables (including axes) chunks = [None] * len(vars) varids = [None] * len(vars) for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] # Generate the array of dimension ids for this var d = [dimids[list(axes).index(a)] for a in var.axes] # Make it C-compatible d = (c_int * var.naxes)(*d) varids[i] = c_int() ret = lib.nc_def_var(fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i])) assert ret == 0, lib.nc_strerror(ret) # Compress the data? (only works for netcdf4 or (higher?)) if compress: ret = lib.nc_def_var_deflate(fileid, varids[i], 1, 1, 2) assert ret == 0, lib.nc_strerror(ret) # Write the attributes # global attributes put_attributes(fileid, -1, dataset.atts, version) # variable attributes for i, var in enumerate(vars): # modify axes to be netcdf friendly (CF-compliant, etc.) put_attributes(fileid, varids[i], var.atts, version) # Don't pre-fill the file oldmode = c_int() ret = lib.nc_set_fill(fileid, 256, byref(oldmode)) assert ret == 0, "Can't set fill mode: %s (error %d)" % ( lib.nc_strerror(ret), ret) # Finished defining the variables, about to start writing the values ret = lib.nc_enddef(fileid) assert ret == 0, "Error leaving define mode: %s (error %d)" % ( lib.nc_strerror(ret), ret) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100 # print "Saving '%s':"%filename pbar = PBar(message="Saving '%s':" % filename) # pbar = FakePBar() # Write the data for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] dtype = numpy_type[t] # print 'writing', var.name # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v, Axis)]) varpbar = pbar.subset(prog[i], prog[i + 1]) views = list(View(var.axes).loop_mem()) for j, v in enumerate(views): vpbar = varpbar.part(j, len(views)) # print '???', repr(str(v)) # Should always be slices (since we're looping over whole thing contiguously?) for sl in v.slices: assert isinstance(sl, slice) for sl in v.slices: assert sl.step in (1, None) start = [sl.start for sl in v.slices] count = [sl.stop - sl.start for sl in v.slices] start = (c_long * var.naxes)(*start) count = (c_long * var.naxes)(*count) if isinstance(var, Axis): assert len(start) == len(count) == 1 data = var.values data = data[ start[0]:start[0] + count[0]] # the above gives us the *whole* axis, # but under extreme conditions we may be looping over smaller pieces vpbar.update(100) else: data = v.get(var, pbar=vpbar) # Ensure the data is stored contiguously in memory data = np.ascontiguousarray(data, dtype=dtype) ret = chunkf[t](fileid, varids[i], start, count, point(data)) assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % ( var.name, lib.nc_strerror(ret), ret) finally: # Finished lib.nc_close(fileid)
def difference(X, Y, axes, alpha=0.05, Nx_fac = None, Ny_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. Nx_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Four quantities are computed: * The difference in the means, X - Y * The effective number of degrees of freedom, :math:`df` * The probability of the computed difference if the population difference was zero * The confidence interval of the difference at the level specified by alpha If the average is taken over all axes of X and Y resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View srcaxes = combine_axes([X, Y]) riaxes = [whichaxis(srcaxes, n) for n in axes] raxes = [a for i, a in enumerate(srcaxes) if i in riaxes] oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') Nx = np.zeros(oview.shape, 'd') Ny = np.zeros(oview.shape, 'd') x[()] = np.nan y[()] = np.nan xx[()] = np.nan yy[()] = np.nan Nx[()] = np.nan Ny[()] = np.nan # Accumulate data for outsl, (xdata,) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Sum of weights (kludge to get masking right) Nx[outsl] = np.nansum([Nx[outsl], npnansum(1. + xdata*0., ixaxes)], 0) for outsl, (ydata,) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Sum of weights (kludge to get masking right) Ny[outsl] = np.nansum([Ny[outsl], npnansum(1. + ydata*0., iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx = (xx - x**2/Nx) / (Nx - 1) yy = (yy - y**2/Ny) / (Ny - 1) x /= Nx y /= Ny if Nx_fac is not None: eNx = Nx//Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny//Ny_fac else: eNy = Ny #print 'average eff. Nx = %.1f, average eff. Ny = %.1f' % (eNx.mean(), eNy.mean()) d = x - y den = np.sqrt(xx/eNx + yy/eNy) df = (xx/eNx + yy/eNy)**2 / ((xx/eNx)**2/(eNx - 1) + (yy/eNy)**2/(eNy - 1)) p = tdist.cdf(abs(d/den), df)*np.sign(d) ci = tdist.ppf(1. - alpha/2, df) * den xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' if xn == yn: name = xn else: name = '%s-%s'%(xn, yn) if len(oaxes) > 0: from pygeode import Var, Dataset D = Var(oaxes, values=d, name=name) DF = Var(oaxes, values=df, name='df_%s' % name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([D, DF, P, CI]) else: # Degenerate case return d, df, p, ci
def multiple_regress(Xs, Y, axes=None, pbar=None, N_fac=None, output='B,p'): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero for each regressor * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals If the regression is computed over all axes so that the result is a scalar, the above are returned as a tuple of floats in the order specified by ``output``. Otherwise they are returned as :class:`Var` instances. The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError('One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr,) os2 = os + (Nr,Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl+(i,)] += npsum(xdata[i]*ydata, siaxes) for j in range(i+1): xx[outsl+(i,j)] += npsum(xdata[i]*xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i,:,:] = np.linalg.inv(xx[i,:,:]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] for o in output: if o == 'B': if len(oaxes) == 0: ret.append(beta) else: ret.append([Var(oaxes, values=beta[...,i], name='beta_%s' % xns[i]) for i in range(Nr)]) elif o == 'r': vary = (yy - y**2/N) R2 = 1 - (yy - vare) / vary if len(oaxes) == 0: ret.append(R2) else: ret.append(Var(oaxes, values=R2, name='R2')) elif o == 'p': ps = [tdist.cdf(np.abs(beta[...,i]/sigbeta[i]), N_eff-Nr) * np.sign(beta[...,i]) for i in range(Nr)] if len(oaxes) == 0: ret.append(ps) else: ret.append([Var(oaxes, values=ps[i], name='p_%s' % xns[i]) for i in range(Nr)]) elif o == 'sb': if len(oaxes) == 0: ret.append(sigbeta) else: ret.append([Var(oaxes, values=sigbeta[i], name='sig_%s' % xns[i]) for i in range(Nr)]) elif o == 'covb': from .axis import NonCoordinateAxis as nca cr1 = nca(values=list(range(Nr)), regressor1=[X.name for X in Xs], name='regressor1') cr2 = nca(values=list(range(Nr)), regressor2=[X.name for X in Xs], name='regressor2') sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff ret.append(Var(oaxes + [cr1, cr2], values=sigmat, name='smat')) elif o == 'se': se = np.sqrt((yy - vare) / N_eff) if len(oaxes) == 0: ret.append(se) else: ret.append(Var(oaxes, values=se, name='sig_resid')) else: print('multiple_regress: unrecognized output "%s"' % o) return ret
def regress(X, Y, axes=None, pbar=None, N_fac=None, output='m,b,p'): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. Returns ======= results : list of :class:`Var` instances. The return values are specified by the ``output`` argument. A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': Probability of this fit if the true linear coefficient was zero * 'sm': Variance in linear coefficient * 'se': Variance of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (X.name, Y.name) # Construct work arrays x = np.zeros(oview.shape, 'd') y = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') xy = np.zeros(oview.shape, 'd') yy = np.zeros(oview.shape, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') x[outsl] += npsum(xdata, siaxes) y[outsl] += npsum(ydata, siaxes) xx[outsl] += npsum(xdata**2, siaxes) yy[outsl] += npsum(ydata**2, siaxes) xy[outsl] += npsum(xdata*ydata, siaxes) N = np.prod([len(srcaxes[i]) for i in riaxes]) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx -= x**2/N yy -= y**2/N xy -= (x*y)/N m = xy/xx b = (y - m*x)/float(N) if N_fac is None: N_eff = N else: N_eff = N // N_fac sige = (yy - m * xy) / (N_eff - 2.) sigm = np.sqrt(sige / xx) t = np.abs(m) / sigm p = tdist.cdf(t, N-2) * np.sign(m) xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var output = output.split(',') ret = [] if 'm' in output: M = Var(oaxes, values=m, name='%s vs. %s' % (yn, xn)) ret.append(M) if 'b' in output: B = Var(oaxes, values=b, name='Intercept (%s vs. %s)' % (yn, xn)) ret.append(B) if 'r' in output: ret.append(Var(oaxes, values=xy**2/(xx*yy), name='R2(%s vs. %s)' % (yn, xn))) if 'p' in output: P = Var(oaxes, values=p, name='P(%s vs. %s != 0)' % (yn, xn)) ret.append(P) if 'sm' in output: ret.append(Var(oaxes, values=sigm, name='Sig. Intercept (%s vs. %s != 0)' % (yn, xn))) if 'se' in output: ret.append(Var(oaxes, values=np.sqrt(sige), name='Sig. Resid. (%s vs. %s != 0)' % (yn, xn))) return ret
def save (filename, in_dataset, version=3, pack=None, compress=False, cfmeta = True, unlimited=None): # {{{ from ctypes import c_int, c_long, byref from pygeode.view import View from pygeode.tools import combine_axes, point from pygeode.axis import Axis, DummyAxis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.formats import finalize_save from pygeode.dataset import asdataset assert isinstance(filename,str) in_dataset = asdataset(in_dataset) dataset = finalize_save(in_dataset, cfmeta, pack) # Version? if compress: version = 4 assert version in (3,4) fileid = c_int() vars = list(dataset.vars) # The output axes axes = combine_axes(v.axes for v in vars) # Include axes in the list of vars (for writing to netcdf). # Exclude axes which don't have any intrinsic values. vars = vars + [a for a in axes if not isinstance(a,DummyAxis)] #vars.extend(axes) # Variables (and axes) must all have unique names assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars] if unlimited is not None: assert unlimited in [a.name for a in axes] # Functions for writing entire array allf = {1:lib.nc_put_var_schar, 2:lib.nc_put_var_text, 3:lib.nc_put_var_short, 4:lib.nc_put_var_int, 5:lib.nc_put_var_float, 6:lib.nc_put_var_double, 7:lib.nc_put_var_uchar, 8:lib.nc_put_var_ushort, 9:lib.nc_put_var_uint, 10:lib.nc_put_var_longlong, 11:lib.nc_put_var_ulonglong} # Functions for writing chunks chunkf = {1:lib.nc_put_vara_schar, 2:lib.nc_put_vara_text, 3:lib.nc_put_vara_short, 4:lib.nc_put_vara_int, 5:lib.nc_put_vara_float, 6:lib.nc_put_vara_double, 7:lib.nc_put_vara_uchar, 8:lib.nc_put_vara_ushort, 9:lib.nc_put_vara_uint, 10:lib.nc_put_vara_longlong, 11:lib.nc_put_vara_ulonglong} # Create the file if version == 3: ret = lib.nc_create (filename.encode('ascii'), 0, byref(fileid)) if ret != 0: raise IOError(lib.nc_strerror(ret)) elif version == 4: ret = lib.nc_create (filename.encode('ascii'), 0x1000, byref(fileid)) # 0x1000 = NC_NETCDF4 if ret != 0: raise IOError(lib.nc_strerror(ret)) else: raise Exception try: # Define the dimensions dimids = [None] * len(axes) for i,a in enumerate(axes): dimids[i] = c_int() if unlimited == a.name: ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i])) else: ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i])) assert ret == 0, lib.nc_strerror(ret) # Define the variables (including axes) chunks = [None] * len(vars) varids = [None] * len(vars) for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] # Generate the array of dimension ids for this var d = [dimids[list(axes).index(a)] for a in var.axes] # Make it C-compatible d = (c_int * var.naxes)(*d) varids[i] = c_int() ret = lib.nc_def_var (fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i])) assert ret == 0, lib.nc_strerror(ret) # Compress the data? (only works for netcdf4 or (higher?)) if compress: ret = lib.nc_def_var_deflate (fileid, varids[i], 1, 1, 2) assert ret == 0, lib.nc_strerror(ret) # Write the attributes # global attributes put_attributes (fileid, -1, dataset.atts, version) # variable attributes for i, var in enumerate(vars): # modify axes to be netcdf friendly (CF-compliant, etc.) put_attributes (fileid, varids[i], var.atts, version) # Don't pre-fill the file oldmode = c_int() ret = lib.nc_set_fill (fileid, 256, byref(oldmode)) assert ret == 0, "Can't set fill mode: %s (error %d)" % (lib.nc_strerror(ret), ret) # Finished defining the variables, about to start writing the values ret = lib.nc_enddef (fileid) assert ret == 0, "Error leaving define mode: %s (error %d)" % (lib.nc_strerror(ret), ret) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100 # print "Saving '%s':"%filename pbar = PBar(message="Saving '%s':"%filename) # pbar = FakePBar() # Write the data for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] dtype = numpy_type[t] # print 'writing', var.name # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v,Axis)]) varpbar = pbar.subset(prog[i], prog[i+1]) views = list(View(var.axes).loop_mem()) for j,v in enumerate(views): vpbar = varpbar.part(j, len(views)) # print '???', repr(str(v)) # Should always be slices (since we're looping over whole thing contiguously?) for sl in v.slices: assert isinstance(sl, slice) for sl in v.slices: assert sl.step in (1,None) start = [sl.start for sl in v.slices] count = [sl.stop - sl.start for sl in v.slices] start = (c_long*var.naxes)(*start) count = (c_long*var.naxes)(*count) if isinstance(var, Axis): assert len(start) == len(count) == 1 data = var.values data = data[start[0]:start[0]+count[0]] # the above gives us the *whole* axis, # but under extreme conditions we may be looping over smaller pieces vpbar.update(100) else: data = v.get(var, pbar=vpbar) # Ensure the data is stored contiguously in memory data = np.ascontiguousarray(data, dtype=dtype) ret = chunkf[t](fileid, varids[i], start, count, point(data)) assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (var.name, lib.nc_strerror(ret), ret) finally: # Finished lib.nc_close(fileid)
def plotvar (var, **kwargs): # {{{ ''' plotvar(var, title, clevs, cmap, ax, ifig, hold) Produces a plot of the pygeode variable var. The routine can plot 1d or 2d data; degenerate axes (of length 1) are ignored; their value is displayed in the title of the plot. If the axes are longitude and latitude, the Basemap package is used to plot variable on a map of the world. If one of the axes is a ZAxis, it is plotted on the y-axes, logarithmically if appropriate. keyword arguments: title: Title of the plot ax: A matplotlib axes object on which to produce the plot lblx: Show xaxis titles and labels lbly: Show yaxis titles and labels scaleAx: Scale values with coordinate value (for logarithmic axes only) colorbar: Show colorbar clevs: Filled contour levels, if None, no filled contours are plotted cmap: A colormap passed on to the contour pylab function clines: Outlined levels, if None, no contour lines are plotted perx: Roll values in x axis (appropriate for periodic axes) ifig: Index of the matplotlib figure on which to produce the plot hold: If True, don't clear the contents of the axis wait: if True, don't invoke the show() command (the plotting main loop is not called, so subsequent pygeode commands can be invoked) ''' from matplotlib.pyplot import figure, show, ion, ioff, draw, cm, clf, isinteractive ### from matplotlib.numerix import ma from numpy import ma from numpy import isnan, isinf, where from pygeode.progress import PBar from copy import copy # Get # of dimensions - can only do 1D or 2D nd = len([s for s in var.shape if s > 1]) assert nd > 0, "the specified data has no dimensions. Nothing to plot!" assert nd == 1 or nd == 2, "can only plot 1D or 2D arrays. Try slicing along some dimensions." axes = var.axes ret = None # Create title if none has been specified title = kwargs.pop('title', None) if title is None: title = _buildvartitle(axes, var.name, **var.plotatts) pbar = kwargs.pop('pbar', True) if pbar is True: pbar = PBar(message='Loading plot values from %s:'%repr(var)) values = var.get(pbar=pbar).squeeze() else: values = var.get().squeeze() # Mask out missing values (NaN) values = ma.masked_where(isnan(values), values) # Apply linear rescaling for plotting values = _scalevalues(values, **var.plotatts) # Scaling by coordinate value preserves integral for log-scaling scaleAx = kwargs.pop('scaleAx',False) # for line plots scaleX = kwargs.pop('scaleX',False) # for surface plots scaleY = kwargs.pop('scaleY',False) # for surface plots # Log scale for values (not axis) logVal = kwargs.pop('logVal',False) wasint = isinteractive() ioff() ax = kwargs.pop('ax', None) ifig = kwargs.pop('ifig', None) hold = kwargs.pop('hold', False) wait = kwargs.pop('wait', False) if ax is None: if ifig is None: fig = figure() else: fig=figure(ifig) if not hold: clf() ax = fig.add_subplot(111) else: fig = ax.figure if not hold and title: ax.set_title(title) # 1D case: if nd == 1: from pygeode.axis import ZAxis, Pres, Hybrid xaxis = [copy(a) for a in axes if len(a)>1][0] # adjust axis scaling #if xaxis.atts['units'] != xaxis.plotatts['plotunits']: xaxis.values = xaxis.values*xaxis.plotatts.get('scalefactor',1) + xaxis.plotatts.get('offset',0) # Scaling by coordinate value preserves integral for log-scaling if (scaleAx and xaxis.plotatts.get('plotscale', 'linear')=='log' and var.plotatts.get('preserve', 'value')=='area'): values = values * xaxis.values # Vertical? if isinstance(xaxis,ZAxis): lblx = kwargs.pop('lblx', False) # preserve previous behaviour lbly = kwargs.pop('lbly', True) ax.plot(values, xaxis.values, **kwargs) if logVal or var.plotatts.get('plotscale', 'linear')=='log': ax.set_xscale('log') # value axis else: ax.set_xscale('linear') # value axis # ax.set_xscale(var.plotatts.get('plotscale', 'linear')) # value axis ax.set_yscale(xaxis.plotatts.get('plotscale', 'linear')) # coordiante ylims = min(xaxis.values),max(xaxis.values) ax.set_ylim(ylims[::xaxis.plotatts['plotorder']]) # coordinate axis ax.yaxis.set_major_formatter(xaxis.formatter()) if lbly: loc = xaxis.locator() if loc is not None: ax.yaxis.set_major_locator(loc) ax.set_ylabel(_buildaxistitle(**xaxis.plotatts)) # value axis if lblx: ax.set_xlabel(_buildaxistitle(name = var.name, **var.plotatts)) else: lblx = kwargs.pop('lblx', True) lbly = kwargs.pop('lbly', False) # preserve previous behaviour ax.plot(xaxis.values, values, **kwargs) if logVal or var.plotatts.get('plotscale', 'linear')=='log': ax.set_yscale('log') # value axis else: ax.set_yscale('linear') # value axis # ax.set_yscale(var.plotatts.get('plotscale', 'linear')) # value axis ax.set_xscale(xaxis.plotatts['plotscale']) # coordinate xlims = min(xaxis.values),max(xaxis.values) ax.set_xlim(xlims[::xaxis.plotatts['plotorder']]) ax.xaxis.set_major_formatter(xaxis.formatter()) # coordinate axis if lblx: loc = xaxis.locator() if loc is not None: ax.xaxis.set_major_locator(loc) ax.set_xlabel(_buildaxistitle(**xaxis.plotatts)) # value axis if lbly: ax.set_ylabel(_buildaxistitle(name = var.name, **var.plotatts)) # 2D case: elif nd == 2: from numpy import meshgrid, concatenate, log10 from matplotlib.pyplot import contourf, colorbar, xlim, ylim, xlabel, ylabel, gca from pygeode.axis import Lat, Lon, ZAxis, Pres, Hybrid, SpectralM, SpectralN # Patch for some versions of matplotlib, which leave gaps between polygons kwargs.setdefault('antialiased',False) yaxis, xaxis = [copy(a) for a in axes if len(a) > 1] # adjust x-axis scaling #if xaxis.atts['units'] != xaxis.plotatts['plotunits']: xaxis.values = xaxis.values*xaxis.plotatts.get('scalefactor',1) + xaxis.plotatts.get('offset',0) # adjust y-axis scaling #if yaxis.atts['units'] != yaxis.plotatts['plotunits']: yaxis.values = yaxis.values*yaxis.plotatts.get('scalefactor',1) + yaxis.plotatts.get('offset',0) # Transpose vertical axis? if isinstance(xaxis, ZAxis): values = values.transpose() xaxis, yaxis = yaxis, xaxis if isinstance(xaxis, SpectralN) and isinstance(yaxis, SpectralM): values = values.transpose() xaxis, yaxis = yaxis, xaxis if isinstance(xaxis, Lat) and isinstance(yaxis, Lon): values = values.transpose() xaxis, yaxis = yaxis, xaxis perx = kwargs.pop('perx', False) if perx: xvals = concatenate([xaxis.values, [xaxis.values[-1] + (xaxis.values[1] - xaxis.values[0])]]) yvals = yaxis.values meshx, meshy = meshgrid (xvals, yvals) else: xvals = xaxis.values yvals = yaxis.values meshx, meshy = meshgrid (xvals, yvals) # Scaling by coordinate value preserves integral for log-scaling if (scaleX and xaxis.plotatts.get('plotscale', 'linear')=='log' and var.plotatts.get('preserve', 'value')=='area'): values = values * meshx if (scaleY and yaxis.plotatts.get('plotscale', 'linear')=='log' and var.plotatts.get('preserve', 'value')=='area'): values = values * meshy # scaling of field values if logVal: values = log10(values) #cmap = kwargs.pop('cmap', cm.gist_rainbow_r) clevs = kwargs.pop('clevs', 21) clines = kwargs.pop('clines', None) cbar = kwargs.pop('colorbar', {'orientation':'vertical'}) pcolor = kwargs.pop('pcolor', False) mask = kwargs.pop('mask', None) if mask is not None: values = ma.masked_where(mask(values), values) if perx: concatenate([values, values[0:1, :]], axis=0) # # Map? Basemap = None if kwargs.pop('map', True): # New toolkit path try: from mpl_toolkits.basemap import Basemap except ImportError: pass # Old toolkit path try: from matplotlib.toolkits.basemap import Basemap except ImportError: pass if isinstance(xaxis,Lon) and isinstance(yaxis,Lat) and Basemap is not None: from numpy import arange # pop some arguments related to projection grid labelling projargs = dict(kwargs.pop('projection', {})) # meridians setup (latitude / y) meridians = projargs.pop('meridians',[-180,-90,0,90,180,270,360]) # parallels setup (longitude / x) parallels = projargs.pop('parallels',[-90,-60,-30,0,30,60,90]) # show labels for meridians and parallels in given location # labels[0]: left, labels[1]: right, labels[2]: top, labels[3]: bottom labels = projargs.pop('labels',[1,0,0,1]) # default axes boundaries bnds = {'llcrnrlat':yvals.min(), 'urcrnrlat':yvals.max(), 'llcrnrlon':xvals.min(), 'urcrnrlon':xvals.max()} # default projection proj = {'projection':'cyl', 'resolution':'l'} # read projection arguments proj.update(projargs) if proj['projection'] in ['cyl', 'merc', 'mill', 'gall']: bnds.update(proj) proj.update(bnds) # construct projection axis m = Basemap(ax=ax, **proj) m.drawcoastlines(ax=ax) # draw meridians and parallels (using arguments from above) m.drawmeridians(meridians,labels=labels,ax=ax) m.drawparallels(parallels,labels=labels,ax=ax) m.drawmapboundary() # Transform mesh px, py = m(meshx, meshy) cont = None # Colour individual grid boxes? (no contours) if pcolor: clevs = None # can't have both cont = m.pcolor(px, py, values, **kwargs) ret = cont # Filled contours? if clevs is not None: cont = m.contourf(px, py, values, clevs, **kwargs) ret = cont # Colour bar? if cbar and cont is not None: fig.colorbar(cont, ax=ax, **cbar) # Contour lines? if clines is not None: ret = m.contour(px, py, values, clines, colors='k') else: cont = None # Colour individual grid boxes? (no contours) if pcolor: clevs = None # can't have both cont = ax.pcolor(meshx, meshy, values, **kwargs) ret = cont # Filled contours? if clevs is not None: cont = ax.contourf(meshx, meshy, values, clevs, **kwargs) ret = cont # Colour bar? if cbar and cont is not None: fig.colorbar(cont, ax=ax, **cbar) # Contour lines? if clines is not None: ret = ax.contour(meshx, meshy, values, clines, colors='k') # Disable autoscale. Otherwise, if we set a log scale below, then # the range of our axes will get screwed up. # (This is a 'feature' of matplotlib!) # http://www.mail-archive.com/[email protected]/msg10527.html gca().set_autoscale_on(False) # Set the axis limits ax.set_xscale(xaxis.plotatts['plotscale']) xlims = min(xvals),max(xvals) ax.set_xlim(xlims[::xaxis.plotatts['plotorder']]) ax.set_yscale(yaxis.plotatts['plotscale']) ylims = min(yaxis.values),max(yaxis.values) ax.set_ylim(ylims[::yaxis.plotatts['plotorder']]) # Set x and y labels and formatters if kwargs.pop('lblx', True): ax.set_xlabel(_buildaxistitle(**xaxis.plotatts)) ax.xaxis.set_major_formatter(xaxis.formatter()) loc = xaxis.locator() if loc is not None: ax.xaxis.set_major_locator(loc) else: ax.set_xticklabels('') if kwargs.pop('lbly', True): ax.set_ylabel(_buildaxistitle(**yaxis.plotatts)) ax.yaxis.set_major_formatter(yaxis.formatter()) loc = yaxis.locator() if loc is not None: ax.yaxis.set_major_locator(loc) else: ax.set_yticklabels('') if wasint: ion() draw() if not wait: show() if ret is not None: return ret
def save(filename, var, iaxis=None, fps=15, palette='bw', minmax=None): from pygeode.axis import TAxis from pygeode.var import Var from pygeode.progress import PBar import tempfile, shutil import Image import numpy as np import os assert isinstance(var, Var) # Remove any degenerate dimensions, make sure the axes are in a consistent order var = var.squeeze().sorted() assert var.naxes == 3, "can only work with 3D data" if iaxis is None: iaxis = var.whichaxis(TAxis) assert iaxis >= 0, "no time axis found" tmpdir = tempfile.mkdtemp(prefix='pygeode_mpeg') sl = [slice(None)] * 3 # Get max & min values of the whole dataset if minmax is None: #TODO: calculate both of these at once, with a progress bar to help the process min = float(var.min()) max = float(var.max()) else: assert len(minmax) == 2, "invalid minmax argument" min, max = minmax print("Saving %s:" % filename) pbar = PBar() # Loop over each timestep, generate a temporary image file for i in range(len(var.axes[iaxis])): fpbar = pbar.part(i, len(var.axes[iaxis])) sl[iaxis] = i # Get data, flip y axis, add an 'RGB' axis data = var[sl].squeeze()[::-1, :, np.newaxis] data = (data - min) / (max - min) * 255 if palette == 'bw': # Same data for R, G, and B channels data = np.concatenate([data, data, data], axis=2) elif palette == 'rainbow': # Piecewise linear palette part1 = data <= 85 part2 = (85 < data) & (data <= 170) part3 = 170 < data b = np.zeros(data.shape) b[part1] = 255 b[part2] = 255 - (data[part2] - 85) * 3 g = np.zeros(data.shape) g[part1] = data[part1] * 3 g[part2] = 255 g[part3] = 255 - (data[part3] - 170) * 3 r = np.zeros(data.shape) r[part2] = (data[part2] - 85) * 3 r[part3] = 255 data = np.concatenate([r, g, b], axis=2) # Encode as an 8-bit array data = np.asarray(np.round(data), 'uint8') # Save framefile = tmpdir + "/frame%04d.jpg" % i Image.fromarray(data, "RGB").save(framefile, quality=95) # os.system("display "+framefile) # break fpbar.update(100) shape = list(var.shape) shape = shape[:iaxis] + shape[iaxis + 1:] h, w = shape # """ # Make the movie file os.system("mencoder mf://%s/*.jpg -mf w=%s:h=%s:type=jpg:fps=%s \ -ovc lavc -lavcopts vcodec=mpeg4:vbitrate=8000 -oac copy \ -o %s" % (tmpdir, w, h, fps, filename)) # """ # Clean up files shutil.rmtree(tmpdir)
def check_dataset (dataset): from pygeode.view import View from pygeode.tools import combine_axes from pygeode.progress import PBar from pygeode.dataset import asdataset import numpy as np # Make sure we have a dataset (in case we're sent a simple list of vars) dataset = asdataset(dataset) vars = list(dataset.vars) # Include axes in the list of vars (to check these values too) axes = combine_axes(v.axes for v in vars) vars.extend(axes) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100 pbar = PBar(message="Checking %s for I/O errors:"%repr(dataset)) failed_indices = {} error_messages = {} # Loop over the data for i,var in enumerate(vars): varpbar = pbar.subset(prog[i], prog[i+1]) # Scan the outer axis (record axis?) for failures. N = var.shape[0] failed_indices[var.name] = [] error_messages[var.name] = [] for j in range(N): vpbar = varpbar.part(j, N) try: # Try fetching the data, see if something fails var[j] if var.naxes == 1 else var[j,...] except Exception as e: failed_indices[var.name].append(j) error_messages[var.name].append(str(e)) vpbar.update(100) # Print summary information for each variable everything_ok = True for var in vars: indices = failed_indices[var.name] messages = error_messages[var.name] if len(indices) == 0: continue everything_ok = False print "\nFailures encountered with variable '%s':"%var.name # Group together record indices that give the same error message unique_messages = [] aggregated_indices = [] for ind,msg in zip(indices,messages): if len(unique_messages) == 0 or msg != unique_messages[-1]: unique_messages.append(msg) aggregated_indices.append([ind]) else: aggregated_indices[-1].append(ind) # Print each error message encountered (and the record indices that give the error) for ind,msg in zip(aggregated_indices,unique_messages): # Group records together that have are consecutive (instead of printing each record separately) groups = [] for i in ind: if len(groups) == 0 or i-1 not in groups[-1]: groups.append([i]) else: groups[-1].append(i) for g in groups: print "=> at %s:\n %s"% (var.axes[0].slice[g[0]:g[-1]+1], msg) if not everything_ok: raise Exception("Problem encountered with the dataset.")
def isnonzero(X, axes, alpha=0.05, N_fac = None, pbar=None): # {{{ r'''Computes the mean value and statistics of X, against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple or :class:`Dataset` instance. Three quantities are computed: * The mean value of X * The probability of the computed value if the population mean was zero * The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata,) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum([Na[outsl], npnansum(1. + xdata*0., riaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx = (xx - x**2/Na) / (Na - 1) x /= Na if N_fac is not None: eN = N//N_fac eNa = Na//N_fac else: eN = N eNa = Na #print 'eff. N = %.1f' % eN sdom = np.sqrt(xx/eNa) p = tdist.cdf(abs(x/sdom), eNa - 1)*np.sign(x) ci = tdist.ppf(1. - alpha/2, eNa - 1) * sdom name = X.name if X.name != '' else 'X' if len(oaxes) > 0: from pygeode import Var, Dataset X = Var(oaxes, values=x, name=name) P = Var(oaxes, values=p, name='p_%s' % name) CI = Var(oaxes, values=ci, name='CI_%s' % name) return Dataset([X, P, CI]) else: # Degenerate case return x, p, ci
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None): # {{{ r'''Computes the mean value of X and statistics relevant for a test against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the mean value can be obtained through ``ds.m``). The following quantities can be calculated. * 'm': The mean value of X * 'p': The probability of the computed value if the population mean was zero * 'ci': The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0) imsk = (Na > 0.) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx[imsk] -= x[imsk]**2 / Na[imsk] xx[imsk] = xx[imsk] / (Na[imsk] - 1) x[imsk] /= Na[imsk] if N_fac is not None: eN = N // N_fac eNa = Na // N_fac else: eN = N eNa = Na sdom = np.zeros((oview.shape), 'd') p = np.zeros((oview.shape), 'd') t = np.zeros((oview.shape), 'd') ci = np.zeros((oview.shape), 'd') sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk]) dmsk = (sdom > 0.) t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk] p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1)) ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk] name = X.name if X.name != '' else 'X' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: m = Var(oaxes, values=x, name='m') m.atts['longname'] = 'Mean value of %s' % (name, ) rvs.append(m) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value of test %s is 0' % (name, ) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence intervale of the mean value of %s' % ( name, ) rvs.append(ci) return asdataset(rvs)