def tidy_axes(dataset, unlimited=None): # {{{ from pygeode.tools import combine_axes from pygeode.axis import DummyAxis from pygeode.dataset import asdataset vars = list(dataset.vars) # The output axes axes = combine_axes(v.axes for v in vars) # Include axes in the list of vars (for writing to netcdf). # Exclude axes which don't have any intrinsic values. # Look at original dataset to check original type of axes (because # finalize_save may force everything to be NamedAxis). vars = vars + [ a for a in axes if not isinstance(dataset[a.name], DummyAxis) ] # Variables (and axes) must all have unique names assert len(set([v.name for v in vars])) == len( vars), "vars must have unique names: %s" % [v.name for v in vars] if unlimited is not None: assert unlimited in [a.name for a in axes] return asdataset(vars)
def serve (path, dataset, port=8080): from pygeode.server.web import MyServer_threaded2 import threading from pygeode.dataset import asdataset # Remove extra /'s while '//' in path: path = path.replace('//','/') # Remove any leading and trailing / if path.startswith('/'): path = path[1:] if path.endswith('/'): path = path[:-1] # Break up into directories and 'file' name parts = path.split('/') dirnames = parts[:-1] fname = parts[-1] # Check if we have a server available already if port not in SERVERS.serverdict: # Make a new server, with an empty root directory root = DAP_Dir() server = MyServer_threaded2(port, root) threading.Thread(target=server.serve_forever).start() print("Started an OPeNDAP server listening on port %s"%port) SERVERS.serverdict[port] = server else: server = SERVERS.serverdict[port] # Get the working directory cwd = server.root_handler # Make sure the full path is available for dname in dirnames: if dname not in cwd.nodes: cwd.nodes[dname] = DAP_Dir() cwd = cwd.nodes[dname] assert hasattr(cwd,'nodes'), "'%s' is already defined as something other than a directory?"%dname # Share the file assert fname not in cwd.nodes, "'%s' is already being served"%path cwd.nodes[fname] = DAPHandler(asdataset(dataset))
def make_dataset (ncfile): # {{{ from pygeode.dataset import asdataset # Construct all the variables, put in a list vars = list(map(make_var, list(ncfile.variables.values()))) # Construct a dataset from these Vars dataset = asdataset(vars) dataset.atts = make_atts(ncfile) return dataset
def finalize_save(dataset, cfmeta = True, pack = None): # {{{ from pygeode.formats import cfmeta as cf from pygeode.dataset import asdataset # Only pack if pack is true if pack: if hasattr(pack, '__len__'): # Assume this is a list of variables to pack vars = [PackVar(v) if v.name in pack else v for v in dataset.vars] else: vars = [PackVar(v) for v in dataset.vars] dset = asdataset(vars) dset.atts = dataset.atts.copy() else: dset = dataset # Encode standard axes back into netcdf metadata? if cfmeta is True: return cf.encode_cf(dset) else: return asdataset(dset)
def ensemble(*varlists): """ Creates an ensemble out of a set of similar variables. The corresponding variable must have the same axes and the same name. If a bunch of vars are passed as inputs, then a single ensemble var is returned. If a bunch of datasets are passed as inputs, then a single dataset is returned, consisting of an ensemble of the internal vars. Each input dataset must have matching vars. """ from pygeode.var import Var from pygeode.dataset import Dataset, asdataset from pygeode.tools import common_dict datasets = [asdataset(v) for v in varlists] varnames = [v.name for v in datasets[0].vars] # Make sure we have the same varnames in each dataset for dataset in datasets: assert set(dataset.vardict.keys()) == set( varnames), "inconsistent variable names between datasets" # Make sure the varlists are all in the same order for i, dataset in enumerate(datasets): varlist = [dataset[varname] for varname in varnames] datasets[i] = Dataset(varlist, atts=dataset.atts) for varname in varnames: var0 = datasets[0][varname] for dataset in datasets: var = dataset[varname] # Make sure the axes are the same between ensemble vars assert var.axes == var0.axes, "inconsistent axes for %s" % varname # Collect the ensembles together ensembles = [] for varname in varnames: ensemble = EnsembleVar([dataset[varname] for dataset in datasets]) ensembles.append(ensemble) # Global attributes atts = common_dict(dataset.atts for dataset in datasets) if isinstance(varlists[0], Dataset): return Dataset(ensembles, atts=atts) if isinstance(varlists[0], Var): assert len(ensembles) == 1 return ensembles[0] return ensembles
def ensemble (*varlists): """ Creates an ensemble out of a set of similar variables. The corresponding variable must have the same axes and the same name. If a bunch of vars are passed as inputs, then a single ensemble var is returned. If a bunch of datasets are passed as inputs, then a single dataset is returned, consisting of an ensemble of the internal vars. Each input dataset must have matching vars. """ from pygeode.var import Var from pygeode.dataset import Dataset, asdataset from pygeode.tools import common_dict datasets = [asdataset(v) for v in varlists] varnames = [v.name for v in datasets[0].vars] # Make sure we have the same varnames in each dataset for dataset in datasets: assert set(dataset.vardict.keys()) == set(varnames), "inconsistent variable names between datasets" # Make sure the varlists are all in the same order for i, dataset in enumerate(datasets): varlist = [dataset[varname] for varname in varnames] datasets[i] = Dataset(varlist, atts=dataset.atts) for varname in varnames: var0 = datasets[0][varname] for dataset in datasets: var = dataset[varname] # Make sure the axes are the same between ensemble vars assert var.axes == var0.axes, "inconsistent axes for %s"%varname # Collect the ensembles together ensembles = [] for varname in varnames: ensemble = EnsembleVar([dataset[varname] for dataset in datasets]) ensembles.append(ensemble) # Global attributes atts = common_dict(dataset.atts for dataset in datasets) if isinstance(varlists[0], Dataset): return Dataset(ensembles, atts=atts) if isinstance(varlists[0], Var): assert len(ensembles) == 1 return ensembles[0] return ensembles
def tidy_axes(dataset, unlimited=None): # {{{ from pygeode.tools import combine_axes from pygeode.axis import DummyAxis from pygeode.dataset import asdataset vars = list(dataset.vars) # The output axes axes = combine_axes(v.axes for v in vars) # Include axes in the list of vars (for writing to netcdf). # Exclude axes which don't have any intrinsic values. # Look at original dataset to check original type of axes (because # finalize_save may force everything to be NamedAxis). vars = vars + [a for a in axes if not isinstance(dataset[a.name],DummyAxis)] # Variables (and axes) must all have unique names assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars] if unlimited is not None: assert unlimited in [a.name for a in axes] return asdataset(vars)
def encode_cf (dataset): from pygeode.dataset import asdataset, Dataset from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless from pygeode.axis import NamedAxis, DummyAxis from pygeode.var import Var from pygeode.timeutils import reltime from copy import copy dataset = asdataset(dataset) varlist = list(dataset) axisdict = dataset.axisdict.copy() global_atts = dataset.atts.copy() del dataset # Fix the variable names for i,v in enumerate(list(varlist)): oldname = v.name newname = fix_name(oldname) if newname != oldname: from warnings import warn warn ("renaming '%s' to '%s'"%(oldname,newname)) varlist[i] = v.rename(newname) # Fix the axis names #TODO # Fix the variable metadata #TODO # Fix the global metadata # Specify the conventions we're (supposedly) using global_atts['Conventions'] = "CF-1.0" for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name # Metadata based on axis classes for name,a in list(axisdict.items()): atts = a.atts.copy() plotatts = a.plotatts.copy() # passed on to Axis constructor if isinstance(a,Lat): atts['standard_name'] = 'latitude' atts['units'] = 'degrees_north' if isinstance(a,Lon): atts['standard_name'] = 'longitude' atts['units'] = 'degrees_east' if isinstance(a,Pres): atts['standard_name'] = 'air_pressure' atts['units'] = 'hPa' atts['positive'] = 'down' if isinstance(a,Hybrid): #TODO: formula_terms (how do we specify LNSP instead of P0?????) atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate' if isinstance(a,Time): atts['standard_name'] = 'time' #TODO: change the unit depending on the time resolution? start = a.startdate atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units, start.get('year',0), start.get('month',1), start.get('day',1), start.get('hour',0), start.get('minute',0), start.get('second',0) ) if isinstance(a,StandardTime): atts['calendar'] = 'standard' if isinstance(a,ModelTime365): atts['calendar'] = '365_day' if isinstance(a,ModelTime360): atts['calendar'] = '360_day' if isinstance(a,Yearless): atts['calendar'] = 'none' if isinstance(a,XAxis): atts['axis'] = 'X' if isinstance(a,YAxis): atts['axis'] = 'Y' if isinstance(a,ZAxis): atts['axis'] = 'Z' if isinstance(a,TAxis): atts['axis'] = 'T' # Change the time axis to be relative to a start date #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata if isinstance(a, Time): #TODO: cast into an integer array if possible axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts) continue # Encode non-coordinate axes, including station (timeseries) data. # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series # Move station lat/lon/name data into separate variables. if isinstance(a, NonCoordinateAxis): # Keep track of extra variables created from auxarray data. extra_vars = [] # Detect certain arrays that should be treated as "coordinates". coordinates = [] # Encode station latitude. if 'lat' in a.auxarrays: lat = a.auxasvar('lat') lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north") extra_vars.append(lat) coordinates.append('lat') # Encode station longitude. if 'lon' in a.auxarrays: lon = a.auxasvar('lon') lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east") extra_vars.append(lon) coordinates.append('lon') coordinates = " ".join(coordinates) # Encode other auxarrays as generic "ancillary" arrays. ancillary_variables = [] for auxname in list(a.auxarrays.keys()): if auxname in coordinates: continue # Handled above var = a.auxasvar(auxname) if var.dtype.name.startswith('str'): var = encode_string_var(var) # Some extra CF encoding for the station name, to use it as the unique identifier. if auxname == 'station': var.atts = dict(cf_role = "timeseries_id") extra_vars.append(var) ancillary_variables.append(auxname) ancillary_variables = " ".join(ancillary_variables) # Attach these coordinates to all variables that use this axis. #TODO: cleaner way of adding this information without having to do a shallow copy. for i,var in enumerate(varlist): if var.hasaxis(a): var = copy(var) var.atts = copy(var.atts) if len(coordinates) > 0: var.atts['coordinates'] = coordinates if len(ancillary_variables) > 0: var.atts['ancillary_variables'] = ancillary_variables varlist[i] = var # Add these coordinates / ancillary variables to the output. varlist.extend(extra_vars) # The values in the axis itself are meaningless, so mark them as such axisdict[name] = DummyAxis(len(a),name=name) # Special case: Station (timeseries) data. if isinstance(a, Station): global_atts['featureType'] = "timeSeries" # Nothing more to do for this axis type continue # Encode custom axes from add-ons for n,c in list(custom_axes.items()): if isinstance(a,c): atts['standard_name'] = n # Add associated arrays as new variables auxarrays = a.auxarrays for aux,values in auxarrays.items(): auxname = name+'_'+aux assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname varlist.append( Var([a], values=values, name=auxname) ) if len(auxarrays) > 0: atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys()) # Create new, generic axes with the desired attributes # (Replaces the existing entry in the dictionary) axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts) # Apply these new axes to the variables for i,oldvar in enumerate(list(varlist)): name = oldvar.name try: #TODO: use Var.replace_axes instead? varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts) except KeyError: print('??', a.name, axisdict) raise dataset = Dataset(varlist, atts=global_atts) return dataset
def to_xarray(dataset): """ Converts a PyGeode Dataset into an xarray Dataset. Parameters ---------- dataset : pygeode.Dataset The dataset to be converted. Returns ------- out : xarray.Dataset An object which can be used with the xarray package. """ from pygeode.dataset import asdataset from pygeode.formats.cfmeta import encode_cf from pygeode.view import View from dask.base import tokenize import dask.array as da import xarray as xr dataset = asdataset(dataset) # Encode the axes/variables with CF metadata. dataset = encode_cf(dataset) out = dict() # Loop over each axis and variable. for var in list(dataset.axes) + list(dataset.vars): # Generate a unique name to identify it with dask. name = var.name + "-" + tokenize(var) dsk = dict() dims = [a.name for a in var.axes] # Special case: already have the values in memory. if hasattr(var, 'values'): out[var.name] = xr.DataArray(var.values, dims=dims, attrs=var.atts, name=var.name) continue # Keep track of all the slices that were made over each dimension. # This information will be used to determine the "chunking" that was done # on the variable from inview.loop_mem(). slice_order = [[] for a in var.axes] chunks = [] # Break up the variable into into portions that are small enough to fit # in memory. These will become the "chunks" for dask. inview = View(var.axes) for outview in inview.loop_mem(): integer_indices = list(map(tuple, outview.integer_indices)) # Determine *how* loop_mem is splitting the axes, and define the chunk # sizes accordingly. # A little indirect, but loop_mem doesn't make its chunking choices # available to the caller. for o, sl in zip(slice_order, integer_indices): if sl not in o: o.append(sl) ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)] # Add this chunk to the dask array. key = tuple([name] + ind) dsk[key] = (var.getview, outview, False) # Construct the dask array. chunks = [list(map(len, sl)) for sl in slice_order] arr = da.Array(dsk, name, chunks, dtype=var.dtype) # Wrap this into an xarray.DataArray (with metadata and named axes). out[var.name] = xr.DataArray(arr, dims=dims, attrs=var.atts, name=var.name) # Build the final xarray.Dataset. out = xr.Dataset(out, attrs=dataset.atts) # Re-decode the CF metadata on the xarray side. out = xr.conventions.decode_cf(out) return out
def open(filename, value_override={}, dimtypes={}, namemap={}, varlist=[], cfmeta=True): from numpy import empty from ctypes import c_long, byref from pygeode.axis import DummyAxis from pygeode.dataset import asdataset from pygeode.formats import finalize_open f = HDF4_File(filename) num_datasets = c_long() num_global_attrs = c_long() ret = lib.SDfileinfo(f.sd_id, byref(num_datasets), byref(num_global_attrs)) assert ret == 0 num_datasets = num_datasets.value num_global_attrs = num_global_attrs.value global_atts = get_attributes(f.sd_id, num_global_attrs) # Get the HDF vars SD_arr = [None] * num_datasets for i in range(num_datasets): SD_arr[i] = HDF4_SD(f, i) # If there are 2 vars of the name XXXX and XXXX:EOSGRID, then # ignore the first one and use the latter one. # (Based some some GMAO files from the IPY dataset) SD_arr = [ sd for sd in SD_arr if sd.name.endswith(':EOSGRID') or not any( sd2.name == sd.name + ':EOSGRID' for sd2 in SD_arr) ] # Find any 'axes' # (look for unique 1D vars which contain a particular dimension id) sd_1d = [sd for sd in SD_arr if sd.rank == 1] # Determine which dimensions map to a unique 1D array dimids = [sd.dimids[0] for sd in sd_1d] dimsds = [ s for s in sd_1d if dimids.count(s.dimids[0]) == 1 or s.iscoord == 1 ] # Load axis values for s in dimsds: s.values = empty(s.shape, numpy_type[s.type]) load_values(s.sds_id, [0], s.shape, s.values) #for s in dimsds: print s; print s.values # Create axis objects from pygeode.axis import NamedAxis axes = [None] * len(dimsds) for i, s in enumerate(dimsds): # Append attributes for the axis atts = get_attributes(s.sds_id, s.natts) # if len(atts) > 0: axes[i].atts = atts axes[i] = NamedAxis(s.values, s.name, atts=atts) # Reference axes by dimension ids axis_lookup = {} for i, a in enumerate(axes): axis_lookup[dimids[i]] = a # Add dummy axes for dimensions without coordinate info. for s in SD_arr: for d in s.dimids: if d not in axis_lookup: dimname, dimsize, dimtype, dim_natts = get_dim_info(d) axis_lookup[d] = DummyAxis(dimsize, dimname) # Create var objects vars = [None] * len(SD_arr) for i, s in enumerate(SD_arr): axes = [axis_lookup[d] for d in s.dimids] vars[i] = HDF4_Var(s, axes) vars = [v for v in vars if v.sd not in dimsds] # Return a dataset d = asdataset(vars) d.atts = global_atts return finalize_open(d, dimtypes, namemap, varlist, cfmeta)
def open(filename, value_override={}, dimtypes={}, namemap={}, varlist=[], cfmeta=True): # {{{ ''' open (filename, [value_override = {}, dimtypes = {}, namemap = {}, varlist = [] ]) Returns a Dataset of PyGeode variables contained in the specified files. The axes of the variables are created from the dimensions of the NetCDF file. NetCDF variables in the file that do not correspond to dimensions are imported as PyGeode variables. filename - NetCDF file to open value_override - an optional dictionary with replacement values for one or more variables. The only known use for this dictionary is to avoid loading in values from a severely scattered variable (such as a 'time' axis or other slowest-varying dimension). dimtypes - a dictionary mapping dimension names to axis classes. The keys should be axis names as defined in the NetCDF file; values should be one of: 1) an axis instance, 2) an axis class, or 3) a tuple of an axis class and a dictionary with keyword arguments to pass to that axis' constructor If no dictionary is included, an attempt is made to automatically identify the axis types. namemap - an optional dictionary to map NetCDF variable names (keys) to PyGeode variable names (values); also works for axes/dimensions varlist - a list containing the variables that should be loaded into the data set (if the list is empty, all NetCDF variables will be loaded) Note: The identifiers used in varlist and dimtypes are the original names used in the NetCDF file, not the names given in namemap.''' from os.path import exists from ctypes import c_int, byref from pygeode.dataset import asdataset from pygeode.formats import finalize_open from pygeode.axis import Axis if not filename.startswith('http://'): assert exists( filename), 'File open failed. "%s" does not exist.' % filename # Read variable dimensions and metadata from the file f = NCFile(filename) f.open() try: fileid = f.fileid # Get number of variables nvars = c_int() ret = lib.nc_inq_nvars(fileid, byref(nvars)) assert ret == 0, lib.nc_strerror(ret) nvars = nvars.value # Construct all the variables, put in a list vars = [NCVar(f, i) for i in range(nvars)] # Construct a dataset from these Vars dataset = asdataset(vars) dataset.atts = get_attributes(fileid, -1) finally: f.close() # Add the object stuff from dimtypes to value_override, so we don't trigger a # load operation on those dims. # (We could use any values here, since they'll be overridden again later, # but we might as well use something relevant). value_override = dict( value_override) # don't use the default (static) empty dict for k, v in list(dimtypes.items()): if isinstance(v, Axis): value_override[k] = v.values #### Filters to apply to the data #### # Override values from the source? if len(value_override) > 0: dataset = override_values(dataset, value_override) # Set up the proper axes (get coordinate values / metadata from a 1D variable # with the same name as the dimension) dataset = dims2axes(dataset) return finalize_open(dataset, dimtypes, namemap, varlist, cfmeta)
def save(filename, in_dataset, version=3, pack=None, compress=False, cfmeta=True, unlimited=None): # {{{ from ctypes import c_int, c_long, byref from pygeode.view import View from pygeode.tools import combine_axes, point from pygeode.axis import Axis, DummyAxis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.formats import finalize_save from pygeode.dataset import asdataset assert isinstance(filename, str) in_dataset = asdataset(in_dataset) dataset = finalize_save(in_dataset, cfmeta, pack) # Version? if compress: version = 4 assert version in (3, 4) fileid = c_int() vars = list(dataset.vars) # The output axes axes = combine_axes(v.axes for v in vars) # Include axes in the list of vars (for writing to netcdf). # Exclude axes which don't have any intrinsic values. vars = vars + [a for a in axes if not isinstance(a, DummyAxis)] #vars.extend(axes) # Variables (and axes) must all have unique names assert len(set([v.name for v in vars])) == len( vars), "vars must have unique names: %s" % [v.name for v in vars] if unlimited is not None: assert unlimited in [a.name for a in axes] # Functions for writing entire array allf = { 1: lib.nc_put_var_schar, 2: lib.nc_put_var_text, 3: lib.nc_put_var_short, 4: lib.nc_put_var_int, 5: lib.nc_put_var_float, 6: lib.nc_put_var_double, 7: lib.nc_put_var_uchar, 8: lib.nc_put_var_ushort, 9: lib.nc_put_var_uint, 10: lib.nc_put_var_longlong, 11: lib.nc_put_var_ulonglong } # Functions for writing chunks chunkf = { 1: lib.nc_put_vara_schar, 2: lib.nc_put_vara_text, 3: lib.nc_put_vara_short, 4: lib.nc_put_vara_int, 5: lib.nc_put_vara_float, 6: lib.nc_put_vara_double, 7: lib.nc_put_vara_uchar, 8: lib.nc_put_vara_ushort, 9: lib.nc_put_vara_uint, 10: lib.nc_put_vara_longlong, 11: lib.nc_put_vara_ulonglong } # Create the file if version == 3: ret = lib.nc_create(filename.encode('ascii'), 0, byref(fileid)) if ret != 0: raise IOError(lib.nc_strerror(ret)) elif version == 4: ret = lib.nc_create(filename.encode('ascii'), 0x1000, byref(fileid)) # 0x1000 = NC_NETCDF4 if ret != 0: raise IOError(lib.nc_strerror(ret)) else: raise Exception try: # Define the dimensions dimids = [None] * len(axes) for i, a in enumerate(axes): dimids[i] = c_int() if unlimited == a.name: ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i])) else: ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i])) assert ret == 0, lib.nc_strerror(ret) # Define the variables (including axes) chunks = [None] * len(vars) varids = [None] * len(vars) for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] # Generate the array of dimension ids for this var d = [dimids[list(axes).index(a)] for a in var.axes] # Make it C-compatible d = (c_int * var.naxes)(*d) varids[i] = c_int() ret = lib.nc_def_var(fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i])) assert ret == 0, lib.nc_strerror(ret) # Compress the data? (only works for netcdf4 or (higher?)) if compress: ret = lib.nc_def_var_deflate(fileid, varids[i], 1, 1, 2) assert ret == 0, lib.nc_strerror(ret) # Write the attributes # global attributes put_attributes(fileid, -1, dataset.atts, version) # variable attributes for i, var in enumerate(vars): # modify axes to be netcdf friendly (CF-compliant, etc.) put_attributes(fileid, varids[i], var.atts, version) # Don't pre-fill the file oldmode = c_int() ret = lib.nc_set_fill(fileid, 256, byref(oldmode)) assert ret == 0, "Can't set fill mode: %s (error %d)" % ( lib.nc_strerror(ret), ret) # Finished defining the variables, about to start writing the values ret = lib.nc_enddef(fileid) assert ret == 0, "Error leaving define mode: %s (error %d)" % ( lib.nc_strerror(ret), ret) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100 # print "Saving '%s':"%filename pbar = PBar(message="Saving '%s':" % filename) # pbar = FakePBar() # Write the data for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] dtype = numpy_type[t] # print 'writing', var.name # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v, Axis)]) varpbar = pbar.subset(prog[i], prog[i + 1]) views = list(View(var.axes).loop_mem()) for j, v in enumerate(views): vpbar = varpbar.part(j, len(views)) # print '???', repr(str(v)) # Should always be slices (since we're looping over whole thing contiguously?) for sl in v.slices: assert isinstance(sl, slice) for sl in v.slices: assert sl.step in (1, None) start = [sl.start for sl in v.slices] count = [sl.stop - sl.start for sl in v.slices] start = (c_long * var.naxes)(*start) count = (c_long * var.naxes)(*count) if isinstance(var, Axis): assert len(start) == len(count) == 1 data = var.values data = data[ start[0]:start[0] + count[0]] # the above gives us the *whole* axis, # but under extreme conditions we may be looping over smaller pieces vpbar.update(100) else: data = v.get(var, pbar=vpbar) # Ensure the data is stored contiguously in memory data = np.ascontiguousarray(data, dtype=dtype) ret = chunkf[t](fileid, varids[i], start, count, point(data)) assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % ( var.name, lib.nc_strerror(ret), ret) finally: # Finished lib.nc_close(fileid)
def decode_cf (dataset, ignore=[]): from pygeode.dataset import asdataset, Dataset from pygeode.axis import Axis, NamedAxis, Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless from pygeode import timeutils from warnings import warn import re # dataset = asdataset(dataset, copy=True) dataset = asdataset(dataset) varlist = list(dataset) axisdict = dataset.axisdict.copy() global_atts = dataset.atts del dataset # data for auxiliary arrays auxdict = {} for name in axisdict.iterkeys(): auxdict[name] = {} # fill values / scale / offset (if applicable) fillvalues = {} scales = {} offsets = {} for v in varlist: name = v.name fillvalues[name] = None scales[name] = None offsets[name] = None for name,a in axisdict.items(): # Skip over this axis? if name in ignore: continue atts = a.atts.copy() plotatts = a.plotatts.copy() # just carry along and pass to new Axis instance (l.282) # Find any auxiliary arrays aux = auxdict[name] if 'ancillary_variables' in atts: _anc = atts.pop('ancillary_variables') remove_from_dataset = [] # vars to remove from the dataset for auxname in _anc.split(' '): assert any(v.name == auxname for v in varlist), "ancilliary variable '%s' not found"%auxname newname = auxname # Remove the axis name prefix, if it was used if newname.startswith(name+'_'): newname = newname[len(name)+1:] aux[newname] = [v for v in varlist if v.name == auxname].pop().get() # Don't need this as a var anymore remove_from_dataset.append(auxname) # Remove some stuff varlist = [v for v in varlist if v.name not in remove_from_dataset] # Determine the best Axis subclass to use # cls = NamedAxis cls = type(a) # Generic 'axis' identifiers first if 'axis' in atts: _axis = atts.pop('axis') if _axis == 'X': cls = XAxis if _axis == 'Y': cls = YAxis if _axis == 'Z': cls = ZAxis if _axis == 'T': cls = TAxis # Check specific standard names, and also units? #TODO: don't *pop* the standard_name, units, etc. until the end of this routine - in case we didn't end up mapping them to an axis _ln = atts.get('long_name', a.name).lower() _st = atts.get('standard_name',_ln).lower() _units = atts.pop('units','') if _st == 'latitude' or _units == 'degrees_north': cls = Lat if _st == 'longitude' or _units == 'degrees_east': cls = Lon if _st == 'air_pressure' or _units in ('hPa','mbar'): cls = Pres # Don't need this in the metadata anymore (it will be put back in encode_cf) atts.pop('positive',None) if _st == 'atmosphere_hybrid_sigma_pressure_coordinate': #TODO: check formula_terms?? #TODO: for ccc2nc files, look for long_name == "Model Level", use_AB = <formula>, # A & B embedded as metadata or as data arrays not attached to ancillary_variables if 'A' in aux and 'B' in aux: cls = Hybrid else: warn ("Cannot create a proper Hybrid vertical axis, since 'A' and 'B' coefficients aren't found.") if (_st == 'time' or cls == TAxis or _units.startswith('days since') or _units.startswith('hours since') or _units.startswith('minutes since') or _units.startswith('seconds since')) and ' since ' in _units: _calendar = atts.pop('calendar', 'standard') if _calendar in ('standard', 'gregorian', 'proleptic_gregorian'): cls = StandardTime elif _calendar in ('365_day', 'noleap', '365day'): cls = ModelTime365 elif _calendar in ('360_day', '360day'): cls = ModelTime360 elif _calendar in ('none'): cls = Yearless else: warn ("unknown calendar '%s'"%_calendar) continue # Extract the time resolution (day, hour, etc), and the reference date res, date = re.match("([a-z]+)\s+since\s+(.*)", _units).groups() # Pluralize the increment (i.e. day->days)? if not res.endswith('s'): res += 's' # Extract the rest of the date date = date.rstrip() year, month, day, hour, minute, second = 0,1,1,0,0,0 if len(date) > 0: year, date = re.match("(\d+)-?(.*)", date).groups() if len(date) > 0: month, date = re.match("(\d+)-?(.*)", date).groups() if len(date) > 0: day, date = re.match("(\d+)\s*(.*)", date).groups() if len(date) > 0: hour, date = re.match("(\d+):?(.*)", date).groups() if len(date) > 0: minute, date = re.match("(\d+):?(.*)", date).groups() if len(date) > 0 and date[0] != ' ': second, date = re.match("(\d+)(.*)", date).groups() # convert from strings to integers #TODO: milliseconds? time zone? year, month, day, hour, minute, second = map(int, [year, month, day, hour, minute, float(second)]) # Create the time axis startdate={'year':year, 'month':month, 'day':day, 'hour':hour, 'minute':minute, 'second':second} axisdict[name] = cls(a.values, startdate=startdate, units=res, name=name, atts=atts) # Special case: start year=0 implies a climatology #NOTE: 'climatology' attribute not used, since we don't currently keep # track of the interval that was used for the climatology. if year == 0: # Don't climatologize(?) the axis if there's more than a year if not all(axisdict[name].year == 0): warn ("cfmeta: data starts at year 0 (which usually indicates a climatology), but there's more than one year's worth of data! Keeping it on a regular calendar.", stacklevel=3) continue axisdict[name] = timeutils.modify(axisdict[name], exclude='year') continue # we've constructed the time axis, so move onto the next axis # put the units back (if we didn't use them)? if cls in [Axis, NamedAxis, XAxis, YAxis, ZAxis, TAxis] and _units != '': atts['units'] = _units # create new axis instance if need be (only if a is a generic axis, to prevent replacement of custom axes) # TODO: don't do this check. This filter *should* be called before any # custom axis overrides, so we *should* be able to assume we only have # generic Axis objects at this point (at least, from the netcdf_new module) if (type(a) in (Axis, NamedAxis, XAxis, YAxis, ZAxis, TAxis)) and (cls != type(a)): axisdict[name] = cls(values=a.values, name=name, atts=atts, **aux) # Apply these new axes to the variables # Check for fill values, etc. # Extract to a list first, then back to a dataset # (ensures the dataset axis list is up to date) for i,oldvar in enumerate(list(varlist)): # name = [n for n,v in dataset.vardict.iteritems() if v is oldvar].pop() name = oldvar.name atts = oldvar.atts.copy() plotatts = oldvar.atts.copy() fillvalue = [atts.pop(f,None) for f in ('FillValue', '_FillValue', 'missing_value')] fillvalue = filter(None, fillvalue) fillvalue = fillvalue[0] if len(fillvalue) > 0 else None scale = atts.pop('scale_factor', None) offset = atts.pop('add_offset', None) varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes], name=name, fillvalue=fillvalue, scale=scale, offset=offset, atts=atts, plotatts=plotatts) dataset = Dataset(varlist, atts=global_atts) return dataset
def encode_cf (dataset): from pygeode.dataset import asdataset, Dataset from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless from pygeode.axis import NamedAxis from pygeode.var import Var from pygeode.timeutils import reltime dataset = asdataset(dataset) varlist = list(dataset) axisdict = dataset.axisdict.copy() global_atts = dataset.atts del dataset # Fix the variable names for i,v in enumerate(list(varlist)): oldname = v.name newname = fix_name(oldname) if newname != oldname: from warnings import warn warn ("renaming '%s' to '%s'"%(oldname,newname)) varlist[i] = v.rename(newname) # Fix the axis names #TODO # Fix the variable metadata #TODO # Fix the global metadata #TODO for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name # Metadata based on axis classes for name,a in axisdict.items(): atts = a.atts.copy() plotatts = a.plotatts.copy() # passed on to Axis constructor (l.139) if isinstance(a,Lat): atts['standard_name'] = 'latitude' atts['units'] = 'degrees_north' if isinstance(a,Lon): atts['standard_name'] = 'longitude' atts['units'] = 'degrees_east' if isinstance(a,Pres): atts['standard_name'] = 'air_pressure' atts['units'] = 'hPa' atts['positive'] = 'down' if isinstance(a,Hybrid): #TODO: formula_terms (how do we specify LNSP instead of P0?????) atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate' if isinstance(a,Time): atts['standard_name'] = 'time' #TODO: change the unit depending on the time resolution? start = a.startdate atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units, start.get('year',0), start.get('month',1), start.get('day',1), start.get('hour',0), start.get('minute',0), start.get('second',0) ) if isinstance(a,StandardTime): atts['calendar'] = 'standard' if isinstance(a,ModelTime365): atts['calendar'] = '365_day' if isinstance(a,ModelTime360): atts['calendar'] = '360_day' if isinstance(a,Yearless): atts['calendar'] = 'none' if isinstance(a,XAxis): atts['axis'] = 'X' if isinstance(a,YAxis): atts['axis'] = 'Y' if isinstance(a,ZAxis): atts['axis'] = 'Z' if isinstance(a,TAxis): atts['axis'] = 'T' # Change the time axis to be relative to a start date #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata if isinstance(a, Time): #TODO: cast into an integer array if possible axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts) continue # Add associated arrays as new variables auxarrays = a.auxarrays for aux,values in auxarrays.iteritems(): auxname = name+'_'+aux assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname varlist.append( Var([a], values=values, name=auxname) ) if len(auxarrays) > 0: atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.iterkeys()) # Create new, generic axes with the desired attributes # (Replaces the existing entry in the dictionary) axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts) # Apply these new axes to the variables for i,oldvar in enumerate(list(varlist)): name = oldvar.name try: #TODO: use Var.replace_axes instead? varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts) except KeyError: print '??', a.name, axisdict raise dataset = Dataset(varlist, atts=global_atts) return dataset
def check_dataset (dataset): from pygeode.view import View from pygeode.tools import combine_axes from pygeode.progress import PBar from pygeode.dataset import asdataset import numpy as np # Make sure we have a dataset (in case we're sent a simple list of vars) dataset = asdataset(dataset) vars = list(dataset.vars) # Include axes in the list of vars (to check these values too) axes = combine_axes(v.axes for v in vars) vars.extend(axes) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100 pbar = PBar(message="Checking %s for I/O errors:"%repr(dataset)) failed_indices = {} error_messages = {} # Loop over the data for i,var in enumerate(vars): varpbar = pbar.subset(prog[i], prog[i+1]) # Scan the outer axis (record axis?) for failures. N = var.shape[0] failed_indices[var.name] = [] error_messages[var.name] = [] for j in range(N): vpbar = varpbar.part(j, N) try: # Try fetching the data, see if something fails var[j] if var.naxes == 1 else var[j,...] except Exception as e: failed_indices[var.name].append(j) error_messages[var.name].append(str(e)) vpbar.update(100) # Print summary information for each variable everything_ok = True for var in vars: indices = failed_indices[var.name] messages = error_messages[var.name] if len(indices) == 0: continue everything_ok = False print "\nFailures encountered with variable '%s':"%var.name # Group together record indices that give the same error message unique_messages = [] aggregated_indices = [] for ind,msg in zip(indices,messages): if len(unique_messages) == 0 or msg != unique_messages[-1]: unique_messages.append(msg) aggregated_indices.append([ind]) else: aggregated_indices[-1].append(ind) # Print each error message encountered (and the record indices that give the error) for ind,msg in zip(aggregated_indices,unique_messages): # Group records together that have are consecutive (instead of printing each record separately) groups = [] for i in ind: if len(groups) == 0 or i-1 not in groups[-1]: groups.append([i]) else: groups[-1].append(i) for g in groups: print "=> at %s:\n %s"% (var.axes[0].slice[g[0]:g[-1]+1], msg) if not everything_ok: raise Exception("Problem encountered with the dataset.")
def open (filename, value_override = {}, dimtypes = {}, namemap = {}, varlist = [], cfmeta = True): from numpy import empty from ctypes import c_long, byref from pygeode.axis import DummyAxis from pygeode.dataset import asdataset from pygeode.formats import finalize_open f = HDF4_File (filename) num_datasets = c_long() num_global_attrs = c_long() ret = lib.SDfileinfo (f.sd_id, byref(num_datasets), byref(num_global_attrs)) assert ret == 0 num_datasets = num_datasets.value num_global_attrs = num_global_attrs.value global_atts = get_attributes(f.sd_id, num_global_attrs) # Get the HDF vars SD_arr = [None] * num_datasets for i in range(num_datasets): SD_arr[i] = HDF4_SD(f, i) # If there are 2 vars of the name XXXX and XXXX:EOSGRID, then # ignore the first one and use the latter one. # (Based some some GMAO files from the IPY dataset) SD_arr = [sd for sd in SD_arr if sd.name.endswith(':EOSGRID') or not any(sd2.name == sd.name+':EOSGRID' for sd2 in SD_arr) ] # Find any 'axes' # (look for unique 1D vars which contain a particular dimension id) sd_1d = [sd for sd in SD_arr if sd.rank == 1] # Determine which dimensions map to a unique 1D array dimids = [sd.dimids[0] for sd in sd_1d] dimsds = [s for s in sd_1d if dimids.count(s.dimids[0]) == 1 or s.iscoord == 1] # Load axis values for s in dimsds: s.values = empty(s.shape, numpy_type[s.type]) load_values (s.sds_id, [0], s.shape, s.values) #for s in dimsds: print s; print s.values # Create axis objects from pygeode.axis import NamedAxis axes = [None] * len(dimsds) for i,s in enumerate(dimsds): # Append attributes for the axis atts = get_attributes (s.sds_id, s.natts) # if len(atts) > 0: axes[i].atts = atts axes[i] = NamedAxis (s.values, s.name, atts=atts) # Reference axes by dimension ids axis_lookup = {} for i,a in enumerate(axes): axis_lookup[dimids[i]] = a # Add dummy axes for dimensions without coordinate info. for s in SD_arr: for d in s.dimids: if d not in axis_lookup: dimname, dimsize, dimtype, dim_natts = get_dim_info(d) axis_lookup[d] = DummyAxis(dimsize,dimname) # Create var objects vars = [None]*len(SD_arr) for i,s in enumerate(SD_arr): axes = [axis_lookup[d] for d in s.dimids] vars[i] = HDF4_Var(s, axes) vars = [v for v in vars if v.sd not in dimsds] # Return a dataset d = asdataset(vars) d.atts = global_atts return finalize_open(d, dimtypes, namemap, varlist, cfmeta)
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None): # {{{ r'''Computes least-squares multiple regression of Y against variables Xs. Parameters ========== Xs : list of :class:`Var` instances Variables to treat as independent regressors. Must have at least one axis in common with each other and with Y. Y : :class:`Var` The dependent variable. Must have at least one axis in common with the Xs. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to the Xs and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'B,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : tuple of floats or :class:`Var` instances. The return values are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed. Note that a constant term is not included by default. The following parameters can be returned: * 'B': Linear coefficients :math:`\beta_i` of each regressor * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`) * 'p': p-value of regession; see notes. * 'sb': Standard deviation of each linear coefficient * 'covb': Covariance matrix of the linear coefficients * 'se': Standard deviation of residuals The outputs 'B', 'p', and 'sb' will produce as many outputs as there are regressors. Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.4. The p-value 'p' is computed using the t-statistic appropriate for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section 8.4.2; it corresponds to the probability of obtaining the regression coefficient under the null hypothesis that there is no linear relationship. Note this may not be the best way to determine if a given parameter is contributing a significant fraction to the explained variance of Y. The variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and Zwiers, respectively. The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum from pygeode.view import View # Split output request now ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) Nr = len(Xs) Xaxes = combine_axes(Xs) srcaxes = combine_axes([Xaxes, Y]) oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes]) if axes is not None: ri_new = [] for a in axes: ia = whichaxis(srcaxes, a) if ia in riaxes: ri_new.append(ia) else: raise KeyError( 'One of the Xs or Y does not have the axis %s.' % a) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = tuple([srcaxes[i] for i in oiaxes]) inaxes = oaxes + tuple([srcaxes[i] for i in riaxes]) oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len( riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % ( Y.name) # Construct work arrays os = oview.shape os1 = os + (Nr, ) os2 = os + (Nr, Nr) y = np.zeros(os, 'd') yy = np.zeros(os, 'd') xy = np.zeros(os1, 'd') xx = np.zeros(os2, 'd') xxinv = np.zeros(os2, 'd') N = np.prod([len(srcaxes[i]) for i in riaxes]) # Accumulate data for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar): ydata = datatuple[-1].astype('d') xdata = [datatuple[i].astype('d') for i in range(Nr)] y[outsl] += npsum(ydata, siaxes) yy[outsl] += npsum(ydata**2, siaxes) for i in range(Nr): xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes) for j in range(i + 1): xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes) # Fill in opposite side of xTx for i in range(Nr): for j in range(i): xx[..., j, i] = xx[..., i, j] # Compute inverse of covariance matrix (could be done more intellegently? certainly the python # loop over oview does not help) xx = xx.reshape(-1, Nr, Nr) xxinv = xxinv.reshape(-1, Nr, Nr) for i in range(xx.shape[0]): xxinv[i, :, :] = np.linalg.inv(xx[i, :, :]) xx = xx.reshape(os2) xxinv = xxinv.reshape(os2) beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1) vare = np.sum(xy * beta, -1) if N_fac is None: N_eff = N else: N_eff = N // N_fac sigbeta = [ np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr) ] xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)] yn = Y.name if Y.name != '' else 'Y' from .var import Var from .dataset import asdataset from .axis import NonCoordinateAxis ra = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor') ra2 = NonCoordinateAxis(values=np.arange(Nr), regressor=xns, name='regressor2') Nd = len(oaxes) rvs = [] if 'beta' in output: B = Var(oaxes + (ra, ), values=beta, name='beta') B.atts['longname'] = 'regression coefficient' rvs.append(B) if 'r2' in output: vary = (yy - y**2 / N) R2 = 1 - (yy - vare) / vary R2 = Var(oaxes, values=R2, name='R2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: p = [ 2. * (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr)) for i in range(Nr) ] p = np.transpose(np.array(p), [Nd] + list(range(Nd))) p = Var(oaxes + (ra, ), values=p, name='p') p.atts['longname'] = 'p-values' rvs.append(p) if 'sb' in output: sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd))) sb = Var(oaxes + (ra, ), values=sigbeta, name='sb') sb.atts['longname'] = 'standard deviation of linear coefficients' rvs.append(sb) if 'covb' in output: sigmat = np.zeros(os2, 'd') for i in range(Nr): for j in range(Nr): #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff) sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb') covb.atts['longname'] = 'Covariance matrix of the linear coefficients' rvs.append(covb) if 'se' in output: se = np.sqrt((yy - vare) / N_eff) se = Var(oaxes, values=se, name='se') se.atts['longname'] = 'standard deviation of residual' rvs.append(se) ds = asdataset(rvs) ds.atts[ 'description'] = 'multiple linear regression parameters for %s regressed against %s' % ( yn, xns) return ds
def decode_cf (dataset, ignore=[]): from pygeode.dataset import asdataset, Dataset from pygeode.axis import Axis, NamedAxis, Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, Station, DummyAxis, NonCoordinateAxis from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless from pygeode import timeutils from warnings import warn import re # dataset = asdataset(dataset, copy=True) dataset = asdataset(dataset) varlist = list(dataset) axisdict = dataset.axisdict.copy() global_atts = dataset.atts del dataset # Decode string variables for i,var in enumerate(varlist): if var.name.endswith("_name") and var.dtype.name in ("string8","bytes8") and var.axes[-1].name.endswith("_strlen"): varlist[i] = decode_string_var(var) # data for auxiliary arrays auxdict = {} for name in axisdict.keys(): auxdict[name] = {} # fill values / scale / offset (if applicable) fillvalues = {} scales = {} offsets = {} for v in varlist: name = v.name fillvalues[name] = None scales[name] = None offsets[name] = None for name,a in list(axisdict.items()): # Skip over this axis? if name in ignore: continue atts = a.atts.copy() plotatts = a.plotatts.copy() # just carry along and pass to new Axis instance # Find any auxiliary arrays aux = auxdict[name] if 'ancillary_variables' in atts: _anc = atts.pop('ancillary_variables') remove_from_dataset = [] # vars to remove from the dataset for auxname in _anc.split(' '): assert any(v.name == auxname for v in varlist), "ancillary variable '%s' not found"%auxname newname = auxname # Remove the axis name prefix, if it was used if newname.startswith(name+'_'): newname = newname[len(name)+1:] aux[newname] = [v for v in varlist if v.name == auxname].pop().get() # Don't need this as a var anymore remove_from_dataset.append(auxname) # Remove some stuff varlist = [v for v in varlist if v.name not in remove_from_dataset] # Determine the best Axis subclass to use # cls = NamedAxis cls = type(a) # Generic 'axis' identifiers first if 'axis' in atts: _axis = atts.pop('axis') if _axis == 'X': cls = XAxis if _axis == 'Y': cls = YAxis if _axis == 'Z': cls = ZAxis if _axis == 'T': cls = TAxis # Check specific standard names, and also units? #TODO: don't *pop* the standard_name, units, etc. until the end of this routine - in case we didn't end up mapping them to an axis _ln = atts.get('long_name', a.name).lower() _st = atts.get('standard_name',_ln).lower() _units = atts.pop('units','') if _st == 'latitude' or _units == 'degrees_north': cls = Lat if _st == 'longitude' or _units == 'degrees_east': cls = Lon if _st == 'air_pressure' or _units in ('hPa','mbar'): cls = Pres # Don't need this in the metadata anymore (it will be put back in encode_cf) atts.pop('positive',None) if _st == 'atmosphere_hybrid_sigma_pressure_coordinate': #TODO: check formula_terms?? #TODO: for ccc2nc files, look for long_name == "Model Level", use_AB = <formula>, # A & B embedded as metadata or as data arrays not attached to ancillary_variables if 'A' in aux and 'B' in aux: cls = Hybrid else: warn ("Cannot create a proper Hybrid vertical axis, since 'A' and 'B' coefficients aren't found.") if _st == 'station': cls = Station if (_st == 'time' or cls == TAxis or _units.startswith('days since') or _units.startswith('hours since') or _units.startswith('minutes since') or _units.startswith('seconds since')) and ' since ' in _units: _calendar = atts.pop('calendar', 'standard') if _calendar in ('standard', 'gregorian', 'proleptic_gregorian'): cls = StandardTime elif _calendar in ('365_day', 'noleap', '365day'): cls = ModelTime365 elif _calendar in ('360_day', '360day'): cls = ModelTime360 elif _calendar in ('none'): cls = Yearless else: warn ("unknown calendar '%s'"%_calendar) continue # Extract the time resolution (day, hour, etc), and the reference date res, date = re.match("([a-z]+)\s+since\s+(.*)", _units).groups() # Pluralize the increment (i.e. day->days)? if not res.endswith('s'): res += 's' # Extract the rest of the date date = date.rstrip() year, month, day, hour, minute, second = 0,1,1,0,0,0 if len(date) > 0: year, date = re.match("(\d+)-?(.*)", date).groups() if len(date) > 0: month, date = re.match("(\d+)-?(.*)", date).groups() if len(date) > 0: day, date = re.match("(\d+)\s*(.*)", date).groups() if date.startswith('T'): date = date[1:] if len(date) > 0: hour, date = re.match("(\d+):?(.*)", date).groups() if len(date) > 0: minute, date = re.match("(\d+):?(.*)", date).groups() if len(date) > 0 and date[0] != ' ': second, date = re.match("(\d+)(.*)", date).groups() # convert from strings to integers #TODO: milliseconds? time zone? year, month, day, hour, minute, second = list(map(int, [year, month, day, hour, minute, float(second)])) # Create the time axis startdate={'year':year, 'month':month, 'day':day, 'hour':hour, 'minute':minute, 'second':second} axisdict[name] = cls(a.values, startdate=startdate, units=res, name=name, atts=atts) # Special case: start year=0 implies a climatology #NOTE: 'climatology' attribute not used, since we don't currently keep # track of the interval that was used for the climatology. if year == 0: # Don't climatologize(?) the axis if there's more than a year if not all(axisdict[name].year == 0): warn ("cfmeta: data starts at year 0 (which usually indicates a climatology), but there's more than one year's worth of data! Keeping it on a regular calendar.", stacklevel=3) continue axisdict[name] = timeutils.modify(axisdict[name], exclude='year') continue # we've constructed the time axis, so move onto the next axis # Check for a match from the custom axes (from add-ons). if _st in custom_axes: cls = custom_axes[_st] # Find any other information that should be put inside this axis. # Look for anything that's identified as a coordinate or anicllary # variable, and that has this axis as its only dimension. dependencies = set() for var in varlist: if var.hasaxis(a.name): dependencies.update(var.atts.get('coordinates','').split()) dependencies.update(var.atts.get('ancillary_variables','').split()) # Look up these dependencies. Only consider 1D information, since we # don't yet have a way to associate multidimensional arrays as auxarrays # in an axis. dependencies = [v for v in varlist if v.name in dependencies and v.naxes == 1 and v.hasaxis(a.name)] # If we found any such information, then this is no longer a simple # "dummy" axis. if issubclass(cls, DummyAxis) and len(dependencies) > 0: cls = NonCoordinateAxis # Attach the information from these dependent variables as auxiliary arrays. aux.update((dep.name,dep.get()) for dep in dependencies) # Anything that got attached to this axis should be removed from the # list of variables, since it's just extra info specific to the axis. varlist = [v for v in varlist if v.name not in aux] # put the units back (if we didn't use them)? if cls in [Axis, NamedAxis, XAxis, YAxis, ZAxis, TAxis] and _units != '': atts['units'] = _units # create new axis instance if need be. if cls != type(a): axisdict[name] = cls(values=a.values, name=name, atts=atts, **aux) # Apply these new axes to the variables # Check for fill values, etc. # Extract to a list first, then back to a dataset # (ensures the dataset axis list is up to date) for i,oldvar in enumerate(list(varlist)): # name = [n for n,v in six.iteritems(dataset.vardict) if v is oldvar].pop() name = oldvar.name atts = oldvar.atts.copy() plotatts = oldvar.atts.copy() fillvalue = [atts.pop(f,None) for f in ('FillValue', '_FillValue', 'missing_value')] fillvalue = [_f for _f in fillvalue if _f] fillvalue = fillvalue[0] if len(fillvalue) > 0 else None scale = atts.pop('scale_factor', None) offset = atts.pop('add_offset', None) varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes], name=name, fillvalue=fillvalue, scale=scale, offset=offset, atts=atts, plotatts=plotatts) dataset = Dataset(varlist, atts=global_atts) return dataset
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None): # {{{ r'''Computes least-squares linear regression of Y against X. Parameters ========== X, Y : :class:`Var` Variables to regress. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to X and Y. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,b,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the linear coefficient of the regression can be obtained by ``ds.m``). A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the following parameters can be returned: * 'm': Linear coefficient of the regression * 'b': Constant coefficient of the regression * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`) * 'p': p-value of regression; see notes. * 'sm': Standard deviation of linear coefficient estimate * 'se': Standard deviation of residuals Notes ===== The statistics described are computed following von Storch and Zwiers 1999, section 8.3. The p-value 'p' is computed using the t-statistic given in section 8.3.8, and confidence intervals for the slope and intercept can be computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively). The data is assumed to be normally distributed.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['m', 'b', 'r2', 'p', 'sm', 'se'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from regression. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) siaxes = list(range(len(oaxes), len(srcaxes))) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % ( X.name, Y.name) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Sum of weights Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) if N_fac is None: N_eff = Na - 2. else: N_eff = Na / N_fac - 2. nmsk = (N_eff > 0.) xx[nmsk] -= (x * x)[nmsk] / Na[nmsk] yy[nmsk] -= (y * y)[nmsk] / Na[nmsk] xy[nmsk] -= (x * y)[nmsk] / Na[nmsk] dmsk = (xx > 0.) m = np.zeros(oview.shape, 'd') b = np.zeros(oview.shape, 'd') r2 = np.zeros(oview.shape, 'd') m[dmsk] = xy[dmsk] / xx[dmsk] b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk] r2den = xx * yy d2msk = (r2den > 0.) r2[d2msk] = xy[d2msk]**2 / r2den[d2msk] sige = np.zeros(oview.shape, 'd') sigm = np.zeros(oview.shape, 'd') t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk] sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk]) sige[nmsk] = np.sqrt(sige[dmsk]) t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk] p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk])) msk = nmsk & dmsk m[~msk] = np.nan b[~msk] = np.nan sige[~msk] = np.nan sigm[~msk] = np.nan p[~msk] = np.nan msk = nmsk & d2msk r2[~msk] = np.nan xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: M = Var(oaxes, values=m, name='m') M.atts['longname'] = 'slope' rvs.append(M) if 'b' in output: B = Var(oaxes, values=b, name='b') B.atts['longname'] = 'intercept' rvs.append(B) if 'r2' in output: R2 = Var(oaxes, values=r2, name='r2') R2.atts['longname'] = 'fraction of variance explained' rvs.append(R2) if 'p' in output: P = Var(oaxes, values=p, name='p') P.atts['longname'] = 'p-value' rvs.append(P) if 'sm' in output: SM = Var(oaxes, values=sigm, name='sm') SM.atts['longname'] = 'standard deviation of slope parameter' rvs.append(SM) if 'se' in output: SE = Var(oaxes, values=sige, name='se') SE.atts['longname'] = 'standard deviation of residual' rvs.append(SE) ds = asdataset(rvs) ds.atts[ 'description'] = 'linear regression parameters for %s regressed against %s' % ( yn, xn) return ds
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None): # {{{ r'''Computes the mean value of X and statistics relevant for a test against the hypothesis that it is 0. Parameters ========== X : :class:`Var` Variable to average. axes : list, optional Axes over which to compute the mean; if nothing is specified, the mean is computed over all axes. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'm,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the mean value can be obtained through ``ds.m``). The following quantities can be calculated. * 'm': The mean value of X * 'p': The probability of the computed value if the population mean was zero * 'ci': The confidence interval of the mean at the level specified by alpha If the average is taken over all axes of X resulting in a scalar, the above values are returned as a tuple in the order given. If not, the results are provided as :class:`Var` objects in a dataset. See Also ======== difference Notes ===== The number of effective degrees of freedom can be scaled as in :meth:`difference`. The p-value and confidence interval are computed for the t-statistic defined in eq (6.61) of von Storch and Zwiers 1999.''' from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum from pygeode.view import View riaxes = [X.whichaxis(n) for n in axes] raxes = [a for i, a in enumerate(X.axes) if i in riaxes] oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes] oview = View(oaxes) N = np.product([len(X.axes[i]) for i in riaxes]) if pbar is None: from pygeode.progress import PBar pbar = PBar() assert N > 1, '%s has only one element along the reduction axes' % X.name # Construct work arrays x = np.zeros(oview.shape, 'd') xx = np.zeros(oview.shape, 'd') Na = np.zeros(oview.shape, 'd') x[()] = np.nan xx[()] = np.nan Na[()] = np.nan # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0) # Sum of weights (kludge to get masking right) Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0) imsk = (Na > 0.) # remove the mean (NOTE: numerically unstable if mean >> stdev) xx[imsk] -= x[imsk]**2 / Na[imsk] xx[imsk] = xx[imsk] / (Na[imsk] - 1) x[imsk] /= Na[imsk] if N_fac is not None: eN = N // N_fac eNa = Na // N_fac else: eN = N eNa = Na sdom = np.zeros((oview.shape), 'd') p = np.zeros((oview.shape), 'd') t = np.zeros((oview.shape), 'd') ci = np.zeros((oview.shape), 'd') sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk]) dmsk = (sdom > 0.) t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk] p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1)) ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk] name = X.name if X.name != '' else 'X' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'm' in output: m = Var(oaxes, values=x, name='m') m.atts['longname'] = 'Mean value of %s' % (name, ) rvs.append(m) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value of test %s is 0' % (name, ) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence intervale of the mean value of %s' % ( name, ) rvs.append(ci) return asdataset(rvs)
def save (filename, in_dataset, version=3, pack=None, compress=False, cfmeta = True, unlimited=None): # {{{ from ctypes import c_int, c_long, byref from pygeode.view import View from pygeode.tools import combine_axes, point from pygeode.axis import Axis, DummyAxis import numpy as np from pygeode.progress import PBar, FakePBar from pygeode.formats import finalize_save from pygeode.dataset import asdataset assert isinstance(filename,str) in_dataset = asdataset(in_dataset) dataset = finalize_save(in_dataset, cfmeta, pack) # Version? if compress: version = 4 assert version in (3,4) fileid = c_int() vars = list(dataset.vars) # The output axes axes = combine_axes(v.axes for v in vars) # Include axes in the list of vars (for writing to netcdf). # Exclude axes which don't have any intrinsic values. vars = vars + [a for a in axes if not isinstance(a,DummyAxis)] #vars.extend(axes) # Variables (and axes) must all have unique names assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars] if unlimited is not None: assert unlimited in [a.name for a in axes] # Functions for writing entire array allf = {1:lib.nc_put_var_schar, 2:lib.nc_put_var_text, 3:lib.nc_put_var_short, 4:lib.nc_put_var_int, 5:lib.nc_put_var_float, 6:lib.nc_put_var_double, 7:lib.nc_put_var_uchar, 8:lib.nc_put_var_ushort, 9:lib.nc_put_var_uint, 10:lib.nc_put_var_longlong, 11:lib.nc_put_var_ulonglong} # Functions for writing chunks chunkf = {1:lib.nc_put_vara_schar, 2:lib.nc_put_vara_text, 3:lib.nc_put_vara_short, 4:lib.nc_put_vara_int, 5:lib.nc_put_vara_float, 6:lib.nc_put_vara_double, 7:lib.nc_put_vara_uchar, 8:lib.nc_put_vara_ushort, 9:lib.nc_put_vara_uint, 10:lib.nc_put_vara_longlong, 11:lib.nc_put_vara_ulonglong} # Create the file if version == 3: ret = lib.nc_create (filename.encode('ascii'), 0, byref(fileid)) if ret != 0: raise IOError(lib.nc_strerror(ret)) elif version == 4: ret = lib.nc_create (filename.encode('ascii'), 0x1000, byref(fileid)) # 0x1000 = NC_NETCDF4 if ret != 0: raise IOError(lib.nc_strerror(ret)) else: raise Exception try: # Define the dimensions dimids = [None] * len(axes) for i,a in enumerate(axes): dimids[i] = c_int() if unlimited == a.name: ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i])) else: ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i])) assert ret == 0, lib.nc_strerror(ret) # Define the variables (including axes) chunks = [None] * len(vars) varids = [None] * len(vars) for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] # Generate the array of dimension ids for this var d = [dimids[list(axes).index(a)] for a in var.axes] # Make it C-compatible d = (c_int * var.naxes)(*d) varids[i] = c_int() ret = lib.nc_def_var (fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i])) assert ret == 0, lib.nc_strerror(ret) # Compress the data? (only works for netcdf4 or (higher?)) if compress: ret = lib.nc_def_var_deflate (fileid, varids[i], 1, 1, 2) assert ret == 0, lib.nc_strerror(ret) # Write the attributes # global attributes put_attributes (fileid, -1, dataset.atts, version) # variable attributes for i, var in enumerate(vars): # modify axes to be netcdf friendly (CF-compliant, etc.) put_attributes (fileid, varids[i], var.atts, version) # Don't pre-fill the file oldmode = c_int() ret = lib.nc_set_fill (fileid, 256, byref(oldmode)) assert ret == 0, "Can't set fill mode: %s (error %d)" % (lib.nc_strerror(ret), ret) # Finished defining the variables, about to start writing the values ret = lib.nc_enddef (fileid) assert ret == 0, "Error leaving define mode: %s (error %d)" % (lib.nc_strerror(ret), ret) # Relative progress of each variable sizes = [v.size for v in vars] prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100 # print "Saving '%s':"%filename pbar = PBar(message="Saving '%s':"%filename) # pbar = FakePBar() # Write the data for i, var in enumerate(vars): t = nc_type[version][var.dtype.name] dtype = numpy_type[t] # print 'writing', var.name # number of actual variables (non-axes) for determining our progress N = len([v for v in vars if not isinstance(v,Axis)]) varpbar = pbar.subset(prog[i], prog[i+1]) views = list(View(var.axes).loop_mem()) for j,v in enumerate(views): vpbar = varpbar.part(j, len(views)) # print '???', repr(str(v)) # Should always be slices (since we're looping over whole thing contiguously?) for sl in v.slices: assert isinstance(sl, slice) for sl in v.slices: assert sl.step in (1,None) start = [sl.start for sl in v.slices] count = [sl.stop - sl.start for sl in v.slices] start = (c_long*var.naxes)(*start) count = (c_long*var.naxes)(*count) if isinstance(var, Axis): assert len(start) == len(count) == 1 data = var.values data = data[start[0]:start[0]+count[0]] # the above gives us the *whole* axis, # but under extreme conditions we may be looping over smaller pieces vpbar.update(100) else: data = v.get(var, pbar=vpbar) # Ensure the data is stored contiguously in memory data = np.ascontiguousarray(data, dtype=dtype) ret = chunkf[t](fileid, varids[i], start, count, point(data)) assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (var.name, lib.nc_strerror(ret), ret) finally: # Finished lib.nc_close(fileid)
def open(filename, value_override = {}, dimtypes = {}, namemap = {}, varlist = [], cfmeta = True): # {{{ ''' open (filename, [value_override = {}, dimtypes = {}, namemap = {}, varlist = [] ]) Returns a Dataset of PyGeode variables contained in the specified files. The axes of the variables are created from the dimensions of the NetCDF file. NetCDF variables in the file that do not correspond to dimensions are imported as PyGeode variables. filename - NetCDF file to open value_override - an optional dictionary with replacement values for one or more variables. The only known use for this dictionary is to avoid loading in values from a severely scattered variable (such as a 'time' axis or other slowest-varying dimension). dimtypes - a dictionary mapping dimension names to axis classes. The keys should be axis names as defined in the NetCDF file; values should be one of: 1) an axis instance, 2) an axis class, or 3) a tuple of an axis class and a dictionary with keyword arguments to pass to that axis' constructor If no dictionary is included, an attempt is made to automatically identify the axis types. namemap - an optional dictionary to map NetCDF variable names (keys) to PyGeode variable names (values); also works for axes/dimensions varlist - a list containing the variables that should be loaded into the data set (if the list is empty, all NetCDF variables will be loaded) Note: The identifiers used in varlist and dimtypes are the original names used in the NetCDF file, not the names given in namemap.''' from os.path import exists from ctypes import c_int, byref from pygeode.dataset import asdataset from pygeode.formats import finalize_open from pygeode.axis import Axis if not filename.startswith('http://'): assert exists(filename), 'File open failed. "%s" does not exist.' % filename # Read variable dimensions and metadata from the file f = NCFile(filename) f.open() try: fileid = f.fileid # Get number of variables nvars = c_int() ret = lib.nc_inq_nvars(fileid, byref(nvars)) assert ret == 0, lib.nc_strerror(ret) nvars = nvars.value # Construct all the variables, put in a list vars = [NCVar(f,i) for i in range(nvars)] # Construct a dataset from these Vars dataset = asdataset(vars) dataset.atts = get_attributes (fileid, -1) finally: f.close() # Add the object stuff from dimtypes to value_override, so we don't trigger a # load operation on those dims. # (We could use any values here, since they'll be overridden again later, # but we might as well use something relevant). value_override = dict(value_override) # don't use the default (static) empty dict for k,v in list(dimtypes.items()): if isinstance(v,Axis): value_override[k] = v.values #### Filters to apply to the data #### # Override values from the source? if len(value_override) > 0: dataset = override_values(dataset, value_override) # Set up the proper axes (get coordinate values / metadata from a 1D variable # with the same name as the dimension) dataset = dims2axes(dataset) return finalize_open(dataset, dimtypes, namemap, varlist, cfmeta)
def to_xarray(dataset): """ Converts a PyGeode Dataset into an xarray Dataset. Parameters ---------- dataset : pygeode.Dataset The dataset to be converted. Returns ------- out : xarray.Dataset An object which can be used with the xarray package. """ from pygeode.dataset import asdataset from pygeode.formats.cfmeta import encode_cf from pygeode.view import View from dask.base import tokenize import dask.array as da import xarray as xr dataset = asdataset(dataset) # Encode the axes/variables with CF metadata. dataset = encode_cf(dataset) out = dict() # Loop over each axis and variable. for var in list(dataset.axes) + list(dataset.vars): # Generate a unique name to identify it with dask. name = var.name + "-" + tokenize(var) dsk = dict() dims = [a.name for a in var.axes] # Special case: already have the values in memory. if hasattr(var,'values'): out[var.name] = xr.DataArray(var.values, dims=dims, attrs=var.atts, name=var.name) continue # Keep track of all the slices that were made over each dimension. # This information will be used to determine the "chunking" that was done # on the variable from inview.loop_mem(). slice_order = [[] for a in var.axes] chunks = [] # Break up the variable into into portions that are small enough to fit # in memory. These will become the "chunks" for dask. inview = View(var.axes) for outview in inview.loop_mem(): integer_indices = map(tuple,outview.integer_indices) # Determine *how* loop_mem is splitting the axes, and define the chunk # sizes accordingly. # A little indirect, but loop_mem doesn't make its chunking choices # available to the caller. for o, sl in zip(slice_order, integer_indices): if sl not in o: o.append(sl) ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)] # Add this chunk to the dask array. key = tuple([name] + ind) dsk[key] = (var.getview, outview, False) # Construct the dask array. chunks = [map(len,sl) for sl in slice_order] arr = da.Array(dsk, name, chunks, dtype=var.dtype) # Wrap this into an xarray.DataArray (with metadata and named axes). out[var.name] = xr.DataArray(arr, dims = dims, attrs = var.atts, name=var.name) # Build the final xarray.Dataset. out = xr.Dataset(out, attrs=dataset.atts) # Re-decode the CF metadata on the xarray side. out = xr.conventions.decode_cf(out) return out
def paired_difference(X, Y, axes=None, alpha=0.05, N_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y, assuming that individual elements of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same shape. Parameters ========== X, Y : :class:`Var` Variables to difference. Must share all axes over which the means are being computed. axes : list, optional Axes over which to compute means; if nothing is specified, the mean is computed over all axes common to X and Y. alpha : float Confidence level for which to compute confidence interval. N_fac : integer A factor by which to rescale the estimated number of degrees of freedom of X and Y; the effective number will be given by the number estimated from the dataset divided by ``N_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero difference Notes ===== Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the appropriate number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.21).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.' if pbar is None: from pygeode.progress import PBar pbar = PBar() assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name # Construct work arrays d = np.full(oview.shape, np.nan, 'd') dd = np.full(oview.shape, np.nan, 'd') N = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes=srcaxes, pbar=pbar): ddata = xdata.astype('d') - ydata.astype('d') d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0) dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0) # Count of non-NaN data points N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (N > 1) dd[imsk] -= (d * d)[imsk] / N[imsk] dd[imsk] /= (N[imsk] - 1) d[imsk] /= N[imsk] # Ensure variance is non-negative dd[dd <= 0.] = 0. if N_fac is not None: eN = N // N_fac else: eN = N emsk = (eN > 1) den = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') den = np.zeros(oview.shape, 'd') den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1)) dmsk = (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1)) ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk] # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=eN - 1, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for t-test of paired difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['N_fac'] = N_fac ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn) return ds
def correlate(X, Y, axes=None, output='r2,p', pbar=None): # {{{ r'''Computes correlation between variables X and Y. Parameters ========== X, Y : :class:`Var` Variables to correlate. Must have at least one axis in common. axes : list, optional Axes over which to compute correlation; if nothing is specified, the correlation is computed over all axes common to shared by X and Y. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'r2,p'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the correlation coefficient can be obtained through ``ds.r2``). * 'r2': The correlation coefficient :math:`\rho_{XY}` * 'p': The p-value; see notes. Notes ===== The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers 1999, section 8.2.2. The p-value is the probability of finding a correlation coeefficient of equal or greater magnitude (two-sided) to the given result under the hypothesis that the true correlation coefficient between X and Y is zero. It is computed from the t-statistic given in eq (8.7), in section 8.2.3, and assumes normally distributed quantities.''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['r2', 'p'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) # Put all the axes being reduced over at the end # so that we can reshape srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [srcaxes[i] for i in oiaxes] inaxes = oaxes + [srcaxes[i] for i in riaxes] oview = View(oaxes) iview = View(inaxes) siaxes = list(range(len(oaxes), len(srcaxes))) # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') xy = np.full(oview.shape, np.nan, 'd') Na = np.full(oview.shape, np.nan, 'd') if pbar is None: from pygeode.progress import PBar pbar = PBar() for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar): xdata = xdata.astype('d') ydata = ydata.astype('d') xydata = xdata * ydata xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)] ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)] xdata = np.tile(xdata, xbc) ydata = np.tile(ydata, ybc) xdata[np.isnan(xydata)] = np.nan ydata[np.isnan(xydata)] = np.nan # It seems np.nansum does not broadcast its arguments automatically # so there must be a better way of doing this... x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0) y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0) xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0) # Count of non-NaN data points Na[outsl] = np.nansum( [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0) imsk = (Na > 0) xx[imsk] -= (x * x)[imsk] / Na[imsk] yy[imsk] -= (y * y)[imsk] / Na[imsk] xy[imsk] -= (x * y)[imsk] / Na[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. # Compute correlation coefficient, t-statistic, p-value den = np.zeros(oview.shape, 'd') rho = np.zeros(oview.shape, 'd') den[imsk] = np.sqrt((xx * yy)[imsk]) dmsk = (den > 0.) rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk] den = 1 - rho**2 # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings den[den < eps] = eps t = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk]) p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2)) p[~imsk] = np.nan rho[~imsk] = np.nan p[~dmsk] = np.nan rho[~dmsk] = np.nan # Construct and return variables xn = X.name if X.name != '' else 'X' # Note: could write: xn = X.name or 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'r2' in output: r2 = Var(oaxes, values=rho, name='r2') r2.atts['longname'] = 'Correlation coefficient between %s and %s' % ( xn, yn) rvs.append(r2) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts[ 'longname'] = 'p-value for correlation coefficient between %s and %s' % ( xn, yn) rvs.append(p) ds = asdataset(rvs) ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn) return ds
def difference(X, Y, axes=None, alpha=0.05, Nx_fac=None, Ny_fac=None, output='d,p,ci', pbar=None): # {{{ r'''Computes the mean value and statistics of X - Y. Parameters ========== X, Y : :class:`Var` Variables to difference. Must have at least one axis in common. axes : list, optional, defaults to None Axes over which to compute means; if othing is specified, the mean is computed over all axes common to X and Y. alpha : float, optional; defaults to 0.05 Confidence level for which to compute confidence interval. Nx_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of X; the effective number will be given by the number estimated from the dataset divided by ``Nx_fac``. Ny_fac : integer, optional: defaults to None A factor by which to rescale the estimated number of degrees of freedom of Y; the effective number will be given by the number estimated from the dataset divided by ``Ny_fac``. output : string, optional A string determining which parameters are returned; see list of possible outputs in the Returns section. The specifications must be separated by a comma. Defaults to 'd,p,ci'. pbar : progress bar, optional A progress bar object. If nothing is provided, a progress bar will be displayed if the calculation takes sufficiently long. Returns ======= results : :class:`Dataset` The returned variables are specified by the ``output`` argument. The names of the variables match the output request string (i.e. if ``ds`` is the returned dataset, the average of the difference can be obtained by ``ds.d``). The following four quantities can be computed: * 'd': The difference in the means, X - Y * 'df': The effective number of degrees of freedom, :math:`df` * 'p': The p-value; see notes. * 'ci': The confidence interval of the difference at the level specified by ``alpha`` See Also ======== isnonzero paired_difference Notes ===== The effective number of degrees of freedom is estimated using eq (6.20) of von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by Nx_fac and Ny_fac, respectively. This provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but the number of effective degrees of freedom are not calculated explicitly by this routine. The p-value and confidence interval are computed based on the t-statistic in eq (6.19).''' from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum from pygeode.view import View # Split output request now ovars = ['d', 'df', 'p', 'ci'] output = [o for o in output.split(',') if o in ovars] if len(output) < 1: raise ValueError( 'No valid outputs are requested from correlation. Possible outputs are %s.' % str(ovars)) srcaxes = combine_axes([X, Y]) oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes]) if axes is not None: ri_new = [] for a in axes: i = whichaxis(srcaxes, a) if i not in riaxes: raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' % (a, X.name, Y.name)) ri_new.append(i) oiaxes.extend([r for r in riaxes if r not in ri_new]) riaxes = ri_new oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes] oview = View(oaxes) ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)] iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)] Nx = np.product([len(X.axes[i]) for i in ixaxes]) Ny = np.product([len(Y.axes[i]) for i in iyaxes]) assert Nx > 1, '%s has only one element along the reduction axes' % X.name assert Ny > 1, '%s has only one element along the reduction axes' % Y.name if pbar is None: from pygeode.progress import PBar pbar = PBar() # Construct work arrays x = np.full(oview.shape, np.nan, 'd') y = np.full(oview.shape, np.nan, 'd') xx = np.full(oview.shape, np.nan, 'd') yy = np.full(oview.shape, np.nan, 'd') Nx = np.full(oview.shape, np.nan, 'd') Ny = np.full(oview.shape, np.nan, 'd') # Accumulate data for outsl, (xdata, ) in loopover([X], oview, pbar=pbar): xdata = xdata.astype('d') x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0) xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0) # Count of non-NaN data points Nx[outsl] = np.nansum( [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0) for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar): ydata = ydata.astype('d') y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0) yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0) # Count of non-NaN data points Ny[outsl] = np.nansum( [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0) # remove the mean (NOTE: numerically unstable if mean >> stdev) imsk = (Nx > 1) & (Ny > 1) xx[imsk] -= (x * x)[imsk] / Nx[imsk] xx[imsk] /= (Nx[imsk] - 1) x[imsk] /= Nx[imsk] yy[imsk] -= (y * y)[imsk] / Ny[imsk] yy[imsk] /= (Ny[imsk] - 1) y[imsk] /= Ny[imsk] # Ensure variances are non-negative xx[xx <= 0.] = 0. yy[yy <= 0.] = 0. if Nx_fac is not None: eNx = Nx // Nx_fac else: eNx = Nx if Ny_fac is not None: eNy = Ny // Ny_fac else: eNy = Ny emsk = (eNx > 1) & (eNy > 1) # Compute difference d = x - y den = np.zeros(oview.shape, 'd') df = np.zeros(oview.shape, 'd') p = np.zeros(oview.shape, 'd') ci = np.zeros(oview.shape, 'd') # Convert to variance of the mean of each sample xx[emsk] /= eNx[emsk] yy[emsk] /= eNy[emsk] den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1) dmsk = (den > 0.) df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk] den[emsk] = np.sqrt(xx[emsk] + yy[emsk]) dmsk &= (den > 0.) p[dmsk] = np.abs(d[dmsk] / den[dmsk]) p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk])) ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk] df[~dmsk] = np.nan p[~dmsk] = np.nan ci[~dmsk] = np.nan # Construct dataset to return xn = X.name if X.name != '' else 'X' yn = Y.name if Y.name != '' else 'Y' from pygeode.var import Var from pygeode.dataset import asdataset rvs = [] if 'd' in output: d = Var(oaxes, values=d, name='d') d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn) rvs.append(d) if 'df' in output: df = Var(oaxes, values=df, name='df') df.atts['longname'] = 'Degrees of freedom used for t-test' rvs.append(df) if 'p' in output: p = Var(oaxes, values=p, name='p') p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % ( xn, yn) rvs.append(p) if 'ci' in output: ci = Var(oaxes, values=ci, name='ci') ci.atts[ 'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % ( alpha, xn, yn) rvs.append(ci) ds = asdataset(rvs) ds.atts['alpha'] = alpha ds.atts['Nx_fac'] = Nx_fac ds.atts['Ny_fac'] = Ny_fac ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn) return ds