Example #1
0
def tidy_axes(dataset, unlimited=None):
    # {{{
    from pygeode.tools import combine_axes
    from pygeode.axis import DummyAxis
    from pygeode.dataset import asdataset

    vars = list(dataset.vars)
    # The output axes
    axes = combine_axes(v.axes for v in vars)

    # Include axes in the list of vars (for writing to netcdf).
    # Exclude axes which don't have any intrinsic values.
    # Look at original dataset to check original type of axes (because
    # finalize_save may force everything to be NamedAxis).
    vars = vars + [
        a for a in axes if not isinstance(dataset[a.name], DummyAxis)
    ]

    # Variables (and axes) must all have unique names
    assert len(set([v.name for v in vars])) == len(
        vars), "vars must have unique names: %s" % [v.name for v in vars]

    if unlimited is not None:
        assert unlimited in [a.name for a in axes]

    return asdataset(vars)
Example #2
0
def serve (path, dataset, port=8080):
  from pygeode.server.web import MyServer_threaded2
  import threading
  from pygeode.dataset import asdataset
  # Remove extra /'s
  while '//' in path: path = path.replace('//','/')
  # Remove any leading and trailing /
  if path.startswith('/'): path = path[1:]
  if path.endswith('/'): path = path[:-1]
  # Break up into directories and 'file' name
  parts = path.split('/')
  dirnames = parts[:-1]
  fname = parts[-1]
  # Check if we have a server available already
  if port not in SERVERS.serverdict:
    # Make a new server, with an empty root directory
    root = DAP_Dir()
    server = MyServer_threaded2(port, root)
    threading.Thread(target=server.serve_forever).start()
    print("Started an OPeNDAP server listening on port %s"%port)
    SERVERS.serverdict[port] = server
  else:
    server = SERVERS.serverdict[port]
  # Get the working directory
  cwd = server.root_handler
  # Make sure the full path is available
  for dname in dirnames:
    if dname not in cwd.nodes:
      cwd.nodes[dname] = DAP_Dir()
    cwd = cwd.nodes[dname]
    assert hasattr(cwd,'nodes'), "'%s' is already defined as something other than a directory?"%dname

  # Share the file
  assert fname not in cwd.nodes, "'%s' is already being served"%path
  cwd.nodes[fname] = DAPHandler(asdataset(dataset))
Example #3
0
def make_dataset (ncfile):
# {{{
  from pygeode.dataset import asdataset
  # Construct all the variables, put in a list
  vars = list(map(make_var, list(ncfile.variables.values())))
  
  # Construct a dataset from these Vars
  dataset = asdataset(vars)
  dataset.atts = make_atts(ncfile)
  return dataset
Example #4
0
def make_dataset (ncfile):
# {{{
  from pygeode.dataset import asdataset
  # Construct all the variables, put in a list
  vars = list(map(make_var, list(ncfile.variables.values())))
  
  # Construct a dataset from these Vars
  dataset = asdataset(vars)
  dataset.atts = make_atts(ncfile)
  return dataset
Example #5
0
def finalize_save(dataset, cfmeta = True, pack = None):
# {{{
  from pygeode.formats import cfmeta as cf
  from pygeode.dataset import asdataset

  # Only pack if pack is true
  if pack:
    if hasattr(pack, '__len__'): # Assume this is a list of variables to pack
      vars = [PackVar(v) if v.name in pack else v for v in dataset.vars]
    else:
      vars = [PackVar(v) for v in dataset.vars]
    dset = asdataset(vars)
    dset.atts = dataset.atts.copy()
  else:
    dset = dataset

  # Encode standard axes back into netcdf metadata?
  if cfmeta is True:
    return cf.encode_cf(dset)
  else:
    return asdataset(dset)
Example #6
0
def ensemble(*varlists):
    """
  Creates an ensemble out of a set of similar variables.
  The corresponding variable must have the same axes and the same name.
  If a bunch of vars are passed as inputs, then a single ensemble var is returned.
  If a bunch of datasets are passed as inputs, then a single dataset is returned, consisting of an ensemble of the internal vars.  Each input dataset must have matching vars.
  """
    from pygeode.var import Var
    from pygeode.dataset import Dataset, asdataset
    from pygeode.tools import common_dict
    datasets = [asdataset(v) for v in varlists]

    varnames = [v.name for v in datasets[0].vars]

    # Make sure we have the same varnames in each dataset
    for dataset in datasets:
        assert set(dataset.vardict.keys()) == set(
            varnames), "inconsistent variable names between datasets"

    # Make sure the varlists are all in the same order
    for i, dataset in enumerate(datasets):
        varlist = [dataset[varname] for varname in varnames]
        datasets[i] = Dataset(varlist, atts=dataset.atts)

    for varname in varnames:
        var0 = datasets[0][varname]
        for dataset in datasets:
            var = dataset[varname]
            # Make sure the axes are the same between ensemble vars
            assert var.axes == var0.axes, "inconsistent axes for %s" % varname

    # Collect the ensembles together
    ensembles = []
    for varname in varnames:
        ensemble = EnsembleVar([dataset[varname] for dataset in datasets])
        ensembles.append(ensemble)

    # Global attributes
    atts = common_dict(dataset.atts for dataset in datasets)
    if isinstance(varlists[0], Dataset): return Dataset(ensembles, atts=atts)
    if isinstance(varlists[0], Var):
        assert len(ensembles) == 1
        return ensembles[0]

    return ensembles
Example #7
0
def ensemble (*varlists):
  """
  Creates an ensemble out of a set of similar variables.
  The corresponding variable must have the same axes and the same name.
  If a bunch of vars are passed as inputs, then a single ensemble var is returned.
  If a bunch of datasets are passed as inputs, then a single dataset is returned, consisting of an ensemble of the internal vars.  Each input dataset must have matching vars.
  """
  from pygeode.var import Var
  from pygeode.dataset import Dataset, asdataset
  from pygeode.tools import common_dict
  datasets = [asdataset(v) for v in varlists]

  varnames = [v.name for v in datasets[0].vars]

  # Make sure we have the same varnames in each dataset
  for dataset in datasets: assert set(dataset.vardict.keys()) == set(varnames), "inconsistent variable names between datasets"

  # Make sure the varlists are all in the same order
  for i, dataset in enumerate(datasets):
    varlist = [dataset[varname] for varname in varnames]
    datasets[i] = Dataset(varlist, atts=dataset.atts)

  for varname in varnames:
    var0 = datasets[0][varname]
    for dataset in datasets:
      var = dataset[varname]
      # Make sure the axes are the same between ensemble vars
      assert var.axes == var0.axes, "inconsistent axes for %s"%varname

  # Collect the ensembles together
  ensembles = []
  for varname in varnames:
    ensemble = EnsembleVar([dataset[varname] for dataset in datasets])
    ensembles.append(ensemble)

  # Global attributes
  atts = common_dict(dataset.atts for dataset in datasets)
  if isinstance(varlists[0], Dataset): return Dataset(ensembles, atts=atts)
  if isinstance(varlists[0], Var):
    assert len(ensembles) == 1
    return ensembles[0]

  return ensembles
Example #8
0
def tidy_axes(dataset, unlimited=None):
# {{{
  from pygeode.tools import combine_axes
  from pygeode.axis import DummyAxis
  from pygeode.dataset import asdataset
  
  vars = list(dataset.vars)
  # The output axes
  axes = combine_axes(v.axes for v in vars)

  # Include axes in the list of vars (for writing to netcdf).
  # Exclude axes which don't have any intrinsic values.
  # Look at original dataset to check original type of axes (because
  # finalize_save may force everything to be NamedAxis).
  vars = vars + [a for a in axes if not isinstance(dataset[a.name],DummyAxis)]

  # Variables (and axes) must all have unique names
  assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars]

  if unlimited is not None:
    assert unlimited in [a.name for a in axes]

  return asdataset(vars)
Example #9
0
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis, DummyAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  from copy import copy
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts.copy()
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  # Specify the conventions we're (supposedly) using
  global_atts['Conventions'] = "CF-1.0"

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in list(axisdict.items()):
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Encode non-coordinate axes, including station (timeseries) data.
    # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series
    # Move station lat/lon/name data into separate variables.
    if isinstance(a, NonCoordinateAxis):

      # Keep track of extra variables created from auxarray data.
      extra_vars = []

      # Detect certain arrays that should be treated as "coordinates".
      coordinates = []

      # Encode station latitude.
      if 'lat' in a.auxarrays:
        lat = a.auxasvar('lat')
        lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north")
        extra_vars.append(lat)
        coordinates.append('lat')
      # Encode station longitude.
      if 'lon' in a.auxarrays:
        lon = a.auxasvar('lon')
        lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east")
        extra_vars.append(lon)
        coordinates.append('lon')

      coordinates = " ".join(coordinates)

      # Encode other auxarrays as generic "ancillary" arrays.
      ancillary_variables = []
      for auxname in list(a.auxarrays.keys()):
        if auxname in coordinates: continue  # Handled above
        var = a.auxasvar(auxname)
        if var.dtype.name.startswith('str'):
          var = encode_string_var(var)
        # Some extra CF encoding for the station name, to use it as the unique identifier.
        if auxname == 'station':
          var.atts = dict(cf_role = "timeseries_id")
        extra_vars.append(var)
        ancillary_variables.append(auxname)

      ancillary_variables = " ".join(ancillary_variables)

      # Attach these coordinates to all variables that use this axis.
      #TODO: cleaner way of adding this information without having to do a shallow copy.
      for i,var in enumerate(varlist):
        if var.hasaxis(a):
          var = copy(var)
          var.atts = copy(var.atts)
          if len(coordinates) > 0:
            var.atts['coordinates'] = coordinates
          if len(ancillary_variables) > 0:
            var.atts['ancillary_variables'] = ancillary_variables
          varlist[i] = var

      # Add these coordinates / ancillary variables to the output.
      varlist.extend(extra_vars)

      # The values in the axis itself are meaningless, so mark them as such
      axisdict[name] = DummyAxis(len(a),name=name)

      # Special case: Station (timeseries) data.
      if isinstance(a, Station):
        global_atts['featureType'] = "timeSeries"
      # Nothing more to do for this axis type
      continue

    # Encode custom axes from add-ons
    for n,c in list(custom_axes.items()):
      if isinstance(a,c):
        atts['standard_name'] = n

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.items():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print('??', a.name, axisdict)
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset
Example #10
0
def to_xarray(dataset):
    """
  Converts a PyGeode Dataset into an xarray Dataset.

  Parameters
  ----------
  dataset : pygeode.Dataset
    The dataset to be converted.

  Returns
  -------
  out : xarray.Dataset
    An object which can be used with the xarray package.
  """
    from pygeode.dataset import asdataset
    from pygeode.formats.cfmeta import encode_cf
    from pygeode.view import View
    from dask.base import tokenize
    import dask.array as da
    import xarray as xr
    dataset = asdataset(dataset)
    # Encode the axes/variables with CF metadata.
    dataset = encode_cf(dataset)
    out = dict()
    # Loop over each axis and variable.
    for var in list(dataset.axes) + list(dataset.vars):
        # Generate a unique name to identify it with dask.
        name = var.name + "-" + tokenize(var)
        dsk = dict()
        dims = [a.name for a in var.axes]

        # Special case: already have the values in memory.
        if hasattr(var, 'values'):
            out[var.name] = xr.DataArray(var.values,
                                         dims=dims,
                                         attrs=var.atts,
                                         name=var.name)
            continue

        # Keep track of all the slices that were made over each dimension.
        # This information will be used to determine the "chunking" that was done
        # on the variable from inview.loop_mem().
        slice_order = [[] for a in var.axes]
        chunks = []
        # Break up the variable into into portions that are small enough to fit
        # in memory.  These will become the "chunks" for dask.
        inview = View(var.axes)
        for outview in inview.loop_mem():
            integer_indices = list(map(tuple, outview.integer_indices))
            # Determine *how* loop_mem is splitting the axes, and define the chunk
            # sizes accordingly.
            # A little indirect, but loop_mem doesn't make its chunking choices
            # available to the caller.
            for o, sl in zip(slice_order, integer_indices):
                if sl not in o:
                    o.append(sl)
            ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)]
            # Add this chunk to the dask array.
            key = tuple([name] + ind)
            dsk[key] = (var.getview, outview, False)
        # Construct the dask array.
        chunks = [list(map(len, sl)) for sl in slice_order]
        arr = da.Array(dsk, name, chunks, dtype=var.dtype)
        # Wrap this into an xarray.DataArray (with metadata and named axes).
        out[var.name] = xr.DataArray(arr,
                                     dims=dims,
                                     attrs=var.atts,
                                     name=var.name)
    # Build the final xarray.Dataset.
    out = xr.Dataset(out, attrs=dataset.atts)
    # Re-decode the CF metadata on the xarray side.
    out = xr.conventions.decode_cf(out)
    return out
Example #11
0
def open(filename,
         value_override={},
         dimtypes={},
         namemap={},
         varlist=[],
         cfmeta=True):
    from numpy import empty
    from ctypes import c_long, byref
    from pygeode.axis import DummyAxis
    from pygeode.dataset import asdataset
    from pygeode.formats import finalize_open

    f = HDF4_File(filename)

    num_datasets = c_long()
    num_global_attrs = c_long()
    ret = lib.SDfileinfo(f.sd_id, byref(num_datasets), byref(num_global_attrs))
    assert ret == 0
    num_datasets = num_datasets.value
    num_global_attrs = num_global_attrs.value
    global_atts = get_attributes(f.sd_id, num_global_attrs)

    # Get the HDF vars
    SD_arr = [None] * num_datasets
    for i in range(num_datasets):
        SD_arr[i] = HDF4_SD(f, i)

    # If there are 2 vars of the name XXXX and XXXX:EOSGRID, then
    # ignore the first one and use the latter one.
    # (Based some some GMAO files from the IPY dataset)
    SD_arr = [
        sd for sd in SD_arr if sd.name.endswith(':EOSGRID') or not any(
            sd2.name == sd.name + ':EOSGRID' for sd2 in SD_arr)
    ]

    # Find any 'axes'
    # (look for unique 1D vars which contain a particular dimension id)
    sd_1d = [sd for sd in SD_arr if sd.rank == 1]
    # Determine which dimensions map to a unique 1D array
    dimids = [sd.dimids[0] for sd in sd_1d]
    dimsds = [
        s for s in sd_1d if dimids.count(s.dimids[0]) == 1 or s.iscoord == 1
    ]

    # Load axis values
    for s in dimsds:
        s.values = empty(s.shape, numpy_type[s.type])
        load_values(s.sds_id, [0], s.shape, s.values)

    #for s in dimsds: print s; print s.values

    # Create axis objects
    from pygeode.axis import NamedAxis
    axes = [None] * len(dimsds)
    for i, s in enumerate(dimsds):
        # Append attributes for the axis
        atts = get_attributes(s.sds_id, s.natts)
        #    if len(atts) > 0: axes[i].atts = atts
        axes[i] = NamedAxis(s.values, s.name, atts=atts)

    # Reference axes by dimension ids
    axis_lookup = {}
    for i, a in enumerate(axes):
        axis_lookup[dimids[i]] = a

    # Add dummy axes for dimensions without coordinate info.
    for s in SD_arr:
        for d in s.dimids:
            if d not in axis_lookup:
                dimname, dimsize, dimtype, dim_natts = get_dim_info(d)
                axis_lookup[d] = DummyAxis(dimsize, dimname)

    # Create var objects
    vars = [None] * len(SD_arr)
    for i, s in enumerate(SD_arr):
        axes = [axis_lookup[d] for d in s.dimids]
        vars[i] = HDF4_Var(s, axes)
    vars = [v for v in vars if v.sd not in dimsds]

    # Return a dataset
    d = asdataset(vars)
    d.atts = global_atts

    return finalize_open(d, dimtypes, namemap, varlist, cfmeta)
Example #12
0
def open(filename,
         value_override={},
         dimtypes={},
         namemap={},
         varlist=[],
         cfmeta=True):
    # {{{
    ''' open (filename, [value_override = {}, dimtypes = {}, namemap = {}, varlist = [] ])

  Returns a Dataset of PyGeode variables contained in the specified files. The axes of the 
  variables are created from the dimensions of the NetCDF file. NetCDF variables in the file that do
  not correspond to dimensions are imported as PyGeode variables.

  filename - NetCDF file to open
  value_override - an optional dictionary with replacement values for one or more variables.
           The only known use for this dictionary is to avoid loading in values from a severely
           scattered variable (such as a 'time' axis or other slowest-varying dimension).
  dimtypes - a dictionary mapping dimension names to axis classes. The keys should be axis names
              as defined in the NetCDF file; values should be one of:
              1) an axis instance, 
              2) an axis class, or 
              3) a tuple of an axis class and a dictionary with keyword arguments to pass 
                to that axis' constructor              
              If no dictionary is included, an attempt is made to automatically identify the axis 
              types.
  namemap - an optional dictionary to map NetCDF variable names (keys) to PyGeode variable names
            (values); also works for axes/dimensions
  varlist - a list containing the variables that should be loaded into the data set (if the list is
            empty, all NetCDF variables will be loaded)
  Note: The identifiers used in varlist and dimtypes are the original names used in the NetCDF file, 
        not the names given in namemap.'''

    from os.path import exists
    from ctypes import c_int, byref
    from pygeode.dataset import asdataset
    from pygeode.formats import finalize_open
    from pygeode.axis import Axis
    if not filename.startswith('http://'):
        assert exists(
            filename), 'File open failed. "%s" does not exist.' % filename

    # Read variable dimensions and metadata from the file
    f = NCFile(filename)
    f.open()
    try:
        fileid = f.fileid

        # Get number of variables
        nvars = c_int()
        ret = lib.nc_inq_nvars(fileid, byref(nvars))
        assert ret == 0, lib.nc_strerror(ret)

        nvars = nvars.value

        # Construct all the variables, put in a list
        vars = [NCVar(f, i) for i in range(nvars)]

        # Construct a dataset from these Vars
        dataset = asdataset(vars)
        dataset.atts = get_attributes(fileid, -1)

    finally:
        f.close()

    # Add the object stuff from dimtypes to value_override, so we don't trigger a
    # load operation on those dims.
    # (We could use any values here, since they'll be overridden again later,
    #  but we might as well use something relevant).
    value_override = dict(
        value_override)  # don't use  the default (static) empty dict
    for k, v in list(dimtypes.items()):
        if isinstance(v, Axis):
            value_override[k] = v.values

    #### Filters to apply to the data ####

    # Override values from the source?
    if len(value_override) > 0:
        dataset = override_values(dataset, value_override)

    # Set up the proper axes (get coordinate values / metadata from a 1D variable
    # with the same name as the dimension)
    dataset = dims2axes(dataset)

    return finalize_open(dataset, dimtypes, namemap, varlist, cfmeta)
Example #13
0
def save(filename,
         in_dataset,
         version=3,
         pack=None,
         compress=False,
         cfmeta=True,
         unlimited=None):
    # {{{
    from ctypes import c_int, c_long, byref
    from pygeode.view import View
    from pygeode.tools import combine_axes, point
    from pygeode.axis import Axis, DummyAxis
    import numpy as np
    from pygeode.progress import PBar, FakePBar
    from pygeode.formats import finalize_save
    from pygeode.dataset import asdataset

    assert isinstance(filename, str)

    in_dataset = asdataset(in_dataset)
    dataset = finalize_save(in_dataset, cfmeta, pack)

    # Version?
    if compress: version = 4
    assert version in (3, 4)

    fileid = c_int()

    vars = list(dataset.vars)
    # The output axes
    axes = combine_axes(v.axes for v in vars)

    # Include axes in the list of vars (for writing to netcdf).
    # Exclude axes which don't have any intrinsic values.
    vars = vars + [a for a in axes if not isinstance(a, DummyAxis)]
    #vars.extend(axes)

    # Variables (and axes) must all have unique names
    assert len(set([v.name for v in vars])) == len(
        vars), "vars must have unique names: %s" % [v.name for v in vars]

    if unlimited is not None:
        assert unlimited in [a.name for a in axes]

    # Functions for writing entire array
    allf = {
        1: lib.nc_put_var_schar,
        2: lib.nc_put_var_text,
        3: lib.nc_put_var_short,
        4: lib.nc_put_var_int,
        5: lib.nc_put_var_float,
        6: lib.nc_put_var_double,
        7: lib.nc_put_var_uchar,
        8: lib.nc_put_var_ushort,
        9: lib.nc_put_var_uint,
        10: lib.nc_put_var_longlong,
        11: lib.nc_put_var_ulonglong
    }

    # Functions for writing chunks
    chunkf = {
        1: lib.nc_put_vara_schar,
        2: lib.nc_put_vara_text,
        3: lib.nc_put_vara_short,
        4: lib.nc_put_vara_int,
        5: lib.nc_put_vara_float,
        6: lib.nc_put_vara_double,
        7: lib.nc_put_vara_uchar,
        8: lib.nc_put_vara_ushort,
        9: lib.nc_put_vara_uint,
        10: lib.nc_put_vara_longlong,
        11: lib.nc_put_vara_ulonglong
    }

    # Create the file
    if version == 3:
        ret = lib.nc_create(filename.encode('ascii'), 0, byref(fileid))
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    elif version == 4:
        ret = lib.nc_create(filename.encode('ascii'), 0x1000,
                            byref(fileid))  # 0x1000 = NC_NETCDF4
        if ret != 0: raise IOError(lib.nc_strerror(ret))
    else: raise Exception

    try:
        # Define the dimensions
        dimids = [None] * len(axes)
        for i, a in enumerate(axes):
            dimids[i] = c_int()
            if unlimited == a.name:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'), c_long(0),
                                     byref(dimids[i]))
            else:
                ret = lib.nc_def_dim(fileid, a.name.encode('ascii'),
                                     c_long(len(a)), byref(dimids[i]))
            assert ret == 0, lib.nc_strerror(ret)

        # Define the variables (including axes)
        chunks = [None] * len(vars)
        varids = [None] * len(vars)
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            # Generate the array of dimension ids for this var
            d = [dimids[list(axes).index(a)] for a in var.axes]
            # Make it C-compatible
            d = (c_int * var.naxes)(*d)
            varids[i] = c_int()
            ret = lib.nc_def_var(fileid, var.name.encode('ascii'), t,
                                 var.naxes, d, byref(varids[i]))
            assert ret == 0, lib.nc_strerror(ret)
            # Compress the data? (only works for netcdf4 or (higher?))
            if compress:
                ret = lib.nc_def_var_deflate(fileid, varids[i], 1, 1, 2)
                assert ret == 0, lib.nc_strerror(ret)

        # Write the attributes

        # global attributes
        put_attributes(fileid, -1, dataset.atts, version)

        # variable attributes
        for i, var in enumerate(vars):
            # modify axes to be netcdf friendly (CF-compliant, etc.)
            put_attributes(fileid, varids[i], var.atts, version)

        # Don't pre-fill the file
        oldmode = c_int()
        ret = lib.nc_set_fill(fileid, 256, byref(oldmode))
        assert ret == 0, "Can't set fill mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)
        # Finished defining the variables, about to start writing the values
        ret = lib.nc_enddef(fileid)
        assert ret == 0, "Error leaving define mode: %s (error %d)" % (
            lib.nc_strerror(ret), ret)

        # Relative progress of each variable
        sizes = [v.size for v in vars]
        prog = np.cumsum([0.] + sizes) / np.sum(sizes) * 100

        #  print "Saving '%s':"%filename
        pbar = PBar(message="Saving '%s':" % filename)
        #  pbar = FakePBar()
        # Write the data
        for i, var in enumerate(vars):
            t = nc_type[version][var.dtype.name]
            dtype = numpy_type[t]

            #    print 'writing', var.name

            # number of actual variables (non-axes) for determining our progress
            N = len([v for v in vars if not isinstance(v, Axis)])
            varpbar = pbar.subset(prog[i], prog[i + 1])

            views = list(View(var.axes).loop_mem())
            for j, v in enumerate(views):

                vpbar = varpbar.part(j, len(views))
                #      print '???', repr(str(v))

                # Should always be slices (since we're looping over whole thing contiguously?)
                for sl in v.slices:
                    assert isinstance(sl, slice)
                for sl in v.slices:
                    assert sl.step in (1, None)

                start = [sl.start for sl in v.slices]
                count = [sl.stop - sl.start for sl in v.slices]

                start = (c_long * var.naxes)(*start)
                count = (c_long * var.naxes)(*count)

                if isinstance(var, Axis):
                    assert len(start) == len(count) == 1
                    data = var.values
                    data = data[
                        start[0]:start[0] +
                        count[0]]  # the above gives us the *whole* axis,
                    # but under extreme conditions we may be looping over smaller pieces
                    vpbar.update(100)
                else:
                    data = v.get(var, pbar=vpbar)

                # Ensure the data is stored contiguously in memory
                data = np.ascontiguousarray(data, dtype=dtype)
                ret = chunkf[t](fileid, varids[i], start, count, point(data))
                assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (
                    var.name, lib.nc_strerror(ret), ret)

    finally:
        # Finished
        lib.nc_close(fileid)
Example #14
0
def decode_cf (dataset, ignore=[]):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Axis, NamedAxis, Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode import timeutils
  from warnings import warn
  import re

#  dataset = asdataset(dataset, copy=True)
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts
  del dataset

  # data for auxiliary arrays
  auxdict = {}
  for name in axisdict.iterkeys(): auxdict[name] = {}

  # fill values / scale / offset (if applicable)
  fillvalues = {}
  scales = {}
  offsets = {}
  for v in varlist:
    name = v.name
    fillvalues[name] = None
    scales[name] = None
    offsets[name] = None

  for name,a in axisdict.items():

    # Skip over this axis?
    if name in ignore: continue

    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # just carry along and pass to new Axis instance (l.282)

    # Find any auxiliary arrays
    aux = auxdict[name]
    if 'ancillary_variables' in atts:
      _anc = atts.pop('ancillary_variables')
      remove_from_dataset = []  # vars to remove from the dataset
      for auxname in _anc.split(' '):
        assert any(v.name == auxname for v in varlist), "ancilliary variable '%s' not found"%auxname
        newname = auxname
        # Remove the axis name prefix, if it was used
        if newname.startswith(name+'_'): newname = newname[len(name)+1:]
        aux[newname] = [v for v in varlist if v.name == auxname].pop().get()
        # Don't need this as a var anymore
        remove_from_dataset.append(auxname)

      # Remove some stuff
      varlist = [v for v in varlist if v.name not in remove_from_dataset]

    # Determine the best Axis subclass to use
#    cls = NamedAxis
    cls = type(a)

    # Generic 'axis' identifiers first
    if 'axis' in atts:
      _axis = atts.pop('axis')
      if _axis == 'X': cls = XAxis
      if _axis == 'Y': cls = YAxis
      if _axis == 'Z': cls = ZAxis
      if _axis == 'T': cls = TAxis

    # Check specific standard names, and also units?
    #TODO: don't *pop* the standard_name, units, etc. until the end of this routine - in case we didn't end up mapping them to an axis
    _ln = atts.get('long_name', a.name).lower()
    _st = atts.get('standard_name',_ln).lower()
    _units = atts.pop('units','')
    if _st == 'latitude' or _units == 'degrees_north': cls = Lat
    if _st == 'longitude' or _units == 'degrees_east': cls = Lon
    if _st == 'air_pressure' or _units in ('hPa','mbar'):
      cls = Pres
      # Don't need this in the metadata anymore (it will be put back in encode_cf)
      atts.pop('positive',None)

    if _st == 'atmosphere_hybrid_sigma_pressure_coordinate':
      #TODO: check formula_terms??
      #TODO: for ccc2nc files, look for long_name == "Model Level", use_AB = <formula>,
      #       A & B embedded as metadata or as data arrays not attached to ancillary_variables
      if 'A' in aux and 'B' in aux:
        cls = Hybrid
      else:
        warn ("Cannot create a proper Hybrid vertical axis, since 'A' and 'B' coefficients aren't found.")
    if (_st == 'time' or cls == TAxis or _units.startswith('days since') or _units.startswith('hours since') or _units.startswith('minutes since') or _units.startswith('seconds since')) and ' since ' in _units:
      _calendar = atts.pop('calendar', 'standard')
      if _calendar in ('standard', 'gregorian', 'proleptic_gregorian'): cls = StandardTime
      elif _calendar in ('365_day', 'noleap', '365day'): cls = ModelTime365
      elif _calendar in ('360_day', '360day'): cls = ModelTime360
      elif _calendar in ('none'): cls = Yearless
      else:
        warn ("unknown calendar '%s'"%_calendar)
        continue
      # Extract the time resolution (day, hour, etc), and the reference date
      res, date = re.match("([a-z]+)\s+since\s+(.*)", _units).groups()
      # Pluralize the increment (i.e. day->days)?
      if not res.endswith('s'): res += 's'
      # Extract the rest of the date
      date = date.rstrip()
      year, month, day, hour, minute, second = 0,1,1,0,0,0
      if len(date) > 0: year, date = re.match("(\d+)-?(.*)", date).groups()
      if len(date) > 0: month, date = re.match("(\d+)-?(.*)", date).groups()
      if len(date) > 0: day, date = re.match("(\d+)\s*(.*)", date).groups()
      if len(date) > 0: hour, date = re.match("(\d+):?(.*)", date).groups()
      if len(date) > 0: minute, date = re.match("(\d+):?(.*)", date).groups()
      if len(date) > 0 and date[0] != ' ': second, date = re.match("(\d+)(.*)", date).groups()
      # convert from strings to integers
      #TODO: milliseconds? time zone?
      year, month, day, hour, minute, second = map(int, [year, month, day, hour, minute, float(second)])
      # Create the time axis
      startdate={'year':year, 'month':month, 'day':day, 'hour':hour, 'minute':minute, 'second':second}
      axisdict[name] = cls(a.values, startdate=startdate, units=res, name=name, atts=atts)
      # Special case: start year=0 implies a climatology
      #NOTE: 'climatology' attribute not used, since we don't currently keep
      #      track of the interval that was used for the climatology.
      if year == 0:
        # Don't climatologize(?) the axis if there's more than a year
        if not all(axisdict[name].year == 0):
          warn ("cfmeta: data starts at year 0 (which usually indicates a climatology), but there's more than one year's worth of data!  Keeping it on a regular calendar.", stacklevel=3)
          continue
        axisdict[name] = timeutils.modify(axisdict[name], exclude='year')
      continue  # we've constructed the time axis, so move onto the next axis

    # put the units back (if we didn't use them)?
    if cls in [Axis, NamedAxis, XAxis, YAxis, ZAxis, TAxis] and _units != '': atts['units'] = _units

    # create new axis instance if need be (only if a is a generic axis, to prevent replacement of custom axes)
    # TODO: don't do this check.  This filter *should* be called before any
    # custom axis overrides, so we *should* be able to assume we only have
    # generic Axis objects at this point (at least, from the netcdf_new module)
    if (type(a) in (Axis, NamedAxis, XAxis, YAxis, ZAxis, TAxis)) and (cls != type(a)): 
      axisdict[name] = cls(values=a.values, name=name, atts=atts, **aux)

  # Apply these new axes to the variables
  # Check for fill values, etc.
  # Extract to a list first, then back to a dataset
  # (ensures the dataset axis list is up to date)
  for i,oldvar in enumerate(list(varlist)):
#    name = [n for n,v in dataset.vardict.iteritems() if v is oldvar].pop()
    name = oldvar.name
    atts = oldvar.atts.copy()
    plotatts = oldvar.atts.copy()
    fillvalue = [atts.pop(f,None) for f in ('FillValue', '_FillValue', 'missing_value')]
    fillvalue = filter(None, fillvalue)
    fillvalue = fillvalue[0] if len(fillvalue) > 0 else None
    scale = atts.pop('scale_factor', None)
    offset = atts.pop('add_offset', None)

    varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes],
                    name=name, fillvalue=fillvalue, scale=scale, offset=offset, atts=atts, plotatts=plotatts)

  dataset = Dataset(varlist, atts=global_atts)

  return dataset
Example #15
0
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  #TODO

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in axisdict.items():
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor (l.139)
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.iteritems():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.iterkeys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print '??', a.name, axisdict
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset
Example #16
0
def check_dataset (dataset):
  from pygeode.view import View
  from pygeode.tools import combine_axes
  from pygeode.progress import PBar
  from pygeode.dataset import asdataset
  import numpy as np

  # Make sure we have a dataset (in case we're sent a simple list of vars)
  dataset = asdataset(dataset)

  vars = list(dataset.vars)

  # Include axes in the list of vars (to check these values too)
  axes = combine_axes(v.axes for v in vars)
  vars.extend(axes)

  # Relative progress of each variable
  sizes = [v.size for v in vars]
  prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  pbar = PBar(message="Checking %s for I/O errors:"%repr(dataset))

  failed_indices = {}
  error_messages = {}

  # Loop over the data
  for i,var in enumerate(vars):

    varpbar = pbar.subset(prog[i], prog[i+1])

    # Scan the outer axis (record axis?) for failures.
    N = var.shape[0]
    failed_indices[var.name] = []
    error_messages[var.name] = []

    for j in range(N):
      vpbar = varpbar.part(j, N)
      try:
        # Try fetching the data, see if something fails
        var[j] if var.naxes == 1 else var[j,...]
      except Exception as e:
        failed_indices[var.name].append(j)
        error_messages[var.name].append(str(e))
      vpbar.update(100)

  # Print summary information for each variable
  everything_ok = True
  for var in vars:
    indices = failed_indices[var.name]
    messages = error_messages[var.name]
    if len(indices) == 0: continue

    everything_ok = False

    print "\nFailures encountered with variable '%s':"%var.name

    # Group together record indices that give the same error message
    unique_messages = []
    aggregated_indices = []
    for ind,msg in zip(indices,messages):
      if len(unique_messages) == 0 or msg != unique_messages[-1]:
        unique_messages.append(msg)
        aggregated_indices.append([ind])
      else:
        aggregated_indices[-1].append(ind)

    # Print each error message encountered (and the record indices that give the error)
    for ind,msg in zip(aggregated_indices,unique_messages):

      # Group records together that have are consecutive (instead of printing each record separately)
      groups = []
      for i in ind:
        if len(groups) == 0 or i-1 not in groups[-1]:
          groups.append([i])
        else:
          groups[-1].append(i)
      for g in groups:
        print "=> at %s:\n    %s"% (var.axes[0].slice[g[0]:g[-1]+1], msg)

  if not everything_ok: raise Exception("Problem encountered with the dataset.")
Example #17
0
def open (filename, value_override = {}, dimtypes = {}, namemap = {},  varlist = [], cfmeta = True):
  from numpy import empty
  from ctypes import c_long, byref
  from pygeode.axis import DummyAxis
  from pygeode.dataset import asdataset
  from pygeode.formats import finalize_open

  f = HDF4_File (filename)

  num_datasets = c_long()
  num_global_attrs = c_long()
  ret = lib.SDfileinfo (f.sd_id, byref(num_datasets), byref(num_global_attrs))
  assert ret == 0
  num_datasets = num_datasets.value
  num_global_attrs = num_global_attrs.value
  global_atts = get_attributes(f.sd_id, num_global_attrs)

  # Get the HDF vars
  SD_arr = [None] * num_datasets
  for i in range(num_datasets):
    SD_arr[i] = HDF4_SD(f, i)

  # If there are 2 vars of the name XXXX and XXXX:EOSGRID, then
  # ignore the first one and use the latter one.
  # (Based some some GMAO files from the IPY dataset)
  SD_arr = [sd for sd in SD_arr
            if sd.name.endswith(':EOSGRID')
            or not any(sd2.name == sd.name+':EOSGRID' for sd2 in SD_arr)
           ]

  # Find any 'axes'
  # (look for unique 1D vars which contain a particular dimension id)
  sd_1d = [sd for sd in SD_arr if sd.rank == 1]
  # Determine which dimensions map to a unique 1D array
  dimids = [sd.dimids[0] for sd  in sd_1d]
  dimsds = [s for s in sd_1d if dimids.count(s.dimids[0]) == 1 or s.iscoord == 1]

  # Load axis values
  for s in dimsds:
    s.values = empty(s.shape, numpy_type[s.type])
    load_values (s.sds_id, [0], s.shape, s.values)

  #for s in dimsds: print s; print s.values

  # Create axis objects
  from pygeode.axis import NamedAxis
  axes = [None] * len(dimsds)
  for i,s in enumerate(dimsds):
    # Append attributes for the axis
    atts = get_attributes (s.sds_id, s.natts)
#    if len(atts) > 0: axes[i].atts = atts
    axes[i] = NamedAxis (s.values, s.name, atts=atts)

  # Reference axes by dimension ids
  axis_lookup = {}
  for i,a in enumerate(axes): axis_lookup[dimids[i]] = a

  # Add dummy axes for dimensions without coordinate info.
  for s in SD_arr:
    for d in s.dimids:
      if d not in axis_lookup:
        dimname, dimsize, dimtype, dim_natts = get_dim_info(d)
        axis_lookup[d] = DummyAxis(dimsize,dimname)

  # Create var objects
  vars = [None]*len(SD_arr)
  for i,s in enumerate(SD_arr):
    axes = [axis_lookup[d] for d in s.dimids]
    vars[i] = HDF4_Var(s, axes)
  vars = [v for v in vars if v.sd not in dimsds]

  # Return a dataset
  d = asdataset(vars)
  d.atts = global_atts

  return finalize_open(d, dimtypes, namemap, varlist, cfmeta)
Example #18
0
def multiple_regress(Xs, Y, axes=None, N_fac=None, output='B,p', pbar=None):
    # {{{
    r'''Computes least-squares multiple regression of Y against variables Xs.

  Parameters
  ==========
  Xs : list of :class:`Var` instances
    Variables to treat as independent regressors. Must have at least one axis
    in common with each other and with Y.

  Y : :class:`Var`
    The dependent variable. Must have at least one axis in common with the Xs.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to the Xs and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'B,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : tuple of floats or :class:`Var` instances.
    The return values are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = \sum_i \beta_i X_i + \epsilon` is assumed.
    Note that a constant term is not included by default. The following
    parameters can be returned:

    * 'B': Linear coefficients :math:`\beta_i` of each regressor
    * 'r2': Fraction of the variance in Y explained by all Xs (:math:`R^2`)
    * 'p': p-value of regession; see notes.
    * 'sb': Standard deviation of each linear coefficient
    * 'covb': Covariance matrix of the linear coefficients
    * 'se': Standard deviation of residuals

    The outputs 'B', 'p', and 'sb' will produce as many outputs as there are
    regressors. 

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.4. The p-value 'p' is computed using the t-statistic appropriate
  for the multi-variate normal estimator :math:`\hat{\vec{a}}` given in section
  8.4.2; it corresponds to the probability of obtaining the regression
  coefficient under the null hypothesis that there is no linear relationship.
  Note this may not be the best way to determine if a given parameter is
  contributing a significant fraction to the explained variance of Y.  The
  variances 'se' and 'sb' are :math:`\hat{\sigma}_E` and the square root of the
  diagonal elements of :math:`\hat{\sigma}^2_E (\chi^T\chi)` in von Storch and
  Zwiers, respectively.  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npsum
    from pygeode.view import View

    # Split output request now
    ovars = ['beta', 'r2', 'p', 'sb', 'covb', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    Nr = len(Xs)

    Xaxes = combine_axes(Xs)

    srcaxes = combine_axes([Xaxes, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [Xaxes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            ia = whichaxis(srcaxes, a)
            if ia in riaxes: ri_new.append(ia)
            else:
                raise KeyError(
                    'One of the Xs or Y does not have the axis %s.' % a)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = tuple([srcaxes[i] for i in oiaxes])
    inaxes = oaxes + tuple([srcaxes[i] for i in riaxes])
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(
        riaxes) > 0, 'Regressors and %s share no axes to be regressed over' % (
            Y.name)

    # Construct work arrays
    os = oview.shape
    os1 = os + (Nr, )
    os2 = os + (Nr, Nr)
    y = np.zeros(os, 'd')
    yy = np.zeros(os, 'd')
    xy = np.zeros(os1, 'd')
    xx = np.zeros(os2, 'd')
    xxinv = np.zeros(os2, 'd')

    N = np.prod([len(srcaxes[i]) for i in riaxes])

    # Accumulate data
    for outsl, datatuple in loopover(Xs + [Y], oview, inaxes, pbar=pbar):
        ydata = datatuple[-1].astype('d')
        xdata = [datatuple[i].astype('d') for i in range(Nr)]
        y[outsl] += npsum(ydata, siaxes)
        yy[outsl] += npsum(ydata**2, siaxes)
        for i in range(Nr):
            xy[outsl + (i, )] += npsum(xdata[i] * ydata, siaxes)
            for j in range(i + 1):
                xx[outsl + (i, j)] += npsum(xdata[i] * xdata[j], siaxes)

    # Fill in opposite side of xTx
    for i in range(Nr):
        for j in range(i):
            xx[..., j, i] = xx[..., i, j]

    # Compute inverse of covariance matrix (could be done more intellegently? certainly the python
    # loop over oview does not help)
    xx = xx.reshape(-1, Nr, Nr)
    xxinv = xxinv.reshape(-1, Nr, Nr)
    for i in range(xx.shape[0]):
        xxinv[i, :, :] = np.linalg.inv(xx[i, :, :])
    xx = xx.reshape(os2)
    xxinv = xxinv.reshape(os2)

    beta = np.sum(xy.reshape(os + (1, Nr)) * xxinv, -1)
    vare = np.sum(xy * beta, -1)

    if N_fac is None: N_eff = N
    else: N_eff = N // N_fac

    sigbeta = [
        np.sqrt((yy - vare) * xxinv[..., i, i] / N_eff) for i in range(Nr)
    ]

    xns = [X.name if X.name != '' else 'X%d' % i for i, X in enumerate(Xs)]
    yn = Y.name if Y.name != '' else 'Y'

    from .var import Var
    from .dataset import asdataset
    from .axis import NonCoordinateAxis

    ra = NonCoordinateAxis(values=np.arange(Nr),
                           regressor=xns,
                           name='regressor')
    ra2 = NonCoordinateAxis(values=np.arange(Nr),
                            regressor=xns,
                            name='regressor2')
    Nd = len(oaxes)

    rvs = []

    if 'beta' in output:
        B = Var(oaxes + (ra, ), values=beta, name='beta')
        B.atts['longname'] = 'regression coefficient'
        rvs.append(B)

    if 'r2' in output:
        vary = (yy - y**2 / N)
        R2 = 1 - (yy - vare) / vary
        R2 = Var(oaxes, values=R2, name='R2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        p = [
            2. *
            (1. - tdist.cdf(np.abs(beta[..., i] / sigbeta[i]), N_eff - Nr))
            for i in range(Nr)
        ]
        p = np.transpose(np.array(p), [Nd] + list(range(Nd)))
        p = Var(oaxes + (ra, ), values=p, name='p')
        p.atts['longname'] = 'p-values'
        rvs.append(p)

    if 'sb' in output:
        sigbeta = np.transpose(np.array(sigbeta), [Nd] + list(range(Nd)))
        sb = Var(oaxes + (ra, ), values=sigbeta, name='sb')
        sb.atts['longname'] = 'standard deviation of linear coefficients'
        rvs.append(sb)

    if 'covb' in output:
        sigmat = np.zeros(os2, 'd')
        for i in range(Nr):
            for j in range(Nr):
                #sigmat[..., i, j] = np.sqrt((yy - vare) * xxinv[..., i, j] / N_eff)
                sigmat[..., i, j] = (yy - vare) * xxinv[..., i, j] / N_eff
        covb = Var(oaxes + (ra, ra2), values=sigmat, name='covb')
        covb.atts['longname'] = 'Covariance matrix of the linear coefficients'
        rvs.append(covb)

    if 'se' in output:
        se = np.sqrt((yy - vare) / N_eff)
        se = Var(oaxes, values=se, name='se')
        se.atts['longname'] = 'standard deviation of residual'
        rvs.append(se)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'multiple linear regression parameters for %s regressed against %s' % (
            yn, xns)

    return ds
Example #19
0
def decode_cf (dataset, ignore=[]):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Axis, NamedAxis, Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, Station, DummyAxis, NonCoordinateAxis
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode import timeutils
  from warnings import warn
  import re

#  dataset = asdataset(dataset, copy=True)
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts
  del dataset

  # Decode string variables
  for i,var in enumerate(varlist):
    if var.name.endswith("_name") and var.dtype.name in ("string8","bytes8") and var.axes[-1].name.endswith("_strlen"):
      varlist[i] = decode_string_var(var)

  # data for auxiliary arrays
  auxdict = {}
  for name in axisdict.keys(): auxdict[name] = {}

  # fill values / scale / offset (if applicable)
  fillvalues = {}
  scales = {}
  offsets = {}
  for v in varlist:
    name = v.name
    fillvalues[name] = None
    scales[name] = None
    offsets[name] = None

  for name,a in list(axisdict.items()):

    # Skip over this axis?
    if name in ignore: continue

    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # just carry along and pass to new Axis instance

    # Find any auxiliary arrays
    aux = auxdict[name]
    if 'ancillary_variables' in atts:
      _anc = atts.pop('ancillary_variables')
      remove_from_dataset = []  # vars to remove from the dataset
      for auxname in _anc.split(' '):
        assert any(v.name == auxname for v in varlist), "ancillary variable '%s' not found"%auxname
        newname = auxname
        # Remove the axis name prefix, if it was used
        if newname.startswith(name+'_'): newname = newname[len(name)+1:]
        aux[newname] = [v for v in varlist if v.name == auxname].pop().get()
        # Don't need this as a var anymore
        remove_from_dataset.append(auxname)

      # Remove some stuff
      varlist = [v for v in varlist if v.name not in remove_from_dataset]

    # Determine the best Axis subclass to use
#    cls = NamedAxis
    cls = type(a)

    # Generic 'axis' identifiers first
    if 'axis' in atts:
      _axis = atts.pop('axis')
      if _axis == 'X': cls = XAxis
      if _axis == 'Y': cls = YAxis
      if _axis == 'Z': cls = ZAxis
      if _axis == 'T': cls = TAxis

    # Check specific standard names, and also units?
    #TODO: don't *pop* the standard_name, units, etc. until the end of this routine - in case we didn't end up mapping them to an axis
    _ln = atts.get('long_name', a.name).lower()
    _st = atts.get('standard_name',_ln).lower()
    _units = atts.pop('units','')
    if _st == 'latitude' or _units == 'degrees_north': cls = Lat
    if _st == 'longitude' or _units == 'degrees_east': cls = Lon
    if _st == 'air_pressure' or _units in ('hPa','mbar'):
      cls = Pres
      # Don't need this in the metadata anymore (it will be put back in encode_cf)
      atts.pop('positive',None)

    if _st == 'atmosphere_hybrid_sigma_pressure_coordinate':
      #TODO: check formula_terms??
      #TODO: for ccc2nc files, look for long_name == "Model Level", use_AB = <formula>,
      #       A & B embedded as metadata or as data arrays not attached to ancillary_variables
      if 'A' in aux and 'B' in aux:
        cls = Hybrid
      else:
        warn ("Cannot create a proper Hybrid vertical axis, since 'A' and 'B' coefficients aren't found.")

    if _st == 'station':
      cls = Station

    if (_st == 'time' or cls == TAxis or _units.startswith('days since') or _units.startswith('hours since') or _units.startswith('minutes since') or _units.startswith('seconds since')) and ' since ' in _units:
      _calendar = atts.pop('calendar', 'standard')
      if _calendar in ('standard', 'gregorian', 'proleptic_gregorian'): cls = StandardTime
      elif _calendar in ('365_day', 'noleap', '365day'): cls = ModelTime365
      elif _calendar in ('360_day', '360day'): cls = ModelTime360
      elif _calendar in ('none'): cls = Yearless
      else:
        warn ("unknown calendar '%s'"%_calendar)
        continue
      # Extract the time resolution (day, hour, etc), and the reference date
      res, date = re.match("([a-z]+)\s+since\s+(.*)", _units).groups()
      # Pluralize the increment (i.e. day->days)?
      if not res.endswith('s'): res += 's'
      # Extract the rest of the date
      date = date.rstrip()
      year, month, day, hour, minute, second = 0,1,1,0,0,0
      if len(date) > 0: year, date = re.match("(\d+)-?(.*)", date).groups()
      if len(date) > 0: month, date = re.match("(\d+)-?(.*)", date).groups()
      if len(date) > 0: day, date = re.match("(\d+)\s*(.*)", date).groups()
      if date.startswith('T'): date = date[1:]
      if len(date) > 0: hour, date = re.match("(\d+):?(.*)", date).groups()
      if len(date) > 0: minute, date = re.match("(\d+):?(.*)", date).groups()
      if len(date) > 0 and date[0] != ' ': second, date = re.match("(\d+)(.*)", date).groups()
      # convert from strings to integers
      #TODO: milliseconds? time zone?
      year, month, day, hour, minute, second = list(map(int, [year, month, day, hour, minute, float(second)]))
      # Create the time axis
      startdate={'year':year, 'month':month, 'day':day, 'hour':hour, 'minute':minute, 'second':second}
      axisdict[name] = cls(a.values, startdate=startdate, units=res, name=name, atts=atts)
      # Special case: start year=0 implies a climatology
      #NOTE: 'climatology' attribute not used, since we don't currently keep
      #      track of the interval that was used for the climatology.
      if year == 0:
        # Don't climatologize(?) the axis if there's more than a year
        if not all(axisdict[name].year == 0):
          warn ("cfmeta: data starts at year 0 (which usually indicates a climatology), but there's more than one year's worth of data!  Keeping it on a regular calendar.", stacklevel=3)
          continue
        axisdict[name] = timeutils.modify(axisdict[name], exclude='year')
      continue  # we've constructed the time axis, so move onto the next axis

    # Check for a match from the custom axes (from add-ons).
    if _st in custom_axes:
      cls = custom_axes[_st]

    # Find any other information that should be put inside this axis.
    # Look for anything that's identified as a coordinate or anicllary
    # variable, and that has this axis as its only dimension.
    dependencies = set()
    for var in varlist:
      if var.hasaxis(a.name):
        dependencies.update(var.atts.get('coordinates','').split())
        dependencies.update(var.atts.get('ancillary_variables','').split())
    # Look up these dependencies.  Only consider 1D information, since we
    # don't yet have a way to associate multidimensional arrays as auxarrays
    # in an axis.
    dependencies = [v for v in varlist if v.name in dependencies and v.naxes == 1 and v.hasaxis(a.name)]

    # If we found any such information, then this is no longer a simple
    # "dummy" axis.
    if issubclass(cls, DummyAxis) and len(dependencies) > 0:
      cls = NonCoordinateAxis

    # Attach the information from these dependent variables as auxiliary arrays.
    aux.update((dep.name,dep.get()) for dep in dependencies)

    # Anything that got attached to this axis should be removed from the
    # list of variables, since it's just extra info specific to the axis.
    varlist = [v for v in varlist if v.name not in aux]


    # put the units back (if we didn't use them)?
    if cls in [Axis, NamedAxis, XAxis, YAxis, ZAxis, TAxis] and _units != '': atts['units'] = _units

    # create new axis instance if need be.
    if cls != type(a):
      axisdict[name] = cls(values=a.values, name=name, atts=atts, **aux)

  # Apply these new axes to the variables
  # Check for fill values, etc.
  # Extract to a list first, then back to a dataset
  # (ensures the dataset axis list is up to date)
  for i,oldvar in enumerate(list(varlist)):
#    name = [n for n,v in six.iteritems(dataset.vardict) if v is oldvar].pop()
    name = oldvar.name
    atts = oldvar.atts.copy()
    plotatts = oldvar.atts.copy()
    fillvalue = [atts.pop(f,None) for f in ('FillValue', '_FillValue', 'missing_value')]
    fillvalue = [_f for _f in fillvalue if _f]
    fillvalue = fillvalue[0] if len(fillvalue) > 0 else None
    scale = atts.pop('scale_factor', None)
    offset = atts.pop('add_offset', None)

    varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes],
                    name=name, fillvalue=fillvalue, scale=scale, offset=offset, atts=atts, plotatts=plotatts)

  dataset = Dataset(varlist, atts=global_atts)

  return dataset
Example #20
0
def regress(X, Y, axes=None, N_fac=None, output='m,b,p', pbar=None):
    # {{{
    r'''Computes least-squares linear regression of Y against X.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to regress. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to X and Y.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom; the effective
    number will be given by the number estimated from the dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,b,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names of the 
    variables match the output request string (i.e. if ``ds`` is the returned dataset, the 
    linear coefficient of the regression can be obtained by ``ds.m``). 
    
    A fit of the form :math:`Y = m X + b + \epsilon` is assumed, and the
    following parameters can be returned:

    * 'm': Linear coefficient of the regression
    * 'b': Constant coefficient of the regression
    * 'r2': Fraction of the variance in Y explained by X (:math:`R^2`)
    * 'p': p-value of regression; see notes.
    * 'sm': Standard deviation of linear coefficient estimate
    * 'se': Standard deviation of residuals

  Notes
  =====
  The statistics described are computed following von Storch and Zwiers 1999,
  section 8.3. The p-value 'p' is computed using the t-statistic given in
  section 8.3.8, and confidence intervals for the slope and intercept can be
  computed from 'se' and 'se' (:math:`\hat{\sigma}_E` and
  :math:`\hat{\sigma}_E/\sqrt{S_{XX}}` in von Storch and Zwiers, respectively).
  The data is assumed to be normally distributed.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['m', 'b', 'r2', 'p', 'sm', 'se']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from regression. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert len(riaxes) > 0, '%s and %s share no axes to be regressed over' % (
        X.name, Y.name)

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Sum of weights
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    if N_fac is None:
        N_eff = Na - 2.
    else:
        N_eff = Na / N_fac - 2.

    nmsk = (N_eff > 0.)

    xx[nmsk] -= (x * x)[nmsk] / Na[nmsk]
    yy[nmsk] -= (y * y)[nmsk] / Na[nmsk]
    xy[nmsk] -= (x * y)[nmsk] / Na[nmsk]

    dmsk = (xx > 0.)

    m = np.zeros(oview.shape, 'd')
    b = np.zeros(oview.shape, 'd')
    r2 = np.zeros(oview.shape, 'd')

    m[dmsk] = xy[dmsk] / xx[dmsk]
    b[nmsk] = (y[nmsk] - m[nmsk] * x[nmsk]) / Na[nmsk]

    r2den = xx * yy
    d2msk = (r2den > 0.)

    r2[d2msk] = xy[d2msk]**2 / r2den[d2msk]

    sige = np.zeros(oview.shape, 'd')
    sigm = np.zeros(oview.shape, 'd')
    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    sige[nmsk] = (yy[nmsk] - m[nmsk] * xy[nmsk]) / N_eff[nmsk]
    sigm[dmsk] = np.sqrt(sige[dmsk] / xx[dmsk])
    sige[nmsk] = np.sqrt(sige[dmsk])
    t[dmsk] = np.abs(m[dmsk]) / sigm[dmsk]
    p[nmsk] = 2. * (1. - tdist.cdf(t[nmsk], N_eff[nmsk]))

    msk = nmsk & dmsk

    m[~msk] = np.nan
    b[~msk] = np.nan
    sige[~msk] = np.nan
    sigm[~msk] = np.nan
    p[~msk] = np.nan

    msk = nmsk & d2msk
    r2[~msk] = np.nan

    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        M = Var(oaxes, values=m, name='m')
        M.atts['longname'] = 'slope'
        rvs.append(M)

    if 'b' in output:
        B = Var(oaxes, values=b, name='b')
        B.atts['longname'] = 'intercept'
        rvs.append(B)

    if 'r2' in output:
        R2 = Var(oaxes, values=r2, name='r2')
        R2.atts['longname'] = 'fraction of variance explained'
        rvs.append(R2)

    if 'p' in output:
        P = Var(oaxes, values=p, name='p')
        P.atts['longname'] = 'p-value'
        rvs.append(P)

    if 'sm' in output:
        SM = Var(oaxes, values=sigm, name='sm')
        SM.atts['longname'] = 'standard deviation of slope parameter'
        rvs.append(SM)

    if 'se' in output:
        SE = Var(oaxes, values=sige, name='se')
        SE.atts['longname'] = 'standard deviation of residual'
        rvs.append(SE)

    ds = asdataset(rvs)
    ds.atts[
        'description'] = 'linear regression parameters for %s regressed against %s' % (
            yn, xn)

    return ds
Example #21
0
def isnonzero(X, axes=None, alpha=0.05, N_fac=None, output='m,p', pbar=None):
    # {{{
    r'''Computes the mean value of X and statistics relevant for a test against
  the hypothesis that it is 0.

  Parameters
  ==========
  X : :class:`Var`
    Variable to average.

  axes : list, optional
    Axes over which to compute the mean; if nothing is specified, the mean is
    computed over all axes.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom;
    the effective number will be given by the number estimated from the dataset
    divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'm,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the mean value can be obtained through ``ds.m``).
    The following quantities can be calculated.

    * 'm': The mean value of X
    * 'p': The probability of the computed value if the population mean was zero
    * 'ci': The confidence interval of the mean at the level specified by alpha

    If the average is taken over all axes of X resulting in a scalar,
    the above values are returned as a tuple in the order given. If not, the
    results are provided as :class:`Var` objects in a dataset. 

  See Also
  ========
  difference

  Notes
  =====
  The number of effective degrees of freedom can be scaled as in :meth:`difference`. 
  The p-value and confidence interval are computed for the t-statistic defined in 
  eq (6.61) of von Storch and Zwiers 1999.'''

    from pygeode.tools import combine_axes, whichaxis, loopover, npsum, npnansum
    from pygeode.view import View

    riaxes = [X.whichaxis(n) for n in axes]
    raxes = [a for i, a in enumerate(X.axes) if i in riaxes]
    oaxes = [a for i, a in enumerate(X.axes) if i not in riaxes]
    oview = View(oaxes)

    N = np.product([len(X.axes[i]) for i in riaxes])

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert N > 1, '%s has only one element along the reduction axes' % X.name

    # Construct work arrays
    x = np.zeros(oview.shape, 'd')
    xx = np.zeros(oview.shape, 'd')
    Na = np.zeros(oview.shape, 'd')

    x[()] = np.nan
    xx[()] = np.nan
    Na[()] = np.nan

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, riaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, riaxes)], 0)

        # Sum of weights (kludge to get masking right)
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xdata), riaxes)], 0)

    imsk = (Na > 0.)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    xx[imsk] -= x[imsk]**2 / Na[imsk]
    xx[imsk] = xx[imsk] / (Na[imsk] - 1)

    x[imsk] /= Na[imsk]

    if N_fac is not None:
        eN = N // N_fac
        eNa = Na // N_fac
    else:
        eN = N
        eNa = Na

    sdom = np.zeros((oview.shape), 'd')
    p = np.zeros((oview.shape), 'd')
    t = np.zeros((oview.shape), 'd')
    ci = np.zeros((oview.shape), 'd')

    sdom[imsk] = np.sqrt(xx[imsk] / eNa[imsk])
    dmsk = (sdom > 0.)

    t[dmsk] = np.abs(x[dmsk]) / sdom[dmsk]
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], eNa[imsk] - 1))
    ci[imsk] = tdist.ppf(1. - alpha / 2, eNa[imsk] - 1) * sdom[imsk]

    name = X.name if X.name != '' else 'X'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'm' in output:
        m = Var(oaxes, values=x, name='m')
        m.atts['longname'] = 'Mean value of %s' % (name, )
        rvs.append(m)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value of test %s is 0' % (name, )
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence intervale of the mean value of %s' % (
                name, )
        rvs.append(ci)

    return asdataset(rvs)
Example #22
0
def save (filename, in_dataset, version=3, pack=None, compress=False, cfmeta = True, unlimited=None):
# {{{
  from ctypes import c_int, c_long, byref
  from pygeode.view import View
  from pygeode.tools import combine_axes, point
  from pygeode.axis import Axis, DummyAxis
  import numpy as np
  from pygeode.progress import PBar, FakePBar
  from pygeode.formats import finalize_save
  from pygeode.dataset import asdataset

  assert isinstance(filename,str)

  in_dataset = asdataset(in_dataset)
  dataset = finalize_save(in_dataset, cfmeta, pack)

  # Version?
  if compress: version = 4
  assert version in (3,4)

  fileid = c_int()

  vars = list(dataset.vars)
  # The output axes
  axes = combine_axes(v.axes for v in vars)

  # Include axes in the list of vars (for writing to netcdf).
  # Exclude axes which don't have any intrinsic values.
  vars = vars + [a for a in axes if not isinstance(a,DummyAxis)]
  #vars.extend(axes)

  # Variables (and axes) must all have unique names
  assert len(set([v.name for v in vars])) == len(vars), "vars must have unique names: %s"% [v.name for v in vars]

  if unlimited is not None:
    assert unlimited in [a.name for a in axes]

  # Functions for writing entire array
  allf = {1:lib.nc_put_var_schar, 2:lib.nc_put_var_text, 3:lib.nc_put_var_short,
       4:lib.nc_put_var_int, 5:lib.nc_put_var_float,
       6:lib.nc_put_var_double, 7:lib.nc_put_var_uchar,
       8:lib.nc_put_var_ushort, 9:lib.nc_put_var_uint,
      10:lib.nc_put_var_longlong, 11:lib.nc_put_var_ulonglong}

  # Functions for writing chunks
  chunkf = {1:lib.nc_put_vara_schar, 2:lib.nc_put_vara_text, 3:lib.nc_put_vara_short,
       4:lib.nc_put_vara_int, 5:lib.nc_put_vara_float,
       6:lib.nc_put_vara_double, 7:lib.nc_put_vara_uchar,
       8:lib.nc_put_vara_ushort, 9:lib.nc_put_vara_uint,
      10:lib.nc_put_vara_longlong, 11:lib.nc_put_vara_ulonglong}


  # Create the file
  if version == 3:
    ret = lib.nc_create (filename.encode('ascii'), 0, byref(fileid))
    if ret != 0: raise IOError(lib.nc_strerror(ret))
  elif version == 4:
    ret = lib.nc_create (filename.encode('ascii'), 0x1000, byref(fileid))  # 0x1000 = NC_NETCDF4
    if ret != 0: raise IOError(lib.nc_strerror(ret))
  else: raise Exception

  try:
    # Define the dimensions
    dimids = [None] * len(axes)
    for i,a in enumerate(axes):
      dimids[i] = c_int()
      if unlimited == a.name:
        ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(0), byref(dimids[i]))
      else:
        ret = lib.nc_def_dim (fileid, a.name.encode('ascii'), c_long(len(a)), byref(dimids[i]))
      assert ret == 0, lib.nc_strerror(ret)

    # Define the variables (including axes)
    chunks = [None] * len(vars)
    varids = [None] * len(vars)
    for i, var in enumerate(vars):
      t = nc_type[version][var.dtype.name]
      # Generate the array of dimension ids for this var
      d = [dimids[list(axes).index(a)] for a in var.axes]
      # Make it C-compatible
      d = (c_int * var.naxes)(*d)
      varids[i] = c_int()
      ret = lib.nc_def_var (fileid, var.name.encode('ascii'), t, var.naxes, d, byref(varids[i]))
      assert ret == 0, lib.nc_strerror(ret)
      # Compress the data? (only works for netcdf4 or (higher?))
      if compress:
        ret = lib.nc_def_var_deflate (fileid, varids[i], 1, 1, 2)
        assert ret == 0, lib.nc_strerror(ret)

    # Write the attributes

    # global attributes
    put_attributes (fileid, -1, dataset.atts, version)

    # variable attributes
    for i, var in enumerate(vars):
      # modify axes to be netcdf friendly (CF-compliant, etc.)
      put_attributes (fileid, varids[i], var.atts, version)

    # Don't pre-fill the file
    oldmode = c_int()
    ret = lib.nc_set_fill (fileid, 256, byref(oldmode))
    assert ret == 0, "Can't set fill mode: %s (error %d)" % (lib.nc_strerror(ret), ret)
    # Finished defining the variables, about to start writing the values
    ret = lib.nc_enddef (fileid)
    assert ret == 0, "Error leaving define mode: %s (error %d)" % (lib.nc_strerror(ret), ret)

    # Relative progress of each variable
    sizes = [v.size for v in vars]
    prog = np.cumsum([0.]+sizes) / np.sum(sizes) * 100

  #  print "Saving '%s':"%filename
    pbar = PBar(message="Saving '%s':"%filename)
  #  pbar = FakePBar()
    # Write the data
    for i, var in enumerate(vars):
      t = nc_type[version][var.dtype.name]
      dtype = numpy_type[t]

  #    print 'writing', var.name

      # number of actual variables (non-axes) for determining our progress
      N = len([v for v in vars if not isinstance(v,Axis)])
      varpbar = pbar.subset(prog[i], prog[i+1])

      views = list(View(var.axes).loop_mem())
      for j,v in enumerate(views):

        vpbar = varpbar.part(j, len(views))
  #      print '???', repr(str(v))

        # Should always be slices (since we're looping over whole thing contiguously?)
        for sl in v.slices: assert isinstance(sl, slice)
        for sl in v.slices: assert sl.step in (1,None)

        start = [sl.start for sl in v.slices]
        count = [sl.stop - sl.start for sl in v.slices]

        start = (c_long*var.naxes)(*start)
        count = (c_long*var.naxes)(*count)

        if isinstance(var, Axis):
          assert len(start) == len(count) == 1
          data = var.values
          data = data[start[0]:start[0]+count[0]] # the above gives us the *whole* axis,
                                                  # but under extreme conditions we may be looping over smaller pieces
          vpbar.update(100)
        else: data = v.get(var, pbar=vpbar)

        # Ensure the data is stored contiguously in memory
        data = np.ascontiguousarray(data, dtype=dtype)
        ret = chunkf[t](fileid, varids[i], start, count, point(data))
        assert ret == 0, "Error writing var '%s' to netcdf: %s (error %d)" % (var.name, lib.nc_strerror(ret), ret)

  finally:
    # Finished
    lib.nc_close(fileid)
Example #23
0
def open(filename, value_override = {}, dimtypes = {}, namemap = {},  varlist = [], cfmeta = True):
# {{{
  ''' open (filename, [value_override = {}, dimtypes = {}, namemap = {}, varlist = [] ])

  Returns a Dataset of PyGeode variables contained in the specified files. The axes of the 
  variables are created from the dimensions of the NetCDF file. NetCDF variables in the file that do
  not correspond to dimensions are imported as PyGeode variables.

  filename - NetCDF file to open
  value_override - an optional dictionary with replacement values for one or more variables.
           The only known use for this dictionary is to avoid loading in values from a severely
           scattered variable (such as a 'time' axis or other slowest-varying dimension).
  dimtypes - a dictionary mapping dimension names to axis classes. The keys should be axis names
              as defined in the NetCDF file; values should be one of:
              1) an axis instance, 
              2) an axis class, or 
              3) a tuple of an axis class and a dictionary with keyword arguments to pass 
                to that axis' constructor              
              If no dictionary is included, an attempt is made to automatically identify the axis 
              types.
  namemap - an optional dictionary to map NetCDF variable names (keys) to PyGeode variable names
            (values); also works for axes/dimensions
  varlist - a list containing the variables that should be loaded into the data set (if the list is
            empty, all NetCDF variables will be loaded)
  Note: The identifiers used in varlist and dimtypes are the original names used in the NetCDF file, 
        not the names given in namemap.'''

  from os.path import exists
  from ctypes import c_int, byref
  from pygeode.dataset import asdataset
  from pygeode.formats import finalize_open
  from pygeode.axis import Axis
  if not filename.startswith('http://'):
    assert exists(filename), 'File open failed. "%s" does not exist.' % filename


  # Read variable dimensions and metadata from the file
  f = NCFile(filename)
  f.open()
  try:
    fileid = f.fileid

    # Get number of variables
    nvars = c_int()
    ret = lib.nc_inq_nvars(fileid, byref(nvars))
    assert ret == 0, lib.nc_strerror(ret)

    nvars = nvars.value

    # Construct all the variables, put in a list
    vars = [NCVar(f,i) for i in range(nvars)]

    # Construct a dataset from these Vars
    dataset = asdataset(vars)
    dataset.atts = get_attributes (fileid, -1)

  finally:
    f.close()

  # Add the object stuff from dimtypes to value_override, so we don't trigger a
  # load operation on those dims.
  # (We could use any values here, since they'll be overridden again later,
  #  but we might as well use something relevant).
  value_override = dict(value_override)  # don't use  the default (static) empty dict
  for k,v in list(dimtypes.items()):
    if isinstance(v,Axis):
      value_override[k] = v.values

  #### Filters to apply to the data ####

  # Override values from the source?
  if len(value_override) > 0:
    dataset = override_values(dataset, value_override)

  # Set up the proper axes (get coordinate values / metadata from a 1D variable
  # with the same name as the dimension)
  dataset = dims2axes(dataset)

  return finalize_open(dataset, dimtypes, namemap, varlist, cfmeta)
Example #24
0
def to_xarray(dataset):
  """
  Converts a PyGeode Dataset into an xarray Dataset.

  Parameters
  ----------
  dataset : pygeode.Dataset
    The dataset to be converted.

  Returns
  -------
  out : xarray.Dataset
    An object which can be used with the xarray package.
  """
  from pygeode.dataset import asdataset
  from pygeode.formats.cfmeta import encode_cf
  from pygeode.view import View
  from dask.base import tokenize
  import dask.array as da
  import xarray as xr
  dataset = asdataset(dataset)
  # Encode the axes/variables with CF metadata.
  dataset = encode_cf(dataset)
  out = dict()
  # Loop over each axis and variable.
  for var in list(dataset.axes) + list(dataset.vars):
    # Generate a unique name to identify it with dask.
    name = var.name + "-" + tokenize(var)
    dsk = dict()
    dims = [a.name for a in var.axes]

    # Special case: already have the values in memory.
    if hasattr(var,'values'):
      out[var.name] = xr.DataArray(var.values, dims=dims, attrs=var.atts, name=var.name)
      continue

    # Keep track of all the slices that were made over each dimension.
    # This information will be used to determine the "chunking" that was done
    # on the variable from inview.loop_mem().
    slice_order = [[] for a in var.axes]
    chunks = []
    # Break up the variable into into portions that are small enough to fit
    # in memory.  These will become the "chunks" for dask.
    inview = View(var.axes)
    for outview in inview.loop_mem():
      integer_indices = map(tuple,outview.integer_indices)
      # Determine *how* loop_mem is splitting the axes, and define the chunk
      # sizes accordingly.
      # A little indirect, but loop_mem doesn't make its chunking choices
      # available to the caller.
      for o, sl in zip(slice_order, integer_indices):
        if sl not in o:
          o.append(sl)
      ind = [o.index(sl) for o, sl in zip(slice_order, integer_indices)]
      # Add this chunk to the dask array.
      key = tuple([name] + ind)
      dsk[key] = (var.getview, outview, False)
    # Construct the dask array.
    chunks = [map(len,sl) for sl in slice_order]
    arr = da.Array(dsk, name, chunks, dtype=var.dtype)
    # Wrap this into an xarray.DataArray (with metadata and named axes).
    out[var.name] = xr.DataArray(arr, dims = dims, attrs = var.atts, name=var.name)
  # Build the final xarray.Dataset.
  out = xr.Dataset(out, attrs=dataset.atts)
  # Re-decode the CF metadata on the xarray side.
  out = xr.conventions.decode_cf(out)
  return out
Example #25
0
def paired_difference(X,
                      Y,
                      axes=None,
                      alpha=0.05,
                      N_fac=None,
                      output='d,p,ci',
                      pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y, assuming that individual elements
  of X and Y can be directly paired. In contrast to :func:`difference`, X and Y must have the same
  shape.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must share all axes over which the means are being computed.

  axes : list, optional
    Axes over which to compute means; if nothing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float
    Confidence level for which to compute confidence interval.

  N_fac : integer
    A factor by which to rescale the estimated number of degrees of freedom of
    X and Y; the effective number will be given by the number estimated from the
    dataset divided by ``N_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  difference

  Notes
  =====
  Following section 6.6.6 of von Storch and Zwiers 1999, a one-sample t test is used to test the
  hypothesis. The number of degrees of freedom is the sample size scaled by N_fac, less one. This
  provides a means of taking into account serial correlation in the data (see sections 6.6.7-9), but
  the appropriate number of effective degrees of freedom are not calculated explicitly by this
  routine. The p-value and confidence interval are computed based on the t-statistic in eq
  (6.21).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    Nx = np.product([len(X.axes[i]) for i in ixaxes])

    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])

    assert ixaxes == iyaxes and Nx == Ny, 'For the paired difference test, X and Y must have the same size along the reduction axes.'

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    # Construct work arrays
    d = np.full(oview.shape, np.nan, 'd')
    dd = np.full(oview.shape, np.nan, 'd')
    N = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ydata) in loopover([X, Y],
                                          oview,
                                          inaxes=srcaxes,
                                          pbar=pbar):
        ddata = xdata.astype('d') - ydata.astype('d')
        d[outsl] = np.nansum([d[outsl], npnansum(ddata, ixaxes)], 0)
        dd[outsl] = np.nansum([dd[outsl], npnansum(ddata**2, ixaxes)], 0)

        # Count of non-NaN data points
        N[outsl] = np.nansum([N[outsl], npnansum(~np.isnan(ddata), ixaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (N > 1)
    dd[imsk] -= (d * d)[imsk] / N[imsk]
    dd[imsk] /= (N[imsk] - 1)
    d[imsk] /= N[imsk]

    # Ensure variance is non-negative
    dd[dd <= 0.] = 0.

    if N_fac is not None: eN = N // N_fac
    else: eN = N

    emsk = (eN > 1)

    den = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    den = np.zeros(oview.shape, 'd')
    den[emsk] = np.sqrt(dd[emsk] / (eN[emsk] - 1))
    dmsk = (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], eN[dmsk] - 1))
    ci[dmsk] = tdist.ppf(1. - alpha / 2, eN[dmsk] - 1) * den[dmsk]

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=eN - 1, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for t-test of paired difference (%s - %s)' % (
                xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of paired difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['N_fac'] = N_fac
    ds.atts['description'] = 't-test of paired difference (%s - %s)' % (yn, xn)

    return ds
Example #26
0
def correlate(X, Y, axes=None, output='r2,p', pbar=None):
    # {{{
    r'''Computes correlation between variables X and Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to correlate. Must have at least one axis in common.

  axes : list, optional
    Axes over which to compute correlation; if nothing is specified, the correlation
    is computed over all axes common to  shared by X and Y.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults
    to 'r2,p'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The names of the variables match the output request string (i.e. if ``ds``
    is the returned dataset, the correlation coefficient can be obtained
    through ``ds.r2``).

    * 'r2': The correlation coefficient :math:`\rho_{XY}`
    * 'p':  The p-value; see notes.

  Notes
  =====
  The coefficient :math:`\rho_{XY}` is computed following von Storch and Zwiers
  1999, section 8.2.2. The p-value is the probability of finding a correlation
  coeefficient of equal or greater magnitude (two-sided) to the given result
  under the hypothesis that the true correlation coefficient between X and Y is
  zero. It is computed from the t-statistic given in eq (8.7), in section
  8.2.3, and assumes normally distributed quantities.'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['r2', 'p']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    # Put all the axes being reduced over at the end
    # so that we can reshape
    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [srcaxes[i] for i in oiaxes]
    inaxes = oaxes + [srcaxes[i] for i in riaxes]
    oview = View(oaxes)
    iview = View(inaxes)
    siaxes = list(range(len(oaxes), len(srcaxes)))

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    xy = np.full(oview.shape, np.nan, 'd')
    Na = np.full(oview.shape, np.nan, 'd')

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    for outsl, (xdata, ydata) in loopover([X, Y], oview, inaxes, pbar=pbar):
        xdata = xdata.astype('d')
        ydata = ydata.astype('d')
        xydata = xdata * ydata

        xbc = [s1 // s2 for s1, s2 in zip(xydata.shape, xdata.shape)]
        ybc = [s1 // s2 for s1, s2 in zip(xydata.shape, ydata.shape)]
        xdata = np.tile(xdata, xbc)
        ydata = np.tile(ydata, ybc)
        xdata[np.isnan(xydata)] = np.nan
        ydata[np.isnan(xydata)] = np.nan

        # It seems np.nansum does not broadcast its arguments automatically
        # so there must be a better way of doing this...
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, siaxes)], 0)
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, siaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, siaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, siaxes)], 0)
        xy[outsl] = np.nansum([xy[outsl], npnansum(xydata, siaxes)], 0)

        # Count of non-NaN data points
        Na[outsl] = np.nansum(
            [Na[outsl], npnansum(~np.isnan(xydata), siaxes)], 0)

    imsk = (Na > 0)

    xx[imsk] -= (x * x)[imsk] / Na[imsk]
    yy[imsk] -= (y * y)[imsk] / Na[imsk]
    xy[imsk] -= (x * y)[imsk] / Na[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    # Compute correlation coefficient, t-statistic, p-value
    den = np.zeros(oview.shape, 'd')
    rho = np.zeros(oview.shape, 'd')

    den[imsk] = np.sqrt((xx * yy)[imsk])
    dmsk = (den > 0.)

    rho[dmsk] = xy[dmsk] / np.sqrt(xx * yy)[dmsk]

    den = 1 - rho**2
    # Saturate the denominator (when correlation is perfect) to avoid div by zero warnings
    den[den < eps] = eps

    t = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')

    t[imsk] = np.abs(rho)[imsk] * np.sqrt((Na[imsk] - 2.) / den[imsk])
    p[imsk] = 2. * (1. - tdist.cdf(t[imsk], Na[imsk] - 2))

    p[~imsk] = np.nan
    rho[~imsk] = np.nan

    p[~dmsk] = np.nan
    rho[~dmsk] = np.nan

    # Construct and return variables
    xn = X.name if X.name != '' else 'X'  # Note: could write:  xn = X.name or 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'r2' in output:
        r2 = Var(oaxes, values=rho, name='r2')
        r2.atts['longname'] = 'Correlation coefficient between %s and %s' % (
            xn, yn)
        rvs.append(r2)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts[
            'longname'] = 'p-value for correlation coefficient between %s and %s' % (
                xn, yn)
        rvs.append(p)

    ds = asdataset(rvs)
    ds.atts['description'] = 'correlation analysis %s against %s' % (yn, xn)

    return ds
Example #27
0
def difference(X,
               Y,
               axes=None,
               alpha=0.05,
               Nx_fac=None,
               Ny_fac=None,
               output='d,p,ci',
               pbar=None):
    # {{{
    r'''Computes the mean value and statistics of X - Y.

  Parameters
  ==========
  X, Y : :class:`Var`
    Variables to difference. Must have at least one axis in common.

  axes : list, optional, defaults to None
    Axes over which to compute means; if othing is specified, the mean
    is computed over all axes common to X and Y.

  alpha : float, optional; defaults to 0.05
    Confidence level for which to compute confidence interval.

  Nx_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    X; the effective number will be given by the number estimated from the
    dataset divided by ``Nx_fac``.

  Ny_fac : integer, optional: defaults to None
    A factor by which to rescale the estimated number of degrees of freedom of
    Y; the effective number will be given by the number estimated from the
    dataset divided by ``Ny_fac``.

  output : string, optional
    A string determining which parameters are returned; see list of possible outputs
    in the Returns section. The specifications must be separated by a comma. Defaults 
    to 'd,p,ci'.

  pbar : progress bar, optional
    A progress bar object. If nothing is provided, a progress bar will be displayed
    if the calculation takes sufficiently long.

  Returns
  =======
  results : :class:`Dataset` 
    The returned variables are specified by the ``output`` argument. The names
    of the variables match the output request string (i.e. if ``ds`` is the
    returned dataset, the average of the difference can be obtained by
    ``ds.d``). The following four quantities can be computed:

    * 'd': The difference in the means, X - Y
    * 'df': The effective number of degrees of freedom, :math:`df`
    * 'p': The p-value; see notes.
    * 'ci': The confidence interval of the difference at the level specified by ``alpha``

  See Also
  ========
  isnonzero
  paired_difference

  Notes
  =====
  The effective number of degrees of freedom is estimated using eq (6.20) of 
  von Storch and Zwiers 1999, in which :math:`n_X` and :math:`n_Y` are scaled by
  Nx_fac and Ny_fac, respectively. This provides a means of taking into account
  serial correlation in the data (see sections 6.6.7-9), but the number of effective
  degrees of freedom are not calculated explicitly by this routine. The p-value and 
  confidence interval are computed based on the t-statistic in eq (6.19).'''

    from pygeode.tools import loopover, whichaxis, combine_axes, shared_axes, npnansum
    from pygeode.view import View

    # Split output request now
    ovars = ['d', 'df', 'p', 'ci']
    output = [o for o in output.split(',') if o in ovars]
    if len(output) < 1:
        raise ValueError(
            'No valid outputs are requested from correlation. Possible outputs are %s.'
            % str(ovars))

    srcaxes = combine_axes([X, Y])
    oiaxes, riaxes = shared_axes(srcaxes, [X.axes, Y.axes])
    if axes is not None:
        ri_new = []
        for a in axes:
            i = whichaxis(srcaxes, a)
            if i not in riaxes:
                raise KeyError('%s axis not shared by X ("%s") and Y ("%s")' %
                               (a, X.name, Y.name))
            ri_new.append(i)
        oiaxes.extend([r for r in riaxes if r not in ri_new])
        riaxes = ri_new

    oaxes = [a for i, a in enumerate(srcaxes) if i not in riaxes]
    oview = View(oaxes)

    ixaxes = [X.whichaxis(n) for n in axes if X.hasaxis(n)]
    iyaxes = [Y.whichaxis(n) for n in axes if Y.hasaxis(n)]

    Nx = np.product([len(X.axes[i]) for i in ixaxes])
    Ny = np.product([len(Y.axes[i]) for i in iyaxes])
    assert Nx > 1, '%s has only one element along the reduction axes' % X.name
    assert Ny > 1, '%s has only one element along the reduction axes' % Y.name

    if pbar is None:
        from pygeode.progress import PBar
        pbar = PBar()

    # Construct work arrays
    x = np.full(oview.shape, np.nan, 'd')
    y = np.full(oview.shape, np.nan, 'd')
    xx = np.full(oview.shape, np.nan, 'd')
    yy = np.full(oview.shape, np.nan, 'd')
    Nx = np.full(oview.shape, np.nan, 'd')
    Ny = np.full(oview.shape, np.nan, 'd')

    # Accumulate data
    for outsl, (xdata, ) in loopover([X], oview, pbar=pbar):
        xdata = xdata.astype('d')
        x[outsl] = np.nansum([x[outsl], npnansum(xdata, ixaxes)], 0)
        xx[outsl] = np.nansum([xx[outsl], npnansum(xdata**2, ixaxes)], 0)

        # Count of non-NaN data points
        Nx[outsl] = np.nansum(
            [Nx[outsl], npnansum(~np.isnan(xdata), ixaxes)], 0)

    for outsl, (ydata, ) in loopover([Y], oview, pbar=pbar):
        ydata = ydata.astype('d')
        y[outsl] = np.nansum([y[outsl], npnansum(ydata, iyaxes)], 0)
        yy[outsl] = np.nansum([yy[outsl], npnansum(ydata**2, iyaxes)], 0)

        # Count of non-NaN data points
        Ny[outsl] = np.nansum(
            [Ny[outsl], npnansum(~np.isnan(ydata), iyaxes)], 0)

    # remove the mean (NOTE: numerically unstable if mean >> stdev)
    imsk = (Nx > 1) & (Ny > 1)
    xx[imsk] -= (x * x)[imsk] / Nx[imsk]
    xx[imsk] /= (Nx[imsk] - 1)

    x[imsk] /= Nx[imsk]

    yy[imsk] -= (y * y)[imsk] / Ny[imsk]
    yy[imsk] /= (Ny[imsk] - 1)

    y[imsk] /= Ny[imsk]

    # Ensure variances are non-negative
    xx[xx <= 0.] = 0.
    yy[yy <= 0.] = 0.

    if Nx_fac is not None: eNx = Nx // Nx_fac
    else: eNx = Nx
    if Ny_fac is not None: eNy = Ny // Ny_fac
    else: eNy = Ny

    emsk = (eNx > 1) & (eNy > 1)

    # Compute difference
    d = x - y

    den = np.zeros(oview.shape, 'd')
    df = np.zeros(oview.shape, 'd')
    p = np.zeros(oview.shape, 'd')
    ci = np.zeros(oview.shape, 'd')

    # Convert to variance of the mean of each sample
    xx[emsk] /= eNx[emsk]
    yy[emsk] /= eNy[emsk]

    den[emsk] = xx[emsk]**2 / (eNx[emsk] - 1) + yy[emsk]**2 / (eNy[emsk] - 1)
    dmsk = (den > 0.)

    df[dmsk] = (xx[dmsk] + yy[dmsk])**2 / den[dmsk]

    den[emsk] = np.sqrt(xx[emsk] + yy[emsk])

    dmsk &= (den > 0.)

    p[dmsk] = np.abs(d[dmsk] / den[dmsk])
    p[dmsk] = 2. * (1. - tdist.cdf(p[dmsk], df[dmsk]))

    ci[dmsk] = tdist.ppf(1. - alpha / 2, df[dmsk]) * den[dmsk]

    df[~dmsk] = np.nan
    p[~dmsk] = np.nan
    ci[~dmsk] = np.nan

    # Construct dataset to return
    xn = X.name if X.name != '' else 'X'
    yn = Y.name if Y.name != '' else 'Y'

    from pygeode.var import Var
    from pygeode.dataset import asdataset

    rvs = []

    if 'd' in output:
        d = Var(oaxes, values=d, name='d')
        d.atts['longname'] = 'Difference (%s - %s)' % (xn, yn)
        rvs.append(d)

    if 'df' in output:
        df = Var(oaxes, values=df, name='df')
        df.atts['longname'] = 'Degrees of freedom used for t-test'
        rvs.append(df)

    if 'p' in output:
        p = Var(oaxes, values=p, name='p')
        p.atts['longname'] = 'p-value for t-test of difference (%s - %s)' % (
            xn, yn)
        rvs.append(p)

    if 'ci' in output:
        ci = Var(oaxes, values=ci, name='ci')
        ci.atts[
            'longname'] = 'Confidence Interval (alpha = %.2f) of difference (%s - %s)' % (
                alpha, xn, yn)
        rvs.append(ci)

    ds = asdataset(rvs)
    ds.atts['alpha'] = alpha
    ds.atts['Nx_fac'] = Nx_fac
    ds.atts['Ny_fac'] = Ny_fac
    ds.atts['description'] = 't-test of difference (%s - %s)' % (yn, xn)

    return ds
Example #28
0
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis, DummyAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  from copy import copy
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts.copy()
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  # Specify the conventions we're (supposedly) using
  global_atts['Conventions'] = "CF-1.0"

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in list(axisdict.items()):
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Encode non-coordinate axes, including station (timeseries) data.
    # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series
    # Move station lat/lon/name data into separate variables.
    if isinstance(a, NonCoordinateAxis):

      # Keep track of extra variables created from auxarray data.
      extra_vars = []

      # Detect certain arrays that should be treated as "coordinates".
      coordinates = []

      # Encode station latitude.
      if 'lat' in a.auxarrays:
        lat = a.auxasvar('lat')
        lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north")
        extra_vars.append(lat)
        coordinates.append('lat')
      # Encode station longitude.
      if 'lon' in a.auxarrays:
        lon = a.auxasvar('lon')
        lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east")
        extra_vars.append(lon)
        coordinates.append('lon')

      coordinates = " ".join(coordinates)

      # Encode other auxarrays as generic "ancillary" arrays.
      ancillary_variables = []
      for auxname in list(a.auxarrays.keys()):
        if auxname in coordinates: continue  # Handled above
        var = a.auxasvar(auxname)
        if var.dtype.name.startswith('str'):
          var = encode_string_var(var)
        # Some extra CF encoding for the station name, to use it as the unique identifier.
        if auxname == 'station':
          var.atts = dict(cf_role = "timeseries_id")
        extra_vars.append(var)
        ancillary_variables.append(auxname)

      ancillary_variables = " ".join(ancillary_variables)

      # Attach these coordinates to all variables that use this axis.
      #TODO: cleaner way of adding this information without having to do a shallow copy.
      for i,var in enumerate(varlist):
        if var.hasaxis(a):
          var = copy(var)
          var.atts = copy(var.atts)
          if len(coordinates) > 0:
            var.atts['coordinates'] = coordinates
          if len(ancillary_variables) > 0:
            var.atts['ancillary_variables'] = ancillary_variables
          varlist[i] = var

      # Add these coordinates / ancillary variables to the output.
      varlist.extend(extra_vars)

      # The values in the axis itself are meaningless, so mark them as such
      axisdict[name] = DummyAxis(len(a),name=name)

      # Special case: Station (timeseries) data.
      if isinstance(a, Station):
        global_atts['featureType'] = "timeSeries"
      # Nothing more to do for this axis type
      continue

    # Encode custom axes from add-ons
    for n,c in list(custom_axes.items()):
      if isinstance(a,c):
        atts['standard_name'] = n

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.items():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print('??', a.name, axisdict)
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset