Example #1
0
def test_io():

  from pygeode.formats import netcdf as nc
  from pygeode.timeaxis import StandardTime
  from pygeode.axis import Pres
  from pygeode.dataset import Dataset
  import numpy as np

  tm = StandardTime(values=np.arange(365), units='days', startdate={'year':2001})
  p = Pres(np.arange(100.))
  v = (tm * p).rename('v')

  # Save the dataset, then reload it immediately
  before = Dataset([v])
  nc.save('issue004_test.nc', before)
  after = nc.open('issue004_test.nc')

  # Compare all vars/axes/attributes

  for var in before:
    assert var.name in after, "Can't find var '%s'"%var.name
    var2 = getattr(after,var.name)
    assert var2.atts == var.atts, "mismatched metadata.  Input: %s, Output %s"%(var.atts,var2.atts)

  for axis in before.axes:
    axis2 = [a for a in after.axes if a.name == axis.name]
    assert len(axis2) == 1, "can't find axis '%s'"%axis.name
    axis2 = axis2[0]
#    assert axis2.atts == axis.atts, "mismatched metadata.  Input: %s, Output %s"%(axis.atts,axis2.atts)
    for attname in list(axis.atts.keys()):
      assert attname in axis2.atts, "attribute '%s' not found"%attname
      assert axis.atts[attname] == axis2.atts[attname], "attribute '%s' changed from '%s' to '%s'"%(attname, axis.atts[attname], axis2.atts[attname])
    assert type(axis2) == type(axis), "mismatched axis types.  Input: %s, Output %s"%(type(axis), type(axis2))
Example #2
0
def whitelist(dataset, varlist):
    # {{{
    from pygeode.dataset import Dataset
    assert isinstance(varlist, (list, tuple))
    vars = [dataset[v] for v in varlist if v in dataset.vardict]
    dataset = Dataset(vars, atts=dataset.atts)
    return dataset
Example #3
0
def _domain_as_dataset (domain, atts, table, axis_manager):
  from pygeode.dataset import Dataset
  axes = domain.make_axes(axis_manager)
  ivarlist = domain.which_axis('varlist')
  assert ivarlist is not None, "Unable to determine variable names"
  varlist = axes[ivarlist]
  axes = axes[:ivarlist] + axes[ivarlist+1:]
  return Dataset([_DataVar.construct(name, axes, atts[name], table[name], axis_manager) for name in varlist])
Example #4
0
def ensemble(*varlists):
    """
  Creates an ensemble out of a set of similar variables.
  The corresponding variable must have the same axes and the same name.
  If a bunch of vars are passed as inputs, then a single ensemble var is returned.
  If a bunch of datasets are passed as inputs, then a single dataset is returned, consisting of an ensemble of the internal vars.  Each input dataset must have matching vars.
  """
    from pygeode.var import Var
    from pygeode.dataset import Dataset, asdataset
    from pygeode.tools import common_dict
    datasets = [asdataset(v) for v in varlists]

    varnames = [v.name for v in datasets[0].vars]

    # Make sure we have the same varnames in each dataset
    for dataset in datasets:
        assert set(dataset.vardict.keys()) == set(
            varnames), "inconsistent variable names between datasets"

    # Make sure the varlists are all in the same order
    for i, dataset in enumerate(datasets):
        varlist = [dataset[varname] for varname in varnames]
        datasets[i] = Dataset(varlist, atts=dataset.atts)

    for varname in varnames:
        var0 = datasets[0][varname]
        for dataset in datasets:
            var = dataset[varname]
            # Make sure the axes are the same between ensemble vars
            assert var.axes == var0.axes, "inconsistent axes for %s" % varname

    # Collect the ensembles together
    ensembles = []
    for varname in varnames:
        ensemble = EnsembleVar([dataset[varname] for dataset in datasets])
        ensembles.append(ensemble)

    # Global attributes
    atts = common_dict(dataset.atts for dataset in datasets)
    if isinstance(varlists[0], Dataset): return Dataset(ensembles, atts=atts)
    if isinstance(varlists[0], Var):
        assert len(ensembles) == 1
        return ensembles[0]

    return ensembles
Example #5
0
def open(filename, value_override = {}, dimtypes = {}, namemap = {}, varlist = [], cfmeta = True, **kwargs):
# {{{
  from pygeode.dataset import Dataset
  from pygeode.formats import finalize_open

  file = GribFile(filename)
  vars = [GribVar(file,i) for i in range(lib.get_nvars(file.index))]
  # append level type to vars with the same name
  names = [v.name for v in vars]
  for i, v in enumerate(vars):
    if names.count(v.name) > 1: v.name = v.name + '_' + level_types[v.level_type][1]
  d = Dataset(vars)

  return finalize_open(d, dimtypes, namemap, varlist, cfmeta)
Example #6
0
def test_issue010():
  from pygeode.var import Var
  from pygeode.axis import Axis
  from pygeode.dataset import Dataset
  from pygeode.formats import netcdf as nc

  # Make some axes
  time_axis = Axis(values=[0], name='time')
  bnds_axis = Axis(values=[0,1], name='bnds')

  # Make some vars (note we don't have a 'bnds' variable corresponding to the 'bnds' dimension
  time_var = Var(axes=[time_axis], values=[1], name='time')
  time_bnds = Var(axes=[time_axis,bnds_axis], values=[[3,4]], name='time_bnds')

  # Make a dataset to hold the vars
  dataset = Dataset([time_var, time_bnds])

  # Manually appy dims2axes to detect our axes
  dataset = nc.dims2axes(dataset)
Example #7
0
def from_xarray(dataset):
    """
  Converts an xarray Dataset into a PyGeode Dataset.

  Parameters
  ----------
  dataset : xarray.Dataset
    The dataset to be converted.

  Returns
  -------
  out : pygeode.Dataset
    An object which can be used with the pygeode package.
  """
    import xarray as xr
    from pygeode.dataset import Dataset
    from pygeode.formats.netcdf import dims2axes
    from pygeode.formats.cfmeta import decode_cf
    # Encode the axes/variables with CF metadata.
    out = []
    # Loop over each axis and variable, and wrap as a pygeode.Var object.
    for varname, var in dataset.variables.items():
        # Apply a subset of conventions that are relevant to PyGeode.
        try:
            var = xr.conventions.maybe_encode_datetime(var)
            var = xr.conventions.maybe_encode_timedelta(var)
        except AttributeError:
            var = xr.coding.times.CFDatetimeCoder().encode(var)
            var = xr.coding.times.CFTimedeltaCoder().encode(var)
        try:
            var = xr.conventions.maybe_encode_string_dtype(var)
        except AttributeError:
            pass  # Using an older version of xarray (<0.10.0)?
        out.append(XArray_DataArray(varname, var))
    # Wrap all the Var objects into a pygeode.Dataset object.
    out = Dataset(out, atts=_fix_atts(dataset.attrs))
    # Re-construct the axes as pygeode.axis.NamedAxis objects.
    out = dims2axes(out)
    # Re-decode the CF metadata on the PyGeode end.
    # This will get the approperiate axis types for lat, lon, time, etc.
    out = decode_cf(out)
    return out
Example #8
0
def decode_cf (dataset, ignore=[]):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Axis, NamedAxis, Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, Station, DummyAxis, NonCoordinateAxis
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode import timeutils
  from warnings import warn
  import re

#  dataset = asdataset(dataset, copy=True)
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts
  del dataset

  # Decode string variables
  for i,var in enumerate(varlist):
    if var.name.endswith("_name") and var.dtype.name in ("string8","bytes8") and var.axes[-1].name.endswith("_strlen"):
      varlist[i] = decode_string_var(var)

  # data for auxiliary arrays
  auxdict = {}
  for name in axisdict.keys(): auxdict[name] = {}

  # fill values / scale / offset (if applicable)
  fillvalues = {}
  scales = {}
  offsets = {}
  for v in varlist:
    name = v.name
    fillvalues[name] = None
    scales[name] = None
    offsets[name] = None

  for name,a in list(axisdict.items()):

    # Skip over this axis?
    if name in ignore: continue

    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # just carry along and pass to new Axis instance

    # Find any auxiliary arrays
    aux = auxdict[name]
    if 'ancillary_variables' in atts:
      _anc = atts.pop('ancillary_variables')
      remove_from_dataset = []  # vars to remove from the dataset
      for auxname in _anc.split(' '):
        assert any(v.name == auxname for v in varlist), "ancillary variable '%s' not found"%auxname
        newname = auxname
        # Remove the axis name prefix, if it was used
        if newname.startswith(name+'_'): newname = newname[len(name)+1:]
        aux[newname] = [v for v in varlist if v.name == auxname].pop().get()
        # Don't need this as a var anymore
        remove_from_dataset.append(auxname)

      # Remove some stuff
      varlist = [v for v in varlist if v.name not in remove_from_dataset]

    # Determine the best Axis subclass to use
#    cls = NamedAxis
    cls = type(a)

    # Generic 'axis' identifiers first
    if 'axis' in atts:
      _axis = atts.pop('axis')
      if _axis == 'X': cls = XAxis
      if _axis == 'Y': cls = YAxis
      if _axis == 'Z': cls = ZAxis
      if _axis == 'T': cls = TAxis

    # Check specific standard names, and also units?
    #TODO: don't *pop* the standard_name, units, etc. until the end of this routine - in case we didn't end up mapping them to an axis
    _ln = atts.get('long_name', a.name).lower()
    _st = atts.get('standard_name',_ln).lower()
    _units = atts.pop('units','')
    if _st == 'latitude' or _units == 'degrees_north': cls = Lat
    if _st == 'longitude' or _units == 'degrees_east': cls = Lon
    if _st == 'air_pressure' or _units in ('hPa','mbar'):
      cls = Pres
      # Don't need this in the metadata anymore (it will be put back in encode_cf)
      atts.pop('positive',None)

    if _st == 'atmosphere_hybrid_sigma_pressure_coordinate':
      #TODO: check formula_terms??
      #TODO: for ccc2nc files, look for long_name == "Model Level", use_AB = <formula>,
      #       A & B embedded as metadata or as data arrays not attached to ancillary_variables
      if 'A' in aux and 'B' in aux:
        cls = Hybrid
      else:
        warn ("Cannot create a proper Hybrid vertical axis, since 'A' and 'B' coefficients aren't found.")

    if _st == 'station':
      cls = Station

    if (_st == 'time' or cls == TAxis or _units.startswith('days since') or _units.startswith('hours since') or _units.startswith('minutes since') or _units.startswith('seconds since')) and ' since ' in _units:
      _calendar = atts.pop('calendar', 'standard')
      if _calendar in ('standard', 'gregorian', 'proleptic_gregorian'): cls = StandardTime
      elif _calendar in ('365_day', 'noleap', '365day'): cls = ModelTime365
      elif _calendar in ('360_day', '360day'): cls = ModelTime360
      elif _calendar in ('none'): cls = Yearless
      else:
        warn ("unknown calendar '%s'"%_calendar)
        continue
      # Extract the time resolution (day, hour, etc), and the reference date
      res, date = re.match("([a-z]+)\s+since\s+(.*)", _units).groups()
      # Pluralize the increment (i.e. day->days)?
      if not res.endswith('s'): res += 's'
      # Extract the rest of the date
      date = date.rstrip()
      year, month, day, hour, minute, second = 0,1,1,0,0,0
      if len(date) > 0: year, date = re.match("(\d+)-?(.*)", date).groups()
      if len(date) > 0: month, date = re.match("(\d+)-?(.*)", date).groups()
      if len(date) > 0: day, date = re.match("(\d+)\s*(.*)", date).groups()
      if date.startswith('T'): date = date[1:]
      if len(date) > 0: hour, date = re.match("(\d+):?(.*)", date).groups()
      if len(date) > 0: minute, date = re.match("(\d+):?(.*)", date).groups()
      if len(date) > 0 and date[0] != ' ': second, date = re.match("(\d+)(.*)", date).groups()
      # convert from strings to integers
      #TODO: milliseconds? time zone?
      year, month, day, hour, minute, second = list(map(int, [year, month, day, hour, minute, float(second)]))
      # Create the time axis
      startdate={'year':year, 'month':month, 'day':day, 'hour':hour, 'minute':minute, 'second':second}
      axisdict[name] = cls(a.values, startdate=startdate, units=res, name=name, atts=atts)
      # Special case: start year=0 implies a climatology
      #NOTE: 'climatology' attribute not used, since we don't currently keep
      #      track of the interval that was used for the climatology.
      if year == 0:
        # Don't climatologize(?) the axis if there's more than a year
        if not all(axisdict[name].year == 0):
          warn ("cfmeta: data starts at year 0 (which usually indicates a climatology), but there's more than one year's worth of data!  Keeping it on a regular calendar.", stacklevel=3)
          continue
        axisdict[name] = timeutils.modify(axisdict[name], exclude='year')
      continue  # we've constructed the time axis, so move onto the next axis

    # Check for a match from the custom axes (from add-ons).
    if _st in custom_axes:
      cls = custom_axes[_st]

    # Find any other information that should be put inside this axis.
    # Look for anything that's identified as a coordinate or anicllary
    # variable, and that has this axis as its only dimension.
    dependencies = set()
    for var in varlist:
      if var.hasaxis(a.name):
        dependencies.update(var.atts.get('coordinates','').split())
        dependencies.update(var.atts.get('ancillary_variables','').split())
    # Look up these dependencies.  Only consider 1D information, since we
    # don't yet have a way to associate multidimensional arrays as auxarrays
    # in an axis.
    dependencies = [v for v in varlist if v.name in dependencies and v.naxes == 1 and v.hasaxis(a.name)]

    # If we found any such information, then this is no longer a simple
    # "dummy" axis.
    if issubclass(cls, DummyAxis) and len(dependencies) > 0:
      cls = NonCoordinateAxis

    # Attach the information from these dependent variables as auxiliary arrays.
    aux.update((dep.name,dep.get()) for dep in dependencies)

    # Anything that got attached to this axis should be removed from the
    # list of variables, since it's just extra info specific to the axis.
    varlist = [v for v in varlist if v.name not in aux]


    # put the units back (if we didn't use them)?
    if cls in [Axis, NamedAxis, XAxis, YAxis, ZAxis, TAxis] and _units != '': atts['units'] = _units

    # create new axis instance if need be.
    if cls != type(a):
      axisdict[name] = cls(values=a.values, name=name, atts=atts, **aux)

  # Apply these new axes to the variables
  # Check for fill values, etc.
  # Extract to a list first, then back to a dataset
  # (ensures the dataset axis list is up to date)
  for i,oldvar in enumerate(list(varlist)):
#    name = [n for n,v in six.iteritems(dataset.vardict) if v is oldvar].pop()
    name = oldvar.name
    atts = oldvar.atts.copy()
    plotatts = oldvar.atts.copy()
    fillvalue = [atts.pop(f,None) for f in ('FillValue', '_FillValue', 'missing_value')]
    fillvalue = [_f for _f in fillvalue if _f]
    fillvalue = fillvalue[0] if len(fillvalue) > 0 else None
    scale = atts.pop('scale_factor', None)
    offset = atts.pop('add_offset', None)

    varlist[i] = var_newaxes(oldvar, [axisdict[a.name] for a in oldvar.axes],
                    name=name, fillvalue=fillvalue, scale=scale, offset=offset, atts=atts, plotatts=plotatts)

  dataset = Dataset(varlist, atts=global_atts)

  return dataset
Example #9
0
def encode_cf (dataset):
  from pygeode.dataset import asdataset, Dataset
  from pygeode.axis import Lat, Lon, Pres, Hybrid, XAxis, YAxis, ZAxis, TAxis, NonCoordinateAxis, Station
  from pygeode.timeaxis import Time, ModelTime365, ModelTime360, StandardTime, Yearless
  from pygeode.axis import NamedAxis, DummyAxis
  from pygeode.var import Var
  from pygeode.timeutils import reltime
  from copy import copy
  dataset = asdataset(dataset)
  varlist = list(dataset)
  axisdict = dataset.axisdict.copy()
  global_atts = dataset.atts.copy()
  del dataset

  # Fix the variable names
  for i,v in enumerate(list(varlist)):
    oldname = v.name
    newname = fix_name(oldname)
    if newname != oldname:
      from warnings import warn
      warn ("renaming '%s' to '%s'"%(oldname,newname))
      varlist[i] = v.rename(newname)

  # Fix the axis names
  #TODO

  # Fix the variable metadata
  #TODO

  # Fix the global metadata
  # Specify the conventions we're (supposedly) using
  global_atts['Conventions'] = "CF-1.0"

  for v in varlist: assert v.name not in axisdict, "'%s' refers to both a variable and an axis"%v.name

  # Metadata based on axis classes
  for name,a in list(axisdict.items()):
    atts = a.atts.copy()
    plotatts = a.plotatts.copy() # passed on to Axis constructor
    
    if isinstance(a,Lat):
      atts['standard_name'] = 'latitude'
      atts['units'] = 'degrees_north'
    if isinstance(a,Lon):
      atts['standard_name'] = 'longitude'
      atts['units'] = 'degrees_east'
    if isinstance(a,Pres):
      atts['standard_name'] = 'air_pressure'
      atts['units'] = 'hPa'
      atts['positive'] = 'down'
    if isinstance(a,Hybrid):
      #TODO: formula_terms (how do we specify LNSP instead of P0?????)
      atts['standard_name'] = 'atmosphere_hybrid_sigma_pressure_coordinate'
    if isinstance(a,Time):
      atts['standard_name'] = 'time'
      #TODO: change the unit depending on the time resolution?
      start = a.startdate
      atts['units'] = '%s since %04i-%02i-%02i %02i:%02i:%02i'% (a.units,
        start.get('year',0), start.get('month',1), start.get('day',1),
        start.get('hour',0), start.get('minute',0), start.get('second',0)
      )
    if isinstance(a,StandardTime): atts['calendar'] = 'standard'
    if isinstance(a,ModelTime365): atts['calendar'] = '365_day'
    if isinstance(a,ModelTime360): atts['calendar'] = '360_day'
    if isinstance(a,Yearless): atts['calendar'] = 'none'

    if isinstance(a,XAxis): atts['axis'] = 'X'
    if isinstance(a,YAxis): atts['axis'] = 'Y'
    if isinstance(a,ZAxis): atts['axis'] = 'Z'
    if isinstance(a,TAxis): atts['axis'] = 'T'

    # Change the time axis to be relative to a start date
    #TODO: check 'units' attribute of the time axis, use that in the 'units' of the netcdf metadata
    if isinstance(a, Time):
      #TODO: cast into an integer array if possible
      axisdict[name] = NamedAxis(values=reltime(a), name=name, atts=atts, plotatts=plotatts)
      continue

    # Encode non-coordinate axes, including station (timeseries) data.
    # Loosely follow http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_orthogonal_multidimensional_array_representation_of_time_series
    # Move station lat/lon/name data into separate variables.
    if isinstance(a, NonCoordinateAxis):

      # Keep track of extra variables created from auxarray data.
      extra_vars = []

      # Detect certain arrays that should be treated as "coordinates".
      coordinates = []

      # Encode station latitude.
      if 'lat' in a.auxarrays:
        lat = a.auxasvar('lat')
        lat.atts = dict(standard_name="latitude", long_name=a.name+" latitude", units="degrees_north")
        extra_vars.append(lat)
        coordinates.append('lat')
      # Encode station longitude.
      if 'lon' in a.auxarrays:
        lon = a.auxasvar('lon')
        lon.atts = dict(standard_name="longitude", long_name=a.name+" longitude", units="degrees_east")
        extra_vars.append(lon)
        coordinates.append('lon')

      coordinates = " ".join(coordinates)

      # Encode other auxarrays as generic "ancillary" arrays.
      ancillary_variables = []
      for auxname in list(a.auxarrays.keys()):
        if auxname in coordinates: continue  # Handled above
        var = a.auxasvar(auxname)
        if var.dtype.name.startswith('str'):
          var = encode_string_var(var)
        # Some extra CF encoding for the station name, to use it as the unique identifier.
        if auxname == 'station':
          var.atts = dict(cf_role = "timeseries_id")
        extra_vars.append(var)
        ancillary_variables.append(auxname)

      ancillary_variables = " ".join(ancillary_variables)

      # Attach these coordinates to all variables that use this axis.
      #TODO: cleaner way of adding this information without having to do a shallow copy.
      for i,var in enumerate(varlist):
        if var.hasaxis(a):
          var = copy(var)
          var.atts = copy(var.atts)
          if len(coordinates) > 0:
            var.atts['coordinates'] = coordinates
          if len(ancillary_variables) > 0:
            var.atts['ancillary_variables'] = ancillary_variables
          varlist[i] = var

      # Add these coordinates / ancillary variables to the output.
      varlist.extend(extra_vars)

      # The values in the axis itself are meaningless, so mark them as such
      axisdict[name] = DummyAxis(len(a),name=name)

      # Special case: Station (timeseries) data.
      if isinstance(a, Station):
        global_atts['featureType'] = "timeSeries"
      # Nothing more to do for this axis type
      continue

    # Encode custom axes from add-ons
    for n,c in list(custom_axes.items()):
      if isinstance(a,c):
        atts['standard_name'] = n

    # Add associated arrays as new variables
    auxarrays = a.auxarrays
    for aux,values in auxarrays.items():
      auxname = name+'_'+aux
      assert not any(v.name == auxname for v in varlist), "already have a variable named %s"%auxname
      varlist.append( Var([a], values=values, name=auxname) )
    if len(auxarrays) > 0:
      atts['ancillary_variables'] = ' '.join(name+'_'+aux for aux in auxarrays.keys())

    # Create new, generic axes with the desired attributes
    # (Replaces the existing entry in the dictionary)
    axisdict[name] = NamedAxis(values=a.values, name=name, atts=atts, plotatts=plotatts)

  # Apply these new axes to the variables
  for i,oldvar in enumerate(list(varlist)):
    name = oldvar.name
    try:
      #TODO: use Var.replace_axes instead?
      varlist[i] = var_newaxes(oldvar, [axisdict.get(a.name,a) for a in oldvar.axes], atts=oldvar.atts, plotatts=oldvar.plotatts)
    except KeyError:
      print('??', a.name, axisdict)
      raise

  dataset = Dataset(varlist, atts=global_atts)
  return dataset
Example #10
0
# Issue 114
# https://github.com/pygeode/pygeode/issues/114

from pygeode.formats import netcdf4 as nc
from pygeode.axis import Lat
from pygeode.var import Var
from pygeode.dataset import Dataset

lat = Lat([80,70,60])
var = Var(axes=[lat], values=[1,2,3], name='A')
dataset = Dataset([var])

dataset_groups = {'Group 1': dataset, 'Group 2': dataset}

# Save the variable. 
nc.save('issue114_test.nc', dataset_groups, cfmeta=True)

# Read in the file again

dataset_groups_read = nc.open('issue114_test.nc',cfmeta=True)

# Check that the variables are the same
assert (dataset_groups['Group 1'].A == dataset_groups_read['Group 1'].A)


Example #11
0
def open_multi (files, format=None, opener=None, pattern=None, file2date=None, **kwargs):
# {{{
  ''' Returns a :class:`Dataset` containing variables merged across many files.

  Parameters
  ==========
  files : string, list, or tuple
    Either a single filename or a list of filenames. Wildcards are supported, :func:`glob.iglob` is
    used to expand these into an explicit list of files.

  format : string, optional
    String specifying format of file to open. If none is given the format will be automatically
    detected from the first filename (see :func:`autodetectformat`)

  opener : function, optional
    Function to open individual files. If none is provided, uses the
    format-specific version of :func:`open`. The datasets returned by this
    function are then concatenated and returned. See Notes.

  pattern : string, optional
    A regex pattern to extract date stamps from the filename; used by default file2date.
    Matching patterns must be named <year>, <month>, <day>, <hour> or <minute>.
    Abbreviations are available for the above; $Y matches a four digit year, $m, $d, $H,
    and $M match a two-digit month, day, hour and minute, respectively.

  file2date : function, optional
    Function which returns a date dictionary given a filename. By default this is produced
    by applying the regex pattern ``pattern`` to the filename.

  sorted : boolean, optional
    If True, the filenames are sorted (by alpha) prior to opening each file, and
    the axes on the returned dataset are sorted by calling :meth:`Dataset.sorted`.

  **kwargs : keyword arguments
    These are passed on to the function ``opener``;

  Returns
  =======
  dataset
    A dataset containing the variables concatenated across all specified files.
    The variable data itself is not loaded into memory. 

  Notes
  =====
  This is intended to provide access to large datasets whose files are
  separated by timestep.  To avoid opening every file individually, the time
  axis is constructed by opening the first and the last file in the list of
  files provided. This is done to provide a template of what variables and what
  times are stored in each file - it is assumed that the number of timesteps
  (and their offsets) is the same accross the whole dataset. The time axis is
  then constructed from the filenames themselves, using the function
  ``file2date`` to generate a date from each filename. As a result only two files
  need to be opened, which makes this a very efficient way to work with very large
  datasets.

  However, no explicit check is made of the integrity of the files - if there
  are corrupt or missing data within individual files, this will not become
  clear until that data is actually accessed. This can be done explicitly with
  :func:`check_multi`, which explicitly attempts to access all the data and
  returns a list of any problems encountered; this can take a long time, but is
  a useful check (and is more likely to provide helpful error messages). 

  The function ``opener`` must take a single positional argument - the filename
  of the file to open - and keyword arguments that are passed through from this
  function. It must return a :class:`Dataset` object with the loaded variables.
  By default the standard :func:`open` is used, but providing a custom opener
  can be useful for any reshaping of the variables that must be done prior to
  concatenating the whole dataset. 

  See Also
  ========
  open
  openall
  '''

  from pygeode.timeaxis import Time, StandardTime
  from pygeode.timeutils import reltime, delta
  from pygeode.dataset import Dataset
  from pygeode.tools import common_dict
  from pygeode.formats import open, autodetectformat
  import numpy as np

  files = expand_file_list(files)
  nfiles = len(files)
  assert nfiles > 0

  if opener is None: 
    if format is None: format = autodetectformat(files[0])

    if not hasattr(format, 'open'): 
      try:
        format = __import__("pygeode.formats.%s" % format, fromlist=["pygeode.formats"])
      except ImportError:
        raise ValueError('Unrecognized format module %s.' % format)

    opener = format.open

  # Apply keyword arguments
  if len(kwargs) > 0:
    old_opener = opener
    opener = lambda f: old_opener (f, **kwargs)


  # Degenerate case: only one file was given
  if nfiles == 1: return opener(files[0])


  # We'll need a function to translate filenames to dates
  # (if we don't have one, use the supplied pattern to make one)
  if file2date is None:
    import re
    assert pattern is not None, "I don't know how to get the dates from the filenames"
    regex = pattern
    regex = regex.replace('$Y', '(?P<year>[0-9]{4})')
    regex = regex.replace('$m', '(?P<month>[0-9]{2})')
    regex = regex.replace('$d', '(?P<day>[0-9]{2})')
    regex = regex.replace('$H', '(?P<hour>[0-9]{2})')
    regex = regex.replace('$M', '(?P<minute>[0-9]{2})')
    regex = re.compile(regex)
    def file2date (f):
      d = regex.search(f)
      assert d is not None, "can't use the pattern on the filenames?"
      d = d.groupdict()
      d = dict([k,int(v)] for k,v in d.items() if v is not None)
      # Apply default values (i.e. for minutes, seconds if they're not in the file format?)
      d = dict({'hour':0, 'minute':0,'second':0}, **d)
      return d


  # Get the starting date of each file
  dates = [file2date(f) for f in files]
  dates = dict((k,[d[k] for d in dates]) for k in list(dates[0].keys()))

  # Open a file to get a time axis
  file = opener(files[0])
  T = None
  for v in file.vars:
    if v.hasaxis(Time):
      T = type(v.getaxis(Time))
      break
  if T is None: T = StandardTime
#  T = [v.getaxis(Time) for v in file.vars if v.hasaxis(Time)]
#  T = type(T[0]) if len(T) > 0 else StandardTime
  del file

  # Generate a lower-resolution time axis (the start of *each* file)
  faxis = T(units='days',**dates)

  # Re-sort the files, if they weren't in order
  S = faxis.argsort()
  faxis = faxis.slice[S]
  files = [files[s] for s in S]
  # Re-init the faxis to force the proper start date
  faxis = type(faxis)(units=faxis.units, **faxis.auxarrays)

  # Open the first and last file, so we know what the variables & timesteps are
  first = opener(files[0])
  last  = opener(files[-1])
  names = [v.name for v in first.vars]
  for n in names: assert n in last, "inconsistent vars"
  # Get global attributes
  global_atts = common_dict (first.atts, last.atts)

  #---
  timedict = {None:faxis}
  for v1 in first:
    if not v1.hasaxis(Time): continue
    t1 = v1.getaxis(Time)
    if t1.name in timedict: continue  # already handled this one
    t2 = last[v1.name].getaxis(Time)
    # Construct a full time axis from these pieces

    # One timestep per file? (check for an offset for the var time compared
    #  to the file time)
    if max(len(t1),len(t2)) == 1:
      offset = reltime(t1, startdate=faxis.startdate, units=faxis.units)[0]
      taxis = faxis.withnewvalues(faxis.values + offset)
    # At least one of first/last files has multiple timesteps?
    else:
      assert t1.units == t2.units
      dt = max(delta(t1),delta(t2))
      assert dt > 0
      val1 = t1.values[0]
      val2 = reltime(t2, startdate=t1.startdate)[-1]
      nt = (val2-val1)/dt + 1
      assert round(nt) == nt
      nt = int(round(nt))
      assert nt > 0
      taxis = t1.withnewvalues(np.arange(nt)*dt + val1)

    timedict[t1.name] = taxis

  #---

  # Create the multifile version of the vars
  vars = [Multifile_Var(v1, opener, files, faxis, timedict) for v1 in first]


  return Dataset(vars,atts=global_atts)