Ejemplo n.º 1
0
    def __init__(self,
                 var,
                 iaxis,
                 ievents,
                 evlen,
                 evoff=0,
                 saxes=None,
                 sindices=None):
        #{{{
        # Replace the time axis with a reference time (of an event?), and the offset
        # from that event.
        import numpy as np
        ievents = np.array(ievents)
        n = ievents.shape[0]
        caxis = var.axes[iaxis]

        # Event offsets can either be specified per event or as a single offset
        if hasattr(evoff, '__len__'):
            evoff = np.array(evoff)
            mevoff = evoff.max()
            assert evoff.shape == ievents.shape, "The number of event offsets provided does not match the number of events."
        else:
            mevoff = evoff
            evoff = np.ones(n, 'i') * mevoff

        # Event lengths can either be specified per event or as a single length
        if hasattr(evlen, '__len__'):
            evlen = np.array(evlen)
            mevlen = (evlen - evoff + mevoff).max()
            assert evlen.shape == ievents.shape, "The number of event lengths provided does not match the number of events."
        else:
            mevlen = evlen
            evlen = np.ones(n, 'i') * mevlen

        # Construct event and offset axes
        from pygeode.timeaxis import Time, Yearless
        from pygeode import timeutils
        ev = Event(np.arange(n) + 1, indices=ievents)
        if isinstance(caxis, Time):
            units = caxis.units
            delta = timeutils.delta(caxis, units=units)
            off = Yearless(values=delta * np.arange(-mevoff, mevlen - mevoff),
                           units=units,
                           startdate={'day': 0})
        else:
            off = Offset(np.arange(-mevoff, mevlen - mevoff))
        axes = var.axes[:iaxis] + (ev, off) + var.axes[iaxis + 1:]

        # Build var object
        self.var = var
        self.iaxis = iaxis
        self.evlens = evlen
        self.mevlen = mevlen
        self.evoffs = evoff
        self.mevoff = mevoff

        for i, (iev, el, eo) in enumerate(zip(ievents, evlen, evoff)):
            if iev - eo < 0:
                self.evoffs[i] += iev - eo
                self.evlens[i] -= iev - eo
            if iev - eo + el >= len(caxis):
                self.evlens[i] = len(caxis) - (iev - eo)

            #assert iev - eo >= 0 and iev - eo + el < len(caxis), \
            #'Event %d (i: %d) is not fully defined' % (np.where(ievents==iev)[0][0], iev)
        Var.__init__(self,
                     axes,
                     dtype=var.dtype,
                     name=var.name,
                     atts=var.atts,
                     plotatts=var.plotatts)
Ejemplo n.º 2
0
 def delta (self, units=None):
   from pygeode import timeutils
   from warnings import warn
   warn ("Deprecated.  Use timeutils module.")
   return timeutils.delta(self, units)
Ejemplo n.º 3
0
def open_multi (files, format=None, opener=None, pattern=None, file2date=None, **kwargs):
# {{{
  ''' Returns a :class:`Dataset` containing variables merged across many files.

  Parameters
  ==========
  files : string, list, or tuple
    Either a single filename or a list of filenames. Wildcards are supported, :func:`glob.iglob` is
    used to expand these into an explicit list of files.

  format : string, optional
    String specifying format of file to open. If none is given the format will be automatically
    detected from the first filename (see :func:`autodetectformat`)

  opener : function, optional
    Function to open individual files. If none is provided, uses the
    format-specific version of :func:`open`. The datasets returned by this
    function are then concatenated and returned. See Notes.

  pattern : string, optional
    A regex pattern to extract date stamps from the filename; used by default file2date.
    Matching patterns must be named <year>, <month>, <day>, <hour> or <minute>.
    Abbreviations are available for the above; $Y matches a four digit year, $m, $d, $H,
    and $M match a two-digit month, day, hour and minute, respectively.

  file2date : function, optional
    Function which returns a date dictionary given a filename. By default this is produced
    by applying the regex pattern ``pattern`` to the filename.

  sorted : boolean, optional
    If True, the filenames are sorted (by alpha) prior to opening each file, and
    the axes on the returned dataset are sorted by calling :meth:`Dataset.sorted`.

  **kwargs : keyword arguments
    These are passed on to the function ``opener``;

  Returns
  =======
  dataset
    A dataset containing the variables concatenated across all specified files.
    The variable data itself is not loaded into memory. 

  Notes
  =====
  This is intended to provide access to large datasets whose files are
  separated by timestep.  To avoid opening every file individually, the time
  axis is constructed by opening the first and the last file in the list of
  files provided. This is done to provide a template of what variables and what
  times are stored in each file - it is assumed that the number of timesteps
  (and their offsets) is the same accross the whole dataset. The time axis is
  then constructed from the filenames themselves, using the function
  ``file2date`` to generate a date from each filename. As a result only two files
  need to be opened, which makes this a very efficient way to work with very large
  datasets.

  However, no explicit check is made of the integrity of the files - if there
  are corrupt or missing data within individual files, this will not become
  clear until that data is actually accessed. This can be done explicitly with
  :func:`check_dataset`, which explicitly attempts to access all the data and
  returns a list of any problems encountered; this can take a long time, but is
  a useful check (and is more likely to provide helpful error messages). 

  The function ``opener`` must take a single positional argument - the filename
  of the file to open - and keyword arguments that are passed through from this
  function. It must return a :class:`Dataset` object with the loaded variables.
  By default the standard :func:`open` is used, but providing a custom opener
  can be useful for any reshaping of the variables that must be done prior to
  concatenating the whole dataset. 

  See Also
  ========
  open
  openall
  '''

  from pygeode.timeaxis import Time, StandardTime
  from pygeode.timeutils import reltime, delta
  from pygeode.dataset import Dataset
  from pygeode.tools import common_dict
  from pygeode.formats import open, autodetectformat
  import numpy as np

  files = expand_file_list(files)
  nfiles = len(files)
  assert nfiles > 0

  if opener is None: 
    if format is None: format = autodetectformat(files[0])

    if not hasattr(format, 'open'): 
      try:
        format = __import__("pygeode.formats.%s" % format, fromlist=["pygeode.formats"])
      except ImportError:
        raise ValueError('Unrecognized format module %s.' % format)

    opener = format.open

  # Apply keyword arguments
  if len(kwargs) > 0:
    old_opener = opener
    opener = lambda f: old_opener (f, **kwargs)


  # Degenerate case: only one file was given
  if nfiles == 1: return opener(files[0])


  # We'll need a function to translate filenames to dates
  # (if we don't have one, use the supplied pattern to make one)
  if file2date is None:
    import re
    assert pattern is not None, "I don't know how to get the dates from the filenames"
    regex = pattern
    regex = regex.replace('$Y', '(?P<year>[0-9]{4})')
    regex = regex.replace('$m', '(?P<month>[0-9]{2})')
    regex = regex.replace('$d', '(?P<day>[0-9]{2})')
    regex = regex.replace('$H', '(?P<hour>[0-9]{2})')
    regex = regex.replace('$M', '(?P<minute>[0-9]{2})')
    regex = re.compile(regex)
    def file2date (f):
      d = regex.search(f)
      assert d is not None, "can't use the pattern on the filenames?"
      d = d.groupdict()
      d = dict([k,int(v)] for k,v in d.iteritems())
      # Apply default values (i.e. for minutes, seconds if they're not in the file format?)
      d = dict({'hour':0, 'minute':0,'second':0}, **d)
      return d


  # Get the starting date of each file
  dates = [file2date(f) for f in files]
  dates = dict((k,[d[k] for d in dates]) for k in dates[0].keys())

  # Open a file to get a time axis
  file = opener(files[0])
  T = None
  for v in file.vars:
    if v.hasaxis(Time):
      T = type(v.getaxis(Time))
      break
  if T is None: T = StandardTime
#  T = [v.getaxis(Time) for v in file.vars if v.hasaxis(Time)]
#  T = type(T[0]) if len(T) > 0 else StandardTime
  del file

  # Generate a lower-resolution time axis (the start of *each* file)
  faxis = T(units='days',**dates)

  # Re-sort the files, if they weren't in order
  S = faxis.argsort()
  faxis = faxis.slice[S]
  files = [files[s] for s in S]
  # Re-init the faxis to force the proper start date
  faxis = type(faxis)(units=faxis.units, **faxis.auxarrays)

  # Open the first and last file, so we know what the variables & timesteps are
  first = opener(files[0])
  last  = opener(files[-1])
  names = [v.name for v in first.vars]
  for n in names: assert n in last, "inconsistent vars"
  # Get global attributes
  global_atts = common_dict (first.atts, last.atts)

  #---
  timedict = {None:faxis}
  for v1 in first:
    if not v1.hasaxis(Time): continue
    t1 = v1.getaxis(Time)
    if t1.name in timedict: continue  # already handled this one
    t2 = last[v1.name].getaxis(Time)
    # Construct a full time axis from these pieces

    # One timestep per file? (check for an offset for the var time compared
    #  to the file time)
    if max(len(t1),len(t2)) == 1:
      offset = reltime(t1, startdate=faxis.startdate, units=faxis.units)[0]
      taxis = faxis.withnewvalues(faxis.values + offset)
    # At least one of first/last files has multiple timesteps?
    else:
      assert t1.units == t2.units
      dt = max(delta(t1),delta(t2))
      assert dt > 0
      val1 = t1.values[0]
      val2 = reltime(t2, startdate=t1.startdate)[-1]
      nt = (val2-val1)/dt + 1
      assert round(nt) == nt
      nt = int(round(nt))
      assert nt > 0
      taxis = t1.withnewvalues(np.arange(nt)*dt + val1)

    timedict[t1.name] = taxis

  #---

  # Create the multifile version of the vars
  vars = [Multifile_Var(v1, opener, files, faxis, timedict) for v1 in first]


  return Dataset(vars,atts=global_atts)
Ejemplo n.º 4
0
 def delta (self, units=None):
   from pygeode import timeutils
   from warnings import warn
   warn ("Deprecated.  Use timeutils module.")
   return timeutils.delta(self, units)
Ejemplo n.º 5
0
def open_multi (files, format=None, opener=None, pattern=None, file2date=None, **kwargs):
# {{{
  ''' Returns a :class:`Dataset` containing variables merged across many files.

  Parameters
  ==========
  files : string, list, or tuple
    Either a single filename or a list of filenames. Wildcards are supported, :func:`glob.iglob` is
    used to expand these into an explicit list of files.

  format : string, optional
    String specifying format of file to open. If none is given the format will be automatically
    detected from the first filename (see :func:`autodetectformat`)

  opener : function, optional
    Function to open individual files. If none is provided, uses the
    format-specific version of :func:`open`. The datasets returned by this
    function are then concatenated and returned. See Notes.

  pattern : string, optional
    A regex pattern to extract date stamps from the filename; used by default file2date.
    Matching patterns must be named <year>, <month>, <day>, <hour> or <minute>.
    Abbreviations are available for the above; $Y matches a four digit year, $m, $d, $H,
    and $M match a two-digit month, day, hour and minute, respectively.

  file2date : function, optional
    Function which returns a date dictionary given a filename. By default this is produced
    by applying the regex pattern ``pattern`` to the filename.

  sorted : boolean, optional
    If True, the filenames are sorted (by alpha) prior to opening each file, and
    the axes on the returned dataset are sorted by calling :meth:`Dataset.sorted`.

  **kwargs : keyword arguments
    These are passed on to the function ``opener``;

  Returns
  =======
  dataset
    A dataset containing the variables concatenated across all specified files.
    The variable data itself is not loaded into memory. 

  Notes
  =====
  This is intended to provide access to large datasets whose files are
  separated by timestep.  To avoid opening every file individually, the time
  axis is constructed by opening the first and the last file in the list of
  files provided. This is done to provide a template of what variables and what
  times are stored in each file - it is assumed that the number of timesteps
  (and their offsets) is the same accross the whole dataset. The time axis is
  then constructed from the filenames themselves, using the function
  ``file2date`` to generate a date from each filename. As a result only two files
  need to be opened, which makes this a very efficient way to work with very large
  datasets.

  However, no explicit check is made of the integrity of the files - if there
  are corrupt or missing data within individual files, this will not become
  clear until that data is actually accessed. This can be done explicitly with
  :func:`check_multi`, which explicitly attempts to access all the data and
  returns a list of any problems encountered; this can take a long time, but is
  a useful check (and is more likely to provide helpful error messages). 

  The function ``opener`` must take a single positional argument - the filename
  of the file to open - and keyword arguments that are passed through from this
  function. It must return a :class:`Dataset` object with the loaded variables.
  By default the standard :func:`open` is used, but providing a custom opener
  can be useful for any reshaping of the variables that must be done prior to
  concatenating the whole dataset. 

  See Also
  ========
  open
  openall
  '''

  from pygeode.timeaxis import Time, StandardTime
  from pygeode.timeutils import reltime, delta
  from pygeode.dataset import Dataset
  from pygeode.tools import common_dict
  from pygeode.formats import open, autodetectformat
  import numpy as np

  files = expand_file_list(files)
  nfiles = len(files)
  assert nfiles > 0

  if opener is None: 
    if format is None: format = autodetectformat(files[0])

    if not hasattr(format, 'open'): 
      try:
        format = __import__("pygeode.formats.%s" % format, fromlist=["pygeode.formats"])
      except ImportError:
        raise ValueError('Unrecognized format module %s.' % format)

    opener = format.open

  # Apply keyword arguments
  if len(kwargs) > 0:
    old_opener = opener
    opener = lambda f: old_opener (f, **kwargs)


  # Degenerate case: only one file was given
  if nfiles == 1: return opener(files[0])


  # We'll need a function to translate filenames to dates
  # (if we don't have one, use the supplied pattern to make one)
  if file2date is None:
    import re
    assert pattern is not None, "I don't know how to get the dates from the filenames"
    regex = pattern
    regex = regex.replace('$Y', '(?P<year>[0-9]{4})')
    regex = regex.replace('$m', '(?P<month>[0-9]{2})')
    regex = regex.replace('$d', '(?P<day>[0-9]{2})')
    regex = regex.replace('$H', '(?P<hour>[0-9]{2})')
    regex = regex.replace('$M', '(?P<minute>[0-9]{2})')
    regex = re.compile(regex)
    def file2date (f):
      d = regex.search(f)
      assert d is not None, "can't use the pattern on the filenames?"
      d = d.groupdict()
      d = dict([k,int(v)] for k,v in d.items() if v is not None)
      # Apply default values (i.e. for minutes, seconds if they're not in the file format?)
      d = dict({'hour':0, 'minute':0,'second':0}, **d)
      return d


  # Get the starting date of each file
  dates = [file2date(f) for f in files]
  dates = dict((k,[d[k] for d in dates]) for k in list(dates[0].keys()))

  # Open a file to get a time axis
  file = opener(files[0])
  T = None
  for v in file.vars:
    if v.hasaxis(Time):
      T = type(v.getaxis(Time))
      break
  if T is None: T = StandardTime
#  T = [v.getaxis(Time) for v in file.vars if v.hasaxis(Time)]
#  T = type(T[0]) if len(T) > 0 else StandardTime
  del file

  # Generate a lower-resolution time axis (the start of *each* file)
  faxis = T(units='days',**dates)

  # Re-sort the files, if they weren't in order
  S = faxis.argsort()
  faxis = faxis.slice[S]
  files = [files[s] for s in S]
  # Re-init the faxis to force the proper start date
  faxis = type(faxis)(units=faxis.units, **faxis.auxarrays)

  # Open the first and last file, so we know what the variables & timesteps are
  first = opener(files[0])
  last  = opener(files[-1])
  names = [v.name for v in first.vars]
  for n in names: assert n in last, "inconsistent vars"
  # Get global attributes
  global_atts = common_dict (first.atts, last.atts)

  #---
  timedict = {None:faxis}
  for v1 in first:
    if not v1.hasaxis(Time): continue
    t1 = v1.getaxis(Time)
    if t1.name in timedict: continue  # already handled this one
    t2 = last[v1.name].getaxis(Time)
    # Construct a full time axis from these pieces

    # One timestep per file? (check for an offset for the var time compared
    #  to the file time)
    if max(len(t1),len(t2)) == 1:
      offset = reltime(t1, startdate=faxis.startdate, units=faxis.units)[0]
      taxis = faxis.withnewvalues(faxis.values + offset)
    # At least one of first/last files has multiple timesteps?
    else:
      assert t1.units == t2.units
      dt = max(delta(t1),delta(t2))
      assert dt > 0
      val1 = t1.values[0]
      val2 = reltime(t2, startdate=t1.startdate)[-1]
      nt = (val2-val1)/dt + 1
      assert round(nt) == nt
      nt = int(round(nt))
      assert nt > 0
      taxis = t1.withnewvalues(np.arange(nt)*dt + val1)

    timedict[t1.name] = taxis

  #---

  # Create the multifile version of the vars
  vars = [Multifile_Var(v1, opener, files, faxis, timedict) for v1 in first]


  return Dataset(vars,atts=global_atts)