def __init__(self, var, iaxis, ievents, evlen, evoff=0, saxes=None, sindices=None): #{{{ # Replace the time axis with a reference time (of an event?), and the offset # from that event. import numpy as np ievents = np.array(ievents) n = ievents.shape[0] caxis = var.axes[iaxis] # Event offsets can either be specified per event or as a single offset if hasattr(evoff, '__len__'): evoff = np.array(evoff) mevoff = evoff.max() assert evoff.shape == ievents.shape, "The number of event offsets provided does not match the number of events." else: mevoff = evoff evoff = np.ones(n, 'i') * mevoff # Event lengths can either be specified per event or as a single length if hasattr(evlen, '__len__'): evlen = np.array(evlen) mevlen = (evlen - evoff + mevoff).max() assert evlen.shape == ievents.shape, "The number of event lengths provided does not match the number of events." else: mevlen = evlen evlen = np.ones(n, 'i') * mevlen # Construct event and offset axes from pygeode.timeaxis import Time, Yearless from pygeode import timeutils ev = Event(np.arange(n) + 1, indices=ievents) if isinstance(caxis, Time): units = caxis.units delta = timeutils.delta(caxis, units=units) off = Yearless(values=delta * np.arange(-mevoff, mevlen - mevoff), units=units, startdate={'day': 0}) else: off = Offset(np.arange(-mevoff, mevlen - mevoff)) axes = var.axes[:iaxis] + (ev, off) + var.axes[iaxis + 1:] # Build var object self.var = var self.iaxis = iaxis self.evlens = evlen self.mevlen = mevlen self.evoffs = evoff self.mevoff = mevoff for i, (iev, el, eo) in enumerate(zip(ievents, evlen, evoff)): if iev - eo < 0: self.evoffs[i] += iev - eo self.evlens[i] -= iev - eo if iev - eo + el >= len(caxis): self.evlens[i] = len(caxis) - (iev - eo) #assert iev - eo >= 0 and iev - eo + el < len(caxis), \ #'Event %d (i: %d) is not fully defined' % (np.where(ievents==iev)[0][0], iev) Var.__init__(self, axes, dtype=var.dtype, name=var.name, atts=var.atts, plotatts=var.plotatts)
def delta (self, units=None): from pygeode import timeutils from warnings import warn warn ("Deprecated. Use timeutils module.") return timeutils.delta(self, units)
def open_multi (files, format=None, opener=None, pattern=None, file2date=None, **kwargs): # {{{ ''' Returns a :class:`Dataset` containing variables merged across many files. Parameters ========== files : string, list, or tuple Either a single filename or a list of filenames. Wildcards are supported, :func:`glob.iglob` is used to expand these into an explicit list of files. format : string, optional String specifying format of file to open. If none is given the format will be automatically detected from the first filename (see :func:`autodetectformat`) opener : function, optional Function to open individual files. If none is provided, uses the format-specific version of :func:`open`. The datasets returned by this function are then concatenated and returned. See Notes. pattern : string, optional A regex pattern to extract date stamps from the filename; used by default file2date. Matching patterns must be named <year>, <month>, <day>, <hour> or <minute>. Abbreviations are available for the above; $Y matches a four digit year, $m, $d, $H, and $M match a two-digit month, day, hour and minute, respectively. file2date : function, optional Function which returns a date dictionary given a filename. By default this is produced by applying the regex pattern ``pattern`` to the filename. sorted : boolean, optional If True, the filenames are sorted (by alpha) prior to opening each file, and the axes on the returned dataset are sorted by calling :meth:`Dataset.sorted`. **kwargs : keyword arguments These are passed on to the function ``opener``; Returns ======= dataset A dataset containing the variables concatenated across all specified files. The variable data itself is not loaded into memory. Notes ===== This is intended to provide access to large datasets whose files are separated by timestep. To avoid opening every file individually, the time axis is constructed by opening the first and the last file in the list of files provided. This is done to provide a template of what variables and what times are stored in each file - it is assumed that the number of timesteps (and their offsets) is the same accross the whole dataset. The time axis is then constructed from the filenames themselves, using the function ``file2date`` to generate a date from each filename. As a result only two files need to be opened, which makes this a very efficient way to work with very large datasets. However, no explicit check is made of the integrity of the files - if there are corrupt or missing data within individual files, this will not become clear until that data is actually accessed. This can be done explicitly with :func:`check_dataset`, which explicitly attempts to access all the data and returns a list of any problems encountered; this can take a long time, but is a useful check (and is more likely to provide helpful error messages). The function ``opener`` must take a single positional argument - the filename of the file to open - and keyword arguments that are passed through from this function. It must return a :class:`Dataset` object with the loaded variables. By default the standard :func:`open` is used, but providing a custom opener can be useful for any reshaping of the variables that must be done prior to concatenating the whole dataset. See Also ======== open openall ''' from pygeode.timeaxis import Time, StandardTime from pygeode.timeutils import reltime, delta from pygeode.dataset import Dataset from pygeode.tools import common_dict from pygeode.formats import open, autodetectformat import numpy as np files = expand_file_list(files) nfiles = len(files) assert nfiles > 0 if opener is None: if format is None: format = autodetectformat(files[0]) if not hasattr(format, 'open'): try: format = __import__("pygeode.formats.%s" % format, fromlist=["pygeode.formats"]) except ImportError: raise ValueError('Unrecognized format module %s.' % format) opener = format.open # Apply keyword arguments if len(kwargs) > 0: old_opener = opener opener = lambda f: old_opener (f, **kwargs) # Degenerate case: only one file was given if nfiles == 1: return opener(files[0]) # We'll need a function to translate filenames to dates # (if we don't have one, use the supplied pattern to make one) if file2date is None: import re assert pattern is not None, "I don't know how to get the dates from the filenames" regex = pattern regex = regex.replace('$Y', '(?P<year>[0-9]{4})') regex = regex.replace('$m', '(?P<month>[0-9]{2})') regex = regex.replace('$d', '(?P<day>[0-9]{2})') regex = regex.replace('$H', '(?P<hour>[0-9]{2})') regex = regex.replace('$M', '(?P<minute>[0-9]{2})') regex = re.compile(regex) def file2date (f): d = regex.search(f) assert d is not None, "can't use the pattern on the filenames?" d = d.groupdict() d = dict([k,int(v)] for k,v in d.iteritems()) # Apply default values (i.e. for minutes, seconds if they're not in the file format?) d = dict({'hour':0, 'minute':0,'second':0}, **d) return d # Get the starting date of each file dates = [file2date(f) for f in files] dates = dict((k,[d[k] for d in dates]) for k in dates[0].keys()) # Open a file to get a time axis file = opener(files[0]) T = None for v in file.vars: if v.hasaxis(Time): T = type(v.getaxis(Time)) break if T is None: T = StandardTime # T = [v.getaxis(Time) for v in file.vars if v.hasaxis(Time)] # T = type(T[0]) if len(T) > 0 else StandardTime del file # Generate a lower-resolution time axis (the start of *each* file) faxis = T(units='days',**dates) # Re-sort the files, if they weren't in order S = faxis.argsort() faxis = faxis.slice[S] files = [files[s] for s in S] # Re-init the faxis to force the proper start date faxis = type(faxis)(units=faxis.units, **faxis.auxarrays) # Open the first and last file, so we know what the variables & timesteps are first = opener(files[0]) last = opener(files[-1]) names = [v.name for v in first.vars] for n in names: assert n in last, "inconsistent vars" # Get global attributes global_atts = common_dict (first.atts, last.atts) #--- timedict = {None:faxis} for v1 in first: if not v1.hasaxis(Time): continue t1 = v1.getaxis(Time) if t1.name in timedict: continue # already handled this one t2 = last[v1.name].getaxis(Time) # Construct a full time axis from these pieces # One timestep per file? (check for an offset for the var time compared # to the file time) if max(len(t1),len(t2)) == 1: offset = reltime(t1, startdate=faxis.startdate, units=faxis.units)[0] taxis = faxis.withnewvalues(faxis.values + offset) # At least one of first/last files has multiple timesteps? else: assert t1.units == t2.units dt = max(delta(t1),delta(t2)) assert dt > 0 val1 = t1.values[0] val2 = reltime(t2, startdate=t1.startdate)[-1] nt = (val2-val1)/dt + 1 assert round(nt) == nt nt = int(round(nt)) assert nt > 0 taxis = t1.withnewvalues(np.arange(nt)*dt + val1) timedict[t1.name] = taxis #--- # Create the multifile version of the vars vars = [Multifile_Var(v1, opener, files, faxis, timedict) for v1 in first] return Dataset(vars,atts=global_atts)
def open_multi (files, format=None, opener=None, pattern=None, file2date=None, **kwargs): # {{{ ''' Returns a :class:`Dataset` containing variables merged across many files. Parameters ========== files : string, list, or tuple Either a single filename or a list of filenames. Wildcards are supported, :func:`glob.iglob` is used to expand these into an explicit list of files. format : string, optional String specifying format of file to open. If none is given the format will be automatically detected from the first filename (see :func:`autodetectformat`) opener : function, optional Function to open individual files. If none is provided, uses the format-specific version of :func:`open`. The datasets returned by this function are then concatenated and returned. See Notes. pattern : string, optional A regex pattern to extract date stamps from the filename; used by default file2date. Matching patterns must be named <year>, <month>, <day>, <hour> or <minute>. Abbreviations are available for the above; $Y matches a four digit year, $m, $d, $H, and $M match a two-digit month, day, hour and minute, respectively. file2date : function, optional Function which returns a date dictionary given a filename. By default this is produced by applying the regex pattern ``pattern`` to the filename. sorted : boolean, optional If True, the filenames are sorted (by alpha) prior to opening each file, and the axes on the returned dataset are sorted by calling :meth:`Dataset.sorted`. **kwargs : keyword arguments These are passed on to the function ``opener``; Returns ======= dataset A dataset containing the variables concatenated across all specified files. The variable data itself is not loaded into memory. Notes ===== This is intended to provide access to large datasets whose files are separated by timestep. To avoid opening every file individually, the time axis is constructed by opening the first and the last file in the list of files provided. This is done to provide a template of what variables and what times are stored in each file - it is assumed that the number of timesteps (and their offsets) is the same accross the whole dataset. The time axis is then constructed from the filenames themselves, using the function ``file2date`` to generate a date from each filename. As a result only two files need to be opened, which makes this a very efficient way to work with very large datasets. However, no explicit check is made of the integrity of the files - if there are corrupt or missing data within individual files, this will not become clear until that data is actually accessed. This can be done explicitly with :func:`check_multi`, which explicitly attempts to access all the data and returns a list of any problems encountered; this can take a long time, but is a useful check (and is more likely to provide helpful error messages). The function ``opener`` must take a single positional argument - the filename of the file to open - and keyword arguments that are passed through from this function. It must return a :class:`Dataset` object with the loaded variables. By default the standard :func:`open` is used, but providing a custom opener can be useful for any reshaping of the variables that must be done prior to concatenating the whole dataset. See Also ======== open openall ''' from pygeode.timeaxis import Time, StandardTime from pygeode.timeutils import reltime, delta from pygeode.dataset import Dataset from pygeode.tools import common_dict from pygeode.formats import open, autodetectformat import numpy as np files = expand_file_list(files) nfiles = len(files) assert nfiles > 0 if opener is None: if format is None: format = autodetectformat(files[0]) if not hasattr(format, 'open'): try: format = __import__("pygeode.formats.%s" % format, fromlist=["pygeode.formats"]) except ImportError: raise ValueError('Unrecognized format module %s.' % format) opener = format.open # Apply keyword arguments if len(kwargs) > 0: old_opener = opener opener = lambda f: old_opener (f, **kwargs) # Degenerate case: only one file was given if nfiles == 1: return opener(files[0]) # We'll need a function to translate filenames to dates # (if we don't have one, use the supplied pattern to make one) if file2date is None: import re assert pattern is not None, "I don't know how to get the dates from the filenames" regex = pattern regex = regex.replace('$Y', '(?P<year>[0-9]{4})') regex = regex.replace('$m', '(?P<month>[0-9]{2})') regex = regex.replace('$d', '(?P<day>[0-9]{2})') regex = regex.replace('$H', '(?P<hour>[0-9]{2})') regex = regex.replace('$M', '(?P<minute>[0-9]{2})') regex = re.compile(regex) def file2date (f): d = regex.search(f) assert d is not None, "can't use the pattern on the filenames?" d = d.groupdict() d = dict([k,int(v)] for k,v in d.items() if v is not None) # Apply default values (i.e. for minutes, seconds if they're not in the file format?) d = dict({'hour':0, 'minute':0,'second':0}, **d) return d # Get the starting date of each file dates = [file2date(f) for f in files] dates = dict((k,[d[k] for d in dates]) for k in list(dates[0].keys())) # Open a file to get a time axis file = opener(files[0]) T = None for v in file.vars: if v.hasaxis(Time): T = type(v.getaxis(Time)) break if T is None: T = StandardTime # T = [v.getaxis(Time) for v in file.vars if v.hasaxis(Time)] # T = type(T[0]) if len(T) > 0 else StandardTime del file # Generate a lower-resolution time axis (the start of *each* file) faxis = T(units='days',**dates) # Re-sort the files, if they weren't in order S = faxis.argsort() faxis = faxis.slice[S] files = [files[s] for s in S] # Re-init the faxis to force the proper start date faxis = type(faxis)(units=faxis.units, **faxis.auxarrays) # Open the first and last file, so we know what the variables & timesteps are first = opener(files[0]) last = opener(files[-1]) names = [v.name for v in first.vars] for n in names: assert n in last, "inconsistent vars" # Get global attributes global_atts = common_dict (first.atts, last.atts) #--- timedict = {None:faxis} for v1 in first: if not v1.hasaxis(Time): continue t1 = v1.getaxis(Time) if t1.name in timedict: continue # already handled this one t2 = last[v1.name].getaxis(Time) # Construct a full time axis from these pieces # One timestep per file? (check for an offset for the var time compared # to the file time) if max(len(t1),len(t2)) == 1: offset = reltime(t1, startdate=faxis.startdate, units=faxis.units)[0] taxis = faxis.withnewvalues(faxis.values + offset) # At least one of first/last files has multiple timesteps? else: assert t1.units == t2.units dt = max(delta(t1),delta(t2)) assert dt > 0 val1 = t1.values[0] val2 = reltime(t2, startdate=t1.startdate)[-1] nt = (val2-val1)/dt + 1 assert round(nt) == nt nt = int(round(nt)) assert nt > 0 taxis = t1.withnewvalues(np.arange(nt)*dt + val1) timedict[t1.name] = taxis #--- # Create the multifile version of the vars vars = [Multifile_Var(v1, opener, files, faxis, timedict) for v1 in first] return Dataset(vars,atts=global_atts)