def __iter__(self): """ Processes the records into multidimensional variables. Iterates over (name, atts, axes, array) tuples. Note that array may not be a true numpy array (values are not yet loaded in memory). To load the array, pass it to numpy.asarray(). Deprecated - use .to_xarray() to get a multidimensional structure. """ from warnings import warn warn( "Iterating over a Buffer is deprecated. Use .to_xarray() to access the multidimensional data.", stacklevel=2) from fstd2nc.mixins import _iter_type, _var_type self._makevars() for var in self._iter_objects(): if isinstance(var, _iter_type): array = _Array(self, var) var = _var_type(var.name, var.atts, var.axes, array) yield var
def _makevars(self): from fstd2nc.mixins import _var_type, _axis_type, _dim_type from fstd2nc.mixins.dates import stamp2datetime from collections import OrderedDict import numpy as np from rpnpy.librmn.fstd98 import fstlir forecast_axis = None # To attach the forecast axis. station = None # To attach the station names as coordinates. momentum = thermo = None # To attach the vertical axes. super(Series, self)._makevars() # Get station and forecast info. # Need to read from original records, because this into isn't in the # data stream. station_header = fstlir(self._meta_funit, nomvar='STNS') if station_header is not None: array = station_header['d'].transpose() # Re-cast array as string. # I don't know why I have to subtract 128 - maybe something to do with # how the characters are encoded in the file? # This isn't always needed. Have test files for both cases. # Need help making this more robust! if array.flatten()[0] >= 128: array -= 128 array = array.view('|S1') nstations, strlen = array.shape array = array.flatten().view('|S%d' % strlen) # Strip out trailing whitespace. # Python3: convert bytes to str array[:] = [str(arr.decode()).rstrip() for arr in array] array = array.view('|S1').reshape(nstations, strlen) station_id = _dim_type('station_id', nstations) station_strlen = _dim_type('station_strlen', strlen) # Encode it as 2D character array for netCDF file output. station = _var_type('station', {}, [station_id, station_strlen], array) # Create forecast axis. forecast_header = fstlir(self._meta_funit, nomvar='HH') if forecast_header is not None: atts = OrderedDict(units='hours') # Note: the information in 'HH' is actually the hour of validity. # Need to subtract the hour from the date of origin in order to get # the leadtime. starting_hour = stamp2datetime(forecast_header['dateo']).hour array = forecast_header['d'].flatten() - starting_hour forecast_timedelta = np.array(array * 3600, 'timedelta64[s]') forecast_axis = _axis_type('forecast', atts, array) # Extract vertical coordinates. for vertvar in ('SH', 'SV'): header = fstlir(self._meta_funit, nomvar=vertvar) if header is None: continue array = header['d'].squeeze() # Drop the top or bottom levels to match the profile data? if self._missing_bottom_profile_level: array = array[:-1] if array.ndim != 1: continue atts = OrderedDict(self._get_header_atts(header)) if vertvar == 'SH': thermo = _axis_type('level', atts, array) if vertvar == 'SV': momentum = _axis_type('level', atts, array) # 'Y' data should be handled fine by _XYCoords - just give a more # specific name to the ni axis for clarity. for var in self._varlist: if var.atts.get('typvar') == 'T' and var.atts.get('grtyp') == 'Y': dims = var.dims iaxis = var.getaxis('i') if iaxis is not None and station is not None and len( iaxis) == station.shape[0]: var.axes[dims.index('i')] = station.axes[0] # Remove degenerate vertical axis for '+' data. # (The one coming from IP1, which is not used.) for var in self._varlist: if var.atts.get('typvar') == 'T' and var.atts.get('grtyp') == '+': dims = var.dims if 'level' in dims: var.record_id = var.record_id.squeeze( axis=dims.index('level')) var.axes.pop(dims.index('level')) # For '+' data, ni is the vertical level, and nj is the forecast. known_levels = dict() for var in self._varlist: if var.atts.get('typvar') != 'T': continue if var.atts.get('grtyp') != '+': continue dims = var.dims # The j dimension is actually the forecast time. jaxis = var.getaxis('j') if jaxis is not None and forecast_axis is not None and len( jaxis) == len(forecast_axis): var.axes[dims.index('j')] = forecast_axis # The i dimension is actually the vertical coordinate for this type of # data. iaxis = var.getaxis('i') if iaxis is not None: # If there's only 1 level (degenerate), then remove that dimension. if len(iaxis) == 1: var.axes.pop(dims.index('i')) continue # Try to map to thermodynamic or momentum levels. level = iaxis level.name = 'level' if var.name in self._momentum_vars and momentum is not None: if len(level) == len(momentum): level = momentum else: warn( _("Wrong number of momentum levels found in the data." )) if var.name in self._thermo_vars and thermo is not None: if len(level) == len(thermo): level = thermo else: warn( _("Wrong number of thermodynamic levels found in the data." )) if level is iaxis: warn( _("Unable to find the vertical coordinates for %s." % var.name)) # Attach a generic level dimension. nlev = len(level) if nlev not in known_levels: known_levels[nlev] = _dim_type('level', nlev) level = known_levels[nlev] else: # Found vertical levels, now define the level kind so VCoords # mixin can add more metadata. var.atts['kind'] = 5 var.axes[dims.index('i')] = level # Some support for squashing forecasts. if getattr(self, '_squash_forecasts', False) is True: known_squashed_forecasts = dict() known_leadtimes = dict() known_reftimes = dict() for var in self._varlist: # Can only do this for a single date of origin, because the time # axis and forecast axis are not adjacent for this type of data. time = var.getaxis('time') forecast = var.getaxis('forecast') if time is None or forecast is None: continue if len(time) != 1: warn( _("Can't use datev for timeseries data with multiple dates of origin. Try re-running with the --dateo option." )) continue var.record_id = var.record_id.squeeze( axis=var.dims.index('time')) var.axes.pop(var.dims.index('time')) key = (id(time), id(forecast)) if key not in known_squashed_forecasts: time0 = time.array[0] # Convert pandas times (if using pandas for processing the headers) time0 = np.datetime64(time0, 's') # Calculate the date of validity forecast_timedelta = np.array(forecast.array * 3600, 'timedelta64[s]') squashed_times_array = time0 + forecast_timedelta time = _axis_type( 'time', OrderedDict([('standard_name', 'time'), ('long_name', 'Validity time'), ('axis', 'T')]), squashed_times_array) known_squashed_forecasts[key] = time # Include forecast and reftime auxiliary coordinates (emulate # what's done in the dates mixin) leadtime = _var_type( 'leadtime', OrderedDict([ ('standard_name', 'forecast_period'), ('long_name', 'Lead time (since forecast_reference_time)'), ('units', 'hours') ]), [time], forecast.array) reftime = _var_type( 'reftime', OrderedDict([('standard_name', 'forecast_reference_time')]), {}, np.array(time0)) known_leadtimes[key] = leadtime known_reftimes[key] = reftime var.axes[var.dims.index( 'forecast')] = known_squashed_forecasts[key] # Add leadtime and reftime as auxiliary coordinates. coords = var.atts.get('coordinates', []) coords.extend([known_leadtimes[key], known_reftimes[key]]) var.atts['coordinates'] = coords # Hook in the station names as coordinate information. if station is not None: for station_id, varlist in self._iter_axes('station_id', varlist=True): # Try to use the provided station coordinate, if it has a consistent # length. if len(station_id) == station.shape[0]: for var in varlist: var.axes[var.dims.index( 'station_id')] = station.axes[0] station_id = station.axes[0] station_coord = station # Otherwise, need to construct a new coordinate with the subset of # stations used. # Assume station_ids start at 1 (not 0). else: indices = station_id.array - 1 array = station.array[indices, :] # Use _axis_type instead of _dim_type to retain the station_id values. station_id = _axis_type('station_id', {}, station_id.array) axes = [station_id, station.axes[1]] station_coord = _var_type('station', {}, axes, array) # Attach the station as a coordinate. for var in varlist: coords = var.atts.get('coordinates', []) coords.append(station_coord) var.atts['coordinates'] = coords
def _makevars(self): from fstd2nc.mixins import _iter_type, _var_type, _axis_type, _dim_type from collections import OrderedDict from rpnpy.librmn.interp import ezqkdef, EzscintError, ezget_nsubgrids from rpnpy.librmn.all import readGrid, RMNError import numpy as np # Scan through the data, and look for any use of horizontal coordinates. grids = OrderedDict() gridmaps = OrderedDict() lats = OrderedDict() lons = OrderedDict() # Only output 1 copy of 1D coords (e.g. could have repetitions with # horizontal staggering. coords = set() super(XYCoords, self)._makevars() # Make sure any LA/LO records get processed first, so we can apply them as # coordinates to other variables. varlist = self._varlist varlist = [v for v in varlist if v.name in ('LA', 'LO') ] + [v for v in varlist if v.name not in ('LA', 'LO')] for var in varlist: # Don't touch variables with no horizontal grid. if all(a not in var.dims for a in ('i', 'j', 'station_id')): continue # Get grid parameters. ni = int(var.atts['ni']) nj = int(var.atts['nj']) grtyp = var.atts['grtyp'] ig1 = int(var.atts['ig1']) ig2 = int(var.atts['ig2']) ig3 = int(var.atts['ig3']) ig4 = int(var.atts['ig4']) # Uniquely identify the grid for this variable. # # Use a looser identifier for timeseries data (ni/nj have different # meanings here (not grid-related), and could have multiple grtyp # values ('+','Y') that should share the same lat/lon info. if var.atts.get('typvar', '').strip() == 'T': key = ('T', ig1, ig2) else: key = (grtyp, ni, nj, ig1, ig2, ig3, ig4) if grtyp in ('Y', '+'): key = key[1:] # Check if we already defined this grid. if key not in grids: lat = lon = xaxis = yaxis = None # Check if GridMap recognizes this grid. if grtyp not in self._direct_grids: try: grd = readGrid(self._meta_funit, var.atts.copy()) gmap = GridMap.gen_gmap(grd) gmapvar = gmap.gen_gmapvar() gridmaps[key] = gmapvar (xaxis, yaxis, gridaxes, lon, lat) = gmap.gen_xyll() except (TypeError, EzscintError, KeyError, RMNError, ValueError): pass # Wasn't supported. # Otherwise, need to decode the information here. if lat is None or lon is None: latatts = OrderedDict() latatts['long_name'] = 'latitude' latatts['standard_name'] = 'latitude' latatts['units'] = 'degrees_north' lonatts = OrderedDict() lonatts['long_name'] = 'longitude' lonatts['standard_name'] = 'longitude' lonatts['units'] = 'degrees_east' latarray = lonarray = None try: # First, handle non-ezqkdef grids. if grtyp in self._direct_grids: latarray = self._find_coord( var, '^^')['d'].squeeze(axis=2) lonarray = self._find_coord( var, '>>')['d'].squeeze(axis=2) # Handle ezqkdef grids. else: gdid = ezqkdef(ni, nj, grtyp, ig1, ig2, ig3, ig4, self._meta_funit) ll = gdll(gdid) latarray = ll['lat'] lonarray = ll['lon'] xycoords = gdgaxes(gdid) ax = xycoords['ax'].transpose() ay = xycoords['ay'].transpose() # Convert from degenerate 2D arrays to 1D arrays. ax = ax[0, :] ay = ay[:, 0] xaxis = _axis_type('x', {'axis': 'X'}, ax) yaxis = _axis_type('y', {'axis': 'Y'}, ay) except (TypeError, EzscintError, KeyError, RMNError, ValueError): pass # Check for LA/LO variables, and use these as the coordinates if # nothing else available. if latarray is None and var.name == 'LA': var.name = 'lat' var.atts.update(latatts) #grids[key] = list(var.axes) lats[key] = var continue if lonarray is None and var.name == 'LO': var.name = 'lon' var.atts.update(lonatts) grids[key] = list(var.axes) lons[key] = var continue if latarray is None or lonarray is None: warn( _("Unable to get lat/lon coordinates for '%s'") % var.name) continue # Construct lat/lon variables from latarray and lonarray. latarray = latarray.transpose( ) # Switch from Fortran to C order. lonarray = lonarray.transpose( ) # Switch from Fortran to C order. # Case 1: lat/lon can be resolved into 1D Cartesian coordinates. # Calculate the mean lat/lon arrays in double precision. meanlat = np.mean(np.array(latarray, dtype=float), axis=1, keepdims=True) meanlon = np.mean(np.array(lonarray, dtype=float), axis=0, keepdims=True) if latarray.shape[ 1] > 1 and lonarray.shape[1] > 1 and np.allclose( latarray, meanlat) and np.allclose( lonarray, meanlon): # Reduce back to single precision for writing out. meanlat = np.array(meanlat, dtype=latarray.dtype).squeeze() meanlon = np.array(meanlon, dtype=lonarray.dtype).squeeze() # Ensure monotonicity of longitude field. # (gdll may sometimes wrap last longitude to zero). # Taken from old fstd_core.c code. if meanlon[-2] > meanlon[-3] and meanlon[-1] < meanlon[ -2]: meanlon[-1] += 360. latarray = meanlat lonarray = meanlon lat = _axis_type('lat', latatts, latarray) lon = _axis_type('lon', lonatts, lonarray) gridaxes = [lat, lon] # Case 2: lat/lon are series of points. elif latarray.shape[0] == 1 and lonarray.shape[ 0] == 1 and ('i' in var.dims or 'station_id' in var.dims): latarray = latarray[0, :] lonarray = lonarray[0, :] # Special case for station data station_id = var.getaxis('station_id') if station_id is not None: gridaxes = [station_id] # Subset the lat/lon to the stations that are actually found. # Assuming the station id (ip3) starts at 1. if isinstance(station_id, _axis_type): indices = np.array(station_id.array, dtype=int) - 1 latarray = latarray[indices] lonarray = lonarray[indices] else: gridaxes = [var.getaxis('i')] lat = _var_type('lat', latatts, gridaxes, latarray) lon = _var_type('lon', lonatts, gridaxes, lonarray) # Case 3: General 2D lat/lon fields on X/Y coordinate system. elif xaxis is not None and yaxis is not None: gridaxes = [yaxis, xaxis] # Special case: have supergrid data, and the user wants to split it? if grtyp == 'U' and self._subgrid_axis: ngrids = ezget_nsubgrids(gdid) ny = len(yaxis.array) // ngrids yaxis.array = yaxis.array[:ny] subgrid = _dim_type('subgrid', ngrids) gridaxes = [subgrid, yaxis, xaxis] latarray = latarray.reshape(ngrids, ny, -1) lonarray = lonarray.reshape(ngrids, ny, -1) lat = _var_type('lat', latatts, gridaxes, latarray) lon = _var_type('lon', lonatts, gridaxes, lonarray) # Case 4: General 2D lat/lon fields with no coordinate system. elif 'i' in var.dims and 'j' in var.dims: gridaxes = [var.getaxis('j'), var.getaxis('i')] lat = _var_type('lat', latatts, gridaxes, latarray) lon = _var_type('lon', lonatts, gridaxes, lonarray) # --- End of lat/lon/xaxis/yaxis decoding. if lat is None or lon is None: warn( _("Unable to get lat/lon coordinates for '%s'") % var.name) continue # Sanity check on lat/lon - make sure we have something of the right size. if lat.array.shape == lat.shape and lon.array.shape == lon.shape: grids[key] = gridaxes lats[key] = lat lons[key] = lon else: warn(_("Wrong shape of lat/lon for '%s'") % var.name) continue # --- End of grid decoding. gridaxes = grids[key] lat = lats[key] lon = lons[key] # Update the var's horizontal coordinates. newaxes = [] if len(gridaxes) == 1: newaxes = [('i', gridaxes[0])] elif len(gridaxes) == 2: newaxes = [('j', gridaxes[0]), ('i', gridaxes[1])] elif len(gridaxes) == 3: newaxes = [('k', gridaxes[0]), ('j', gridaxes[1]), ('i', gridaxes[2])] else: warn(_("Unusual grid axes for '%s' - ignoring.") % var.name) dims = var.dims for oldname, newaxis in newaxes: if oldname in dims: var.axes[dims.index(oldname)] = newaxis # For 2D lat/lon, need to reference them as coordinates in order for # netCDF viewers to display the field properly. if 'lat' not in var.dims or 'lon' not in var.dims: coordinates = var.atts.get('coordinates', []) coordinates.extend([lon, lat]) var.atts['coordinates'] = coordinates if key in gridmaps: var.atts['grid_mapping'] = gridmaps[key] # Throw out superfluous LA/LO variables, if lat/lon was already decoded. if var.name == 'LA' and ('lat' in var.dims or lat in coordinates): var.name = None if var.name == 'LO' and ('lon' in var.dims or lon in coordinates): var.name = None self._varlist = [v for v in varlist if v.name is not None]
def to_netcdf(self, filename, nc_format='NETCDF4', global_metadata=None, zlib=False, compression=4, progress=False): """ Write the records to a netCDF file. Requires the netCDF4 package. """ from fstd2nc.mixins import _var_type, _ProgressBar, _FakeBar from netCDF4 import Dataset import numpy as np f = Dataset(filename, "w", format=nc_format) # Apply global metadata (from config files and global_metadata argument). if 'global' in getattr(self, '_metadata', {}): f.setncatts(self._metadata['global']) if global_metadata is not None: f.setncatts(global_metadata) # Collect all the records that will be read/written. # List of (key,recshape,ncvar,ncind). # Note: derived variables (with values stored in memory) will be written # immediately, bypassing this list. io = [] self._makevars() # Define the dimensions. for axis in self._iter_axes(): # Special case: make the time dimension unlimited. if axis.name == 'time' and self._time_unlimited: f.createDimension(axis.name, None) else: f.createDimension(axis.name, len(axis)) # Generate the variable structures. for var in self._iter_objects(): # Write the variable. # Easy case: already have the data. if hasattr(var, 'array'): v = f.createVariable(var.name, datatype=var.array.dtype, dimensions=var.dims, zlib=zlib, complevel=compression) # Write the metadata. v.setncatts(var.atts) v[()] = var.array continue # Hard case: only have the record indices, need to loop over the records. # Get the shape of a single record for the variable. if not hasattr(var, 'record_id'): continue record_shape = var.shape[var.record_id.ndim:] # Use this as the "chunk size" for the netCDF file, to improve I/O # performance. chunksizes = (1, ) * var.record_id.ndim + record_shape v = f.createVariable(var.name, datatype=var.dtype, dimensions=var.dims, zlib=zlib, complevel=compression, chunksizes=chunksizes, fill_value=getattr(self, '_fill_value', None)) # Turn off auto scaling of variables - want to encode the values as-is. # 'scale_factor' and 'add_offset' will only be applied when *reading* the # the file after it's created. v.set_auto_scale(False) # Write the metadata. v.setncatts(var.atts) # Write the data. indices = list(np.ndindex(var.record_id.shape)) # Sort the indices by FSTD key, so we're reading the records in the same # order as they're found on disk. keys = map(int, var.record_id.flatten()) for r, ind in zip(keys, indices): if r >= 0: io.append((r, record_shape, v, ind)) # Now, do the actual transcribing of the data. # Read/write the data in the same order of records in the RPN file(s) to # improve performance. Bar = _ProgressBar if progress is True else _FakeBar bar = Bar(_("Saving netCDF file"), suffix="%(percent)d%% [%(myeta)s]") for r, shape, v, ind in bar.iter(sorted(io)): try: data = self._fstluk( r, dtype=v.dtype)['d'].transpose().reshape(shape) v[ind] = data except (IndexError, ValueError): warn( _("Internal problem with the script - unable to get data for '%s'" ) % v.name) continue f.close()
def _fix_names(self): # List of metadata keys that are internal to the FSTD file. internal_meta = self._headers.dtype.names # Generate unique axis names. axis_table = dict() for axis in self._iter_axes(): if axis.name not in axis_table: axis_table[axis.name] = [] axis_table[axis.name].append(axis) for axisname, axis_list in axis_table.items(): if len(axis_list) == 1: continue warn( _("Multiple %s axes. Appending integer suffixes to their names." ) % axisname) for i, axis in enumerate(axis_list): axis.name = axis.name + str(i + 1) # Generate a string-based variable id. # Only works for true variables from the FSTD source # (needs metadata like etiket, etc.) def get_var_id(var): out = [] for fmt in self._human_var_id: out.append(fmt % var.atts) return tuple(out) # Generate unique variable names. var_table = dict() for var in self._iter_objects(): if var.name not in var_table: var_table[var.name] = [] # Identify the variables by their index in the master list. var_table[var.name].append(var) for varname, var_list in var_table.items(): # Only need to rename variables that are non-unique. if len(var_list) == 1: continue try: var_ids = [get_var_id(v) for v in var_list] except KeyError: # Some derived axes may not have enough metadata to generate an id, # so the best we can do is append an integer suffix. var_ids = [(str(r), ) for r in range(1, len(var_list) + 1)] var_ids = zip(*var_ids) # Omit parts of the var_id that are invariant over all the variables. var_ids = [var_id for var_id in var_ids if len(set(var_id)) > 1] # Starting from the rightmost key, remove as many keys as possible while # maintaining uniqueness. for j in reversed(range(len(var_ids))): test = var_ids[:j] + var_ids[j + 1:] if len(set(zip(*test))) == len(var_list): var_ids = test var_ids = zip(*var_ids) var_ids = ['_'.join(var_id) for var_id in var_ids] warn( _("Multiple definitions of %s. Adding unique suffixes %s.") % (varname, ', '.join(var_ids))) # Apply the name changes. for var, var_id in zip(var_list, var_ids): var.name = var.name + '_' + var_id for var in self._iter_objects(): # Names must start with a letter or underscore. if not var.name[0].isalpha(): warn(_("Renaming '%s' to '_%s'.") % (var.name, var.name)) var.name = '_' + var.name # Strip out FSTD-specific metadata? if not hasattr(var, 'atts'): continue if self._rpnstd_metadata_list is not None: for n in internal_meta: if n not in self._rpnstd_metadata_list: var.atts.pop(n, None)
def _makevars_pandas (self): from collections import OrderedDict import numpy as np import pandas as pd import warnings nrecs = len(self._headers) # Degenerate case: no data in buffer if nrecs == 0: return # Convert records to a pandas DataFrame, which is faster to operate on. records = pd.DataFrame.from_records(self._headers) # Keep track of original dtypes (may need to re-cast later). original_dtypes = dict(self._headers_dtype) # Ignore deleted / invalidated records. records = records[records['dltf']==0] # Keep track of any axes that were generated. known_axes = dict() # Keep track of any auxiliary coordinates that were generated. known_coords = dict() # Iterate over each variable. # Variables are defined by the entries in _var_id. self._varlist = [] for var_id, var_records in records.groupby(list(self._var_id)): var_id = OrderedDict(zip(self._var_id, var_id)) nomvar = var_id['nomvar'].strip() nomvar = str(nomvar.decode()) # Python3: convert bytes to str. # Ignore meta records. if nomvar in self._meta_records: continue # Get the attributes, axes, and corresponding indices of each record. atts = OrderedDict() axes = OrderedDict() indices = OrderedDict() coordnames = [] coord_axes = OrderedDict() coord_indices = OrderedDict() for n in records.columns: if n in self._ignore_atts: continue # Ignore columns which are masked out. # https://stackoverflow.com/questions/29530232/python-pandas-check-if-any-value-is-nan-in-dataframe if var_records[n].isnull().values.any(): continue # Get the unique values, in order. # Coerce back to original dtype, since masked columns get upcasted to # float64 in pandas.DataFrame.from_records. try: column = var_records[n].astype(original_dtypes[n]) except TypeError: # Some types may not be re-castable. # For instance, pandas < 0.23 can't convert between datetime64 with # different increments ([ns] and [s]). column = var_records[n] cat = pd.Categorical(column) # Is this column an outer axis? if n in self._outer_axes: values = tuple(cat.categories) if (n,values) not in known_axes: known_axes[(n,values)] = _axis_type(name = n, atts = OrderedDict(), array = np.array(values,dtype=column.dtype)) axes[n] = known_axes[(n,values)] indices[n] = cat.codes # Is this also an axis for an auxiliary coordinate? for coordname,coordaxes in self._outer_coords.items(): if n in coordaxes: coordnames.append(coordname) coord_axes.setdefault(coordname,OrderedDict())[n] = axes[n] coord_indices.setdefault(coordname,OrderedDict())[n] = cat.codes # Otherwise, does it have a consistent value? # If so, can add it to the metadata. # Ignore outer coords, since the value is already encoded elsewhere. elif len(cat.categories) == 1 and n not in self._outer_coords: try: v = cat[0] # Python3: convert bytes to str. if isinstance(v,bytes): v = str(v.decode()) # Trim string attributes (remove whitespace padding). if isinstance(v,str): v = v.strip() # Use regular integers for numeric types. elif np.can_cast(v,int): v = int(v) atts[n] = v except (TypeError,ValueError): pass # Recover the proper order for the outer axes. # Not necessarily the order of the header table columns. axes = OrderedDict((n,axes[n]) for n in self._outer_axes if n in axes) indices = tuple([indices[n] for n in self._outer_axes if n in indices]) for coordname in coord_axes.keys(): coord_axes[coordname] = OrderedDict((n,coord_axes[coordname][n]) for n in self._outer_axes if n in coord_axes[coordname]) coord_indices[coordname] = [coord_indices[coordname][n] for n in self._outer_axes if n in coord_indices[coordname]] # Construct a multidimensional array to hold the record keys. record_id = np.empty(list(map(len,axes.values())), dtype='int32') # Assume missing data (nan) unless filled in later. record_id[()] = -1 # Arrange the record keys in the appropriate locations. record_id[indices] = var_records.index # Get the auxiliary coordinates. coords = [] for n in coordnames: # Ignore auxiliary coordinates which are masked out. if var_records[n].isnull().values.any(): continue # Unique key for this coordinate key = (n,tuple(coord_axes[n].items())) # Arrange the coordinate values in the appropriate location. shape = list(map(len,list(coord_axes[n].values()))) values = np.zeros(shape,dtype=var_records[n].dtype) indices = tuple(coord_indices[n]) values[indices] = var_records[n] if key not in known_coords: coord = _var_type (name = n, atts = OrderedDict(), axes = list(coord_axes[n].values()), array = values ) known_coords[key] = coord coords.append(known_coords[key]) if len(coords) > 0: atts['coordinates'] = coords # Check if we have full coverage along all axes. have_data = [k >= 0 for k in record_id.flatten()] if not np.all(have_data): warn (_("Missing some records for %s.")%nomvar) # Add dummy axes for the ni,nj,nk record dimensions. axes['k'] = _dim_type('k',int(var_id['nk'])) axes['j'] = _dim_type('j',int(var_id['nj'])) axes['i'] = _dim_type('i',int(var_id['ni'])) # Determine the optimal data type to use. # First, find unique combos of datyp, nbits # (want to minimize calls to dtype_fst2numpy). x = var_records[['datyp','nbits']].drop_duplicates() datyp = map(int,x['datyp']) nbits = map(int,x['nbits']) dtype_list = map(dtype_fst2numpy, datyp, nbits) dtype = np.result_type(*dtype_list) var = _iter_type( name = nomvar, atts = atts, axes = list(axes.values()), dtype = dtype, record_id = record_id ) self._varlist.append(var)
def _makevars_slow (self): from collections import OrderedDict import numpy as np import warnings nrecs = len(self._headers) # Degenerate case: no data in buffer if nrecs == 0: return records = self._headers # Ignore deleted / invalidated records. deleted = (records['dltf'] == 1) if np.any(deleted): records = records[~deleted] header_indices = np.where(~deleted)[0] # Determine the variable identifiers. # First, extract the uniquely identifying information from the metadata. # Suppress FutureWarning from numpy about doing this. Probably benign... with warnings.catch_warnings(): warnings.simplefilter("ignore") all_var_ids = records.data[list(self._var_id)] # Do a pre-processing step to remove ids that are identical to the one # immediately before it. # This is purely an optimization thing - the np.unique call later on is # O(n log n) so want to prune this array as much as possible beforehand. # TODO: Update this once numpy has an unsorted "unique"-like function that # can run more efficiently when there are relatively few unique elements. # (could get ~O(n) with a hash table). var_ids = all_var_ids flag = np.concatenate(([True], var_ids[1:] != var_ids[:-1])) var_ids = var_ids[flag] # Now, find the unique var_ids from this pruned list. var_ids = np.unique(var_ids) # Keep track of axes that were generated known_axes = dict() # Keep track of any auxiliary coordinates that were generated. known_coords = dict() # Loop over each variable and construct the data & metadata. self._varlist = [] for var_id in var_ids: selection = (all_var_ids == var_id) var_records = records[selection] var_record_indices = np.where(selection)[0] nomvar = var_id['nomvar'].strip() nomvar = str(nomvar.decode()) # Python3: convert bytes to str. # Ignore meta records. if nomvar in self._meta_records: continue # Get the metadata for each record. atts = OrderedDict() for n in records.dtype.names: if n in self._outer_axes or n in self._outer_coords or n in self._ignore_atts: continue v = var_records[n] # Remove missing values before continuing. v = np.ma.compressed(v) if len(v) == 0: continue # Only use attributes that are consistent across all variable records. if len(set(v)) > 1: continue v = v[0] # Use regular integers for numeric types. if np.can_cast(v.dtype,int): v = int(v) # Python3: convert bytes to str. if isinstance(v,bytes): v = str(v.decode()) # Trim string attributes (remove whitespace padding). if isinstance(v,str): v = v.strip() atts[n] = v # Get the axes. axes = OrderedDict() for n in self._outer_axes: values = var_records[n] # Remove missing values before continuing. values = np.ma.compressed(values) # Ignore axes that have no actual coordinate values. if len(values) == 0: continue # Get all unique values (sorted). values = tuple(sorted(set(values))) if (n,values) not in known_axes: known_axes[(n,values)] = _axis_type(name = n, atts = OrderedDict(), array = np.array(values)) axes[n] = known_axes[(n,values)] # Construct a multidimensional array to hold the record keys. record_id = np.empty(list(map(len,axes.values())), dtype='int32') # Assume missing data (nan) unless filled in later. record_id[()] = -1 # Arrange the record keys in the appropriate locations. indices = [] for n in axes.keys(): u, ind = np.unique(var_records[n], return_inverse=True) indices.append(ind) record_id[tuple(indices)] = header_indices[var_record_indices] # Get the auxiliary coordinates. coords = [] for n, coordaxes in self._outer_coords.items(): # Get the axes for this coordinate. # Use the same order of columns as was used for the outer axes, # so we get the right coordinate order after sorting. coordaxes = OrderedDict((k,v) for k,v in axes.items() if k in coordaxes) # Sanity check - do we actually have any of the coordinate axes? if len(coordaxes) == 0: continue # Unique key for this coordinate key = (n,tuple(coordaxes.items())) # Arrange the coordinate values in the appropriate location. shape = list(map(len,list(coordaxes.values()))) # Extract all values of the coordinate (including duplicates over # other axes). Will determine which values to put in what order later. all_coord_values = np.ma.compressed(var_records[n]) if len(all_coord_values) == 0: continue values = np.zeros(shape,dtype=all_coord_values.dtype) indices = [] for k in coordaxes.keys(): u, ind = np.unique(var_records[k], return_inverse=True) indices.append(ind) values[tuple(indices)] = all_coord_values if key not in known_coords: coord = _var_type (name = n, atts = OrderedDict(), axes = list(coordaxes.values()), array = values ) known_coords[key] = coord coords.append(known_coords[key]) if len(coords) > 0: atts['coordinates'] = coords # Check if we have full coverage along all axes. have_data = [k >= 0 for k in record_id.flatten()] if not np.all(have_data): warn (_("Missing some records for %s.")%nomvar) # Add dummy axes for the ni,nj,nk record dimensions. axes['k'] = _dim_type(name='k', length = int(var_id['nk'])) axes['j'] = _dim_type(name='j', length = int(var_id['nj'])) axes['i'] = _dim_type(name='i', length = int(var_id['ni'])) # Determine the optimal data type to use. # First, find unique combos of datyp, nbits # (want to minimize calls to dtype_fst2numpy). datyp, nbits = zip(*np.unique(var_records.data[['datyp','nbits']])) datyp = map(int,datyp) nbits = map(int,nbits) dtype_list = map(dtype_fst2numpy, datyp, nbits) dtype = np.result_type(*dtype_list) var = _iter_type( name = nomvar, atts = atts, axes = list(axes.values()), dtype = dtype, record_id = record_id ) self._varlist.append(var)
def dtype_fst2numpy (datyp, nbits=None): from rpnpy.librmn.fstd98 import dtype_fst2numpy if datyp == 0: warn (_("Raw binary records detected. The values may not be properly decoded if you're opening on a different platform.")) datyp = 5 return dtype_fst2numpy(datyp,nbits)
def __init__ (self, filename, header_cache=None, progress=False, minimal_metadata=None, rpnstd_metadata=None, rpnstd_metadata_list=None, ignore_typvar=False, ignore_etiket=False, no_quick_scan=False): """ Read raw records from FSTD files, into the buffer. Multiple files can be read simultaneously. Parameters ---------- filename : str or list The RPN standard file(s) to convert. progress : bool, optional Display a progress bar during the conversion, if the "progress" module is installed. rpnstd_metadata : bool, optional Include all RPN record attributes in the output metadata. rpnstd_metadata_list : str or list, optional Specify a minimal set of RPN record attributes to include in the output file. ignore_typvar : bool, optional Tells the converter to ignore the typvar when deciding if two records are part of the same field. Default is to split the variable on different typvars. ignore_etiket : bool, optional Tells the converter to ignore the etiket when deciding if two records are part of the same field. Default is to split the variable on different etikets. """ from rpnpy.librmn.fstd98 import fstnbr, fstinl, fstprm, fstopenall from rpnpy.librmn.const import FST_RO from fstd2nc.extra import maybeFST as isFST from collections import Counter import numpy as np from glob import glob, has_magic import os import warnings # Set up lock for threading. # The same lock is shared for all Buffer objects, to synchronize access to # librmn. self._lock = _lock # Set up a progress bar for scanning the input files. Bar = _ProgressBar if progress is True else _FakeBar # Set default for minimal_metadata if rpnstd_metadata is not None: minimal_metadata = not rpnstd_metadata if minimal_metadata is None: minimal_metadata = True # Set default for rpnstd_metadata_list if minimal_metadata is True and rpnstd_metadata_list is None: rpnstd_metadata_list = '' if isinstance(rpnstd_metadata_list,str): rpnstd_metadata_list = rpnstd_metadata_list.replace(',',' ') rpnstd_metadata_list = rpnstd_metadata_list.split() if hasattr(rpnstd_metadata_list,'__len__'): rpnstd_metadata_list = tuple(rpnstd_metadata_list) self._rpnstd_metadata_list = rpnstd_metadata_list if not ignore_typvar: # Insert typvar value just after nomvar. self._var_id = self._var_id[0:1] + ('typvar',) + self._var_id[1:] self._human_var_id = self._human_var_id[0:1] + ('%(typvar)s',) + self._human_var_id[1:] if not ignore_etiket: # Insert etiket value just after nomvar. self._var_id = self._var_id[0:1] + ('etiket',) + self._var_id[1:] self._human_var_id = self._human_var_id[0:1] + ('%(etiket)s',) + self._human_var_id[1:] if isinstance(filename,str): infiles = [filename] else: infiles = list(filename) # Apply wildcard and directory expansion to filenames. expanded_infiles = [] for infile in infiles: for f in sorted(glob(infile)) or [infile]: if os.path.isdir(f): for dirpath, dirnames, filenames in os.walk(f,followlinks=True): for filename in filenames: expanded_infiles.append((infile,os.path.join(dirpath,filename))) else: expanded_infiles.append((infile,f)) # Inspect all input files, and extract the headers from valid RPN files. matches = Counter() headers = [] self._files = [] if header_cache is None: header_cache = {} # Show a progress bar when there are multiple input files. if len(expanded_infiles) > 1: expanded_infiles = Bar(_("Inspecting input files"), suffix='%(percent)d%% (%(index)d/%(max)d)').iter(expanded_infiles) for infile, f in expanded_infiles: fkey = f if fkey.startswith('/'): fkey = '__ROOT__'+fkey if fkey not in header_cache and (not os.path.exists(f) or not isFST(f)): matches[infile] += 0 continue matches[infile] += 1 # Read the headers from the file(s) and store the info in the table. filenum = len(self._files) self._files.append(f) if fkey not in header_cache: funit = self._open(filenum) nrecs = fstnbr(funit) h = np.zeros(nrecs, dtype=self._headers_dtype) if no_quick_scan: keys = fstinl(funit) params = map(fstprm, keys) for i,prm in enumerate(params): for n,v in prm.items(): if n in h.dtype.names: h[n][i] = v else: from fstd2nc.extra import all_params params = all_params(funit,out=h) keys = params['key'] # Encode the keys without the file index info. h['key'] = keys h['key'] >>= 10 header_cache[fkey] = h h = header_cache[fkey] # The file info will be an index into a separate file list. h['file_id'] = filenum headers.append(h) # Check if the input entries actually matched anything. for infile, count in matches.items(): if count == 0: if os.path.isfile(infile): warn(_("'%s' is not an RPN standard file.")%infile) elif os.path.isdir(infile): warn(_("Directory '%s' does not contain any RPN standard files.")%infile) elif has_magic(infile): warn(_("No RPN standard files match '%s'.")%infile) elif not os.path.exists(infile): warn(_("'%s' does not exist.")%infile) else: warn(_("Problem with input file '%s'")%infile) nfiles = len(headers) if nfiles == 0: error(_("no input files found!")) info(_("Found %d RPN input file(s)"%nfiles)) self._headers = np.ma.concatenate(headers) # Find all unique meta (coordinate) records, and link a subset of files # that provide all unique metadata records. # This will make it easier to look up the meta records later. meta_mask = np.zeros(len(self._headers),dtype='bool') for meta_name in self._meta_records: meta_name = (meta_name+' ')[:4] meta_mask |= (self._headers['nomvar'] == meta_name) meta_recids = np.where(meta_mask)[0] # Use the same unique parameters as regular variables. # Plus, ig1,ig2,ig3,ig4. # Suppress FutureWarning from numpy about doing this. Probably benign... with warnings.catch_warnings(): warnings.simplefilter("ignore") meta_keys = self._headers.data[meta_mask][list(self._var_id)+['ip1','ip2','ip3','ig1','ig2','ig3','ig4']] meta_keys, ind = np.unique(meta_keys, return_index=True) meta_recids = meta_recids[ind] # Find the files that give these unique coord records. file_ids = sorted(set(self._headers['file_id'][meta_recids])) filenames = [self._files[f] for f in file_ids] if len(filenames) > 500: error(_("Holy crap, how many coordinates do you have???")) # If no coordinates found, just open the first file as a dummy file. # Less error-prone than checking if _meta_funit is defined every time # an FSTD function is called. if len(filenames) == 0: filenames = self._files[0:1] # Open these files and link them together self._meta_funit = fstopenall(filenames, FST_RO)