def _best_coordinates_setup(self): """ Set up object when coordinates attribute is found """ coordinates_vars = [] coordinates = self._attributes[0][self.BEST_COORDINATES_NAME.lower()] if coordinates is not None: coordinates_vars = coordinates.split() # split on whitespace if len(coordinates_vars) is not 4: raise InvalidVariableError( 'The coordinate attribute does not have four entries. ' 'It should be space separated "longitude latitude altitude time"' ) for coordinates_var in coordinates_vars: if coordinates_var not in self._variables: raise InvalidVariableError( "There is no variable for the co-ordinate '{}'".format( coordinates_var)) self.longitude_variable_name, \ self.latitude_variable_name, \ self.altitude_variable_name, \ self.time_variable_name \ = coordinates_vars
def _check_has_variables_and_attributes(self): """ Check that netcdf file has variables and attributes """ if self._variables is None or len(self._variables) == 0: raise InvalidVariableError("No variables in the file so the type of data is unknown") if self._attributes[0] is None or len(self._attributes[0]) == 0: raise InvalidVariableError("No attributes in the file so type of data is unknown")
def __init__(self, attributes, variables): """ Initialisation :param attributes: dictionary of attributes and their values (or list of dictionarys if multiple files read) :param variables: dictionary of variable names and NetCDF Variable objects (or list of dictionarys if multiple files read) :return: nothing """ self.station = False self.station_latitude = None self.latitude_variable_name = None self.station_longitude = None self.longitude_variable_name = None self.altitude = None self.altitude_variable_name = None self.pressure_variable_name = None self.time_stamp_info = None self.time_dimensions = None self._attributes = [{k.lower(): v for k, v in list(attrs.items())} for attrs in listify(attributes)] if len(variables) == 0: raise InvalidVariableError("No variables in the file so the type of data is unknown") self._variables = list(variables[0].keys()) self._variable_dimensions = [{name: var.dimensions for name, var in list(vars.items())} for vars in listify(variables)] self._check_has_variables_and_attributes() # Carry out these checks using the attributes from the first file as a 'master' if self.TIME_COORDINATE_NAME.lower() in self._attributes[0]: self.time_variable_name = self._get_coordinate_variable_name(self.TIME_COORDINATE_NAME, "time") if self.LATITUDE_COORDINATE_NAME.lower() in self._attributes[0]: self._lat_lon_var_specified_setup() elif self.STATION_LATITUDE_NAME.lower() in self._attributes[0]: self._stationary_setup() else: raise InvalidVariableError("No attributes indicating latitude, expecting '{}' or '{}'" .format(self.STATION_LATITUDE_NAME, self.LONGITUDE_COORDINATE_NAME)) elif self.BEST_COORDINATES_NAME.lower() in self._attributes[0]: self._best_coordinates_setup() else: raise InvalidVariableError( "No attributes indicating time variable name, expecting either '{}' or 'Coordinates'" .format(self.TIME_COORDINATE_NAME)) if self.CORRECTED_PRESSURE_VAR_NAME in self._variables: self.pressure_variable_name = self.CORRECTED_PRESSURE_VAR_NAME elif self.PRESSURE_VAR_NAME in self._variables: self.pressure_variable_name = self.PRESSURE_VAR_NAME else: self.pressure_variable_name = None if self.TIME_STAMP_INFO_NAME.lower() in self._attributes[0]: # Not all files will have the same timestamp -> Retrieve a list of timestamps for each file. self.time_stamp_info = [attrs[self.TIME_STAMP_INFO_NAME.lower()] for attrs in self._attributes] self.time_dimensions = self._variable_dimensions[0][self.time_variable_name]
def _create_cube(self, filenames, variable): """Creates a cube for the specified variable. :param filenames: List of filenames to read coordinates from :param variable: Optional variable to read while we're reading the coordinates, can be a string or a VariableConstraint object :return: If variable was specified this will return an UngriddedData object, otherwise a CoordList """ import six from cis.exceptions import InvalidVariableError from cis.data_io.products.gridded_NetCDF import DisplayConstraint from cis.data_io.gridded_data import load_cube from iris.exceptions import CoordinateNotFoundError # Check if the files given actually exist. for filename in filenames: with open(filename) as f: pass variable_constraint = variable if isinstance(variable, six.string_types): # noinspection PyPep8 variable_constraint = DisplayConstraint( cube_func=(lambda c: c.var_name == variable or c.standard_name == variable or c.long_name == variable), display=variable) if len(filenames) == 1: callback_function = self.load_single_file_callback else: callback_function = self.load_multiple_files_callback try: cube = load_cube(filenames, variable_constraint, callback=callback_function) except ValueError as e: if variable is None: message = "File contains more than one cube variable name must be specified" elif e.args[0] == "No cubes found": message = "Variable not found: {} \nTo see a list of variables run: cis info {}" \ .format(str(variable), filenames[0]) else: message = e.args[0] raise InvalidVariableError(message) try: hybrid_ht = cube.coord(name_or_coord='Hybrid height') hybrid_ht.attributes[ 'formula'] = 'z(k,j,i) = a(k) + b(k)*orog(j,i)' hybrid_ht.convert_units('m') except CoordinateNotFoundError as e: pass try: cube.coord(long_name='t').standard_name = 'time' except CoordinateNotFoundError as e: pass self._add_available_aux_coords(cube, filenames) return cube
def read(filename, usr_variables): """ Reads a Variable from a NetCDF file :param filename: The name (with path) of the NetCDF file to read. :param usr_variables: A variable (dataset) name to read from the files. The name must appear exactly as in in the NetCDF file. Variable names may be fully qualified NetCDF4 Hierarchical group variables in the form ``<group1>/<group2....>/<variable_name>``, e.g. ``AVHRR/Ch4CentralWavenumber``. :return: A Variable instance constructed from the input file """ from netCDF4 import Dataset usr_variables = listify(usr_variables) try: datafile = Dataset(filename) except RuntimeError as e: raise IOError(str(e)) data = {} for full_variable in usr_variables: # Split the fully qualified variable (group/variable) into group and variable parts = full_variable.split("/") groups = parts[:-1] variable = parts[-1] current_group = datafile for group in groups: current_group = current_group.groups[group] try: data[full_variable] = current_group.variables[variable] except: raise InvalidVariableError(full_variable + ' could not be found in ' + filename) return data
def read_many_files(filenames, usr_variables, dim=None): """ Reads a single Variable from many NetCDF files. This method uses the netCDF4 MFDataset class and so is NOT suitable for NetCDF4 datasets (only 'CLASSIC' netcdf). :param filenames: A list of NetCDF filenames to read, or a string with wildcards. :param usr_variables: A list of variable (dataset) names to read from the files. The names must appear exactly as in in the NetCDF file. :param dim: The name of the dimension on which to aggregate the data. None is the default which tries to aggregate over the unlimited dimension :return: A list of variable instances constructed from all of the input files """ from netCDF4 import MFDataset from cis.exceptions import InvalidVariableError usr_variables = listify(usr_variables) try: datafile = MFDataset(filenames, aggdim=dim) except RuntimeError as e: raise IOError(e) data = {} for variable in usr_variables: # Get data. try: data[variable] = datafile.variables[variable] except: raise InvalidVariableError( 'Variable {} not found in file {}.'.format( variable, filenames)) return data
def find_auxiliary_coordinate(self, variable): """ Find the variable name of an auxiliary coordinate for the given variable (if there is one). :param str variable: The data variable we're checking for any auxiliary coordinates :return str or None: The name of the variable holding the auxiliary coordinate or None """ aux_coord_name = None dim_coord_names = [self.latitude_variable_name, self.longitude_variable_name, self.altitude_variable_name, self.pressure_variable_name] + list(self.time_dimensions) # Find the *dimension* which corresponds to the auxiliary coordinate aux_coords = [dim for dim in self._variable_dimensions[0][variable] if dim not in dim_coord_names] if len(aux_coords) > 1: raise InvalidVariableError("CIS currently only supports reading data variables with one auxilliary " "coordinate") elif len(aux_coords) == 1: # If there is also a variable named after that dimension then this is the variable we're after if aux_coords[0] in self._variable_dimensions[0]: aux_coord_name = aux_coords[0] # Otherwise we need to look through all the variables and choose the first variable whose dimension is only # the auxiliary dimension. else: for v, dims in self._variable_dimensions[0].items(): if dims[0] == aux_coords[0]: aux_coord_name = v break return aux_coord_name
def create_data_object(self, filenames, variable): from cis.exceptions import InvalidVariableError, CISError import numpy as np try: data_obj = load_multiple_hysplit(filenames, [variable]) except ValueError: raise InvalidVariableError(variable + " does not exist in " + str(filenames)) except EOFError as e: raise CISError(e) except IOError as e: raise CISError(e) # TODO coords = self._create_coord_list(filenames, data_obj) # WRITE STANDARD NAME GUESSER HERE if variable == "PRESSURE": variable = "air_pressure" elif variable == "RELHUMID": variable = "relative_humidity" objM = Metadata(name=variable, standard_name=variable, long_name=variable, shape=(len(data_obj[variable]), ), missing_value=-99999.0) #objM.standard_name = None #print((len(data_obj[variable]),)) return UngriddedData(data_obj[variable], objM, coords)
def _get_coordinate_variable_name(self, attribute_name, coordinate_display_name): """ Reads an attribute value for a co-ordinate and returns the value. Checks that value is a variables in the data Throws InvalidVariableError if the attribute or variable does not exist :param attribute_name: the name of the attribute to read :param coordinate_display_name: the display name of the attribute read :return: the variable name """ if attribute_name.lower() in self._attributes[0]: variable_name = self._attributes[0][attribute_name.lower()] if variable_name not in self._variables: # Just check the first file raise InvalidVariableError("There is no variable for the {} co-ordinate '{}'" .format(coordinate_display_name, variable_name)) return variable_name raise InvalidVariableError( "No attributes indicating {} variable name, expecting '{}'" .format(coordinate_display_name, attribute_name))
def load_aeronet(filename, variables=None): """ Loads aeronet csv file. :param filename: data file name :param variables: A list of variables to return :return: A dictionary of variables names and numpy arrays containing the data for that variable """ from cis.exceptions import InvalidVariableError from cis.time_util import cis_standard_time_unit from numpy.ma import masked_invalid from pandas import read_csv, to_datetime version = get_aeronet_version(filename) ordered_vars = get_aeronet_file_variables(filename, version) if len(ordered_vars) == 0: return {} # Load all available geolocation information and any requested variables cols = [var for var in ("date", "time", "latitude", "longitude", "altitude") if var in ordered_vars] if cols is not None and variables is not None: cols.extend(variables) dtypes = {var:'str' if var in ("date", "time") else "float" for var in cols} try: rawd = read_csv(filename, sep=",", header=AERONET_HEADER_LENGTH[version]-1, names=ordered_vars, index_col=False, usecols=cols, na_values=AERONET_MISSING_VALUE[version], dtype=dtypes, parse_dates={"datetime":["date", "time"]}, infer_datetime_format=True, dayfirst=True, error_bad_lines=False, warn_bad_lines=True, #low_memory="All_Sites_Times_All_Points" in filename ) except ValueError: raise InvalidVariableError("{} not available in {}".format(variables, filename)) # Empty file if rawd.shape[0] == 0: return {"datetime":[], "latitude":[], "longitude":[], "altitude":[]} # Convert pandas Timestamps into CIS standard numbers rawd["datetime"] = [cis_standard_time_unit.date2num(timestamp.to_pydatetime()) for timestamp in to_datetime(rawd["datetime"], format='%d:%m:%Y %H:%M:%S')] # Add position metadata that isn't listed in every line for some formats if version.startswith("MAN"): rawd["altitude"] = 0. elif version.endswith("2"): metadata = get_file_metadata(filename) rawd["longitude"] = float(metadata.misc[2][1].split("=")[1]) rawd["latitude"] = float(metadata.misc[2][2].split("=")[1]) rawd["altitude"] = float(metadata.misc[2][3].split("=")[1]) return {var : masked_invalid(arr) for var, arr in rawd.items()}
def __get_data_scale(self, filename, variable): from cis.exceptions import InvalidVariableError from pyhdf.SD import SD try: meta = SD(filename).datasets()[variable][0][0] except KeyError: raise InvalidVariableError("Variable " + variable + " not found") for scaling in self.modis_scaling: if scaling in meta: return scaling return None
def __get_data_scale(self, filename, variable): # Note this is only here because it doesn't get inherited... from cis.exceptions import InvalidVariableError from pyhdf.SD import SD try: meta = SD(filename).datasets()[variable][0][0] except KeyError: raise InvalidVariableError("Variable " + variable + " not found") for scaling in self.modis_scaling: if scaling in meta: return scaling return None
def _stationary_setup(self): """ Set up object when latitude and longitude are fixed """ from cis.exceptions import InvalidVariableError if self.STATION_LATITUDE_NAME.lower() not in self._attributes[0]: raise InvalidVariableError( "No attributes indicating latitude, expecting '{}'".format( self.STATION_LATITUDE_NAME)) # We need a bunch of different latitudes for different files self.station_latitude = [ self._parse_station_lat_lon( attr[self.STATION_LATITUDE_NAME.lower()]) for attr in self._attributes ] if self.STATION_LONGITUDE_NAME.lower() not in self._attributes[0]: raise InvalidVariableError( "No attributes indicating longitude, expecting '{}'".format( self.STATION_LONGITUDE_NAME)) self.station_longitude = [ self._parse_station_lat_lon( attr[self.STATION_LONGITUDE_NAME.lower()]) for attr in self._attributes ] self.station = True if self.STATION_ALTITUDE_NAME.lower() in self._attributes[0]: self.altitude = [ self._parse_station_altitude( attr[self.STATION_ALTITUDE_NAME.lower()]) for attr in self._attributes ] else: self.altitude = [ self.DEFAULT_ALTITUDE for attr in self._attributes ]
def _parse_station_lat_lon(lat_lon_string): """ Parse a station's latitude or longitude string. Will try and read it directly as a float, otherwise will try and read the first white-space separated part of the string (e.g. '80 degrees north' -> float(80)). :param lat_lon_string: :return: """ from cis.exceptions import InvalidVariableError try: return float(lat_lon_string) except ValueError: try: return float(lat_lon_string.split()[0]) except ValueError: raise InvalidVariableError("Couldn't parse station attribute '{}'".format(lat_lon_string))
def _create_cube(self, filenames, variable): """Creates a cube for the specified variable. :param filenames: List of filenames to read coordinates from :param variable: Optional variable to read while we're reading the coordinates, can be a string or a VariableConstraint object :return: If variable was specified this will return an UngriddedData object, otherwise a CoordList """ from cis.exceptions import InvalidVariableError from cis.data_io import gridded_data import iris # Check if the files given actually exist. for filename in filenames: with open(filename) as f: pass variable_constraint = variable if isinstance(variable, str): variable_constraint = DisplayConstraint(cube_func=(lambda c: c.var_name == variable or c.standard_name == variable or c.long_name == variable), display=variable, coord_values={'hybrid level at layer midpoints': (lambda lev: lev == 31)}) if len(filenames) == 1: callback_function = self.load_single_file_callback else: callback_function = self.load_multiple_files_callback try: cube = gridded_data.load_cube(filenames, variable_constraint, callback=callback_function) except iris.exceptions.ConstraintMismatchError as e: if variable is None: message = "File contains more than one cube variable name must be specified" elif e.message == "no cubes found": message = "Variable not found: {} \nTo see a list of variables run: cis info {}" \ .format(str(variable), filenames[0]) else: message = e.message raise InvalidVariableError(message) except ValueError as e: raise IOError(str(e)) self._add_available_aux_coords(cube, filenames) return cube
def _read_hdf4(filename, variables): """ A wrapper method for reading raw data from hdf4 files. This returns a dictionary of io handles for each VD and SD data types. :param filename: A name of a file to read :param variables: List of variables to read from the files :return: (sds_dict, vds_dict) A tuple of dictionaries, one for sds objects and another for vds """ from cis.exceptions import InvalidVariableError from pyhdf.error import HDF4Error variables = utils.listify(variables) # I'd rather not have to make this check but for pyhdf 0.9.0 and hdf 4.2.9 on OS X the c-level read routine will at # some point call exit(138) when reading valid netcdf files (rather than returning a negative status). if not filename.endswith('.hdf'): raise IOError("Tried to read non HDF file: {}".format(filename)) try: sds_dict = hdf_sd.read(filename, variables) # remove the variables identified as SD (i.e. the keys in sds_dict) # no need to try looking for them as VD variable # AND this can cause a crash in some version/implementations of the core HDF4 libraries! # First create a copy of the list in order for the original list to be left intact when elements are removed # from it, this enables the original list to be used when many files are read vdvariables = list(variables) for sds_dict_key in sds_dict: vdvariables.remove(sds_dict_key) vds_dict = hdf_vd.read(filename, vdvariables) except HDF4Error as e: raise IOError(str(e)) for variable in variables: if variable not in sds_dict and variable not in vds_dict: raise InvalidVariableError("Could not find " + variable + " in file: " + filename) return sds_dict, vds_dict
def create_coords(self, filenames, variable=None): from cis.data_io.ungridded_data import Metadata from numpy import genfromtxt, NaN from cis.exceptions import InvalidVariableError from cis.time_util import convert_datetime_to_std_time import dateutil.parser as du array_list = [] for filename in filenames: try: array_list.append(genfromtxt(filename, dtype="f8,f8,f8,O,f8", names=['latitude', 'longitude', 'altitude', 'time', 'value'], delimiter=',', missing_values='', usemask=True, invalid_raise=True, converters={"time": du.parse})) except: raise IOError('Unable to read file ' + filename) data_array = utils.concatenate(array_list) n_elements = len(data_array['latitude']) coords = CoordList() coords.append(Coord(data_array["latitude"], Metadata(standard_name="latitude", shape=(n_elements,), units="degrees_north"))) coords.append(Coord(data_array["longitude"], Metadata(standard_name="longitude", shape=(n_elements,), units="degrees_east"))) coords.append( Coord(data_array["altitude"], Metadata(standard_name="altitude", shape=(n_elements,), units="meters"))) time_arr = convert_datetime_to_std_time(data_array["time"]) time = Coord(time_arr, Metadata(standard_name="time", shape=(n_elements,), units="days since 1600-01-01 00:00:00")) coords.append(time) if variable: try: data = UngriddedData(data_array['value'], Metadata(name="value", shape=(n_elements,), units="unknown", missing_value=NaN), coords) except: InvalidVariableError("Value column does not exist in file " + filenames) return data else: return UngriddedCoordinates(coords)
def load_aeronet(fname, variables=None): """ loads aeronet lev 2.0 csv file. Originally from http://code.google.com/p/metamet/ License: GNU GPL v3 :param fname: data file name :param variables: A list of variables to return :return: A dictionary of variables names and numpy arrays containing the data for that variable """ import numpy as np from numpy import ma from datetime import datetime, timedelta from cis.time_util import cis_standard_time_unit from cis.exceptions import InvalidVariableError std_day = cis_standard_time_unit.num2date(0) ordered_vars = get_aeronet_file_variables(fname) def date2daynum(datestr): the_day = datetime(int(datestr[-4:]), int(datestr[3:5]), int(datestr[:2])) return float((the_day - std_day).days) def time2fractionalday(timestr): td = timedelta(hours=int(timestr[:2]), minutes=int(timestr[3:5]), seconds=int(timestr[6:8])) return td.total_seconds() / (24.0 * 60.0 * 60.0) try: rawd = np.genfromtxt(fname, skip_header=5, delimiter=',', names=ordered_vars, converters={ 0: date2daynum, 1: time2fractionalday, 'Last_Processing_Date': date2daynum }, dtype=np.float64, missing_values='N/A', usemask=True) except (StopIteration, IndexError) as e: raise IOError(e) lend = len(rawd) # The date and time column are already in days since cis standard time, and fractional days respectively, so we can # just add them together # Find the columns by number rather than name as some older versions of numpy mangle the special characters datetimes = rawd[rawd.dtype.names[0]] + rawd[rawd.dtype.names[1]] metadata = get_file_metadata(fname) lon = np.zeros(lend) + float(metadata.misc[2][1].split("=")[1]) lat = np.zeros(lend) + float(metadata.misc[2][2].split("=")[1]) alt = np.zeros(lend) + float(metadata.misc[2][3].split("=")[1]) data_dict = {} if variables is not None: for key in variables: try: # Again, we can't trust the numpy names so we have to use our pre-read names to index the right column data_dict[key] = rawd[rawd.dtype.names[ordered_vars.index( key)]] except ValueError: raise InvalidVariableError(key + " does not exist in " + fname) data_dict["datetime"] = ma.array(datetimes) data_dict["longitude"] = ma.array(lon) data_dict["latitude"] = ma.array(lat) data_dict["altitude"] = ma.array(alt) return data_dict