def to_datestring_YYYYMMDD(value): """Convert input time to string with format YYYYMMDD Parameters ---------- value input time, may be string, datetime, numpy.datetime64 or pandas.Timestamp Returns ------- str input formatted to string YYYYMMDD Raises ------ ValueError if input is not supported """ if isinstance(value, str) and len(value, 8): logger.info('Input is already string containing 8 chars. Assuming it ' 'is in the right format and returning unchanged') return value try: return to_pandas_timestamp(value).strftime('%Y%m%d') except Exception as e: raise ValueError('Invalid input, need str, datetime, numpy.datetime64 ' 'or pandas.Timestamp. Error: {}'.format(repr(e)))
def find_data_dir(self, name_or_pattern, ignorecase=True): """Find match of input name or pattern in Aerocom database Parameters ---------- name_or_pattern : str name or pattern of data (can be model or obs data) ignorecase : bool if True, upper / lower case is ignored Returns ------- str data directory of match Raises ------ DataSearchError if no matches or no unique match can be found """ if name_or_pattern in self: logger.info('{} found in instance of AerocomBrowser'.format( name_or_pattern)) return self[name_or_pattern] logger.info('Searching database for {}'.format(name_or_pattern)) return self._browse(name_or_pattern, ignorecase=ignorecase, return_if_match=True) #returns list
def apply(self, data_obj): """Apply filter to data object Parameters ---------- data_obj : :obj:`UngriddedData`, :obj:`GriddedData` input data object that is supposed to be filtered Returns ------- :obj:`UngriddedData`, :obj:`GriddedData` filtered data object Raises ------ IOError if input is invalid """ if self.name == self.NO_FILTER_NAME: logger.info('NO FILTER flag: {} -> no filtering will be applied ' 'in {}. Returning unchanged object.'.format( self.NO_FILTER_NAME, type(data_obj))) return data_obj if isinstance(data_obj, UngriddedData): return self._apply_ungridded(data_obj) elif isinstance(data_obj, GriddedData): return self._apply_gridded(data_obj) elif isinstance(data_obj, ColocatedData): return self._apply_colocated(data_obj) raise IOError('Cannot filter {} obj, need instance of GriddedData or ' 'UngriddedData'.format(type(data_obj)))
def check_and_regrid_lons_cube(cube): """Checks and corrects for if longitudes of :attr:`grid` are 0 -> 360 Note ---- This method checks if the maximum of the current longitudes array exceeds 180. Thus, it is not recommended to use this function after subsetting a cube, rather, it should be checked directly when the file is loaded (cf. :func:`load_input`) Parameters ---------- cube : iris.cube.Cube gridded data loaded as iris.Cube Returns ------- bool True, if longitudes were on 0 -> 360 and have been rolled, else False """ if cube.coord("longitude").points.max() > 180: logger.info("Rolling longitudes to -180 -> 180 definition") cube = cube.intersection(longitude=(-180, 180)) return cube
def __init__(self, var_name="od550aer", init=True, cfg=None, **kwargs): #save orig. input for whatever reasons self._var_name_input = var_name self.is_3d = False self.is_dry = False var_name = var_name.lower() if '3d' in var_name: logger.info('Variable name {} contains 3d. Activating flag is_3d ' 'and removing from var_name string'.format(var_name)) var_name = var_name.replace('3d', '') self.is_3d = True if 'dry' in var_name: self.is_dry = True var_name_alt = var_name.replace('dry', '') else: var_name_alt = var_name self.var_name = var_name self.var_name_alt = var_name_alt #alternative var_name self.standard_name = None self.units = '1' #self.aliases = [] self.wavelength_nm = None self.dry_rh_max = None self.dimensions = None self.minimum = -9e30 self.maximum = 9e30 self.description = None self.comments_and_purpose = None #wavelength tolerance in nm self.obs_wavelength_tol_nm = None self.scat_xlim = None self.scat_ylim = None self.scat_loglog = None self.scat_scale_factor = 1.0 # settings for map plotting self.map_vmin = None self.map_vmax = None self.map_c_under = None self.map_c_over = None self.map_cbar_levels = None self.map_cbar_ticks = None # imports default information and, on top, variable information (if # applicable) if init: self.parse_from_ini(var_name, var_name_alt=self.var_name_alt, cfg=cfg) self.update(**kwargs) if self.obs_wavelength_tol_nm is None: self.obs_wavelength_tol_nm = OBS_WAVELENGTH_TOL_NM
def __init__(self, var_ini=None, var_csv=None): self.var_ini = var_ini self.var_csv = var_csv self._cfg = self._read_ini() self.all_vars = [k for k in self._cfg.keys()] logger.info("Importing variable aliases info") self.all_vars.extend(list(_read_alias_ini()))
def __init__(self, var_ini): self._all_vars = None self._var_ini = None self.var_ini = var_ini self._cfg_parser = parse_variables_ini(var_ini) self._alias_parser = parse_aliases_ini() self._idx = -1 logger.info("Importing variable aliases info")
def check_dimensions(self): """Checks if data source and time dimension are at the right index""" dims = self.data.dims if not 2 < len(dims) < 5: logger.info('Invalid number of dimensions. Must be 3 or 4') return False try: if dims.index('data_source') == 0 and dims.index('time') == 1: return True raise Exception except: return False
def parse_from_ini(self, var_name=None, cfg=None): """Import information about default region Parameters ---------- var_name : str variable name var_name_alt : str alternative variable name that is used if variable name is not available cfg : ConfigParser open config parser object Returns ------- bool True, if default could be loaded, False if not Raises ------ IOError if regions.ini file does not exist """ if cfg is None: cfg = self.read_config() if not var_name in cfg: try: var_name = self._check_aliases(var_name) except VariableDefinitionError: logger.info('Unknown input variable {}'.format(var_name)) return self._var_name_aerocom = var_name var_info = cfg[var_name] # this variable should import settings from another variable if 'use' in var_info: use = var_info['use'] if not use in cfg: raise VariableDefinitionError( 'Input variable {} depends on {} ' 'which is not available in ' 'variables.ini.'.format(var_name, use)) self.parse_from_ini(use, cfg) for key, val in var_info.items(): if key in self.ALT_NAMES: key = self.ALT_NAMES[key] self._add(key, val)
def to_timeseries(self, var_name, freq=None, resample_how='mean'): """Get pandas.Series object for one of the data columns Parameters ---------- var_name : str name of variable (e.g. "od550aer") freq : str new temporal resolution (can be pandas freq. string, or pyaerocom ts_type) resample_how : str choose from mean or median (only relevant if input parameter freq is provided, i.e. if resampling is applied) Returns ------- Series time series object Raises ------ KeyError if variable key does not exist in this dictionary ValueError if length of data array does not equal the length of the time array """ if not var_name in self: raise KeyError("Variable {} does not exist".format(var_name)) self.check_dtime() data = self[var_name] if isinstance(data, pd.Series): logger.info('Data is already instance of pandas.Series') return data if not data.ndim == 1: raise NotImplementedError('Multi-dimensional data columns cannot ' 'be converted to time-series') if not len(data) == len(self.dtime): raise ValueError("Mismatch between length of data array for " "variable {} (length: {}) and time array " "(length: {}).".format(var_name, len(data), len(self.dtime))) s = pd.Series(data, index=self.dtime) if freq is not None: from pyaerocom.helpers import resample_timeseries s = resample_timeseries(s, freq, resample_how) return s
def write(self, data): """Write instance of UngriddedData to cache Parameters ---------- data : UngriddedData object containing the data """ if not self.connection_established: # TODO: may be updated in the future raise AerocomConnectionError('Cannot write Cache file, connection ' 'to Aerocom database could not be ' 'established (required for checking ' 'revision)') if not isinstance(data, UngriddedData): raise TypeError('Invalid input, need instance of UngriddedData, ' 'got {}'.format(type(data))) logger.info('Writing cache file: {}'.format(self.file_path)) success=True # OutHandle = gzip.open(c__cache_file, 'wb') # takes too much time out_handle = open(self.file_path, 'wb') try: pickle.dump(self.newest_file_in_read_dir, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.newest_file_date_in_read_dir, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.reader.data_revision, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.reader.__version__, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(UngriddedData.__version__, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(self.__version__, out_handle, pickle.HIGHEST_PROTOCOL) pickle.dump(data, out_handle, pickle.HIGHEST_PROTOCOL) except: logger.exception('Failed to write cache') success=False finally: out_handle.close() if not success: os.remove(self.file_path) logger.info('Success!')
def check_and_load(self): if not os.path.isfile(self.file_path): logger.info('No cache file available for query of dataset ' '{}'.format(self.dataset_to_read)) return False delete_existing = False in_handle = open(self.file_path, 'rb') # read meta information about file if self.connection_established: try: use_cache_file = self._check_pkl_head_vs_database(in_handle) except Exception as e: use_cache_file = False delete_existing = True logger.exception('File error in cached data file {}. File will ' 'be removed and data reloaded' 'Error: {}'.format(self.file_path, repr(e))) if not use_cache_file: # TODO: Should we delete the cache file if it is outdated ??? logger.info('Aborting reading cache file {}. Aerocom database ' 'has changed compared to cached version' .format(self.file_name)) in_handle.close() if delete_existing: #something was wrong os.remove(self.file_path) return False else: for k in range(self.LEN_CACHE_HEAD): logger.debug(pickle.load(in_handle)) # everything is okay data = pickle.load(in_handle) if not isinstance(data, UngriddedData): raise TypeError('Unexpected data type stored in cache file, need ' 'instance of UngriddedData, got {}'.format(type(data))) self.loaded_data = data logger.info('Successfully loaded data for {} from Cache'.format(self.dataset_to_read)) return True
def plotscatter(model_name, model_data=None, obs_data=None, opts=None, verbose=True): """Method to plot scatterplots Todo ---- Complete docstring, review code """ if verbose: change_verbosity(new_level='debug') plt_name = 'SCATTERLOG' var_to_run = opts['VariablesToRun'][0] # global settings (including plot settings) for variable VAR_PARAM = const.VAR_PARAM[var_to_run] obs_network_name = opts['ObsNetworkName'][0] obs_data_as_series = obs_data.to_timeseries(start_date=opts['StartDate'], end_date=opts['EndDate'], freq='D') obs_lats = [ obs_data_as_series[i]['latitude'] for i in range(len(obs_data_as_series)) ] obs_lons = [ obs_data_as_series[i]['longitude'] for i in range(len(obs_data_as_series)) ] obs_names = [ obs_data_as_series[i]['station_name'] for i in range(len(obs_data_as_series)) ] # model_station_data = model_data.interpolate([("latitude", obs_lats), ("longitude", obs_lons)]) # times_as_dt64 = pa.helpers.cftime_to_datetime64(model_station_data.time) # model_data_as_series = pa.helpers.to_time_series_griesie(model_station_data.grid.data, obs_lats, obs_lons, # times_as_dt64, var_name = [var_to_run]) model_data_as_series = model_data.to_time_series([("latitude", obs_lats), ("longitude", obs_lons)]) df_time = pd.DataFrame() df_points = pd.DataFrame() station_no = 0 for i in range(len(obs_data_as_series)): _len = len(obs_data_as_series[i][var_to_run]) # print('{} length: {}'.format(obs_names[i],_len)) if _len > 0: _nansum = np.nansum(obs_data_as_series[i][var_to_run]) # _isnan = np.isnan(_nansum) # print('{} nansum: {:.3f}'.format(obs_names[i],np.nansum(obs_data_as_series[i][var_to_run]))) # print('{} isnan: {}'.format(obs_names[i],_isnan)) if _nansum > np.float_(0.): station_no += 1 # print('{} station_no: {}'.format(obs_names[i],station_no)) else: print('{} removed due to NaNs only'.format(obs_names[i])) else: continue # put obs and model in DataFrame to make them use the same time index df_time_temp = pd.DataFrame(obs_data_as_series[i][var_to_run], columns=[obs_network_name]) df_points = df_points.append(df_time_temp) # df_time_temp[model_name] = model_data_as_series[i][var_to_run]*1.E3 df_time_temp[model_name] = (model_data_as_series[i][var_to_run] * VAR_PARAM['scat_scale_factor']) # df_time has now all time steps where either one of the obs or model data have data # # df_points = df_points.append(pd.DataFrame(np.float_(df_time_temp.values), columns=df_time_temp.columns)) df_time = df_time.append( pd.DataFrame(df_time_temp, columns=df_time_temp.columns)) # remove all indices where either one of the data pairs is NaN # mainly done to get the number of days right. # df_time.corr() gets it right without df_time = df_time.dropna(axis=0, how='any') df_points = df_points.dropna() print('# of measurements: {}'.format(len(df_points))) filter_name = 'WORLD-wMOUNTAINS' filter_name = 'WORLD' time_step_name = 'mALLYEARdaily' # OD550_AER_an2008_YEARLY_WORLD_SCATTERLOG_AeronetSunV3Lev2.0.daily.ps.png # if df_time[model_name].index[0].year != df_time[model_name].index[-1].year: years_covered = df_time[model_name].index[:].year.unique().sort_values() if len(years_covered) > 1: figname = '{}_{}_an{}-{}_{}_{}_{}_{}.png'.format( model_name, var_to_run, years_covered[0], years_covered[-1], time_step_name, filter_name, plt_name, obs_network_name) plotname = "{}-{} {}".format(years_covered[0], years_covered[-1], 'daily') else: figname = '{}_{}_an{}_{}_{}_{}_{}.png'.format(model_name, var_to_run, years_covered[0], time_step_name, filter_name, plt_name, obs_network_name) plotname = "{} {}".format(years_covered[0], 'daily') logger.info(figname) mean = df_time.mean() correlation_coeff = df_time.corr() # IDL: rms=sqrt(total((f_YData-f_Xdata)^2)/n_elements(f_YData)) #sum = df_time.sum() # nmb=total(f_YData-f_Xdata)/total(f_Xdata)*100. # c=n_elements(f_YData) # f_temp=(f_YData-f_Xdata)/(f_YData+f_Xdata) # mnmb=2./c*total(f_temp)*100. # fge=2./c*total(abs(f_temp))*100. # f_YDatabc=f_YData*(total(f_Xdata,/nan)/total(f_YData,/nan)) ; bias corrected model data # rmsbc=sqrt(total((f_YDatabc-f_Xdata)^2)/n_elements(f_YDatabc)) difference = df_time[model_name] - df_time[obs_network_name] num_points = len(df_time) rms = np.sqrt(np.nansum(np.power(difference.values, 2)) / num_points) nmb = np.sum(difference) / np.sum(df_time[obs_network_name]) * 100. tmp = (df_time[model_name] - df_time[obs_network_name]) / ( df_time[model_name] + df_time[obs_network_name]) mnmb = 2. / num_points * np.sum(tmp) * 100. fge = 2. / np.sum(np.abs(tmp)) * 100. df_time.plot.scatter(obs_network_name, model_name, loglog=VAR_PARAM['scat_loglog'], marker='+', color='black') # plot the 1 by 1 line plt.plot(VAR_PARAM['scat_xlim'], VAR_PARAM['scat_ylim'], '-', color='grey') plt.axes().set_aspect('equal') plt.xlim(VAR_PARAM['scat_xlim']) plt.ylim(VAR_PARAM['scat_ylim']) xypos_index = 0 var_str = var_to_run + VAR_PARAM.unit_str plt.axes().annotate("{} #: {} # st: {}".format(var_str, len(df_time), station_no), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=14, color='red') xypos_index += 1 plt.axes().annotate('Obs: {:.3f}'.format(mean[obs_network_name]), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=10, color='red') xypos_index += 1 plt.axes().annotate('Mod: {:.3f}'.format(mean[model_name]), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=10, color='red') xypos_index += 1 plt.axes().annotate('NMB: {:.1f}%'.format(nmb), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=10, color='red') xypos_index += 1 plt.axes().annotate('MNMB: {:.1f}%'.format(mnmb), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=10, color='red') xypos_index += 1 plt.axes().annotate('R: {:.3f}'.format(correlation_coeff.values[0, 1]), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=10, color='red') xypos_index += 1 plt.axes().annotate('RMS: {:.3f}'.format(rms), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=10, color='red') xypos_index += 1 plt.axes().annotate('FGE: {:.3f}'.format(fge), xy=xypos[xypos_index], xycoords='axes fraction', fontsize=10, color='red') # right lower part plt.axes().annotate('{}'.format(plotname), xy=xypos[-2], xycoords='axes fraction', ha='center', fontsize=10, color='black') plt.axes().annotate('{}'.format(filter_name), xy=xypos[-1], xycoords='axes fraction', ha='center', fontsize=10, color='black') plt.savefig(figname, dpi=300) plt.close()
def _browse(self, name_or_pattern, ignorecase=True, return_if_match=True): """Search all Aerocom data directories that match input name or pattern Note ---- Please do not use this function but either Parameters ---------- name_or_pattern : str name or pattern of data (can be model or obs data) ignorecase : bool if True, upper / lower case is ignored return_if_match : bool if True, then the data directory is returned as string, if it can be found, else, only a list is returned that contains all matches. The latter takes longer since the whole database is searched. Returns ------- :obj:`str` or :obj:`list` Data directory (str, if ``return_if_match`` is True) or list containing valid Aerocom names (which can then be used to retrieve the paths) Raises ------ DataSearchError if no match or no unique match can be found """ pattern = fnmatch.translate(name_or_pattern) _candidates = [] _msgs = [] _warnings = [] for obs_id, obs_path in const.OBSLOCS_UNGRIDDED.items(): if ignorecase: match = name_or_pattern.lower() == obs_id.lower() else: match = name_or_pattern == obs_id if match: logger.info("Found match for search pattern in obs network " "directories {}".format(obs_id)) path = os.path.normpath(obs_path) if os.path.exists(path): self[obs_id] = path if return_if_match: return path else: if ignorecase: match = bool(re.search(pattern, obs_id, re.IGNORECASE)) else: match = bool(re.search(pattern, obs_id)) if match: path = os.path.normpath(obs_path) if os.path.exists(path): self[obs_id] = path _candidates.append(obs_id) if return_if_match: return path for search_dir in const.DATA_SEARCH_DIRS: # get the directories if os.path.isdir(search_dir): #subdirs = listdir(search_dir) subdirs = [ x for x in os.listdir(search_dir) if os.path.isdir(os.path.join(search_dir, x)) ] for subdir in subdirs: if ignorecase: match = bool(re.search(pattern, subdir, re.IGNORECASE)) else: match = bool(re.search(pattern, subdir)) if match: _dir = os.path.normpath( os.path.join(search_dir, subdir)) _rnsubdir = os.path.join(_dir, "renamed") if os.path.isdir(_rnsubdir): logger.info( "{} has subdir renamed. Using that one".format( _dir)) _dir = _rnsubdir if any([_dir in x for x in self.values()]): # directory was already found before continue # append name of candidate ... _candidates.append(subdir) # ... and the corresponding data directory self[subdir] = _dir # now check if it is actually an exact match, if # applicable if return_if_match: if ignorecase: match = name_or_pattern.lower( ) == subdir.lower() else: match = name_or_pattern == subdir if match: logger.info("Found match for ID {}".format( name_or_pattern)) if return_if_match: return _dir else: _msgs.append('directory %s does not exist\n' % search_dir) for msg in _msgs: logger.info(msg) for warning in _warnings: logger.warning(warning) if len(_candidates) == 0: raise DataSearchError( 'No matches could be found for search pattern ' '{}'.format(name_or_pattern)) if return_if_match: if len(_candidates) == 1: logger.info("Found exactly one match for search pattern " "{}: {}".format(name_or_pattern, _candidates[0])) return self[_candidates[0]] raise DataSearchError( 'Found multiple matches for search pattern {}. ' 'Please choose from {}'.format(name_or_pattern, _candidates)) return _candidates
def _browse(self, name_or_pattern, ignorecase=True, return_if_match=True): """Search all Aerocom data directories that match input name or pattern Note ---- Please do not use this function but either Parameters ---------- name_or_pattern : str name or pattern of data (can be model or obs data) ignorecase : bool if True, upper / lower case is ignored return_if_match : bool if True, then the data directory is returned as string, if it can be found, else, only a list is returned that contains all matches. The latter takes longer since the whole database is searched. Returns ------- :obj:`str` or :obj:`list` Data directory (str, if ``return_if_match`` is True) or list containing valid Aerocom names (which can then be used to retrieve the paths) Raises ------ DataSearchError if no match or no unique match can be found """ pattern = fnmatch.translate(name_or_pattern) _candidates = [] _msgs = [] _warnings = [] for obs_id in const.OBS_IDS: if ignorecase: match = name_or_pattern.lower() == obs_id.lower() else: match = name_or_pattern == obs_id if match: logger.info("Found match for search pattern in obs network " "directories {}".format(obs_id)) self[obs_id] = const.OBSCONFIG[obs_id]["PATH"] if return_if_match: return self[obs_id] else: if ignorecase: match = bool(re.search(pattern, obs_id, re.IGNORECASE)) else: match = bool(re.search(pattern, obs_id)) if match: self[obs_id] = const.OBSCONFIG[obs_id]["PATH"] _candidates.append(obs_id) for search_dir in const.MODELDIRS: # get the directories if isdir(search_dir): #subdirs = listdir(search_dir) subdirs = [ x for x in listdir(search_dir) if isdir(join(search_dir, x)) ] for subdir in subdirs: if ignorecase: match = bool(re.search(pattern, subdir, re.IGNORECASE)) else: match = bool(re.search(pattern, subdir)) if match: _dir = join(search_dir, subdir) _rnsubdir = join(_dir, "renamed") if isdir(_rnsubdir): logger.info( "{} has subdir renamed. Using that one".format( _dir)) _dir = _rnsubdir # ============================================================================= # ok = True # if const.GRID_IO.USE_RENAMED_DIR: # logger.info("Checking if renamed directory exists") # _dir = join(_dir, "renamed") # if not isdir(_dir): # ok = False # _warnings.append("Renamed folder does not exist " # "in {}".format(join(search_dir, # subdir))) # # directory exists and is candidate since it matches # # the pattern # if ok: # ============================================================================= # append name of candidate ... _candidates.append(subdir) # ... and the corresponding data directory self[subdir] = _dir # now check if it is actually an exact match, if # applicable if return_if_match: if ignorecase: match = name_or_pattern.lower( ) == subdir.lower() else: match = name_or_pattern == subdir if match: logger.info("Found match for ID {}".format( name_or_pattern)) if return_if_match: return _dir else: _msgs.append('directory %s does not exist\n' % search_dir) for msg in _msgs: logger.info(msg) for warning in _warnings: logger.warning(warning) if len(_candidates) == 0: raise DataSearchError( 'No matches could be found for search pattern ' '{}'.format(name_or_pattern)) if return_if_match: if len(_candidates) == 1: logger.info("Found exactly one match for search pattern " "{}: {}".format(name_or_pattern, _candidates[0])) return self[_candidates[0]] raise DataSearchError( 'Found multiple matches for search pattern {}. ' 'Please choose from {}'.format(name_or_pattern, _candidates)) return _candidates
def check_and_load(self, var_name): """Check if cache file exists and load Note ---- If a cache file exists for this database, but cannot be loaded or is outdated against pyaerocom updates, then it will be removed (the latter only if :attr:`pyaerocom.const.RM_CACHE_OUTDATED` is True). Returns ------- bool True, if cache file exists and could be successfully loaded, else False. Note: if import is successful, the corresponding data object (instance of :class:`pyaerocom.UngriddedData` can be accessed via :attr:`loaded_data' Raises ------ TypeError if cached file is not an instance of :class:`pyaerocom.UngriddedData` class (which should not happen) """ try: fp = self.file_path(var_name) except FileNotFoundError as e: logger.warning(repr(e)) return False if not os.path.isfile(fp): logger.info('No cache file available for {}, {}' .format(self.dataset_to_read, var_name)) return False delete_existing = const.RM_CACHE_OUTDATED in_handle = open(fp, 'rb') try: ok = self._check_pkl_head_vs_database(in_handle) except Exception as e: ok = False delete_existing = True logger.exception('File error in cached data file {}. File will ' 'be removed and data reloaded' 'Error: {}'.format(fp, repr(e))) if not ok: # TODO: Should we delete the cache file if it is outdated ??? logger.info('Aborting reading cache file {}. Aerocom database ' 'or pyaerocom version has changed compared to ' 'cached version' .format(self.file_name(var_name))) in_handle.close() if delete_existing: #something was wrong const.print_log.info('Deleting outdated cache file: {}' .format(fp)) os.remove(self.file_path(var_name)) return False # everything is okay data = pickle.load(in_handle) if not isinstance(data, UngriddedData): raise TypeError('Unexpected data type stored in cache file, need ' 'instance of UngriddedData, got {}' .format(type(data))) self.loaded_data[var_name] = data logger.info('Successfully loaded data for {} from Cache' .format(self.dataset_to_read)) return True
def parse_from_ini(self, var_name=None, var_name_alt=None, cfg=None): """Import information about default region Parameters ---------- var_name : str variable name var_name_alt : str alternative variable name that is used if variable name is not available cfg : ConfigParser open config parser object Returns ------- bool True, if default could be loaded, False if not Raises ------ IOError if regions.ini file does not exist """ if cfg is None: cfg = self.read_config() var_info = {} if var_name is not None and var_name != 'DEFAULT': if var_name in cfg: logger.info("Found default configuration for variable " "{}".format(var_name)) var_info = cfg[var_name] #self.var_name = var_name elif isinstance(var_name_alt, str) and var_name_alt in cfg: var_info = cfg[var_name_alt] else: ap = parse_aliases_ini() aliases = _read_alias_ini(ap) if var_name in aliases: var_name = aliases[var_name] var_info = cfg[var_name] else: try: var_name = _check_alias_family(var_name, ap) var_info = cfg[var_name] except VariableDefinitionError: logger.warning( "No default configuration available for " "variable {}. Using DEFAULT settings".format( var_name)) default = cfg['DEFAULT'] for key in self.keys(): if key in self.ALT_NAMES: if self.ALT_NAMES[key] in var_info: self._add(key, var_info[self.ALT_NAMES[key]]) elif key in var_info: self._add(key, var_info[key]) elif key in default: self._add(key, default[key]) self.var_name = var_name
def write(self, data, var_name=None): """Write single-variable instance of UngriddedData to cache Parameters ---------- data : UngriddedData object containing the data (possibly containing multiple variables) var_name : str, optional name of variable that is supposed to be stored (only required if input `data` contains more than one variable) """ meta = self.cache_meta_info() if not isinstance(data, UngriddedData): raise TypeError('Invalid input, need instance of UngriddedData, ' 'got {}'.format(type(data))) if len(data.contains_datasets) > 1: raise CacheWriteError('Input UngriddedData object contains ' 'datasets: {}. Can only write single ' 'dataset objects' .format(data.contains_datasets)) if var_name is None: if len(data.contains_vars) > 1: raise CacheWriteError('Input UngriddedData object for {} contains ' 'more than one variable: {}. Please ' 'specify which variable should be ' 'cached' .format(self.reader.data_id, data.contains_vars)) var_name = data.contains_vars[0] elif not var_name in data.contains_vars: raise CacheWriteError('Cannot write cache file: variable {} does ' 'not exist in input UngriddedData object' .format(var_name)) if len(data.contains_vars) > 1: data = data.extract_var(var_name) fp = self.file_path(var_name) logger.info('Writing cache file: {}'.format(fp)) success = True # OutHandle = gzip.open(c__cache_file, 'wb') # takes too much time out_handle = open(fp, 'wb') try: # write cache header pickle.dump(meta, out_handle, pickle.HIGHEST_PROTOCOL) # write data pickle.dump(data, out_handle, pickle.HIGHEST_PROTOCOL) except Exception as e: from pyaerocom import print_log print_log.exception('Failed to write cache'.format(repr(e))) success=False finally: out_handle.close() if not success: os.remove(self.file_path) logger.info('Successfully wrote {} data ({}) to disk!' .format(var_name, self.reader.data_id))
def read_file(self, nasa_ames_file, only_head=False, replace_invalid_nan=True, convert_timestamps=True, decode_flags=True, quality_check=True): """Read NASA Ames file Parameters ---------- nasa_ames_file : str EBAS NASA Ames file only_head : bool read only file header replace_invalid_nan : bool replace all invalid values in the table by NaNs. The invalid values for each dependent data column are identified based on the information in the file header. convert_timestamps : bool compute array of numpy datetime64 timestamps from numeric timestamps in data decode_flags : bool if True, all flags in all flag columns are decoded from floating point representation to 3 integers, e.g. 0.111222333 -> 111 222 333 quality_check : bool perform quality check after import (for details see :func:`_quality_check`) """ logger.info("Reading NASA Ames file:\n{}".format(nasa_ames_file)) lc = 0 #line counter dc = 0 #data block line counter mc = 0 #meta block counter END_VAR_DEF = np.nan #will be set (info stored in header) IN_DATA = False data = [] _insert_invalid = None for line in open(nasa_ames_file): #print(lc, _NUM_FIXLINES, line) if IN_DATA: if dc == 0: logger.debug(line) try: data.append( tuple([float(x.strip()) for x in line.strip().split()])) #data.append([float(x.strip()) for x in line.strip().split()]) except Exception as e: data.append(_insert_invalid) logger.warning("Failed to read data row {}. " "Error msg: {}".format(dc, repr(e))) dc += 1 elif lc < self._NUM_FIXLINES: try: val = self._H_FIXLINES_CONV[lc](line) attr = self._H_FIXLINES_YIELD[lc] if isinstance(attr, list): for i, attr_id in enumerate(attr): self[attr_id] = val[i] else: self[attr] = val except Exception as e: msg = ("Failed to read header row {}.\n{}\n" "Error msg: {}".format(lc, line, repr(e))) if lc in self._HEAD_ROWS_MANDATORY: raise NasaAmesReadError("Fatal: {}".format(msg)) else: logger.warning(msg) else: _flagmap_idx = 0 if mc == 0: END_VAR_DEF = self._NUM_FIXLINES + self.num_cols_dependent - 1 NUM_HEAD_LINES = self.num_head_lines try: self.var_defs.append(self._read_vardef_line(line)) except Exception as e: logger.warning(repr(e)) elif lc < END_VAR_DEF: var = self._read_vardef_line(line) #if variable corresponds to flag column, assign this #flag column to all previously read variables if var.is_flag: for _var in self.var_defs[_flagmap_idx:]: _var.flag_id = var.name self.var_defs.append(var) _flagmap_idx = len(self.var_defs) try: pass #self.var_defs.append(var) except Exception as e: logger.warning(repr(e)) elif lc == NUM_HEAD_LINES - 1: IN_DATA = True self._data_header = h = [x.strip() for x in line.split()] #append information of first two columns to variable #definition array. self._var_defs.insert( 0, EbasColDef(name=h[0], is_flag=False, is_var=False, unit=self.time_unit)) self._var_defs.insert( 1, EbasColDef(name=h[1], is_flag=False, is_var=False, unit=self.time_unit)) if only_head: return logger.debug("REACHED DATA BLOCK") _insert_invalid = tuple([np.nan] * self.col_num) #elif lc > self._NUM_FIXLINES + 3: elif lc >= END_VAR_DEF + 2: try: name, val = line.split(":") key = name.strip().lower().replace(" ", "_") self.meta[key] = val.strip() except Exception as e: logger.warning("Failed to read line no. {}.\n{}\n" "Error msg: {}\n".format( lc, line, repr(e))) else: logger.debug("Ignoring line no. {}: {}".format(lc, line)) mc += 1 lc += 1 data = np.asarray(data) data[:, 1:] = data[:, 1:] * np.asarray(self.mul_factors) self._data = data if replace_invalid_nan: dep_dat = data[:, 1:] for i, val in enumerate(np.floor(self.vals_invalid)): try: col = dep_dat[:, i] cond = np.floor(col) == val col[cond] = np.nan dep_dat[:, i] = col except: logger.warning("Failed to replace invalid values with " "NaNs in column {}".format( self.col_names[i + 1])) data[:, 1:] = dep_dat self._data = data if convert_timestamps: try: self.compute_time_stamps() except Exception as e: logger.warning("Failed to compute time stamps.\n" "Error message: {}".format(repr(e))) self.init_flags(decode_flags) if quality_check: self._quality_check()
def parse_from_ini(self, var_name=None, cfg=None): """Import information about default region Parameters ---------- var_name : str strind ID of region (must be specified in `regions.ini <https:// github.com/metno/pyaerocom/blob/master/pyaerocom/data/regions.ini>`__ file) cfg : ConfigParser open and read config parser object Returns ------- bool True, if default could be loaded, False if not Raises ------ IOError if regions.ini file does not exist """ if cfg is None: cfg = self.read_config() var_info = {} if var_name is not None and var_name != 'DEFAULT': if var_name in cfg: logger.info("Found default configuration for variable " "{}".format(var_name)) var_info = cfg[var_name] self.var_name = var_name else: aliases = _read_alias_ini() if var_name in aliases: var_info = cfg[aliases[var_name]] else: logger.warning( "No default configuration available for " "variable {}. Using DEFAULT settings".format(var_name)) default = cfg['DEFAULT'] for key in self.keys(): ok = True if key in var_info: val = var_info[key] elif key in default: val = default[key] else: ok = False if ok: if key in self._TYPE_CONV: try: val = self._TYPE_CONV[key](val) except: pass elif key == 'unit': if val == 'None' or val == '1': val = 1 if val == 'None': val = None self[key] = val
def check_set_countries(self, inplace=True, assign_to_dim=None): """ Checks if country information is available and assigns if not If not country information is available, countries will be assigned for each lat / lon coordinate using :func:`pyaerocom.geodesy.get_country_info_coords`. Parameters ---------- inplace : bool, optional If True, modify and return this object, else a copy. The default is True. assign_to_dim : str, optional name of dimension to which the country coordinate is assigned. Default is None, in which case station_name is used. Raises ------ DataDimensionError If data is 4D (i.e. if latitude and longitude are othorgonal dimensions) Returns ------- ColocatedData data object with countries assigned """ if self.has_latlon_dims: raise DataDimensionError('Countries cannot be assigned to 4D' 'ColocatedData with othorgonal lat / lon ' 'dimensions. Please consider stacking ' 'the latitude and longitude dimensions-') if assign_to_dim is None: assign_to_dim = 'station_name' if not assign_to_dim in self.dims: raise DataDimensionError('No such dimension', assign_to_dim) # ============================================================================= # if self.has_latlon_dims: #4D data # raise NotImplementedError('Cannot yet assign countries to 4D ' # 'ColocatedData') # ============================================================================= coldata = self if inplace else self.copy() if 'country' in coldata.data.coords: logger.info('Country information is available') return coldata coords = coldata._get_stat_coords() info = get_country_info_coords(coords) countries, codes = [],[] for item in info: countries.append(item['country']) codes.append(item['country_code']) arr = coldata.data arr = arr.assign_coords(country = (assign_to_dim, countries), country_code=(assign_to_dim, codes)) coldata.data = arr return coldata