def add_file_to_log(filepath, err_msg): try: dirname = os.path.dirname(filepath) spl = dirname.split(os.sep) if spl[-1].lower() == 'renamed': model_or_obs_id = spl[-2] else: model_or_obs_id = spl[-1] except: model_or_obs_id = 'others' try: logdir = const.LOGFILESDIR found = False logfile = os.path.join(logdir, model_or_obs_id + '.log') if os.path.exists(logfile): with open(logfile, 'r') as f: for line in f: if filepath == line.strip(): found = True break if not found: with open(logfile, 'a+') as f: f.write(filepath + '\n') with open(os.path.join(logdir, model_or_obs_id + '_ERR.log'), 'a+') as ferr: ferr.write('{}\n{}\n\n'.format(filepath, err_msg)) except Exception as e: from pyaerocom import print_log const.WRITE_FILEIO_ERR_LOG = False print_log.info('Failed to write to file-read error logging ({}). ' 'Deactiving lgging'.format(repr(e)))
def CACHEDIR(self): """Cache directory""" try: return chk_make_subdir(self._cachedir, getpass.getuser()) except Exception as e: from pyaerocom import print_log print_log.info('Failed to access CACHEDIR: {}' 'Deactivating caching'.format(repr(e))) self._caching_active = False
def LOGFILESDIR(self): """Directory where logfiles are stored""" try: logdir = chk_make_subdir(self.OUTPUTDIR, '_log') return logdir except Exception as e: from pyaerocom import print_log print_log.info('Failed to access LOGFILESDIR: {}' 'Deactivating file logging'.format(repr(e))) self.WRITE_FILEIO_ERR_LOG = False
def CACHEDIR(self): """Cache directory for UngriddedData objects""" if self._cachedir is None: raise IOError('Cache directory is not defined') try: return chk_make_subdir(self._cachedir, getpass.getuser()) except Exception as e: from pyaerocom import print_log print_log.info('Failed to access CACHEDIR: {}\n' 'Deactivating caching'.format(repr(e))) self._caching_active = False
def _save_coldata(self, coldata, savename, out_dir, model_var, model_data, obs_var): """Helper for saving colocateddata""" if model_var != model_data.var_name: coldata.rename_variable(model_data.var_name, model_var, model_data.data_id) if (isinstance(self.model_add_vars, dict) and obs_var in self.model_add_vars and self.model_add_vars[obs_var] == model_var): coldata.rename_variable(obs_var, model_var, self.obs_id) coldata.to_netcdf(out_dir, savename=savename) self.file_status[savename] = 'saved' if self._log: self._write_log('WRITE: {}\n'.format(savename)) print_log.info('Writing file {}'.format(savename))
def _read_gridded(self, reader, var_name, start, stop, is_model=True): if is_model: vert_which = self.obs_vert_type if all(x == '' for x in reader.file_info.vert_code.values): print_log.info('Deactivating model file search by vertical ' 'code for {}, since filenames do not include ' 'information about vertical code (probably ' 'AeroCom 2 convention)'.format(reader.data_id)) vert_which = None ts_type_read = self.model_ts_type_read if self.model_use_climatology: start = 9999 stop = None else: vert_which = None ts_type_read = self.obs_ts_type_read msg = ('No data files available for dataset {} ({})'.format( reader.data_id, var_name)) try: return reader.read_var(var_name, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=self.flex_ts_type_gridded, vert_which=vert_which) except DataCoverageError: vt = None if is_model: if self.obs_vert_type in self.OBS_VERT_TYPES_ALT: vt = self.OBS_VERT_TYPES_ALT[self.obs_vert_type] elif self.model_vert_type_alt is not None: mva = self.model_vert_type_alt if isinstance(mva, str): vt = mva elif isinstance(mva, dict) and var_name in mva: vt = mva[var_name] if vt is None: raise DataCoverageError(msg) return reader.read_var(var_name, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=self.flex_ts_type_gridded, vert_which=vt)
def check_output_dirs(self): """Checks if output directories are available and have write-access""" ok = True from pyaerocom import print_log if not self.dir_exists(self._outputdir) or not self._write_access(self._outputdir): self._outputdir = chk_make_subdir(self.HOMEDIR, self._outhomename) if not self._write_access(self._outputdir): print_log.info('Cannot establish write access to output directory {}' .format(self._outputdir)) ok = False if not self.dir_exists(self._cachedir) or not self._write_access(self._cachedir): self._cachedir = chk_make_subdir(self._outputdir, '_cache') if not self._write_access(self._cachedir): print_log.info('Cannot establish write access to cache directory {}.' 'Deactivating caching of files'.format(self._cachedir)) self._caching_active = False ok = False return ok
def BASEDIR(self, value): if not os.path.exists(value): raise IOError('Cannot change data base directory. Input directory ' 'does not exist') self._obsbasedir = value self._modelbasedir = value subdirs = os.listdir(value) from pyaerocom import print_log if 'aerocom0' in subdirs: print_log.info('Initiating directories for lustre') self.read_config(self._config_ini, keep_basedirs=True) elif 'obsdata' in subdirs: #test dataset print_log.info('Initiating directories for pyaerocom testdataset') self.read_config(self._config_ini_testdata, keep_basedirs=True) self._cachedir = os.path.join('..', '_cache') elif 'AMAP' in subdirs: print_log.info('Initiating directories for AEROCOM users database') self.read_config(self._config_ini_user_server, keep_basedirs=True) else: self.reload()
def _run_gridded_gridded(self, var_name=None): start, stop = start_stop(self.start, self.stop) model_reader = ReadGridded(self.model_id) obs_reader = ReadGridded(self.obs_id) if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() if bool(remaining_filters): raise NotImplementedError( 'Cannot apply filters {} to gridded ' 'observation data.'.format(remaining_filters)) obs_vars = self.obs_vars obs_vars_avail = obs_reader.vars_provided for obs_var in obs_vars: if not obs_var in obs_vars_avail: raise DataCoverageError( 'Variable {} is not supported by {}'.format( obs_var, self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) if self.remove_outliers: self._update_var_outlier_ranges(var_matches) all_ts_types = const.GRID_IO.TS_TYPES ts_type = self.ts_type data_objs = {} for model_var, obs_var in var_matches.items(): print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue if not model_data.ts_type in all_ts_types: raise TemporalResolutionError('Invalid temporal resolution {} ' 'in model {}'.format( model_data.ts_type, self.model_id)) try: obs_data = self._read_gridded(reader=obs_reader, var_name=obs_var, start=start, stop=stop, is_model=False) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue if not obs_data.ts_type in all_ts_types: raise TemporalResolutionError('Invalid temporal resolution {} ' 'in obs {}'.format( obs_data.ts_type, self.model_id)) # update colocation ts_type, based on the available resolution in # model and obs. lowest = self.get_lowest_resolution(ts_type, model_data.ts_type, obs_data.ts_type) if lowest != ts_type: print_log.info('Updating ts_type from {} to {} (highest ' 'available in {} / {} combination)'.format( ts_type, lowest, self.model_id, self.obs_id)) ts_type = lowest if self.save_coldata: out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( self.model_id, savename) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) continue else: os.remove(os.path.join(out_dir, savename)) try: by = None if self.model_use_climatology: by = to_pandas_timestamp(start).year coldata = colocate_gridded_gridded( gridded_data=model_data, gridded_data_ref=obs_data, ts_type=ts_type, start=start, stop=stop, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, apply_time_resampling_constraints=\ self.apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) #coldata.to_netcdf(out_dir, savename=savename) if self._log: self._write_log('WRITE: {}\n'.format(savename)) print_log.info('Writing file {}'.format(savename)) data_objs[model_var] = coldata except Exception as e: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed: Reason {}'.format(self.model_id, model_var, self.obs_id, obs_var, repr(e))) const.print_log.warning(msg) self._write_log(msg) if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs
def _run_gridded_ungridded(self, var_name=None): """Analysis method for gridded vs. ungridded data""" model_reader = ReadGridded(self.model_id) obs_reader = ReadUngridded(self.obs_id) obs_vars_supported = obs_reader.get_reader( self.obs_id).PROVIDES_VARIABLES obs_vars = list(np.intersect1d(self.obs_vars, obs_vars_supported)) if len(obs_vars) == 0: raise DataCoverageError( 'No observation variable matches found for ' '{}'.format(self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) if self.read_opts_ungridded is not None: ropts = self.read_opts_ungridded else: ropts = {} obs_data = obs_reader.read(datasets_to_read=self.obs_id, vars_to_retrieve=obs_vars, **ropts) if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() obs_data = obs_data.apply_filters(**remaining_filters) if self.remove_outliers: self._update_var_outlier_ranges(var_matches) #all_ts_types = const.GRID_IO.TS_TYPES data_objs = {} for model_var, obs_var in var_matches.items(): ts_type = self.ts_type start, stop = start_stop(self.start, self.stop) print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue ts_type_src = model_data.ts_type # ============================================================================= # if not model_data.ts_type in all_ts_types: # raise TemporalResolutionError('Invalid temporal resolution {} ' # 'in model {}'.format(model_data.ts_type, # self.model_id)) # ============================================================================= ignore_stats = None if self.ignore_station_names is not None: ignore_stats = self.ignore_station_names if isinstance(ignore_stats, dict): if obs_var in ignore_stats: ignore_stats = ignore_stats[obs_var] else: ignore_stats = None #ts_type_src = model_data.ts_type if TsType(ts_type_src) < TsType( ts_type): # < all_ts_types.index(ts_type_src): print_log.info('Updating ts_type from {} to {} (highest ' 'available in model {})'.format( ts_type, ts_type_src, self.model_id)) ts_type = ts_type_src if self.save_coldata: savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( model_data.data_id, savename) out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) self.file_status[savename] = 'skipped' continue else: print_log.info( 'Deleting and recomputing existing ' 'colocated data file {}'.format(savename)) print_log.info('REMOVE: {}\n'.format(savename)) os.remove(os.path.join(out_dir, savename)) try: by = None if self.model_use_climatology: by = start.year coldata = colocate_gridded_ungridded( gridded_data=model_data, ungridded_data=obs_data, ts_type=ts_type, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, ignore_station_names=ignore_stats, apply_time_resampling_constraints=self. apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) data_objs[model_var] = coldata except Exception as e: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed: Reason {}'.format(self.model_id, model_var, self.obs_id, obs_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs
def read_dataset(self, dataset_to_read, vars_to_retrieve=None, only_cached=False, **kwargs): """Read dataset into an instance of :class:`ReadUngridded` Parameters ---------- dataset_to_read : str name of dataset vars_to_retrieve : str or list variable or list of variables to be imported only_cached : bool if True, then nothing is reloaded but only data is loaded that is available as cached objects (not recommended to use but may be used if working offline without connection to database) **kwargs additional reading constraints. If any are provided, caching is deactivated and the data will be read from disk. Returns -------- UngriddedData data object """ _caching = None if len(kwargs) > 0: _caching = const.CACHING const.CACHING = False print_log.info('Received additional reading constraints, ' 'ignoring caching') reader = self.get_reader(dataset_to_read) if vars_to_retrieve is not None: # Note: self.vars_to_retrieve may be None as well, then # default variables of each network are read self.vars_to_retrieve = vars_to_retrieve if self.vars_to_retrieve is None: self.vars_to_retrieve = reader.PROVIDES_VARIABLES vars_to_retrieve = varlist_aerocom(self.vars_to_retrieve) # data_dir will be None in most cases, but can be specified when # creating the instance, by default, data_dir is inferred automatically # in the reading class, using database location data_dir = self._get_data_dir(dataset_to_read) if data_dir is not None: if not os.path.exists(data_dir): raise FileNotFoundError( 'Trying to read {} from specified data_dir {} failed. ' 'Directory does not exist'.format(dataset_to_read, data_dir)) reader._dataset_path = data_dir const.print_log.info( 'Reading {} from specified data loaction: {}'.format( dataset_to_read, data_dir)) # Since this interface enables to load multiple datasets, each of # which support a number of variables, here, only the variables are # considered that are supported by the dataset vars_available = [ var for var in vars_to_retrieve if reader.var_supported(var) ] if len(vars_available) == 0: raise DataRetrievalError('None of the input variables ({}) is ' 'supported by {} interface'.format( vars_to_retrieve, dataset_to_read)) cache = CacheHandlerUngridded(reader) if not self.ignore_cache: # initate cache handler for var in vars_available: try: cache.check_and_load(var, force_use_outdated=only_cached) except Exception: self.logger.exception( 'Fatal: compatibility error between ' 'old cache file {} and current version ' 'of code ') if not only_cached: vars_to_read = [ v for v in vars_available if not v in cache.loaded_data ] else: vars_to_read = [] data_read = None if len(vars_to_read) > 0: _loglevel = print_log.level print_log.setLevel(logging.INFO) data_read = reader.read(vars_to_read, **kwargs) print_log.setLevel(_loglevel) for var in vars_to_read: # write the cache file if not self.ignore_cache: try: cache.write(data_read, var) except Exception as e: _caching = False print_log.warning( 'Failed to write to cache directory. ' 'Error: {}. Deactivating caching in ' 'pyaerocom'.format(repr(e))) if len(vars_to_read) == len(vars_available): data_out = data_read else: data_out = UngriddedData() for var in vars_available: if var in cache.loaded_data: data_out.append(cache.loaded_data[var]) if data_read is not None: data_out.append(data_read) if _caching is not None: const.CACHING = _caching return data_out
def read_datasetOLD(self, dataset_to_read, vars_to_retrieve=None, **kwargs): """Read single dataset into instance of :class:`ReadUngridded` Note ---- This method does not write class attribute :attr:`data` (only :func:`read` does) Parameters ---------- dataset_to_read : str name of dataset vars_to_retrieve : list list of variables to be retrieved. If None (default), the default variables of each reading routine are imported Returns -------- UngriddedData data object """ _caching = None if len(kwargs) > 0: _caching = const.CACHING const.CACHING = False print_log.info('Received additional reading constraints, ' 'ignoring caching') if vars_to_retrieve is None: # Note: self.vars_to_retrieve may be None as well, then # default variables of each network are read vars_to_retrieve = self.vars_to_retrieve reader = self.get_reader(dataset_to_read) if vars_to_retrieve is None: vars_to_retrieve = reader.PROVIDES_VARIABLES elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] # Since this interface enables to load multiple datasets, each of # which support a number of variables, here, only the variables are # considered that are supported by the dataset vars_available = [ var for var in vars_to_retrieve if var in reader.PROVIDES_VARIABLES ] # read the data sets cache_hit_flag = False if not self.ignore_cache: # initate cache handler try: cache = CacheHandlerUngridded(reader, vars_available, **kwargs) if cache.check_and_load(): all_avail = True for var in vars_available: if not var in cache.loaded_data: all_avail = False break if all_avail: print_log.info( 'Found Cache match for {}'.format(dataset_to_read)) cache_hit_flag = True data = cache.loaded_data except: self.logger.exception( 'Fatal: compatibility error between old ' 'cache file and current version of code ') cache_hit_flag = False if not cache_hit_flag: print_log.info('No Cache match found for {} in {}. ' 'Reading from files (this ' 'may take a while)'.format(dataset_to_read, const.CACHEDIR)) _loglevel = print_log.level print_log.setLevel(logging.INFO) data = reader.read(vars_available, **kwargs) print_log.setLevel(_loglevel) self.revision[dataset_to_read] = reader.data_revision self.data_version[dataset_to_read] = reader.__version__ # write the cache file if not cache_hit_flag and not self.ignore_cache: try: cache.write(data) except Exception as e: _caching = False print_log.warning('Failed to write to cache directory:\n{}.\n' 'Deactivating caching in pyaerocom'.format( repr(e))) if _caching is not None: const.CACHING = _caching return data
def get_topo_data(lat0, lon0, lat1=None, lon1=None, topo_dataset='srtm', topodata_loc=None, try_etopo1=False): """Retrieve topographic altitude for a certain location Currently works only if :mod:`geonum` is installed. Supports topography datasets supported by geonum. These are currently (20 Feb. 19) srtm (SRTM dataset, default, automatic access if online) and etopo1 (ETOPO1 dataset, lower resolution, must be available on local machine or server). Parameters ---------- lat0 : float start longitude for data extraction lon0 : float start latitude for data extraction lat1 : float stop longitude for data extraction (default: None). If None only data around lon0, lat0 will be extracted. lon1 : float stop latitude for data extraction (default: None). If None only data around lon0, lat0 will be extracted topo_dataset : str name of topography dataset topodata_loc : str filepath or directory containing supported topographic datasets try_etopo1 : bool if True and if access fails via input arg `topo_dataset`, then try to access altitude using ETOPO1 dataset. Returns ------- geonum.TopoData data object containing topography data in specified range Raises ------ ValueError if altitude data cannot be accessed """ if not GEONUM_AVAILABLE: raise ModuleNotFoundError('Feature disabled: geonum library is not ' 'installed') import geonum if topodata_loc is None: from pyaerocom import const if topo_dataset in const.SUPPLDIRS and os.path.exists(const.SUPPLDIRS[topo_dataset]): topodata_loc = const.SUPPLDIRS[topo_dataset] print_log.info('Found default location for {} topodata at\n{}' .format(topo_dataset, topodata_loc)) try: access = geonum.TopoDataAccess(topo_dataset, local_path=topodata_loc) return access.get_data(lat0, lon0, lat1, lon1) except Exception as e: if try_etopo1 and not topo_dataset=='etopo1': print_log.warning('Failed to access topography data for {}. ' 'Trying ETOPO1.\nError: {}'.format(topo_dataset, repr(e))) return get_topo_data(lat0, lon0, lat1, lon1, topo_dataset='etopo1', topodata_loc=topodata_loc, try_etopo1=False) raise
def _run_gridded_gridded(self): start, stop = self.start, self.stop model_reader = ReadGridded(self.model_id, start, stop) obs_reader = ReadGridded(self.obs_id, start, stop) vars_to_analyse = self.vars_to_analyse if vars_to_analyse is None: vars_to_analyse = model_reader.vars_provided var_matches = {} for var in vars_to_analyse: if var in model_reader.vars_provided: #candidate # first check if the variable pair was defined explicitely if var in self.alt_vars: if self.alt_vars[var] in obs_reader.vars_provided: var_matches[var] = self.alt_vars[var] else: if var in obs_reader.vars_provided: var_matches[var] = var if len(var_matches) == 0: raise DataCoverageError('No variable matches between {} and {} for ' 'input vars: {}'.format(self.model_id, self.obs_id, self.vars_to_analyse)) all_ts_types = const.GRID_IO.TS_TYPES ts_types_ana = self.ts_types_ana if ts_types_ana is None: ts_types_ana = self._setup.TS_TYPES_ANA_DEFAULT['gridded'] ts_types_read = self.ts_types_read if ts_types_read is None: ts_types_read = model_reader.ts_types vars_model = list(var_matches.keys()) vars_obs = list(var_matches.values()) flex_obs = self._setup.options.TS_TYPE_OBS_FLEX for ts_type_read in ts_types_read: # reads only year if starttime is provided but not stop time model_data_vars = model_reader.read(vars_model, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=False) if len(model_data_vars) == 0: if self._log: self._log.write('No model data available ({}-{}, {})\n' .format(start, stop, ts_type_read)) continue obs_data_vars = obs_reader.read(vars_obs, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=flex_obs) if len(obs_data_vars) == 0: if self._log: self._log.write('No obs data available for variables {} ' '({}-{}, {})\n' .format(vars_obs, start, stop, ts_type_read)) continue for model_data in model_data_vars: var = model_data.var_name obs_data = None for _obs in obs_data_vars: if _obs.var_name == var_matches[var]: obs_data = _obs break if obs_data is None: if self._log: self._log.write('No obs data available for model var {} ' '({}-{}, {})\n' .format(var, start, stop, ts_type_read)) continue for ts_type_ana in ts_types_ana: # model resolution (ts_type) must be equal or higher # than the current analysis setting (since ) if all_ts_types.index(ts_type_ana) >= all_ts_types.index(ts_type_read): out_dir = chk_make_subdir(self.output_dir('colocate'), self.model_id) savename = self._coldata_save_name(model_data, ts_type_ana, start, stop) file_exists = self._check_coldata_exists(self.model_id, savename) if file_exists: if not self.options.REANALYSE_EXISTING: if self._log: self._log.write('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) continue else: os.remove(os.path.join(out_dir, savename)) data_coll = colocate_gridded_gridded( model_data, obs_data, ts_type=ts_type_ana, start=start, stop=stop, filter_name=self.filter_name) self._last_coldata = data_coll if data_coll.save_name_aerocom + '.nc' != savename: raise Exception data_coll.to_netcdf(out_dir) if self._log: self._log.write('WRITE: {}\n'.format(savename)) print_log.info('Writing {}'.format(savename))
def _run_gridded_ungridded(self): """Analysis method for gridded vs. ungridded data""" start, stop = self.start, self.stop model_reader = ReadGridded(self.model_id, start, stop) obs_reader = ReadUngridded(self.obs_id) obs_vars = obs_reader.get_reader(self.obs_id).PROVIDES_VARIABLES vars_to_analyse = self.vars_to_analyse if vars_to_analyse is None: vars_to_analyse = model_reader.vars_provided var_matches = {} for var in vars_to_analyse: if var in model_reader.vars_provided: #candidate if var in self.alt_vars: if self.alt_vars[var] in obs_vars: var_matches[var] = self.alt_vars[var] else: if var in obs_vars: var_matches[var] = var if len(var_matches) == 0: raise DataCoverageError('No variable matches between ' '{} and {} for input vars: {}' .format(self.model_id, self.obs_id, self.vars_to_analyse)) all_ts_types = const.GRID_IO.TS_TYPES ts_types_ana = self.ts_types_ana if ts_types_ana is None: ts_types_ana = self._setup.TS_TYPES_ANA_DEFAULT['ungridded'] ts_types_read = self.ts_types_read if ts_types_read is None: ts_types_read = model_reader.ts_types vars_model = list(var_matches.keys()) vars_obs = list(var_matches.values()) obs_data = obs_reader.read(datasets_to_read=self.obs_id, vars_to_retrieve=vars_obs) for ts_type_read in ts_types_read: model_data_vars = model_reader.read(vars_model, start=start, stop=stop, ts_type=ts_type_read, flex_ts_type=False) if len(model_data_vars)==0: if self._log: self._log.write('No model data available ({}-{}, {})\n' .format(start, stop, ts_type_read)) continue for model_data in model_data_vars: var = model_data.var_info.var_name obs_var = var_matches[var] if not obs_var in obs_reader.data: if self._log: self._log.write('No obs data available for variable {} ' '({}-{}, {})\n' .format(obs_var, start, stop, ts_type_read)) continue for ts_type_ana in ts_types_ana: if all_ts_types.index(ts_type_ana) >= all_ts_types.index(ts_type_read): out_dir = chk_make_subdir(self.output_dir('colocate'), self.model_id) savename = self._coldata_save_name(model_data, ts_type_ana, start, stop) file_exists = self._check_coldata_exists( self.model_id, savename) if file_exists: if not self.options.REANALYSE_EXISTING: if self._log: self._log.write('SKIP: {}\n' .format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) continue else: os.remove(os.path.join(out_dir, savename)) data_coll = colocate_gridded_ungridded_2D( model_data, obs_data, ts_type=ts_type_ana, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name) self._last_coldata = data_coll data_coll.to_netcdf(out_dir) if self._log: self._log.write('WRITE: {}\n'.format(savename)) print_log.info('Writing {}'.format(savename)) plt.close('all')
def __init__(self, model_base_dir=None, obs_base_dir=None, output_dir=None, config_file=None, cache_dir=None, colocateddata_dir=None, write_fileio_err_log=True, activate_caching=True): # Loggers from pyaerocom import print_log, logger self.print_log = print_log self.logger = logger # Directories self._modelbasedir = model_base_dir self._obsbasedir = obs_base_dir self._cachedir = cache_dir self._outputdir = output_dir self._testdatadir = os.path.join(self.HOMEDIR, 'pyaerocom-testdata') self._colocateddatadir = colocateddata_dir # Options self._caching_active = activate_caching #: Settings for reading and writing of gridded data self.GRID_IO = GridIO() print_log.info('Initating pyaerocom configuration') if not isinstance(config_file, str) or not os.path.exists(config_file): from time import time print_log.info('Checking database access...') t0 = time() config_file = self._infer_config_file() print_log.info('Expired time: {:.3f} s'.format(time() - t0)) self._var_param = None self._coords = None # Attributes that are used to store search directories self.OBSCONFIG = od() self.SUPPLDIRS = od() self.MODELDIRS = [] self.WRITE_FILEIO_ERR_LOG = write_fileio_err_log self._ebas_flag_info = None if config_file is not None: keep_basedirs = False if self.dir_exists(model_base_dir) and self.dir_exists( obs_base_dir): keep_basedirs = True try: self.read_config(config_file, keep_basedirs) except Exception as e: from traceback import format_exc self.init_outputdirs() self.print_log.warning(format_exc()) self.print_log.warning("Failed to init config. Error: %s" % repr(e)) else: self.init_outputdirs()
def _run_gridded_ungridded(self, var_name=None): """Analysis method for gridded vs. ungridded data""" print_log.info('PREPARING colocation of {} vs. {}'.format( self.model_id, self.obs_id)) model_reader = self.instantiate_gridded_reader(what='model') obs_reader = ReadUngridded(self.obs_id, data_dir=self.obs_data_dir) obs_vars = obs_reader.get_vars_supported(self.obs_id, self.obs_vars) if len(obs_vars) == 0: raise DataCoverageError( 'No observation variable matches found for ' '{}'.format(self.obs_id)) var_matches = self._find_var_matches(obs_vars, model_reader, var_name) print_log.info( 'The following variable combinations will be colocated\n' 'MODEL-VAR\tOBS-VAR') for key, val in var_matches.items(): print_log.info('{}\t{}'.format(key, val)) # get list of unique observation variables obs_vars = np.unique(list(var_matches.values())).tolist() if self.remove_outliers: self._update_var_outlier_ranges(var_matches) if self.read_opts_ungridded is not None: ropts = self.read_opts_ungridded else: ropts = {} data_objs = {} if self.start is None: self._infer_start_stop(model_reader) start, stop = start_stop(self.start, self.stop) for model_var, obs_var in var_matches.items(): # ToDo: consider removing outliers already here. #if 'obs_filters' in self: ts_type = self.ts_type print_log.info('Running {} / {} ({}, {})'.format( self.model_id, self.obs_id, model_var, obs_var)) try: model_data = self._read_gridded(reader=model_reader, var_name=model_var, start=start, stop=stop, is_model=True) except Exception as e: msg = ( 'Failed to load gridded data: {} / {}. Reason {}'.format( self.model_id, model_var, repr(e))) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) else: continue ts_type_src = model_data.ts_type rshow = self._eval_resample_how(model_var, obs_var) if ts_type is None: # if colocation frequency is not specified ts_type = ts_type_src ignore_stats = None if self.ignore_station_names is not None: ignore_stats = self.ignore_station_names if isinstance(ignore_stats, dict): if obs_var in ignore_stats: ignore_stats = ignore_stats[obs_var] else: ignore_stats = None #ts_type_src = model_data.ts_type if TsType(ts_type_src) < TsType( ts_type): # < all_ts_types.index(ts_type_src): print_log.info('Updating ts_type from {} to {} (highest ' 'available in model {})'.format( ts_type, ts_type_src, self.model_id)) ts_type = ts_type_src really_do_reanalysis = True if self.save_coldata: really_do_reanalysis = False savename = self._coldata_savename(model_data, start, stop, ts_type, var_name=model_var) file_exists = self._check_coldata_exists( model_data.data_id, savename) out_dir = chk_make_subdir(self.basedir_coldata, self.model_id) if file_exists: if not self.reanalyse_existing: if self._log: self._write_log('SKIP: {}\n'.format(savename)) print_log.info('Skip {} (file already ' 'exists)'.format(savename)) self.file_status[savename] = 'skipped' continue else: really_do_reanalysis = True print_log.info( 'Deleting and recomputing existing ' 'colocated data file {}'.format(savename)) print_log.info('REMOVE: {}\n'.format(savename)) os.remove(os.path.join(out_dir, savename)) else: really_do_reanalysis = True if really_do_reanalysis: #Reading obs data only if the co-located data file does #not already exist. #This part of the method has been changed by @hansbrenna to work better with #large observational data sets. Only one variable is loaded into # the UngriddedData object at a time. Currently the variable is #re-read a lot of times, which is a weakness. obs_data = obs_reader.read(vars_to_retrieve=obs_var, only_cached=self._obs_cache_only, **ropts) # ToDo: consider removing outliers already here. if 'obs_filters' in self: remaining_filters = self._eval_obs_filters() obs_data = obs_data.apply_filters(**remaining_filters) try: try: by = self.update_baseyear_gridded stop = None except AttributeError: by = None if self.model_use_climatology: by = start.year coldata = colocate_gridded_ungridded( gridded_data=model_data, ungridded_data=obs_data, ts_type=ts_type, start=start, stop=stop, var_ref=obs_var, filter_name=self.filter_name, regrid_res_deg=self.regrid_res_deg, remove_outliers=self.remove_outliers, vert_scheme=self.vert_scheme, harmonise_units=self.harmonise_units, var_outlier_ranges=self.var_outlier_ranges, var_ref_outlier_ranges=self.var_ref_outlier_ranges, update_baseyear_gridded=by, ignore_station_names=ignore_stats, apply_time_resampling_constraints=self. apply_time_resampling_constraints, min_num_obs=self.min_num_obs, colocate_time=self.colocate_time, var_keep_outliers=self.model_keep_outliers, var_ref_keep_outliers=self.obs_keep_outliers, use_climatology_ref=self.obs_use_climatology, resample_how=rshow) if self.model_to_stp: coldata = correct_model_stp_coldata(coldata) if self.save_coldata: self._save_coldata(coldata, savename, out_dir, model_var, model_data, obs_var) data_objs[model_var] = coldata except Exception: msg = ('Colocation between model {} / {} and obs {} / {} ' 'failed.\nTraceback:\n{}'.format( self.model_id, model_var, self.obs_id, obs_var, traceback.format_exc())) const.print_log.warning(msg) self._write_log(msg + '\n') if self.raise_exceptions: self._close_log() raise Exception(msg) return data_objs
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used Returns ------- UngriddedData data object """ if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] if files is None: if len(self.files) == 0: self.get_file_list() files = self.files if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = 0.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx num_vars = len(vars_to_retrieve) num_files = len(files) disp_each = int(num_files * 0.1) if disp_each < 1: disp_each = 1 for i, _file in enumerate(files): if i % disp_each == 0: print_log.info("Reading file {} of {} ({})".format( i, num_files, type(self).__name__)) station_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve) # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot metadata[meta_key] = od() metadata[meta_key].update(station_data.get_meta()) metadata[meta_key].update(station_data.get_station_coords()) metadata[meta_key]['dataset_name'] = self.DATASET_NAME metadata[meta_key]['ts_type'] = self.TS_TYPE metadata[meta_key]['variables'] = vars_to_retrieve if 'instrument_name' in station_data and station_data[ 'instrument_name'] is not None: instr = station_data['instrument_name'] else: instr = self.INSTRUMENT_NAME metadata[meta_key]['instrument_name'] = instr # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() num_times = len(station_data['dtime']) #access array containing time stamps # TODO: check using index instead (even though not a problem here # since all Aerocom data files are of type timeseries) times = np.float64(station_data['dtime']) totnum = num_times * num_vars #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) for var_idx, var in enumerate(vars_to_retrieve): values = station_data[var] start = idx + var_idx * num_times stop = start + num_times #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['stat_lat'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['stat_lat'] data_obj._data[ start:stop, data_obj._ALTITUDEINDEX] = station_data['stat_alt'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += totnum meta_key = meta_key + 1. # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] data_obj.data_revision[self.DATASET_NAME] = self.data_revision self.data = data_obj return data_obj
def read(self, vars_to_retrieve=None, files=None, first_file=None, last_file=None, file_pattern=None, common_meta=None): """Method that reads list of files as instance of :class:`UngriddedData` Parameters ---------- vars_to_retrieve : :obj:`list` or similar, optional, list containing variable IDs that are supposed to be read. If None, all variables in :attr:`PROVIDES_VARIABLES` are loaded files : :obj:`list`, optional list of files to be read. If None, then the file list is used that is returned on :func:`get_file_list`. first_file : :obj:`int`, optional index of first file in file list to read. If None, the very first file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. last_file : :obj:`int`, optional index of last file in list to read. If None, the very last file in the list is used. Note: is ignored if input parameter `file_pattern` is specified. file_pattern : str, optional string pattern for file search (cf :func:`get_file_list`) common_meta : dict, optional dictionary that contains additional metadata shared for this network (assigned to each metadata block of the :class:`UngriddedData` object that is returned) Returns ------- UngriddedData data object """ if common_meta is None: common_meta = {} if vars_to_retrieve is None: vars_to_retrieve = self.DEFAULT_VARS elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] vars_to_retrieve = varlist_aerocom(vars_to_retrieve) if files is None: if len(self.files) == 0: self.get_file_list(pattern=file_pattern) files = self.files if file_pattern is None: if first_file is None: first_file = 0 if last_file is None: last_file = len(files) files = files[first_file:last_file] self.read_failed = [] data_obj = UngriddedData() meta_key = 0.0 idx = 0 #assign metadata object metadata = data_obj.metadata meta_idx = data_obj.meta_idx num_vars = len(vars_to_retrieve) num_files = len(files) print_log.info('Reading AERONET data') for i in tqdm(range(num_files)): _file = files[i] station_data = self.read_file(_file, vars_to_retrieve=vars_to_retrieve) # Fill the metatdata dict # the location in the data set is time step dependant! # use the lat location here since we have to choose one location # in the time series plot meta = od() meta['var_info'] = od() meta.update(station_data.get_meta()) #metadata[meta_key].update(station_data.get_station_coords()) meta['data_id'] = self.data_id meta['ts_type'] = self.TS_TYPE #meta['variables'] = vars_to_retrieve if 'instrument_name' in station_data and station_data['instrument_name'] is not None: instr = station_data['instrument_name'] else: instr = self.INSTRUMENT_NAME meta['instrument_name'] = instr meta['data_revision'] = self.data_revision meta['filename'] = _file meta.update(**common_meta) # this is a list with indices of this station for each variable # not sure yet, if we really need that or if it speeds up things meta_idx[meta_key] = od() num_times = len(station_data['dtime']) #access array containing time stamps # TODO: check using index instead (even though not a problem here # since all Aerocom data files are of type timeseries) times = np.float64(station_data['dtime']) totnum = num_times * num_vars #check if size of data object needs to be extended if (idx + totnum) >= data_obj._ROWNO: #if totnum < data_obj._CHUNKSIZE, then the latter is used data_obj.add_chunk(totnum) for var_idx, var in enumerate(vars_to_retrieve): values = station_data[var] start = idx + var_idx * num_times stop = start + num_times #write common meta info for this station (data lon, lat and #altitude are set to station locations) data_obj._data[start:stop, data_obj._LATINDEX] = station_data['latitude'] data_obj._data[start:stop, data_obj._LONINDEX] = station_data['longitude'] data_obj._data[start:stop, data_obj._ALTITUDEINDEX] = station_data['altitude'] data_obj._data[start:stop, data_obj._METADATAKEYINDEX] = meta_key # write data to data object data_obj._data[start:stop, data_obj._TIMEINDEX] = times data_obj._data[start:stop, data_obj._DATAINDEX] = values data_obj._data[start:stop, data_obj._VARINDEX] = var_idx meta_idx[meta_key][var] = np.arange(start, stop) if var in station_data['var_info']: if 'units' in station_data['var_info'][var]: u = station_data['var_info'][var]['units'] elif 'unit' in station_data['var_info'][var]: from pyaerocom.exceptions import MetaDataError raise MetaDataError('Metadata attr unit is deprecated, ' 'please use units') else: u = self.DEFAULT_UNIT elif var in self.UNITS: u = self.UNITS[var] else: u = self.DEFAULT_UNIT meta['var_info'][var] = od(units=u) if not var in data_obj.var_idx: data_obj.var_idx[var] = var_idx idx += totnum metadata[meta_key] = meta meta_key = meta_key + 1. # shorten data_obj._data to the right number of points data_obj._data = data_obj._data[:idx] #data_obj.data_revision[self.data_id] = self.data_revision self.data = data_obj return data_obj
def read_dataset(self, dataset_to_read, vars_to_retrieve=None, **kwargs): """Read dataset into an instance of :class:`ReadUngridded` Note ---- This method does not assign loaded data obj to class attribute :attr:`data` (only :func:`read` does) Parameters ---------- dataset_to_read : str name of dataset vars_to_retrieve : list list of variables to be retrieved. If None (default), the default variables of each reading routine are imported Returns -------- UngriddedData data object """ _caching = None if len(kwargs) > 0: _caching = const.CACHING const.CACHING = False print_log.info('Received additional reading constraints, ' 'ignoring caching') if vars_to_retrieve is None: # Note: self.vars_to_retrieve may be None as well, then # default variables of each network are read vars_to_retrieve = self.vars_to_retrieve reader = self.get_reader(dataset_to_read) if vars_to_retrieve is None: vars_to_retrieve = reader.PROVIDES_VARIABLES elif isinstance(vars_to_retrieve, str): vars_to_retrieve = [vars_to_retrieve] # Since this interface enables to load multiple datasets, each of # which support a number of variables, here, only the variables are # considered that are supported by the dataset vars_available = [ var for var in vars_to_retrieve if var in reader.PROVIDES_VARIABLES ] cache = CacheHandlerUngridded(reader) if not self.ignore_cache: # initate cache handler for var in vars_available: try: cache.check_and_load(var_name=var) except: self.logger.exception( 'Fatal: compatibility error between ' 'old cache file {} and current version ' 'of code ') vars_to_read = [ v for v in vars_available if not v in cache.loaded_data ] data_read = None if len(vars_to_read) > 0: _loglevel = print_log.level print_log.setLevel(logging.INFO) data_read = reader.read(vars_to_read, **kwargs) print_log.setLevel(_loglevel) for var in vars_to_read: # write the cache file if not self.ignore_cache: try: cache.write(data_read, var) except Exception as e: _caching = False print_log.warning( 'Failed to write to cache directory. ' 'Error: {}. Deactivating caching in ' 'pyaerocom'.format(repr(e))) if len(vars_to_read) == len(vars_available): data_out = data_read else: data_out = UngriddedData() for var in vars_available: if var in cache.loaded_data: data_out.append(cache.loaded_data[var]) if data_read is not None: data_out.append(data_read) if _caching is not None: const.CACHING = _caching return data_out