def __init__(self, rundate=None, file_path=None): """Set up the basic information needed by the parser Subclasses of Parser should extend this constructor, by calling super().__init__(rundate) before setting attributes (at the very least self.file_key). The `file_path` parameter is mainly for when running the parsers indepently of Where, and can be used to specify a file independent of `files.conf`. Note that this should not be done inside of the Where program, as that looses some of the logging and maintainability. Args: rundate (date): The model run date (optional, used to set up date variables). file_path (String): Optional path to file that will be read. """ super().__init__() self.file_key = "Overwritten by subclasses" self.file_path = file_path self.rundate = rundate self.data_available = True self.dependencies = list() # Initialize the data self.vars = dict() self.meta = dict() self.data = dict() # Use _parser.Parser and subclasses instead log.dev( f"parser.Parser is deprecated, let {self.__class__.__name__} subclass one of " f"LineParser, ChainParser or SinexParser instead")
def parse(self): """Parse data This is a basic implementation that carries out the whole pipeline of reading and parsing datafiles including calculating secondary data. Returns: Parser: The parsed data """ log.dev( f"where.parsers.parser is deprecated. Use where.parsers._parser or one of it's subclasses instead." ) if self.file_path is None: self.file_path = config.files.path(self.file_key, file_vars=self.vars, download_missing=True) parser_package, parser_name = self.__module__.rsplit(".", maxsplit=1) with Timer("Finish {} ({}) - {} in".format(parser_name, parser_package, self.file_key)): if self.data_available: self.read_data() if not self.data_available: # May have been set to False by self.read_data() log.warn( f"No data found by {self.__class__.__name__} for {self.rundate.strftime(config.FMT_date)} " f"(was looking for {self.file_path})") return self self.calculate_data() dependencies.add(*self.dependencies, label=self.file_key) return self
def glob_paths(file_key, file_vars=None, is_zipped=None): """Find all filepaths matching a filename pattern Using pathlib.Path.glob() here is not trivial because we need to split into a base directory to start searching from and a pattern which may include directories. With glob.glob() this is trivial. The downside is that it only returns strings and not pathlib.Paths. """ import sys caller = sys._getframe(1) func_name = caller.f_code.co_name file_name = caller.f_code.co_filename line_num = caller.f_lineno log.dev( f"{file_name} ({line_num}) {func_name}: 'lib.files.glob_paths()' is deprecated. Use 'lib.config.files.glob_paths()' instead" ) path_string = str( path(file_key, file_vars, default="*", is_zipped=is_zipped)) glob_path = pathlib.Path(re.sub(r"\*+", "*", path_string)) idx = min((i for i, p in enumerate(glob_path.parts) if "*" in p), default=len(glob_path.parts) - 1) glob_base = pathlib.Path(*glob_path.parts[:idx]) glob_pattern = str(pathlib.Path(*glob_path.parts[idx:])) return list(glob_base.glob(glob_pattern))
def data_handling(dset): """Edits data based on SLR handling file Args: dset: A Dataset containing model data. Returns: Array containing False for observations to throw away """ handling = apriori.get("slr_handling_file", time=dset.time) remove_idx = np.zeros(dset.num_obs, dtype=bool) for station in dset.unique("station"): # TODO: To be implemented if "V" in handling.get(station, {}): log.dev( f"TODO: Station {station}, marked with a V, not sure what that means" ) # X is data to be deleted # N is a non reliable station, not to be used for operational analysis # Q is a station in quarantene for key in ["X", "N", "Q"]: intervals = handling.get(station, {}).get(key, []) for interval in intervals: start_x, end_x = interval[0] int_idx = (dset.filter(station=station) & (dset.time.datetime >= start_x) & (dset.time.datetime <= end_x)) if np.any(int_idx): log.debug( f"Removed data for station {station} in interval {start_x}-{end_x}, marked with {key}" ) remove_idx |= int_idx return ~remove_idx
def __init__(self, text="Elapsed time:", unit=None, logger=log.time): """Set up a new timer The text to be shown when logging the timer can be customized. Typically, the value of the timer will be added at the end of the string (e.g. 'Elapsed time: 0.1234 seconds'). However, this can be customized by adding a '{}' to the text. For example `text='Used {} to run the code'` will produce something like 'Used 0.1234 seconds to run the code'. Args: text (String): Text used when logging the timer (see above). unit (String): Unit used for logging the timer (Default is seconds). logger (Function): Function used to do the logging. """ super().__init__() self._start = None self._end = None self.text = text if "{}" in text else (text + " {}").strip() self.unit_name = "seconds" if unit is None else unit self.unit_factor = 1 if unit is None else Unit("seconds", unit) self.logger = logger # Use midgard instead caller = sys._getframe(1) func_name = caller.f_code.co_name file_name = caller.f_code.co_filename line_num = caller.f_lineno log.dev( f"{file_name} ({line_num}) {func_name}: where.lib.timer is deprecated, use midgard.dev.timer instead" )
def process_data(self): """Deprecate this method Can be removed when all references to process_data() are gone """ name = self.__class__.__name__ log.dev(f"{name}.process_data is deprecated. Use {name}.parse instead") self.parse()
def open(file_key, file_vars=None, create_dirs=False, is_zipped=None, download_missing=True, **kwargs): """Open a Where file Open a Where file based on file key which is looked up in the Where file list. The function automatically handles reading from gzipped files if the filename is specified with the special {gz}-ending (including the curly braces) in the file list. In that case, the mode should be specified to be 'rt' if the contents of the file should be treated as text. If both a zipped and an unzipped version is available, the zipped version is used. This can be overridden by specifying True or False for the is_zipped-parameter. This function behaves similar to the built-in open-function, and should typically be used with a context manager as follows: Example: with files.open('eopc04_iau', mode='rt') as fid: for line in fid: print(line.strip()) Args: file_key: String that is looked up in the Where file list. file_vars: Dict, used to replace variables in file name and path. create_dirs: True or False, if True missing directories are created. kwargs: All keyword arguments are passed on to open_path. Returns: File object representing the file. """ import sys caller = sys._getframe(2) func_name = caller.f_code.co_name file_name = caller.f_code.co_filename line_num = caller.f_lineno log.dev( f"{file_name} ({line_num}) {func_name}: 'lib.files.open()' is deprecated. Use 'lib.config.files.open()' instead" ) download_missing = download_missing and "r" in kwargs.get("mode", "r") file_path = path(file_key, file_vars, is_zipped=is_zipped, download_missing=download_missing) kwargs.setdefault("encoding", encoding(file_key)) try: with open_path(file_path, description=file_key, create_dirs=create_dirs, is_zipped=is_path_zipped(file_path), **kwargs) as fid: yield fid except Exception: raise
def _interpolate_meteorological_data(dset, data, rundate): """Calculate temperature, humidity and pressure at observation epochs Meteorological data are calculated at observation epochs by interpolating in the data given on the observation file for each station. Missing meteorological data are currently not handled. """ rundate = datetime(rundate.year, rundate.month, rundate.day) for field, station in [(f, f[4:]) for f in data.keys() if f.startswith("met_")]: log.debug(f"Meteorological data available for station {station}") met_time = data[field].pop("met_time") flat_list = [item for sublist in met_time for item in sublist] met_time_float = np.array([(flat_list[i] - rundate).total_seconds() for i in range(0, len(flat_list))]) met_time_unique, met_index = np.unique(met_time_float, return_index=True) diff = len(met_time_float) - len(met_time_unique) if diff > 0: log.dev(f"Removed duplicate met data for station {station}") log.dev("Do this for the actual obs data also!") if len(met_time_unique) == 1: for met_type in data[field].keys(): data[field][met_type] = np.repeat(data[field][met_type][0], dset.num_obs) continue # Extrapolation one month before/after # (this is overkill, most of these values will be removed later when taking the diagonal) min_time = min(met_time_unique) - 31 * 86400 max_time = max(met_time_unique) + 31 * 86400 met_time_unique = np.hstack( (np.array(min_time), met_time_unique, np.array(max_time))) for met_type in data[field].keys(): met_data_array = data[field][met_type] flat_list = [ item for sublist in met_data_array for item in sublist ] met_data_array = np.array([flat_list[i] for i in met_index]) met_data_array = np.hstack( (met_data_array[0], met_data_array, met_data_array[-1])) data[field][met_type] = interpolation.interpolate(met_time_unique, met_data_array, dset.obs_time, kind="cubic") return data
def register(func, name=None, sort_value=0): """Register a plug-in Plug-ins are registered based on the name of the module (file) they are defined in, as well as the package (directory) which contains them. Typically all plug-ins of a given type are collected in a package, e.g. models, techniques, parsers, etc. The path to the source code file is also stored. This is used to be able to add the source code as a dependency file when the plug-in is called. If `name` is given, the plug-in is registered based on this name instead of the name of the module. The name of the module is still registered as a part that can be used to distinguish between similar plug-ins in different files (see for instance how `session` is used in `where.techniques`). Args: func (Function): The function that is being registered. name (String): Alternative name of plug-in. Used by `register_named`. sort_value (Number): The value used when sorting plug-ins. Used by `register_ordered`. Returns: Function: The function that is being registered. """ # Get information from the function being registered package_name, _, plugin_name = func.__module__.rpartition(".") file_path = pathlib.Path(sys.modules[func.__module__].__file__) # Store Plugin-object in _PLUGINS dictionary plugin_info = _PLUGINS.setdefault(package_name, dict()).setdefault(plugin_name, dict()) if name is None: name = func.__name__ # Name of function is used as default name plugin_info.setdefault("__parts__", list()).append( name) # Only unnamed parts are added to list plugin = Plugin("{}.{}".format(plugin_name, name), func, file_path, sort_value) plugin_info[name] = plugin log.debug( f"Registering {plugin.name} as a {package_name}-plugin from {plugin.file_path}" ) # Add first registered unnamed part as default if "__parts__" in plugin_info: plugin_info["__default__"] = plugin_info[plugin_info["__parts__"][0]] # Use midgard instead log.dev( f"{package_name}.{plugin_name}: where.lib.plugins is deprecated, use midgard.dev.plugins instead" ) return func
def parse_matrix_func(self, data, lower_upper, type=""): """Parser for {marker} data Converts the input data to a symmetric matrix and adds it to self.data['{marker}']. The NEQ-Matrix Row/Column Number correspond to the Estimated Parameters Index in the {size_marker} block. Missing elements in the matrix are assumed to be zero (0); consequently, zero elements may be omitted to reduce the size of this block. Args: data (numpy.array): Input data, raw data for {marker} block. lower_upper (String): Either 'L' or 'U', indicating whether the matrix is given in lower or upper form. type (String): Information about the type of matrix, optional Returns: Numpy array: Symmetric matrix. """ # Size of matrix is given by {size_marker}-block, initialize to all zeros try: n = len(self._sinex[size_marker]) except KeyError: n = max(data["row_idx"]) log.dev( f"{size_marker!r}-block was not parsed. Guessing at size of normal equation matrix (n={n})." ) matrix = np.zeros((n, n)) # Loop through each line of values and put it in the correct place in the matrix (cannot simply reshape as # elements may have been omitted) values = np.stack((data["value_0"], data["value_1"], data["value_2"]), axis=1) for row, col, vals in zip(data["row_idx"], data["column_idx"], values): vals = vals[~np.isnan(vals)] idx = slice(row - 1, row), slice(col - 1, col - 1 + len(vals)) matrix[idx] = vals # Add symmetrical elements, depending on whether the matrix being represented in lower or upper form if lower_upper.upper() == "L": matrix = np.tril(matrix) + np.tril(matrix, k=-1).T elif lower_upper.upper() == "U": matrix = np.triu(matrix) + np.triu(matrix, k=1).T else: log.warn( f"'L' or 'U' not specified for {marker}. Trying to create a symmetric matrix anyway." ) matrix = matrix + matrix.T - np.diag(np.diag(matrix)) return {"matrix": matrix, "type": type}
def data(self): """Temporary warning about removing of data field Remove this method when all references to time.data are gone. """ import sys from where.lib import log caller = sys._getframe(1) func_name = caller.f_code.co_name file_name = caller.f_code.co_filename line_num = caller.f_lineno log.dev( "'time.data' is deprecated. Use 'time' instead in '{}' ({}:{})", func_name, file_name, line_num) return self
def rotate_z(angle): """Rotation matrix around Z-axis Positive (counterclockwise) rotation of the Z-axis as viewed from the positive end of the rotation axis towards the origin. Args: angle (float64): Rotation angle in [rad] Return: numpy.ndarray: Rotation matrix """ log.dev("lib.mathp.rotate_z is deprecated. Use lib.rotation.R3 instead.") cosA = np.cos(angle) sinA = np.sin(angle) R = np.array([[cosA, sinA, 0], [-sinA, cosA, 0], [0, 0, 1]]) return R
def glob_variable(file_key, variable, pattern, file_vars=None): """Find all possible values of variable """ import sys caller = sys._getframe(1) func_name = caller.f_code.co_name file_name = caller.f_code.co_filename line_num = caller.f_lineno log.dev( f"{file_name} ({line_num}) {func_name}: 'lib.files.glob_variable()' is deprecated. Use 'lib.config.files.glob_variable()' instead" ) # Find available paths file_vars = dict() if file_vars is None else dict(file_vars) file_vars[variable] = "*" search_paths = glob_paths(file_key, file_vars) # Set up the regular expression re_vars = {**file_vars, variable: f"(?P<{variable}>__pattern__)"} path_pattern = str(path(file_key, file_vars=re_vars, default=".*")).replace("\\", "\\\\") for i in itertools.count(): # Give unique names to each occurance of variable path_pattern = path_pattern.replace(f"<{variable}>", f"<{variable}__{i}>", 1) if f"<{variable}>" not in path_pattern: break re_pattern = re.compile(path_pattern.replace("__pattern__", pattern)) # Find each match values = set() for search_path in search_paths: match = re_pattern.search(str(search_path)) if match: matches = set(match.groupdict().values()) if len(matches) > 1: log.warn( f"Found multiple values for {variable!r} in {search_path}: {', '.join(matches)}" ) values |= matches return values
def get(datasource_name, **kwargs): """Read data from the given data source Simple data sources that only return data directly from a parser does not need an explicit apriori-file. This is handled by looking in the parser-directory if a data source is not found in the apriori directory. The import of where.parsers is done locally to avoid circular imports. Args: datasource_name (String): Name of apriori data source kwargs: Input arguments to the data source Returns: The data from the data source (data type depends on source) """ try: return plugins.call_one(package_name=__name__, plugin_name=datasource_name, **kwargs) except exceptions.UnknownPluginError as apriori_err: from where import parsers try: data = parsers.parse_key(file_key=datasource_name, **kwargs).as_dict() log.dev( f"Called parsers.parse_key({datasource_name}) in apriori.get()" ) return data except (AttributeError) as att: try: data = parsers.parse(datasource_name, **kwargs) log.dev( f"Called parsers.parse({datasource_name}) in apriori.get()" ) return data except exceptions.UnknownPluginError: raise apriori_err from None
def data_handling(dset): """Edits data based on SLR handling file Args: dset: A Dataset containing model data. Returns: Array containing False for observations to throw away """ handling = apriori.get("slr_handling_file", time=dset.time) for station in dset.unique("station"): # Estimate range bias E intervals = handling.get(station, {}).get("E", []) for interval, info in intervals: start_x, end_x = interval int_idx = dset.filter(station=station) & (dset.time >= start_x) & ( dset.time <= end_x) if np.any(int_idx): log.info( f"ILRS handling: Estimating range bias for station {station} in interval {start_x}-{end_x}" ) log.dev( "ILRS Data Handling: What if there is a break in the middle of a pass?" ) dset.estimate_range[:] = np.logical_or(int_idx, dset.estimate_range) # Apply range bias R intervals = handling.get(station, {}).get("R", []) for interval, info in intervals: start_x, end_x = interval int_idx = dset.filter(station=station) & (dset.time >= start_x) & ( dset.time <= end_x) if np.any(int_idx): log.info( f"ILRS handling: Applying range bias for station {station} in interval {start_x}-{end_x}" ) RB = info["e_value"] if info["unit"] == "mm": dset.range_bias[:] += int_idx * RB * Unit.mm2m elif info["unit"] == "ms": dset.range_bias[:] += int_idx * RB * Unit.millisec2seconds * constant.c else: log.fatal( "Unknown unit on ILRS Data handling file for range bias applied" ) # Estimate time bias U intervals = handling.get(station, {}).get("U", []) for interval, info in intervals: start_x, end_x = interval int_idx = dset.filter(station=station) & (dset.time >= start_x) & ( dset.time <= end_x) if np.any(int_idx): log.warn( f"ILRS handling: Estimating time bias for station {station} in interval {start_x}-{end_x}" ) dset.estimate_time |= int_idx # Apply time bias T intervals = handling.get(station, {}).get("T", []) for interval, info in intervals: start_x, end_x = interval int_idx = dset.filter(station=station) & (dset.time >= start_x) & ( dset.time <= end_x) if np.any(int_idx): log.info( f"ILRS handling: Applying time bias for station {station} in interval {start_x}-{end_x}" ) t_midInterval = Time(start_x + 1 / 2 * (end_x - start_x), format="datetime") TB = info["e_value"] drift = info["e_rate"] if info["unit"] == "us": time_drifted = (dset.time - t_midInterval).jd * drift dset.time_bias[:] += int_idx * ( -np.repeat(TB, dset.num_obs) - time_drifted) * Unit.microsec2sec else: log.fatal( "Unknown unit on ILRS Data handling file for time bias applied" ) # Apply pressure bias P intervals = handling.get(station, {}).get("P", []) for interval, info in intervals: start_x, end_x = interval int_idx = dset.filter(station=station) & (dset.time >= start_x) & ( dset.time <= end_x) if np.any(int_idx): log.fatal(f"ILRS handling: TODO: Implement pressure bias!") # Target signature bias C intervals = handling.get(station, {}).get("P", []) for interval, info in intervals: start_x, end_x = interval int_idx = dset.filter(station=station) & (dset.time >= start_x) & ( dset.time <= end_x) if np.any(int_idx): log.fatal( f"ILRS handling: TODO: Implement target signature bias!") return
def path(file_key, file_vars=None, default=None, is_zipped=None, download_missing=False, use_aliases=True): """Construct a filepath for a given file with variables If `is_zipped` is None, and the file_path contains `<filename>{gz}`, the file will be assumed to be a gzip-file if there exists a file named `<filename>.gz`. When setting `use_aliases` to True, the aliases as specified in the files configuration file represent alternative filenames. In particular, + if directory / file_name exists it is returned + otherwise the first directory / alias that exists is returned + if none of these exist, directory / file_name is returned Args: file_key (String): Key that is looked up in the Where file list. file_vars (Dict): Values used to replace variables in file name and path. default (String): Value to use for variables that are not in file_vars. is_zipped (Bool/None): True, False or None. If True, open with gzip. If None automatically decide. download_missing (Bool): Whether to try to download missing files. use_aliases (Bool): Fall back on aliases if file does not exist. Return: Path: Full path with replaced variables in file name and path. """ import sys caller = sys._getframe(1) func_name = caller.f_code.co_name file_name = caller.f_code.co_filename line_num = caller.f_lineno log.dev( f"{file_name} ({line_num}) {func_name}: 'lib.files.path()' is deprecated. Use 'lib.config.files.path()' instead" ) file_vars = dict() if file_vars is None else file_vars directory = config.files[file_key].directory.replace(default=default, **file_vars).path file_name = config.files[file_key].filename.replace(default=default, **file_vars).path file_path = _replace_gz(directory / file_name) # Check for aliases if use_aliases and not path_exists(file_path): aliases = config.files.get("aliases", section=file_key, default="").replace(default=default, **file_vars).list for alias in aliases: aliased_path = _replace_gz(file_path.with_name(alias)) if path_exists(aliased_path): return aliased_path # Try to download the file if it is missing if download_missing and not path_exists(file_path): downloaded_path = download_file(file_key, file_vars) if downloaded_path is not None: file_path = downloaded_path return file_path