def __init__(self, inp, time=None, z=None, lat=None, lon=None, geom=None): """ inp: a numpy array or a dictionary of numpy arrays where the keys are the stream ids time: numpy array of date-like objects. z: numpy array of z lat: numpy array of latitude, this or geom is required if using regional subsets lon: numpy array of longitude, this or geom is required if using regional subsets geom: numpy array of geometry, this or lat and lon are required if using regional subsets """ self.inp = inp self.tinp = pd.DatetimeIndex(mapdates(time)) self.zinp = z self.lat = lat self.lon = lon self.geom = geom
def rate_of_change_test(inp: Sequence[N], tinp: Sequence[N], threshold: float) -> np.ma.core.MaskedArray: """Checks the first order difference of a series of values to see if there are any values exceeding a threshold defined by the inputs. These are then marked as SUSPECT. It is up to the test operator to determine an appropriate threshold value for the absolute difference not to exceed. Threshold is expressed as a rate in observations units per second. Missing and masked data is flagged as UNKNOWN. Args: inp: Input data as a numeric numpy array or a list of numbers. tinp: Time data as a sequence of datetime objects compatible with pandas DatetimeIndex. This includes numpy datetime64, python datetime objects and pandas Timestamp object. ie. pd.DatetimeIndex([datetime.utcnow(), np.datetime64(), pd.Timestamp.now()] If anything else is passed in the format is assumed to be seconds since the unix epoch. threshold: A float value representing a rate of change over time, in observation units per second. Returns: A masked array of flag values equal in size to that of the input. """ with warnings.catch_warnings(): warnings.simplefilter("ignore") inp = np.ma.masked_invalid(np.array(inp).astype(np.float64)) # Save original shape original_shape = inp.shape inp = inp.flatten() # Start with everything as passing (1) flag_arr = np.ma.ones(inp.size, dtype='uint8') # calculate rate of change in units/second roc = np.ma.zeros(inp.size, dtype='float') tinp = mapdates(tinp).flatten() roc[1:] = np.abs( np.diff(inp) / np.diff(tinp).astype('timedelta64[s]').astype(float)) with np.errstate(invalid='ignore'): flag_arr[roc > threshold] = QartodFlags.SUSPECT # If the value is masked set the flag to MISSING flag_arr[inp.mask] = QartodFlags.MISSING return flag_arr.reshape(original_shape)
def climatology_test( config: Union[ClimatologyConfig, Sequence[Dict[str, Tuple]]], inp: Sequence[N], tinp: Sequence[N], zinp: Sequence[N], ) -> np.ma.core.MaskedArray: """Checks that values are within reasonable range bounds and flags as SUSPECT. Data for which no ClimatologyConfig member exists is marked as UNKNOWN. Args: config: A ClimatologyConfig object or a list of dicts containing tuples that can be used to create a ClimatologyConfig object. See ClimatologyConfig docs for more info. tinp: Time data as a sequence of datetime objects compatible with pandas DatetimeIndex. This includes numpy datetime64, python datetime objects and pandas Timestamp object. ie. pd.DatetimeIndex([datetime.utcnow(), np.datetime64(), pd.Timestamp.now()] If anything else is passed in the format is assumed to be seconds since the unix epoch. vinp: Input data as a numeric numpy array or a list of numbers. zinp: Z (depth) data, in meters positive down, as a numeric numpy array or a list of numbers. Returns: A masked array of flag values equal in size to that of the input. """ # Create a ClimatologyConfig object if one was not passed in config = ClimatologyConfig.convert(config) tinp = mapdates(tinp) with warnings.catch_warnings(): warnings.simplefilter("ignore") inp = np.ma.masked_invalid(np.array(inp).astype(np.float64)) zinp = np.ma.masked_invalid(np.array(zinp).astype(np.float64)) # Save original shape original_shape = inp.shape # We compare using a pandas Timestamp for helper functions like # 'week' and 'dayofyear'. It is surprisingly hard to pull these out # of a plain datetime64 object. tinp = pd.DatetimeIndex(tinp.flatten()) inp = inp.flatten() zinp = zinp.flatten() flag_arr = config.check(tinp, inp, zinp) return flag_arr.reshape(original_shape)
def run(self, config: Config): do_close, ds = self._open() stream_ids = [] for context in config.contexts: for stream_id, stream in context.streams.items(): if stream_id not in ds.variables: L.warning( f'{stream_id} is not a variable in the netCDF dataset, skipping' ) continue stream_ids.append(stream_id) # Find any var specific kwargs to pass onto the run varkwargs = {'inp': {}} if self.time_var in ds.variables: varkwargs['time'] = pd.DatetimeIndex( mapdates(ds.variables[self.time_var].values)) if self.z_var in ds.variables: varkwargs['z'] = ds.variables[self.z_var].values if self.lat_var in ds.variables: varkwargs['lat'] = ds.variables[self.lat_var].values if self.lon_var in ds.variables: varkwargs['lon'] = ds.variables[self.lon_var].values # Now populate the `inp` dict for each valid data stream for s in stream_ids: if s in ds.variables: varkwargs['inp'][s] = ds.variables[s].values if do_close is True: ds.close() ns = NumpyStream(**varkwargs) return ns.run(config)
def attenuated_signal_test( inp: Sequence[N], tinp: Sequence[N], suspect_threshold: N, fail_threshold: N, test_period: N = None, min_obs: N = None, min_period: int = None, check_type: str = 'std', *args, **kwargs, ) -> np.ma.MaskedArray: """Check for near-flat-line conditions using a range or standard deviation. Missing and masked data is flagged as UNKNOWN. Args: inp: Input data as a numeric numpy array or a list of numbers. tinp: Time input data as a numpy array of dtype `datetime64`. suspect_threshold: Any calculated value below this amount will be flagged as SUSPECT. In observations units. fail_threshold: Any calculated values below this amount will be flagged as FAIL. In observations units. test_period: Length of time to test over in seconds [optional]. Otherwise, will test against entire `inp`. min_obs: Minimum number of observations in window required to calculate a result [optional]. Otherwise, test will start at beginning of time series. Note: you can specify either `min_obs` or `min_period`, but not both. min_period: Minimum number of seconds in test_period required to calculate a result [optional]. Otherwise, test will start at beginning of time series. Note: you can specify either `min_obs` or `min_period`, but not both. check_type: Either 'std' (default) or 'range', depending on the type of check you wish to perform. Returns: A masked array of flag values equal in size to that of the input. This array will always contain only a single unique value since all input data is flagged together. """ # window_func: Applied to each window when `time_period` is supplied # check_func: Applied to a flattened numpy array when no `time_period` is supplied # These are split for performance reasons if check_type == 'std': window_func = lambda x: x.std() # noqa check_func = np.std elif check_type == 'range': def window_func(w): # When pandas>=1.0 and numba are installed, this is about twice as fast try: return w.apply(np.ptp, raw=True, engine='numba') except (ImportError, TypeError): return w.apply(np.ptp, raw=True) check_func = np.ptp else: raise ValueError( 'Check type "{}" is not one of ["std", "range"]'.format( check_type)) tinp = mapdates(tinp) with warnings.catch_warnings(): warnings.simplefilter("ignore") inp = np.ma.masked_invalid(np.array(inp).astype(np.float64)) # Save original shape original_shape = inp.shape # Start with everything as not tested (0) flag_arr = np.full((inp.size, ), QartodFlags.UNKNOWN) if test_period: if min_obs is not None: min_periods = min_obs elif min_period is not None: time_interval = np.median( np.diff(tinp)).astype('timedelta64[s]').astype(float) min_periods = (min_period / time_interval).astype(int) else: min_periods = None series = pd.Series(inp.flatten(), index=tinp.flatten()) windows = series.rolling(f'{test_period}s', min_periods=min_periods) check_val = window_func(windows) else: # applying np.ptp to Series causes warnings, this is a workaround series = inp.flatten() check_val = np.ones_like(flag_arr) * check_func(series) flag_arr[check_val >= suspect_threshold] = QartodFlags.GOOD flag_arr[check_val < suspect_threshold] = QartodFlags.SUSPECT flag_arr[np.isnan(check_val)] = QartodFlags.UNKNOWN flag_arr[check_val < fail_threshold] = QartodFlags.FAIL flag_arr[inp.mask] = QartodFlags.MISSING return flag_arr.reshape(original_shape)
def flat_line_test(inp: Sequence[N], tinp: Sequence[N], suspect_threshold: int, fail_threshold: int, tolerance: N = 0) -> np.ma.MaskedArray: """Check for consecutively repeated values within a tolerance. Missing and masked data is flagged as UNKNOWN. More information: https://github.com/ioos/ioos_qc/pull/11 Args: inp: Input data as a numeric numpy array or a list of numbers. tinp: Time data as a sequence of datetime objects compatible with pandas DatetimeIndex. This includes numpy datetime64, python datetime objects and pandas Timestamp object. ie. pd.DatetimeIndex([datetime.utcnow(), np.datetime64(), pd.Timestamp.now()] If anything else is passed in the format is assumed to be seconds since the unix epoch. suspect_threshold: The number of seconds within `tolerance` to allow before being flagged as SUSPECT. fail_threshold: The number of seconds within `tolerance` to allow before being flagged as FAIL. tolerance: The tolerance that should be exceeded between consecutive values. To determine if the current point `n` should be flagged, we use a rolling window, with endpoint at point `n`, and calculate the range of values in the window. If that range is less than `tolerance`, then the point is flagged. Returns: A masked array of flag values equal in size to that of the input. """ # input as numpy arr with warnings.catch_warnings(): warnings.simplefilter("ignore") inp = np.ma.masked_invalid(np.array(inp).astype(np.float64)) # Save original shape original_shape = inp.shape inp = inp.flatten() # Start with everything as passing flag_arr = np.full((inp.size, ), QartodFlags.GOOD) # if we have fewer than 3 points, we can't run the test, so everything passes if len(inp) < 3: return flag_arr.reshape(original_shape) # determine median time interval tinp = mapdates(tinp).flatten() # The thresholds are in seconds so we round make sure the interval is also in seconds time_interval = np.median( np.diff(tinp)).astype('timedelta64[s]').astype(float) def rolling_window(a, window): """ https://rigtorp.se/2011/01/01/rolling-statistics-numpy.html """ if len(a) < window: return np.ma.MaskedArray(np.empty((0, window + 1))) shape = a.shape[:-1] + (a.shape[-1] - window + 1, window + 1) strides = a.strides + (a.strides[-1], ) arr = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) return np.ma.masked_invalid(arr[:-1, :]) def run_test(test_threshold, flag_value): # convert time thresholds to number of observations count = (int(test_threshold) / time_interval).astype(int) # calculate actual data ranges for each window window = rolling_window(inp, count) data_min = np.min(window, 1) data_max = np.max(window, 1) data_range = np.abs(data_max - data_min) # find data ranges that are within threshold and flag them test_results = np.ma.filled(data_range < tolerance, fill_value=False) # data points before end of first window should pass n_fill = count if count < len(inp) else len(inp) test_results = np.insert(test_results, 0, np.full((n_fill, ), False)) flag_arr[test_results] = flag_value run_test(suspect_threshold, QartodFlags.SUSPECT) run_test(fail_threshold, QartodFlags.FAIL) # If the value is masked set the flag to MISSING flag_arr[inp.mask] = QartodFlags.MISSING return flag_arr.reshape(original_shape)