Example #1
0
 def __init__(self, inp, time=None, z=None, lat=None, lon=None, geom=None):
     """
     inp: a numpy array or a dictionary of numpy arrays where the keys are the stream ids
     time: numpy array of date-like objects.
     z: numpy array of z
     lat: numpy array of latitude, this or geom is required if using regional subsets
     lon: numpy array of longitude, this or geom is required if using regional subsets
     geom: numpy array of geometry, this or lat and lon are required if using regional subsets
     """
     self.inp = inp
     self.tinp = pd.DatetimeIndex(mapdates(time))
     self.zinp = z
     self.lat = lat
     self.lon = lon
     self.geom = geom
Example #2
0
def rate_of_change_test(inp: Sequence[N], tinp: Sequence[N],
                        threshold: float) -> np.ma.core.MaskedArray:
    """Checks the first order difference of a series of values to see if
    there are any values exceeding a threshold defined by the inputs.
    These are then marked as SUSPECT.  It is up to the test operator
    to determine an appropriate threshold value for the absolute difference not to
    exceed. Threshold is expressed as a rate in observations units per second.
    Missing and masked data is flagged as UNKNOWN.

    Args:
        inp: Input data as a numeric numpy array or a list of numbers.
        tinp: Time data as a sequence of datetime objects compatible with pandas DatetimeIndex.
              This includes numpy datetime64, python datetime objects and pandas Timestamp object.
              ie. pd.DatetimeIndex([datetime.utcnow(), np.datetime64(), pd.Timestamp.now()]
              If anything else is passed in the format is assumed to be seconds since the unix epoch.
        threshold: A float value representing a rate of change over time,
                   in observation units per second.

    Returns:
        A masked array of flag values equal in size to that of the input.
    """
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        inp = np.ma.masked_invalid(np.array(inp).astype(np.float64))

    # Save original shape
    original_shape = inp.shape
    inp = inp.flatten()

    # Start with everything as passing (1)
    flag_arr = np.ma.ones(inp.size, dtype='uint8')

    # calculate rate of change in units/second
    roc = np.ma.zeros(inp.size, dtype='float')

    tinp = mapdates(tinp).flatten()
    roc[1:] = np.abs(
        np.diff(inp) / np.diff(tinp).astype('timedelta64[s]').astype(float))

    with np.errstate(invalid='ignore'):
        flag_arr[roc > threshold] = QartodFlags.SUSPECT

    # If the value is masked set the flag to MISSING
    flag_arr[inp.mask] = QartodFlags.MISSING

    return flag_arr.reshape(original_shape)
Example #3
0
def climatology_test(
    config: Union[ClimatologyConfig, Sequence[Dict[str, Tuple]]],
    inp: Sequence[N],
    tinp: Sequence[N],
    zinp: Sequence[N],
) -> np.ma.core.MaskedArray:
    """Checks that values are within reasonable range bounds and flags as SUSPECT.

    Data for which no ClimatologyConfig member exists is marked as UNKNOWN.

    Args:
        config: A ClimatologyConfig object or a list of dicts containing tuples
            that can be used to create a ClimatologyConfig object. See ClimatologyConfig
            docs for more info.
        tinp: Time data as a sequence of datetime objects compatible with pandas DatetimeIndex.
          This includes numpy datetime64, python datetime objects and pandas Timestamp object.
          ie. pd.DatetimeIndex([datetime.utcnow(), np.datetime64(), pd.Timestamp.now()]
          If anything else is passed in the format is assumed to be seconds since the unix epoch.
        vinp: Input data as a numeric numpy array or a list of numbers.
        zinp: Z (depth) data, in meters positive down, as a numeric numpy array or a list of numbers.

    Returns:
        A masked array of flag values equal in size to that of the input.
    """

    # Create a ClimatologyConfig object if one was not passed in
    config = ClimatologyConfig.convert(config)

    tinp = mapdates(tinp)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        inp = np.ma.masked_invalid(np.array(inp).astype(np.float64))
        zinp = np.ma.masked_invalid(np.array(zinp).astype(np.float64))

    # Save original shape
    original_shape = inp.shape

    # We compare using a pandas Timestamp for helper functions like
    # 'week' and 'dayofyear'. It is surprisingly hard to pull these out
    # of a plain datetime64 object.
    tinp = pd.DatetimeIndex(tinp.flatten())
    inp = inp.flatten()
    zinp = zinp.flatten()

    flag_arr = config.check(tinp, inp, zinp)
    return flag_arr.reshape(original_shape)
Example #4
0
    def run(self, config: Config):

        do_close, ds = self._open()

        stream_ids = []
        for context in config.contexts:
            for stream_id, stream in context.streams.items():
                if stream_id not in ds.variables:
                    L.warning(
                        f'{stream_id} is not a variable in the netCDF dataset, skipping'
                    )
                    continue
                stream_ids.append(stream_id)

        # Find any var specific kwargs to pass onto the run
        varkwargs = {'inp': {}}
        if self.time_var in ds.variables:
            varkwargs['time'] = pd.DatetimeIndex(
                mapdates(ds.variables[self.time_var].values))
        if self.z_var in ds.variables:
            varkwargs['z'] = ds.variables[self.z_var].values
        if self.lat_var in ds.variables:
            varkwargs['lat'] = ds.variables[self.lat_var].values
        if self.lon_var in ds.variables:
            varkwargs['lon'] = ds.variables[self.lon_var].values

        # Now populate the `inp` dict for each valid data stream
        for s in stream_ids:
            if s in ds.variables:
                varkwargs['inp'][s] = ds.variables[s].values

        if do_close is True:
            ds.close()

        ns = NumpyStream(**varkwargs)
        return ns.run(config)
Example #5
0
def attenuated_signal_test(
    inp: Sequence[N],
    tinp: Sequence[N],
    suspect_threshold: N,
    fail_threshold: N,
    test_period: N = None,
    min_obs: N = None,
    min_period: int = None,
    check_type: str = 'std',
    *args,
    **kwargs,
) -> np.ma.MaskedArray:
    """Check for near-flat-line conditions using a range or standard deviation.

    Missing and masked data is flagged as UNKNOWN.

    Args:
        inp: Input data as a numeric numpy array or a list of numbers.
        tinp: Time input data as a numpy array of dtype `datetime64`.
        suspect_threshold: Any calculated value below this amount will be flagged as SUSPECT.
            In observations units.
        fail_threshold: Any calculated values below this amount will be flagged as FAIL.
            In observations units.
        test_period: Length of time to test over in seconds [optional].
            Otherwise, will test against entire `inp`.
        min_obs: Minimum number of observations in window required to calculate a result [optional].
            Otherwise, test will start at beginning of time series.
            Note: you can specify either `min_obs` or `min_period`, but not both.
        min_period: Minimum number of seconds in test_period required to calculate a result [optional].
            Otherwise, test will start at beginning of time series.
            Note: you can specify either `min_obs` or `min_period`, but not both.
        check_type: Either 'std' (default) or 'range', depending on the type of check
            you wish to perform.

    Returns:
        A masked array of flag values equal in size to that of the input.
        This array will always contain only a single unique value since all
        input data is flagged together.
    """

    # window_func: Applied to each window when `time_period` is supplied
    # check_func: Applied to a flattened numpy array when no `time_period` is supplied
    # These are split for performance reasons
    if check_type == 'std':
        window_func = lambda x: x.std()  # noqa
        check_func = np.std
    elif check_type == 'range':

        def window_func(w):
            # When pandas>=1.0 and numba are installed, this is about twice as fast
            try:
                return w.apply(np.ptp, raw=True, engine='numba')
            except (ImportError, TypeError):
                return w.apply(np.ptp, raw=True)

        check_func = np.ptp
    else:
        raise ValueError(
            'Check type "{}" is not one of ["std", "range"]'.format(
                check_type))

    tinp = mapdates(tinp)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        inp = np.ma.masked_invalid(np.array(inp).astype(np.float64))

    # Save original shape
    original_shape = inp.shape

    # Start with everything as not tested (0)
    flag_arr = np.full((inp.size, ), QartodFlags.UNKNOWN)

    if test_period:
        if min_obs is not None:
            min_periods = min_obs
        elif min_period is not None:
            time_interval = np.median(
                np.diff(tinp)).astype('timedelta64[s]').astype(float)
            min_periods = (min_period / time_interval).astype(int)
        else:
            min_periods = None
        series = pd.Series(inp.flatten(), index=tinp.flatten())
        windows = series.rolling(f'{test_period}s', min_periods=min_periods)
        check_val = window_func(windows)
    else:
        # applying np.ptp to Series causes warnings, this is a workaround
        series = inp.flatten()
        check_val = np.ones_like(flag_arr) * check_func(series)

    flag_arr[check_val >= suspect_threshold] = QartodFlags.GOOD
    flag_arr[check_val < suspect_threshold] = QartodFlags.SUSPECT
    flag_arr[np.isnan(check_val)] = QartodFlags.UNKNOWN
    flag_arr[check_val < fail_threshold] = QartodFlags.FAIL
    flag_arr[inp.mask] = QartodFlags.MISSING

    return flag_arr.reshape(original_shape)
Example #6
0
def flat_line_test(inp: Sequence[N],
                   tinp: Sequence[N],
                   suspect_threshold: int,
                   fail_threshold: int,
                   tolerance: N = 0) -> np.ma.MaskedArray:
    """Check for consecutively repeated values within a tolerance.
    Missing and masked data is flagged as UNKNOWN.
    More information: https://github.com/ioos/ioos_qc/pull/11

    Args:
        inp: Input data as a numeric numpy array or a list of numbers.
        tinp: Time data as a sequence of datetime objects compatible with pandas DatetimeIndex.
              This includes numpy datetime64, python datetime objects and pandas Timestamp object.
              ie. pd.DatetimeIndex([datetime.utcnow(), np.datetime64(), pd.Timestamp.now()]
              If anything else is passed in the format is assumed to be seconds since the unix epoch.
        suspect_threshold: The number of seconds within `tolerance` to
            allow before being flagged as SUSPECT.
        fail_threshold: The number of seconds within `tolerance` to
            allow before being flagged as FAIL.
        tolerance: The tolerance that should be exceeded between consecutive values.
            To determine if the current point `n` should be flagged, we use a rolling window, with endpoint at
            point `n`, and calculate the range of values in the window. If that range is less than `tolerance`,
            then the point is flagged.

    Returns:
        A masked array of flag values equal in size to that of the input.
    """

    # input as numpy arr
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        inp = np.ma.masked_invalid(np.array(inp).astype(np.float64))

    # Save original shape
    original_shape = inp.shape
    inp = inp.flatten()

    # Start with everything as passing
    flag_arr = np.full((inp.size, ), QartodFlags.GOOD)

    # if we have fewer than 3 points, we can't run the test, so everything passes
    if len(inp) < 3:
        return flag_arr.reshape(original_shape)

    # determine median time interval
    tinp = mapdates(tinp).flatten()

    # The thresholds are in seconds so we round make sure the interval is also in seconds
    time_interval = np.median(
        np.diff(tinp)).astype('timedelta64[s]').astype(float)

    def rolling_window(a, window):
        """
        https://rigtorp.se/2011/01/01/rolling-statistics-numpy.html
        """
        if len(a) < window:
            return np.ma.MaskedArray(np.empty((0, window + 1)))
        shape = a.shape[:-1] + (a.shape[-1] - window + 1, window + 1)
        strides = a.strides + (a.strides[-1], )
        arr = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
        return np.ma.masked_invalid(arr[:-1, :])

    def run_test(test_threshold, flag_value):
        # convert time thresholds to number of observations
        count = (int(test_threshold) / time_interval).astype(int)

        # calculate actual data ranges for each window
        window = rolling_window(inp, count)
        data_min = np.min(window, 1)
        data_max = np.max(window, 1)
        data_range = np.abs(data_max - data_min)

        # find data ranges that are within threshold and flag them
        test_results = np.ma.filled(data_range < tolerance, fill_value=False)
        # data points before end of first window should pass
        n_fill = count if count < len(inp) else len(inp)
        test_results = np.insert(test_results, 0, np.full((n_fill, ), False))
        flag_arr[test_results] = flag_value

    run_test(suspect_threshold, QartodFlags.SUSPECT)
    run_test(fail_threshold, QartodFlags.FAIL)

    # If the value is masked set the flag to MISSING
    flag_arr[inp.mask] = QartodFlags.MISSING

    return flag_arr.reshape(original_shape)