def dataqc_gradienttest(dat, x, ddatdx, mindx, startdat, toldat, strict_validation=False):
    """
    Description

        Data quality control algorithm testing if changes between successive
        data points fall within a certain range.

        Input data dat are given as a function of coordinate x. The algorithm
        will flag dat values as bad if the change deltaDAT/deltaX between
        successive dat values exceeds thresholds given in ddatdx. Once the
        threshold is exceeded, following dat are considered bad until a dat
        value returns to within toldat of the last known good value.

        It is possible to remove data points that are too close together in x
        coordinates (use mindx).

        By default, the first value of dat is considered good. To change this,
        use startdat and toldat to set as the first good data point the first
        one that comes within toldat of startdat.

    Implemented by:

        2012-07-17: DPS authored by Mathias Lankhorst. Example code provided
        for Matlab.
        2013-04-06: Christopher Wingard. Initial python implementation.

    Usage:

        outdat, outx, outqc = dataqc_gradienttest(dat, x, ddatdx, mindx,
                                                  startdat, toldat);

            where

        outdat = same as dat except that NaNs and values not meeting mindx are
            removed.
        outx = same as x except that NaNs and values not meeting mindx are
            removed.
        outqc = output quality control flags for outdat. 0 means bad data, 1
            means good data.

        dat = input dataset, a numeric real vector.
        x = coordinate (e.g. time, distance) along which dat is given. Must be
            of the same size as dat and strictly increasing.
        ddatdx = two-element vector defining the valid range of ddat/dx
            from one point to the next.
        mindx = scalar. minimum dx for which this test will be applied (data
            that are less than mindx apart will be deleted). defaults to zero
            if NaN/empty.
        startdat = start value (scalar) of dat that is presumed good. defaults
            to first non-NaN value of dat if NaN/empty.
        toldat = tolerance value (scalar) for dat; threshold to within which
            dat must return to be counted as good, after exceeding a ddatdx
            threshold detected bad data.

    References:

        OOI (2012). Data Product Specification for Gradient Test. Document
            Control Number 1341-100010.
            https://alfresco.oceanobservatories.org/ (See: Company Home >> OOI
            >> Controlled >> 1000 System Level >>
            1341-10010_Data_Product_SPEC_GRDTEST_OOI.pdf)
    """

    if strict_validation:
        if not utils.isvector(dat) or not utils.isvector(x):
            raise ValueError('\'dat\' and \'x\' must be vectors')

        if len(dat) != len(x):
            raise ValueError('\'dat\' and \'x\' must be of equal len')

        if not all(np.diff(x) > 0):
            raise ValueError('\'x\' must be montonically increasing')

    dat = np.asanyarray(dat, dtype=np.float).flatten()
    x = np.asanyarray(x, dtype=np.float).flatten()

    if np.isnan(mindx):
        mindx = 0
    mindx = mindx or 0
    if np.isnan(startdat):
        startdat = 0
    startdat = startdat or 0
    
    # No strict validation here, they are scalards and they must be validated
    # before going into the C-layer
    if not utils.isscalar(mindx):
        raise ValueError("'mindx' must be scalar, NaN, or empty.")
    if not utils.isscalar(startdat):
        raise ValueError("'startdat' must be scalar, NaN, or empty.")


    # Confirm that there are still data points left, else abort:
    if np.abs(x[0] - x[-1]) < mindx:
        out = np.zeros(x.shape)
        out.fill(1)
        log.warn('Too few values to inspect')
        return out


    grad_min = ddatdx[0]
    grad_max = ddatdx[1]
    out = gradientvalues(dat, x, grad_min, grad_max, mindx, startdat, toldat)

    return out
def dataqc_stuckvaluetest(x, reso, num=10, strict_validation=False):
    """
    Description:

        Data quality control algorithm testing a time series for "stuck
        values", i.e. repeated occurences of one value. Returns 1 for
        presumably good data and 0 for data presumed bad.

    Implemented by:

        2012-10-29: DPS authored by Mathias Lankhorst. Example code provided
        for Matlab.
        2013-04-06: Christopher Wingard. Initial python implementation.

    Usage:

        qcflag = =dataqc_stuckvaluetest(x, RESO, NUM);

            where

        qcflag = Boolean output: 0 where stuck values are found, 1 elsewhere.
        x = Input time series (vector, numeric).
        reso = Resolution; repeat values less than reso apart will be
            considered "stuck values".
        num = Minimum number of successive values within reso of each other
            that will trigger the "stuck value". num is optional and defaults
            to 10 if omitted or empty.

    References:

        OOI (2012). Data Product Specification for Stuck Value Test. Document
            Control Number 1341-10008. https://alfresco.oceanobservatories.org/
            (See: Company Home >> OOI >> Controlled >> 1000 System Level >>
            1341-10008_Data_Product_SPEC_STUCKVL_OOI.pdf)
    """
    dat = np.atleast_1d(x)

    if strict_validation:
        if not utils.isnumeric(dat).all():
            raise ValueError('\'x\' must be numeric')

        if not utils.isvector(dat):
            raise ValueError('\'x\' must be a vector')

        if not utils.isreal(dat).all():
            raise ValueError('\'x\' must be real')

        for k, arg in {'reso': reso, 'num': num}.iteritems():
            if not utils.isnumeric(arg).all():
                raise ValueError('\'{0}\' must be numeric'.format(k))

            if not utils.isscalar(arg):
                raise ValueError('\'{0}\' must be a scalar'.format(k))

            if not utils.isreal(arg).all():
                raise ValueError('\'{0}\' must be real'.format(k))

    num = np.abs(num)
    dat = np.asanyarray(dat, dtype=np.float)
    ll = len(x)
    if ll < num:
        # Warn - 'num' is greater than len(x), returning zeros
        out = np.zeros(dat.size, dtype='int8')
    else:
        out = stuckvalues(dat, reso, num)

    return out
def dataqc_polytrendtest(dat, t, ord_n=1, nstd=3, strict_validation=False):
    """
    Description:

        Data quality control algorithm testing if measurements contain a
        significant portion of a polynomial. Returns 1 if this is not the case,
        else 0.

        The purpose of this test is to check if a significant fraction of the
        variability in a time series can be explained by a drift, possibly
        interpreted as a sensor drift. This drift is assumed to be a polynomial
        of order ORD. Use ORD=1 to consider a linear drift

        The time series dat is passed to MatLab's POLYFIT routine to obtain a
        polynomial fit PP to dat, and the difference dat-PP is compared to the
        original dat. If the standard deviation of (dat-PP) is less than that
        of dat by a factor of NSTD, the time series is assumed to contain a
        significant trend (output will be 0), else not (output will be 1).

    Implemented by:

        2012-10-29: DPS authored by Mathias Lankhorst. Example code provided
        for Matlab.
        2013-04-06: Christopher Wingard. Initial python implementation.
        2013-05-30: Christopher Mueller. Performance optimizations.

    Usage:

        qcflag = dataqc_polytrendtest(dat, t, ord_n, nstd, strict_validation)

            where

        qcflag = Boolean, 0 a trend is detected, 1 elsewhere.
        dat = Input dataset, a numeric real vector.
        t = time record associated with dat
        ord_n (optional, defaults to 1) = Polynomial order.
        nstd (optional, defaults to 3) = Factor by how much the standard
            deviation must be reduced before qcflag switches from 1 to 0
        strict_validation (optional, defaults to False) = Flag asserting
            testing of inputs.

    References:

        OOI (2012). Data Product Specification for Trend Test. Document
            Control Number 1341-10007. https://alfresco.oceanobservatories.org/
            (See: Company Home >> OOI >> Controlled >> 1000 System Level >>
            1341-10007_Data_Product_SPEC_TRNDTST_OOI.pdf)
    """
    dat = np.atleast_1d(dat)
    t = np.atleast_1d(t)

    if strict_validation:
        for k, arg in {'dat': dat, 't': t, 'ord_n': ord_n, 'nstd': nstd}.iteritems():
            if not utils.isnumeric(arg).all():
                raise ValueError('\'{0}\' must be numeric'.format(k))

            if not utils.isreal(arg).all():
                raise ValueError('\'{0}\' must be real'.format(k))

        for k, arg in {'dat': dat, 't': t}.iteritems():
            if not utils.isvector(arg):
                raise ValueError('\'{0}\' must be a vector'.format(k))

        for k, arg in {'ord_n': ord_n, 'nstd': nstd}.iteritems():
            if not utils.isscalar(arg):
                raise ValueError('\'{0}\' must be a scalar'.format(k))

    ord_n = int(round(abs(ord_n)))
    nstd = int(abs(nstd))

    ll = len(dat)
    # Not needed because time is incorporated as 't'
    # t = range(ll)

    pp = np.polyfit(t, dat, ord_n)
    datpp = np.polyval(pp, t)

    # test for a trend
    if np.atleast_1d((np.std(dat - datpp) * nstd) < np.std(dat)).all():
        trndtst = 0
    else:
        trndtst = 1

    # insure output size equals input, even though test yields a single value.
    qcflag = np.ones(dat.shape).astype('int8') * trndtst
    return qcflag
Example #4
0
def dataqc_gradienttest(dat,
                        x,
                        ddatdx,
                        mindx,
                        startdat,
                        toldat,
                        strict_validation=False):
    """
    Description

        Data quality control algorithm testing if changes between successive
        data points fall within a certain range.

        Input data dat are given as a function of coordinate x. The algorithm
        will flag dat values as bad if the change deltaDAT/deltaX between
        successive dat values exceeds thresholds given in ddatdx. Once the
        threshold is exceeded, following dat are considered bad until a dat
        value returns to within toldat of the last known good value.

        It is possible to remove data points that are too close together in x
        coordinates (use mindx).

        By default, the first value of dat is considered good. To change this,
        use startdat and toldat to set as the first good data point the first
        one that comes within toldat of startdat.

    Implemented by:

        2012-07-17: DPS authored by Mathias Lankhorst. Example code provided
        for Matlab.
        2013-04-06: Christopher Wingard. Initial python implementation.

    Usage:

        outdat, outx, outqc = dataqc_gradienttest(dat, x, ddatdx, mindx,
                                                  startdat, toldat);

            where

        outdat = same as dat except that NaNs and values not meeting mindx are
            removed.
        outx = same as x except that NaNs and values not meeting mindx are
            removed.
        outqc = output quality control flags for outdat. 0 means bad data, 1
            means good data.

        dat = input dataset, a numeric real vector.
        x = coordinate (e.g. time, distance) along which dat is given. Must be
            of the same size as dat and strictly increasing.
        ddatdx = two-element vector defining the valid range of ddat/dx
            from one point to the next.
        mindx = scalar. minimum dx for which this test will be applied (data
            that are less than mindx apart will be deleted). defaults to zero
            if NaN/empty.
        startdat = start value (scalar) of dat that is presumed good. defaults
            to first non-NaN value of dat if NaN/empty.
        toldat = tolerance value (scalar) for dat; threshold to within which
            dat must return to be counted as good, after exceeding a ddatdx
            threshold detected bad data.

    References:

        OOI (2012). Data Product Specification for Gradient Test. Document
            Control Number 1341-100010.
            https://alfresco.oceanobservatories.org/ (See: Company Home >> OOI
            >> Controlled >> 1000 System Level >>
            1341-10010_Data_Product_SPEC_GRDTEST_OOI.pdf)
    """

    if strict_validation:
        if not utils.isvector(dat) or not utils.isvector(x):
            raise ValueError('\'dat\' and \'x\' must be vectors')

        if len(dat) != len(x):
            raise ValueError('\'dat\' and \'x\' must be of equal len')

        if not all(np.diff(x) > 0):
            raise ValueError('\'x\' must be montonically increasing')

    dat = np.asanyarray(dat, dtype=np.float).flatten()
    x = np.asanyarray(x, dtype=np.float).flatten()

    if np.isnan(mindx):
        mindx = 0
    mindx = mindx or 0
    if np.isnan(startdat):
        startdat = 0
    startdat = startdat or 0

    # No strict validation here, they are scalards and they must be validated
    # before going into the C-layer
    if not utils.isscalar(mindx):
        raise ValueError("'mindx' must be scalar, NaN, or empty.")
    if not utils.isscalar(startdat):
        raise ValueError("'startdat' must be scalar, NaN, or empty.")

    # Confirm that there are still data points left, else abort:
    if np.abs(x[0] - x[-1]) < mindx:
        out = np.zeros(x.shape)
        out.fill(1)
        log.warn('Too few values to inspect')
        return out

    grad_min = ddatdx[0]
    grad_max = ddatdx[1]
    out = gradientvalues(dat, x, grad_min, grad_max, mindx, startdat, toldat)

    return out
Example #5
0
def dataqc_stuckvaluetest(x, reso, num=10, strict_validation=False):
    """
    Description:

        Data quality control algorithm testing a time series for "stuck
        values", i.e. repeated occurences of one value. Returns 1 for
        presumably good data and 0 for data presumed bad.

    Implemented by:

        2012-10-29: DPS authored by Mathias Lankhorst. Example code provided
        for Matlab.
        2013-04-06: Christopher Wingard. Initial python implementation.

    Usage:

        qcflag = =dataqc_stuckvaluetest(x, RESO, NUM);

            where

        qcflag = Boolean output: 0 where stuck values are found, 1 elsewhere.
        x = Input time series (vector, numeric).
        reso = Resolution; repeat values less than reso apart will be
            considered "stuck values".
        num = Minimum number of successive values within reso of each other
            that will trigger the "stuck value". num is optional and defaults
            to 10 if omitted or empty.

    References:

        OOI (2012). Data Product Specification for Stuck Value Test. Document
            Control Number 1341-10008. https://alfresco.oceanobservatories.org/
            (See: Company Home >> OOI >> Controlled >> 1000 System Level >>
            1341-10008_Data_Product_SPEC_STUCKVL_OOI.pdf)
    """
    dat = np.atleast_1d(x)

    if strict_validation:
        if not utils.isnumeric(dat).all():
            raise ValueError('\'x\' must be numeric')

        if not utils.isvector(dat):
            raise ValueError('\'x\' must be a vector')

        if not utils.isreal(dat).all():
            raise ValueError('\'x\' must be real')

        for k, arg in {'reso': reso, 'num': num}.iteritems():
            if not utils.isnumeric(arg).all():
                raise ValueError('\'{0}\' must be numeric'.format(k))

            if not utils.isscalar(arg):
                raise ValueError('\'{0}\' must be a scalar'.format(k))

            if not utils.isreal(arg).all():
                raise ValueError('\'{0}\' must be real'.format(k))

    num = np.abs(num)
    dat = np.asanyarray(dat, dtype=np.float)
    ll = len(x)
    if ll < num:
        # Warn - 'num' is greater than len(x), returning zeros
        out = np.zeros(dat.size, dtype='int8')
    else:
        out = stuckvalues(dat, reso, num)

    return out
Example #6
0
def dataqc_polytrendtest(dat, t, ord_n=1, nstd=3, strict_validation=False):
    """
    Description:

        Data quality control algorithm testing if measurements contain a
        significant portion of a polynomial. Returns 1 if this is not the case,
        else 0.

        The purpose of this test is to check if a significant fraction of the
        variability in a time series can be explained by a drift, possibly
        interpreted as a sensor drift. This drift is assumed to be a polynomial
        of order ORD. Use ORD=1 to consider a linear drift

        The time series dat is passed to MatLab's POLYFIT routine to obtain a
        polynomial fit PP to dat, and the difference dat-PP is compared to the
        original dat. If the standard deviation of (dat-PP) is less than that
        of dat by a factor of NSTD, the time series is assumed to contain a
        significant trend (output will be 0), else not (output will be 1).

    Implemented by:

        2012-10-29: DPS authored by Mathias Lankhorst. Example code provided
        for Matlab.
        2013-04-06: Christopher Wingard. Initial python implementation.
        2013-05-30: Christopher Mueller. Performance optimizations.

    Usage:

        qcflag = dataqc_polytrendtest(dat, t, ord_n, nstd, strict_validation)

            where

        qcflag = Boolean, 0 a trend is detected, 1 elsewhere.
        dat = Input dataset, a numeric real vector.
        t = time record associated with dat
        ord_n (optional, defaults to 1) = Polynomial order.
        nstd (optional, defaults to 3) = Factor by how much the standard
            deviation must be reduced before qcflag switches from 1 to 0
        strict_validation (optional, defaults to False) = Flag asserting
            testing of inputs.

    References:

        OOI (2012). Data Product Specification for Trend Test. Document
            Control Number 1341-10007. https://alfresco.oceanobservatories.org/
            (See: Company Home >> OOI >> Controlled >> 1000 System Level >>
            1341-10007_Data_Product_SPEC_TRNDTST_OOI.pdf)
    """
    dat = np.atleast_1d(dat)
    t = np.atleast_1d(t)

    if strict_validation:
        for k, arg in {
                'dat': dat,
                't': t,
                'ord_n': ord_n,
                'nstd': nstd
        }.iteritems():
            if not utils.isnumeric(arg).all():
                raise ValueError('\'{0}\' must be numeric'.format(k))

            if not utils.isreal(arg).all():
                raise ValueError('\'{0}\' must be real'.format(k))

        for k, arg in {'dat': dat, 't': t}.iteritems():
            if not utils.isvector(arg):
                raise ValueError('\'{0}\' must be a vector'.format(k))

        for k, arg in {'ord_n': ord_n, 'nstd': nstd}.iteritems():
            if not utils.isscalar(arg):
                raise ValueError('\'{0}\' must be a scalar'.format(k))

    ord_n = int(round(abs(ord_n)))
    nstd = int(abs(nstd))

    ll = len(dat)
    # Not needed because time is incorporated as 't'
    # t = range(ll)

    pp = np.polyfit(t, dat, ord_n)
    datpp = np.polyval(pp, t)

    # test for a trend
    if np.atleast_1d((np.std(dat - datpp) * nstd) < np.std(dat)).all():
        trndtst = 0
    else:
        trndtst = 1

    # insure output size equals input, even though test yields a single value.
    qcflag = np.ones(dat.shape).astype('int8') * trndtst
    return qcflag