def dataqc_gradienttest(dat, x, ddatdx, mindx, startdat, toldat, strict_validation=False): """ Description Data quality control algorithm testing if changes between successive data points fall within a certain range. Input data dat are given as a function of coordinate x. The algorithm will flag dat values as bad if the change deltaDAT/deltaX between successive dat values exceeds thresholds given in ddatdx. Once the threshold is exceeded, following dat are considered bad until a dat value returns to within toldat of the last known good value. It is possible to remove data points that are too close together in x coordinates (use mindx). By default, the first value of dat is considered good. To change this, use startdat and toldat to set as the first good data point the first one that comes within toldat of startdat. Implemented by: 2012-07-17: DPS authored by Mathias Lankhorst. Example code provided for Matlab. 2013-04-06: Christopher Wingard. Initial python implementation. Usage: outdat, outx, outqc = dataqc_gradienttest(dat, x, ddatdx, mindx, startdat, toldat); where outdat = same as dat except that NaNs and values not meeting mindx are removed. outx = same as x except that NaNs and values not meeting mindx are removed. outqc = output quality control flags for outdat. 0 means bad data, 1 means good data. dat = input dataset, a numeric real vector. x = coordinate (e.g. time, distance) along which dat is given. Must be of the same size as dat and strictly increasing. ddatdx = two-element vector defining the valid range of ddat/dx from one point to the next. mindx = scalar. minimum dx for which this test will be applied (data that are less than mindx apart will be deleted). defaults to zero if NaN/empty. startdat = start value (scalar) of dat that is presumed good. defaults to first non-NaN value of dat if NaN/empty. toldat = tolerance value (scalar) for dat; threshold to within which dat must return to be counted as good, after exceeding a ddatdx threshold detected bad data. References: OOI (2012). Data Product Specification for Gradient Test. Document Control Number 1341-100010. https://alfresco.oceanobservatories.org/ (See: Company Home >> OOI >> Controlled >> 1000 System Level >> 1341-10010_Data_Product_SPEC_GRDTEST_OOI.pdf) """ if strict_validation: if not utils.isvector(dat) or not utils.isvector(x): raise ValueError('\'dat\' and \'x\' must be vectors') if len(dat) != len(x): raise ValueError('\'dat\' and \'x\' must be of equal len') if not all(np.diff(x) > 0): raise ValueError('\'x\' must be montonically increasing') dat = np.asanyarray(dat, dtype=np.float).flatten() x = np.asanyarray(x, dtype=np.float).flatten() if np.isnan(mindx): mindx = 0 mindx = mindx or 0 if np.isnan(startdat): startdat = 0 startdat = startdat or 0 # No strict validation here, they are scalards and they must be validated # before going into the C-layer if not utils.isscalar(mindx): raise ValueError("'mindx' must be scalar, NaN, or empty.") if not utils.isscalar(startdat): raise ValueError("'startdat' must be scalar, NaN, or empty.") # Confirm that there are still data points left, else abort: if np.abs(x[0] - x[-1]) < mindx: out = np.zeros(x.shape) out.fill(1) log.warn('Too few values to inspect') return out grad_min = ddatdx[0] grad_max = ddatdx[1] out = gradientvalues(dat, x, grad_min, grad_max, mindx, startdat, toldat) return out
def dataqc_stuckvaluetest(x, reso, num=10, strict_validation=False): """ Description: Data quality control algorithm testing a time series for "stuck values", i.e. repeated occurences of one value. Returns 1 for presumably good data and 0 for data presumed bad. Implemented by: 2012-10-29: DPS authored by Mathias Lankhorst. Example code provided for Matlab. 2013-04-06: Christopher Wingard. Initial python implementation. Usage: qcflag = =dataqc_stuckvaluetest(x, RESO, NUM); where qcflag = Boolean output: 0 where stuck values are found, 1 elsewhere. x = Input time series (vector, numeric). reso = Resolution; repeat values less than reso apart will be considered "stuck values". num = Minimum number of successive values within reso of each other that will trigger the "stuck value". num is optional and defaults to 10 if omitted or empty. References: OOI (2012). Data Product Specification for Stuck Value Test. Document Control Number 1341-10008. https://alfresco.oceanobservatories.org/ (See: Company Home >> OOI >> Controlled >> 1000 System Level >> 1341-10008_Data_Product_SPEC_STUCKVL_OOI.pdf) """ dat = np.atleast_1d(x) if strict_validation: if not utils.isnumeric(dat).all(): raise ValueError('\'x\' must be numeric') if not utils.isvector(dat): raise ValueError('\'x\' must be a vector') if not utils.isreal(dat).all(): raise ValueError('\'x\' must be real') for k, arg in {'reso': reso, 'num': num}.iteritems(): if not utils.isnumeric(arg).all(): raise ValueError('\'{0}\' must be numeric'.format(k)) if not utils.isscalar(arg): raise ValueError('\'{0}\' must be a scalar'.format(k)) if not utils.isreal(arg).all(): raise ValueError('\'{0}\' must be real'.format(k)) num = np.abs(num) dat = np.asanyarray(dat, dtype=np.float) ll = len(x) if ll < num: # Warn - 'num' is greater than len(x), returning zeros out = np.zeros(dat.size, dtype='int8') else: out = stuckvalues(dat, reso, num) return out
def dataqc_polytrendtest(dat, t, ord_n=1, nstd=3, strict_validation=False): """ Description: Data quality control algorithm testing if measurements contain a significant portion of a polynomial. Returns 1 if this is not the case, else 0. The purpose of this test is to check if a significant fraction of the variability in a time series can be explained by a drift, possibly interpreted as a sensor drift. This drift is assumed to be a polynomial of order ORD. Use ORD=1 to consider a linear drift The time series dat is passed to MatLab's POLYFIT routine to obtain a polynomial fit PP to dat, and the difference dat-PP is compared to the original dat. If the standard deviation of (dat-PP) is less than that of dat by a factor of NSTD, the time series is assumed to contain a significant trend (output will be 0), else not (output will be 1). Implemented by: 2012-10-29: DPS authored by Mathias Lankhorst. Example code provided for Matlab. 2013-04-06: Christopher Wingard. Initial python implementation. 2013-05-30: Christopher Mueller. Performance optimizations. Usage: qcflag = dataqc_polytrendtest(dat, t, ord_n, nstd, strict_validation) where qcflag = Boolean, 0 a trend is detected, 1 elsewhere. dat = Input dataset, a numeric real vector. t = time record associated with dat ord_n (optional, defaults to 1) = Polynomial order. nstd (optional, defaults to 3) = Factor by how much the standard deviation must be reduced before qcflag switches from 1 to 0 strict_validation (optional, defaults to False) = Flag asserting testing of inputs. References: OOI (2012). Data Product Specification for Trend Test. Document Control Number 1341-10007. https://alfresco.oceanobservatories.org/ (See: Company Home >> OOI >> Controlled >> 1000 System Level >> 1341-10007_Data_Product_SPEC_TRNDTST_OOI.pdf) """ dat = np.atleast_1d(dat) t = np.atleast_1d(t) if strict_validation: for k, arg in {'dat': dat, 't': t, 'ord_n': ord_n, 'nstd': nstd}.iteritems(): if not utils.isnumeric(arg).all(): raise ValueError('\'{0}\' must be numeric'.format(k)) if not utils.isreal(arg).all(): raise ValueError('\'{0}\' must be real'.format(k)) for k, arg in {'dat': dat, 't': t}.iteritems(): if not utils.isvector(arg): raise ValueError('\'{0}\' must be a vector'.format(k)) for k, arg in {'ord_n': ord_n, 'nstd': nstd}.iteritems(): if not utils.isscalar(arg): raise ValueError('\'{0}\' must be a scalar'.format(k)) ord_n = int(round(abs(ord_n))) nstd = int(abs(nstd)) ll = len(dat) # Not needed because time is incorporated as 't' # t = range(ll) pp = np.polyfit(t, dat, ord_n) datpp = np.polyval(pp, t) # test for a trend if np.atleast_1d((np.std(dat - datpp) * nstd) < np.std(dat)).all(): trndtst = 0 else: trndtst = 1 # insure output size equals input, even though test yields a single value. qcflag = np.ones(dat.shape).astype('int8') * trndtst return qcflag
def dataqc_polytrendtest(dat, t, ord_n=1, nstd=3, strict_validation=False): """ Description: Data quality control algorithm testing if measurements contain a significant portion of a polynomial. Returns 1 if this is not the case, else 0. The purpose of this test is to check if a significant fraction of the variability in a time series can be explained by a drift, possibly interpreted as a sensor drift. This drift is assumed to be a polynomial of order ORD. Use ORD=1 to consider a linear drift The time series dat is passed to MatLab's POLYFIT routine to obtain a polynomial fit PP to dat, and the difference dat-PP is compared to the original dat. If the standard deviation of (dat-PP) is less than that of dat by a factor of NSTD, the time series is assumed to contain a significant trend (output will be 0), else not (output will be 1). Implemented by: 2012-10-29: DPS authored by Mathias Lankhorst. Example code provided for Matlab. 2013-04-06: Christopher Wingard. Initial python implementation. 2013-05-30: Christopher Mueller. Performance optimizations. Usage: qcflag = dataqc_polytrendtest(dat, t, ord_n, nstd, strict_validation) where qcflag = Boolean, 0 a trend is detected, 1 elsewhere. dat = Input dataset, a numeric real vector. t = time record associated with dat ord_n (optional, defaults to 1) = Polynomial order. nstd (optional, defaults to 3) = Factor by how much the standard deviation must be reduced before qcflag switches from 1 to 0 strict_validation (optional, defaults to False) = Flag asserting testing of inputs. References: OOI (2012). Data Product Specification for Trend Test. Document Control Number 1341-10007. https://alfresco.oceanobservatories.org/ (See: Company Home >> OOI >> Controlled >> 1000 System Level >> 1341-10007_Data_Product_SPEC_TRNDTST_OOI.pdf) """ dat = np.atleast_1d(dat) t = np.atleast_1d(t) if strict_validation: for k, arg in { 'dat': dat, 't': t, 'ord_n': ord_n, 'nstd': nstd }.iteritems(): if not utils.isnumeric(arg).all(): raise ValueError('\'{0}\' must be numeric'.format(k)) if not utils.isreal(arg).all(): raise ValueError('\'{0}\' must be real'.format(k)) for k, arg in {'dat': dat, 't': t}.iteritems(): if not utils.isvector(arg): raise ValueError('\'{0}\' must be a vector'.format(k)) for k, arg in {'ord_n': ord_n, 'nstd': nstd}.iteritems(): if not utils.isscalar(arg): raise ValueError('\'{0}\' must be a scalar'.format(k)) ord_n = int(round(abs(ord_n))) nstd = int(abs(nstd)) ll = len(dat) # Not needed because time is incorporated as 't' # t = range(ll) pp = np.polyfit(t, dat, ord_n) datpp = np.polyval(pp, t) # test for a trend if np.atleast_1d((np.std(dat - datpp) * nstd) < np.std(dat)).all(): trndtst = 0 else: trndtst = 1 # insure output size equals input, even though test yields a single value. qcflag = np.ones(dat.shape).astype('int8') * trndtst return qcflag