def test_load_yaml_stringio(self): st = io.StringIO() with open(self.yamlfile, 'rt') as f: st.write(f.read()) qc = QcConfig(st) st.close() assert qc.config == self.expected_dict
def test_rate_of_change_test(self): qc = QcConfig( {'qartod': { 'rate_of_change_test': { 'threshold': 2.5, } }}) self.perf_test(qc)
def test_attenuated_signal_test(self): qc = QcConfig( {'qartod': { 'attenuated_signal_test': { 'threshold': (2.5, 5), } }}) self.perf_test(qc)
def test_with_values_in_config(self): config = deepcopy(self.config) config['qartod']['location_test'] = { 'bbox': [-100, -40, 100, 40], 'lat': [-41, -40, -39, 0, 39, 40, 41], 'lon': [-101, -100, -99, 0, 99, 100, 101], } config['qartod']['gross_range_test']['inp'] = list(range(13)) qc = QcConfig(config) r = qc.run() range_expected = np.array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3]) npt.assert_array_equal(r['qartod']['gross_range_test'], range_expected) location_expected = np.array([4, 1, 1, 1, 1, 1, 4]) npt.assert_array_equal(r['qartod']['location_test'], location_expected)
def test_location_test(self): qc = QcConfig({ 'qartod': { 'location_test': { 'lon': self.lon, 'lat': self.lat, } } }) self.perf_test(qc)
def test_gross_range(self): qc = QcConfig({ 'qartod': { 'gross_range_test': { 'suspect_span': [1, 11], 'fail_span': [0, 12], } } }) self.perf_test(qc)
def test_spike_test(self): qc = QcConfig({ 'qartod': { 'spike_test': { 'suspect_threshold': 3, 'fail_threshold': 6, } } }) self.perf_test(qc)
def test_attenuated_signal_test(self): qc = QcConfig({ 'qartod': { 'attenuated_signal_test': { 'suspect_threshold': 5, 'fail_threshold': 2.5, } } }) self.perf_test(qc)
def test_climatology_config_test(self): tests = [ ( np.datetime64('2011-01-02 00:00:00'), 11, None ) ] times, values, depths = zip(*tests) qc = QcConfig(self.yamlfile) results = qc.run( tinp=times, inp=values, zinp=depths ) npt.assert_array_equal( results['qartod']['climatology_test'], np.ma.array([1]) )
def test_comparing_nc_and_qc_from_dict(self): c = NcQcConfig({ 'data1': { 'qartod': { 'gross_range_test': self.config } } }) ncresults = c.run(self.fp) qcr = QcConfig(c.config['data1']) result = qcr.run( inp=list(range(13)) ) npt.assert_array_equal( ncresults['data1']['qartod']['gross_range_test'], result['qartod']['gross_range_test'], self.expected )
def test_location_test__with_range_max(self): qc = QcConfig({ 'qartod': { 'location_test': { 'lon': self.lon, 'lat': self.lat, 'range_max': 1, } } }) self.perf_test(qc)
def test_attenuated_signal_with_time_period_test(self): qc = QcConfig({ 'qartod': { 'attenuated_signal_test': { 'suspect_threshold': 5, 'fail_threshold': 2.5, 'test_period': 86400 } } }) self.perf_test(qc)
def test_flat_line_test(self): qc = QcConfig({ 'qartod': { 'flat_line_test': { 'suspect_threshold': 43200, 'fail_threshold': 86400, 'tolerance': 1, } } }) self.perf_test(qc)
def run_qartod(df, config, time="time", depth="depth"): # Run QARTOD tests # We are using the deprecated QcConfig method and hopefully will move # to a new stream method soon. # TODO this is a deprecated method and we should move on the Stream method in the near future. for var in config.keys(): qc = QcConfig(config[var]) qc_result = qc.run( inp=df[var], tinp=df[time], zinp=df[depth], ) for module, tests in qc_result.items(): for test, flag in tests.items(): flag_name = var + "_" + module + "_" + test if type(df) is xr.Dataset: df[flag_name] = (df[var].dims, flag) else: df.loc[flag_name] = flag return df
def test_speed_test(self): qc = QcConfig({ 'argo': { 'speed_test': { 'tinp': self.times, 'lon': self.lon, 'lat': self.lat, 'suspect_threshold': 1, 'fail_threshold': 3, } } }) self.perf_test(qc)
def test_climatology_test_depths(self): tests = [ ( np.datetime64('2012-01-02 00:00:00'), 51, 2 ), ( np.datetime64('2012-01-02 00:00:00'), 71, 90 ), ( np.datetime64('2012-01-02 00:00:00'), 42, None ), ( np.datetime64('2012-01-02 00:00:00'), 59, 11 ), ( np.datetime64('2012-01-02 00:00:00'), 79, 101 ) ] times, values, depths = zip(*tests) qc = QcConfig(self.yamlfile) results = qc.run( tinp=times, inp=values, zinp=depths ) npt.assert_array_equal( results['qartod']['climatology_test'], np.ma.array([1, 1, 1, 3, 9]) )
def test_run_with_agg(self): qc = QcConfig({ 'qartod': { 'gross_range_test': { 'fail_span': [0, 12], }, 'spike_test': { 'suspect_threshold': 3, 'fail_threshold': 10, }, 'aggregate': {} } }) inp = [-1, 0, 1, 2, 10, 3] expected_gross_range = np.array([4, 1, 1, 1, 1, 1]) expected_spike = np.array([2, 1, 1, 3, 3, 2]) expected_agg = np.array([4, 1, 1, 3, 3, 1]) r = qc.run(inp=inp) npt.assert_array_equal(r['qartod']['gross_range_test'], expected_gross_range) npt.assert_array_equal(r['qartod']['spike_test'], expected_spike) npt.assert_array_equal(r['qartod']['aggregate'], expected_agg)
def test_climatology_test(self): qc = QcConfig({ 'qartod': { 'climatology_test': { 'config': [ { 'vspan': (10, 20), 'tspan': (0, 1), 'period': 'quarter' }, ] } } }) self.perf_test(qc)
"time", "z", ] data = e.to_pandas( index_col="time (UTC)", parse_dates=True, ) data["timestamp"] = data.index.astype("int64") // 1e9 data.to_csv(fname) data.head() from ioos_qc.config import QcConfig qc = QcConfig(qc_config) qc_results = qc.run( inp=data["sea_surface_height_above_sea_level_geoid_mhhw (m)"], tinp=data["timestamp"], zinp=data["z (m)"], gen_agg=True ) qc_results The results are returned in a dictionary format, similar to the input configuration, with a mask for each test. While the mask **is** a masked array it should not be applied as such. The results range from 1 to 4 meaning: 1. data passed the QA/QC 2. did not run on this data point 3. flag as suspect
def test_load_json_str(self): with open(self.yamlfile) as f: js = json.dumps(yaml.load(f.read())) qc = QcConfig(js) assert qc.config == self.expected_dict
def test_climatology_config_yaml_conversion(self): qc = QcConfig(self.yamlfile) yaml_climatology_config = ClimatologyConfig.convert( qc.config['qartod']['climatology_test']['config']) self._assert_cc_configs_equal(self.cc, yaml_climatology_config)
def test_climatology_json_conversion(self): qc = QcConfig(self.json_config) json_climatology_config = ClimatologyConfig.convert( qc.config['qartod']['climatology_test']['config']) self._assert_cc_configs_equal(self.cc, json_climatology_config)
def test_run(self): qc = QcConfig(self.config) r = qc.run(inp=list(range(13))) expected = np.array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3]) npt.assert_array_equal(r['qartod']['gross_range_test'], expected)
def test_load_file_path(self): qc = QcConfig(self.yamlfile) assert qc.config == self.expected_dict
def test_load_yaml_str(self): with open(self.yamlfile) as f: qc = QcConfig(f.read()) assert qc.config == self.expected_dict
def test_load_yaml_dict_object(self): with open(self.yamlfile) as f: y = yaml.load(f.read(), Loader=yaml.Loader) qc = QcConfig(y) assert qc.config == self.expected_dict
def execute_qartod_test(self, qartod_test_record, dataset): """ Run a single QARTOD test against the given dataset and record the results in the dataset. :param qartod_test_record: QartodTestRecord indicating a test to execute :param dataset: xarray.Dataset holding the science data the QARTOD test evaluates :return: """ # Extract configuration details for test inputs referring to dataset variables params = qartod_test_record.parameters # single quoted strings in parameters (i.e. from the database field) will mess up the json.loads call params = params.replace("'", "\"") try: param_dict = json.loads(params) except ValueError: log.error( '<%s> Failure deserializing QC parameter configuration %r', self.request_id, params) return parameter_under_test = param_dict['inp'] # can't run test on data that's not there if parameter_under_test not in dataset: return # Extract configuration details for remaining test inputs config = qartod_test_record.qcConfig # single quoted strings in qcConfig (i.e. from the database field) will mess up the json.loads call config = config.replace("'", "\"") try: qc_config = QcConfig(json.loads(config)) except ValueError: log.error( '<%s> Failure deserializing QC test configuration %r for parameter %r', self.request_id, config, parameter_under_test) return # replace parameter names with the actual numpy arrays from the dataset for each entry in param_dict # caste keys to list instead of iterating dict directly because we may delete keys in this loop for input_name in list(param_dict.keys()): param_name = param_dict[input_name] if param_name: param_dict[input_name] = dataset[param_name].values else: # optional parameter set to None/null - remove it del param_dict[input_name] # call QARTOD test in a separate process to deal with crashes, e.g. segfaults read_fd, write_fd = os.pipe() processid = os.fork() if processid == 0: # child process with os.fdopen(write_fd, 'w') as w: os.close(read_fd) # run the qc function try: # all arguments except the data under test come from the configuration object # results is a nested dictionary results = qc_config.run(**param_dict) # convert results into a string for sending over pipe # NOTE: this converts numpy arrays to lists! Use np.asarray() to restore them. results_string = json.dumps(results, cls=NumpyEncoder) w.write(results_string) except (TypeError, ValueError) as e: log.exception( '<%s> Failure executing QC with configuration %r %r', self.request_id, config, e) w.write(EXCEPTION_MESSAGE) # child process is done, don't let it stick around os._exit(0) # parent process os.close(write_fd) with os.fdopen(read_fd) as r: results_string = r.read() # wait for the child process to prevent zombies - second argument of 0 means default behavior of waitpid os.waitpid(processid, 0) # check for failure to produce results if not results_string: # an error, e.g. segfault, prevented proper qc execution, proceed with trying the next qc function log.error( '<%s> Failed to execute QC with configuration %r: QC process failed to return any data', self.request_id, config) return if results_string == EXCEPTION_MESSAGE: # an exception has already been logged, proceed with trying the next qc function return # load the results dict from the results string results = json.loads(results_string) # results is a nested dictionary with the outer keys being module names, the inner keys being test # names, and the inner values being the results for the given test # e.g. {'qartod': {'gross_range_test': [0, 0, 3, 4, 0], 'location_test': [2, 2, 2, 2, 2]}} for module, test_set in results.items(): for test, test_results in test_set.items(): # test_results was converted from an np.array to a list during serialization, so convert it back test_results = np.asarray(test_results) # Verify all QC results are valid QARTOD Primary Level Flags mask = np.array([ item not in QartodFlags.getValidQCFlags() for item in test_results ]) if mask.any(): log.error( 'Received QC result with invalid QARTOD Primary Flag from %s. Invalid flags: %r', test, np.unique(test_results[mask])) # Use the "high interest" (SUSPECT) flag to draw attention to the failure test_results[mask] = QartodFlags.SUSPECT # add results to dataset QartodQcExecutor.insert_qc_results(parameter_under_test, test, test_results, dataset)
def test_load_path_object(self): qc = QcConfig(Path(self.yamlfile)) assert qc.config == self.expected_dict
def qc(self, dataset_ids=None, verbose=False, skip_units=False): """Light quality check on data. This runs one IOOS QARTOD on data as a first order quality check. Only returns data that is quality checked. Requires pint for unit handling. Requires user-input `criteria` and `var_def` to run. This is slow if your data is both chunks of time and space, so this should first narrow by both as much as possible. Parameters ---------- dataset_ids: str, list, optional Read in data for dataset_ids specifically. If none are provided, data will be read in for all `self.keys()`. verbose: boolean, optional If True, report summary statistics on QC flag distribution in datasets. skip_units: boolean, optional If True, do not interpret or alter units and assume the data is in the units described in var_def already. Returns ------- Dataset with added variables for each variable in dataset that was checked, with name of [variable]+'_qc'. Notes ----- Code has been saved for data in DataFrames, but is changing so that data will be in Datasets. This way, can use cf-xarray functionality for custom variable names and easier to have recognizable units for variables with netcdf than csv. """ assertion = ( "Need to have custom criteria and variable information defined to run QC." ) assert self.criteria and self.var_def, assertion if dataset_ids is None: data_ids = ( self.keys() ) # Only return already read-in dataset_ids # self.dataset_ids else: data_ids = dataset_ids if not isinstance(data_ids, list): data_ids = [data_ids] data_out = {} for data_id in data_ids: # access the Dataset dd = self[data_id] # which custom variable names are in dataset # dd_varnames are the variable names in the Dataset dd # cf_varnames are the custom names we can use to refer to the # variables through cf-xarray if isinstance(dd, pd.DataFrame): varnames, cf_varnames = [], [] for var in self.var_def.keys(): try: varname = dd.cf[var].name varnames.append(varname) cf_varnames.append(var) except: pass elif isinstance(dd, xr.Dataset): varnames = [ (cf_xarray.accessor._get_custom_criteria(dd, var), var) for var in self.var_def.keys() if len(cf_xarray.accessor._get_custom_criteria(dd, var)) > 0 ] assert len(varnames) > 0, "no custom names matched in Dataset." if isinstance(dd, pd.DataFrame): dd_varnames = varnames.copy() elif isinstance(dd, xr.Dataset): dd_varnames, cf_varnames = zip(*varnames) dd_varnames = sum(dd_varnames, []) assert len(dd_varnames) == len( cf_varnames ), "looks like multiple variables might have been identified for a custom variable name" # subset to just the boem or requested variables for each df or ds if isinstance(dd, pd.DataFrame): dd2 = dd[list(varnames)] elif isinstance(dd, xr.Dataset): dd2 = dd.cf[cf_varnames] # dd2 = dd[varnames] # equivalent if not skip_units: # Preprocess to change salinity units away from 1e-3 if isinstance(dd, pd.DataFrame): # this replaces units in the 2nd column level of 1e-3 with psu new_levs = [ "psu" if col == "1e-3" else col for col in dd2.columns.levels[1] ] dd2.columns.set_levels(new_levs, level=1, inplace=True) elif isinstance(dd, xr.Dataset): for Var in dd2.data_vars: if ("units" in dd2[Var].attrs and dd2[Var].attrs["units"] == "1e-3"): dd2[Var].attrs["units"] = "psu" # run pint quantify on each data structure dd2 = dd2.pint.quantify() # dd2 = dd2.pint.quantify(level=-1) # go through each variable by name to make sure in correct units # have to do this in separate loop so that can dequantify afterward if isinstance(dd, pd.DataFrame): print("NOT IMPLEMENTED FOR DATAFRAME YET") elif isinstance(dd, xr.Dataset): # form of "temp": "degree_Celsius" units_dict = { dd_varname: self.var_def[cf_varname]["units"] for (dd_varname, cf_varname) in zip(dd_varnames, cf_varnames) } # convert to conventional units dd2 = dd2.pint.to(units_dict) dd2 = dd2.pint.dequantify() # now loop for QARTOD on each variable for dd_varname, cf_varname in zip(dd_varnames, cf_varnames): # run QARTOD qc_config = { "qartod": { "gross_range_test": { "fail_span": self.var_def[cf_varname]["fail_span"], "suspect_span": self.var_def[cf_varname]["suspect_span"], }, } } qc = QcConfig(qc_config) qc_results = qc.run(inp=dd2[dd_varname]) # qc_results = qc.run(inp=dd2.cf[cf_varname]) # this isn't working for some reason # put flags into dataset new_qc_var = f"{dd_varname}_qc" if isinstance(dd, pd.DataFrame): dd2[new_qc_var] = qc_results["qartod"]["gross_range_test"] elif isinstance(dd, xr.Dataset): new_data = qc_results["qartod"]["gross_range_test"] dims = dd2[dd_varname].dims dd2[f"{dd_varname}_qc"] = (dims, new_data) data_out[data_id] = dd2 if verbose: for dataset_id, dd in data_out.items(): print(dataset_id) qckeys = dd2[[var for var in dd.data_vars if "_qc" in var]] for qckey in qckeys: print(qckey) for flag, desc in odg.qcdefs.items(): print( f"Flag == {flag} ({desc}): {int((dd[qckey] == int(flag)).sum())}" ) return data_out