def test_load_json_stringio(self): st = io.StringIO() qc = QcConfig(self.yamlfile) with open(self.yamlfile, 'rt') as f: js = json.dumps(yaml.load(f.read())) st.write(js) qc = QcConfig(st) st.close() assert qc.config == self.expected_dict
def test_different_kwargs_run(self): config = deepcopy(self.config) config['qartod']['location_test'] = { 'bbox': [-100, -40, 100, 40] } xs = [ -101, -100, -99, 0, 99, 100, 101 ] ys = [ -41, -40, -39, 0, 39, 40, 41 ] qc = QcConfig(config) r = qc.run( inp=list(range(13)), lat=ys, lon=xs ) range_expected = np.array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3]) npt.assert_array_equal( r['qartod']['gross_range_test'], range_expected ) location_expected = np.array([4, 1, 1, 1, 1, 1, 4]) npt.assert_array_equal( r['qartod']['location_test'], location_expected )
def test_run(self): qc = QcConfig(self.config) r = qc.run(inp=list(range(13))) expected = np.array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3]) npt.assert_array_equal(r['qartod']['gross_range_test'], expected) assert 'aggregate' not in r['qartod']
def test_pressure_increasing_test(self): qc = QcConfig({ 'argo': { 'pressure_increasing_test': {} } }) self.perf_test(qc)
def test_qartod_compare(self): qc = QcConfig({ 'qartod': { 'gross_range_test': { 'suspect_span': [1, 11], 'fail_span': [0, 12], }, 'spike_test': { 'suspect_threshold': 3, 'fail_threshold': 6, }, 'rate_of_change_test': { 'threshold': 2.5, } } }) results = qc.run(inp=self.inp, tinp=self.times, zinp=self.zinp) all_tests = [ results['qartod'][test_name] for test_name in list(results['qartod']) ] def run_fn(): qartod.qartod_compare(all_tests) self.perf_test(None, method_name='qartod_compare', run_fn=run_fn)
def test_with_empty_config(self): self.config['qartod']['flat_line_test'] = None qc = QcConfig(self.config) r = qc.run(inp=list(range(13))) assert 'gross_range_test' in r['qartod'] assert 'flat_line_test' not in r['qartod']
def test_load_yaml_stringio(self): st = io.StringIO() with open(self.yamlfile, 'rt') as f: st.write(f.read()) qc = QcConfig(st) st.close() assert qc.config == self.expected_dict
def test_attenuated_signal_test(self): qc = QcConfig( {'qartod': { 'attenuated_signal_test': { 'threshold': (2.5, 5), } }}) self.perf_test(qc)
def test_rate_of_change_test(self): qc = QcConfig( {'qartod': { 'rate_of_change_test': { 'threshold': 2.5, } }}) self.perf_test(qc)
def test_using_config(self): config = {'argo': {'pressure_increasing_test': {}}} qc = QcConfig(config) r = qc.run(inp=np.array([0.0, 2.0, 2.0, 1.99, 2.3, 2.4, 2.4, 2.5], dtype='float32')) expected = np.array([1, 1, 3, 3, 1, 1, 3, 1]) npt.assert_array_equal(r['argo']['pressure_increasing_test'], expected)
def test_comparing_nc_and_qc_from_nc(self): c = NcQcConfig(self.fp) ncresults = c.run(self.fp) qcr = QcConfig(c.config['data1']) result = qcr.run(inp=list(range(13))) npt.assert_array_equal( ncresults['data1']['qartod']['gross_range_test'], result['qartod']['gross_range_test'], self.expected)
def test_location_test(self): qc = QcConfig({ 'qartod': { 'location_test': { 'lon': self.lon, 'lat': self.lat, } } }) self.perf_test(qc)
def test_gross_range(self): qc = QcConfig({ 'qartod': { 'gross_range_test': { 'suspect_span': [1, 11], 'fail_span': [0, 12], } } }) self.perf_test(qc)
def test_spike_test(self): qc = QcConfig({ 'qartod': { 'spike_test': { 'suspect_threshold': 3, 'fail_threshold': 6, } } }) self.perf_test(qc)
def test_attenuated_signal_test(self): qc = QcConfig({ 'qartod': { 'attenuated_signal_test': { 'suspect_threshold': 5, 'fail_threshold': 2.5, } } }) self.perf_test(qc)
def test_attenuated_signal_with_time_period_test(self): qc = QcConfig({ 'qartod': { 'attenuated_signal_test': { 'suspect_threshold': 5, 'fail_threshold': 2.5, 'test_period': 86400 } } }) self.perf_test(qc)
def test_flat_line_test(self): qc = QcConfig({ 'qartod': { 'flat_line_test': { 'suspect_threshold': 43200, 'fail_threshold': 86400, 'tolerance': 1, } } }) self.perf_test(qc)
def test_location_test__with_range_max(self): qc = QcConfig({ 'qartod': { 'location_test': { 'lon': self.lon, 'lat': self.lat, 'range_max': 1, } } }) self.perf_test(qc)
def test_comparing_nc_and_qc_config(self): # Compare results from QcConfig to those from NcQcConfig nc_config = NcQcConfig(self.config) nc_results = nc_config.run(self.fp) qc_config = QcConfig(self.config['data1']) qc_results = qc_config.run(inp=self.data) npt.assert_array_equal( nc_results['data1']['qartod']['gross_range_test'], qc_results['qartod']['gross_range_test'], self.expected)
def test_speed_test(self): qc = QcConfig({ 'argo': { 'speed_test': { 'tinp': self.times, 'lon': self.lon, 'lat': self.lat, 'suspect_threshold': 1, 'fail_threshold': 3, } } }) self.perf_test(qc)
def test_climatology_test(self): qc = QcConfig({ 'qartod': { 'climatology_test': { 'config': [ { 'vspan': (10, 20), 'tspan': (0, 1), 'period': 'quarter' }, ] } } }) self.perf_test(qc)
def test_with_values_in_config(self): config = deepcopy(self.config) config['qartod']['location_test'] = { 'bbox': [-100, -40, 100, 40], 'lat': [-41, -40, -39, 0, 39, 40, 41], 'lon': [-101, -100, -99, 0, 99, 100, 101], } config['qartod']['gross_range_test']['inp'] = list(range(13)) qc = QcConfig(config) r = qc.run() range_expected = np.array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3]) npt.assert_array_equal(r['qartod']['gross_range_test'], range_expected) location_expected = np.array([4, 1, 1, 1, 1, 1, 4]) npt.assert_array_equal(r['qartod']['location_test'], location_expected)
def test_climatology_config_test(self): tests = [ ( np.datetime64('2011-01-02 00:00:00'), 11, None ) ] times, values, depths = zip(*tests) qc = QcConfig(self.yamlfile) results = qc.run( tinp=times, inp=values, zinp=depths ) npt.assert_array_equal( results['qartod']['climatology_test'], np.ma.array([1]) )
def run_qartod(df, config, time="time", depth="depth"): # Run QARTOD tests # We are using the deprecated QcConfig method and hopefully will move # to a new stream method soon. # TODO this is a deprecated method and we should move on the Stream method in the near future. for var in config.keys(): qc = QcConfig(config[var]) qc_result = qc.run( inp=df[var], tinp=df[time], zinp=df[depth], ) for module, tests in qc_result.items(): for test, flag in tests.items(): flag_name = var + "_" + module + "_" + test if type(df) is xr.Dataset: df[flag_name] = (df[var].dims, flag) else: df.loc[flag_name] = flag return df
def test_climatology_test_depths(self): tests = [ ( np.datetime64('2012-01-02 00:00:00'), 51, 2 ), ( np.datetime64('2012-01-02 00:00:00'), 71, 90 ), ( np.datetime64('2012-01-02 00:00:00'), 42, None ), ( np.datetime64('2012-01-02 00:00:00'), 59, 11 ), ( np.datetime64('2012-01-02 00:00:00'), 79, 101 ) ] times, values, depths = zip(*tests) qc = QcConfig(self.yamlfile) results = qc.run( tinp=times, inp=values, zinp=depths ) npt.assert_array_equal( results['qartod']['climatology_test'], np.ma.array([1, 1, 1, 3, 9]) )
def test_run_with_agg(self): qc = QcConfig({ 'qartod': { 'gross_range_test': { 'fail_span': [0, 12], }, 'spike_test': { 'suspect_threshold': 3, 'fail_threshold': 10, }, 'aggregate': {} } }) inp = [-1, 0, 1, 2, 10, 3] expected_gross_range = np.array([4, 1, 1, 1, 1, 1]) expected_spike = np.array([2, 1, 1, 3, 3, 2]) expected_agg = np.array([4, 1, 1, 3, 3, 1]) r = qc.run(inp=inp) npt.assert_array_equal(r['qartod']['gross_range_test'], expected_gross_range) npt.assert_array_equal(r['qartod']['spike_test'], expected_spike) npt.assert_array_equal(r['qartod']['aggregate'], expected_agg)
def test_load_path_object(self): qc = QcConfig(Path(self.yamlfile)) assert qc.config == self.expected_dict
def test_load_file_path(self): qc = QcConfig(self.yamlfile) assert qc.config == self.expected_dict
def test_load_yaml_dict_object(self): with open(self.yamlfile) as f: y = yaml.load(f.read(), Loader=yaml.Loader) qc = QcConfig(y) assert qc.config == self.expected_dict
def qc(self, dataset_ids=None, verbose=False, skip_units=False): """Light quality check on data. This runs one IOOS QARTOD on data as a first order quality check. Only returns data that is quality checked. Requires pint for unit handling. Requires user-input `criteria` and `var_def` to run. This is slow if your data is both chunks of time and space, so this should first narrow by both as much as possible. Parameters ---------- dataset_ids: str, list, optional Read in data for dataset_ids specifically. If none are provided, data will be read in for all `self.keys()`. verbose: boolean, optional If True, report summary statistics on QC flag distribution in datasets. skip_units: boolean, optional If True, do not interpret or alter units and assume the data is in the units described in var_def already. Returns ------- Dataset with added variables for each variable in dataset that was checked, with name of [variable]+'_qc'. Notes ----- Code has been saved for data in DataFrames, but is changing so that data will be in Datasets. This way, can use cf-xarray functionality for custom variable names and easier to have recognizable units for variables with netcdf than csv. """ assertion = ( "Need to have custom criteria and variable information defined to run QC." ) assert self.criteria and self.var_def, assertion if dataset_ids is None: data_ids = ( self.keys() ) # Only return already read-in dataset_ids # self.dataset_ids else: data_ids = dataset_ids if not isinstance(data_ids, list): data_ids = [data_ids] data_out = {} for data_id in data_ids: # access the Dataset dd = self[data_id] # which custom variable names are in dataset # dd_varnames are the variable names in the Dataset dd # cf_varnames are the custom names we can use to refer to the # variables through cf-xarray if isinstance(dd, pd.DataFrame): varnames, cf_varnames = [], [] for var in self.var_def.keys(): try: varname = dd.cf[var].name varnames.append(varname) cf_varnames.append(var) except: pass elif isinstance(dd, xr.Dataset): varnames = [ (cf_xarray.accessor._get_custom_criteria(dd, var), var) for var in self.var_def.keys() if len(cf_xarray.accessor._get_custom_criteria(dd, var)) > 0 ] assert len(varnames) > 0, "no custom names matched in Dataset." if isinstance(dd, pd.DataFrame): dd_varnames = varnames.copy() elif isinstance(dd, xr.Dataset): dd_varnames, cf_varnames = zip(*varnames) dd_varnames = sum(dd_varnames, []) assert len(dd_varnames) == len( cf_varnames ), "looks like multiple variables might have been identified for a custom variable name" # subset to just the boem or requested variables for each df or ds if isinstance(dd, pd.DataFrame): dd2 = dd[list(varnames)] elif isinstance(dd, xr.Dataset): dd2 = dd.cf[cf_varnames] # dd2 = dd[varnames] # equivalent if not skip_units: # Preprocess to change salinity units away from 1e-3 if isinstance(dd, pd.DataFrame): # this replaces units in the 2nd column level of 1e-3 with psu new_levs = [ "psu" if col == "1e-3" else col for col in dd2.columns.levels[1] ] dd2.columns.set_levels(new_levs, level=1, inplace=True) elif isinstance(dd, xr.Dataset): for Var in dd2.data_vars: if ("units" in dd2[Var].attrs and dd2[Var].attrs["units"] == "1e-3"): dd2[Var].attrs["units"] = "psu" # run pint quantify on each data structure dd2 = dd2.pint.quantify() # dd2 = dd2.pint.quantify(level=-1) # go through each variable by name to make sure in correct units # have to do this in separate loop so that can dequantify afterward if isinstance(dd, pd.DataFrame): print("NOT IMPLEMENTED FOR DATAFRAME YET") elif isinstance(dd, xr.Dataset): # form of "temp": "degree_Celsius" units_dict = { dd_varname: self.var_def[cf_varname]["units"] for (dd_varname, cf_varname) in zip(dd_varnames, cf_varnames) } # convert to conventional units dd2 = dd2.pint.to(units_dict) dd2 = dd2.pint.dequantify() # now loop for QARTOD on each variable for dd_varname, cf_varname in zip(dd_varnames, cf_varnames): # run QARTOD qc_config = { "qartod": { "gross_range_test": { "fail_span": self.var_def[cf_varname]["fail_span"], "suspect_span": self.var_def[cf_varname]["suspect_span"], }, } } qc = QcConfig(qc_config) qc_results = qc.run(inp=dd2[dd_varname]) # qc_results = qc.run(inp=dd2.cf[cf_varname]) # this isn't working for some reason # put flags into dataset new_qc_var = f"{dd_varname}_qc" if isinstance(dd, pd.DataFrame): dd2[new_qc_var] = qc_results["qartod"]["gross_range_test"] elif isinstance(dd, xr.Dataset): new_data = qc_results["qartod"]["gross_range_test"] dims = dd2[dd_varname].dims dd2[f"{dd_varname}_qc"] = (dims, new_data) data_out[data_id] = dd2 if verbose: for dataset_id, dd in data_out.items(): print(dataset_id) qckeys = dd2[[var for var in dd.data_vars if "_qc" in var]] for qckey in qckeys: print(qckey) for flag, desc in odg.qcdefs.items(): print( f"Flag == {flag} ({desc}): {int((dd[qckey] == int(flag)).sum())}" ) return data_out