def test_load_yaml_stringio(self):
     st = io.StringIO()
     with open(self.yamlfile, 'rt') as f:
         st.write(f.read())
     qc = QcConfig(st)
     st.close()
     assert qc.config == self.expected_dict
Example #2
0
 def test_rate_of_change_test(self):
     qc = QcConfig(
         {'qartod': {
             'rate_of_change_test': {
                 'threshold': 2.5,
             }
         }})
     self.perf_test(qc)
Example #3
0
 def test_attenuated_signal_test(self):
     qc = QcConfig(
         {'qartod': {
             'attenuated_signal_test': {
                 'threshold': (2.5, 5),
             }
         }})
     self.perf_test(qc)
Example #4
0
    def test_with_values_in_config(self):

        config = deepcopy(self.config)
        config['qartod']['location_test'] = {
            'bbox': [-100, -40, 100, 40],
            'lat': [-41, -40, -39, 0, 39, 40, 41],
            'lon': [-101, -100, -99, 0, 99, 100, 101],
        }
        config['qartod']['gross_range_test']['inp'] = list(range(13))

        qc = QcConfig(config)
        r = qc.run()

        range_expected = np.array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3])
        npt.assert_array_equal(r['qartod']['gross_range_test'], range_expected)
        location_expected = np.array([4, 1, 1, 1, 1, 1, 4])
        npt.assert_array_equal(r['qartod']['location_test'], location_expected)
Example #5
0
 def test_location_test(self):
     qc = QcConfig({
         'qartod': {
             'location_test': {
                 'lon': self.lon,
                 'lat': self.lat,
             }
         }
     })
     self.perf_test(qc)
Example #6
0
 def test_gross_range(self):
     qc = QcConfig({
         'qartod': {
             'gross_range_test': {
                 'suspect_span': [1, 11],
                 'fail_span': [0, 12],
             }
         }
     })
     self.perf_test(qc)
Example #7
0
 def test_spike_test(self):
     qc = QcConfig({
         'qartod': {
             'spike_test': {
                 'suspect_threshold': 3,
                 'fail_threshold': 6,
             }
         }
     })
     self.perf_test(qc)
Example #8
0
 def test_attenuated_signal_test(self):
     qc = QcConfig({
         'qartod': {
             'attenuated_signal_test': {
                 'suspect_threshold': 5,
                 'fail_threshold': 2.5,
             }
         }
     })
     self.perf_test(qc)
Example #9
0
 def test_climatology_config_test(self):
     tests = [
         (
             np.datetime64('2011-01-02 00:00:00'),
             11,
             None
         )
     ]
     times, values, depths = zip(*tests)
     qc = QcConfig(self.yamlfile)
     results = qc.run(
         tinp=times,
         inp=values,
         zinp=depths
     )
     npt.assert_array_equal(
         results['qartod']['climatology_test'],
         np.ma.array([1])
     )
Example #10
0
    def test_comparing_nc_and_qc_from_dict(self):
        c = NcQcConfig({
            'data1': {
                'qartod': {
                    'gross_range_test': self.config
                }
            }
        })
        ncresults = c.run(self.fp)

        qcr = QcConfig(c.config['data1'])
        result = qcr.run(
            inp=list(range(13))
        )

        npt.assert_array_equal(
            ncresults['data1']['qartod']['gross_range_test'],
            result['qartod']['gross_range_test'],
            self.expected
        )
Example #11
0
 def test_location_test__with_range_max(self):
     qc = QcConfig({
         'qartod': {
             'location_test': {
                 'lon': self.lon,
                 'lat': self.lat,
                 'range_max': 1,
             }
         }
     })
     self.perf_test(qc)
Example #12
0
 def test_attenuated_signal_with_time_period_test(self):
     qc = QcConfig({
         'qartod': {
             'attenuated_signal_test': {
                 'suspect_threshold': 5,
                 'fail_threshold': 2.5,
                 'test_period': 86400
             }
         }
     })
     self.perf_test(qc)
Example #13
0
 def test_flat_line_test(self):
     qc = QcConfig({
         'qartod': {
             'flat_line_test': {
                 'suspect_threshold': 43200,
                 'fail_threshold': 86400,
                 'tolerance': 1,
             }
         }
     })
     self.perf_test(qc)
def run_qartod(df, config, time="time", depth="depth"):
    # Run QARTOD tests
    # We are using the deprecated QcConfig method and hopefully will move
    #  to a new stream method soon.

    # TODO this is a deprecated method and we should move on the Stream method in the near future.
    for var in config.keys():
        qc = QcConfig(config[var])
        qc_result = qc.run(
            inp=df[var],
            tinp=df[time],
            zinp=df[depth],
        )
        for module, tests in qc_result.items():
            for test, flag in tests.items():
                flag_name = var + "_" + module + "_" + test
                if type(df) is xr.Dataset:
                    df[flag_name] = (df[var].dims, flag)
                else:
                    df.loc[flag_name] = flag
    return df
Example #15
0
 def test_speed_test(self):
     qc = QcConfig({
         'argo': {
             'speed_test': {
                 'tinp': self.times,
                 'lon': self.lon,
                 'lat': self.lat,
                 'suspect_threshold': 1,
                 'fail_threshold': 3,
             }
         }
     })
     self.perf_test(qc)
Example #16
0
 def test_climatology_test_depths(self):
     tests = [
         (
             np.datetime64('2012-01-02 00:00:00'),
             51,
             2
         ),
         (
             np.datetime64('2012-01-02 00:00:00'),
             71,
             90
         ),
         (
             np.datetime64('2012-01-02 00:00:00'),
             42,
             None
         ),
         (
             np.datetime64('2012-01-02 00:00:00'),
             59,
             11
         ),
         (
             np.datetime64('2012-01-02 00:00:00'),
             79,
             101
         )
     ]
     times, values, depths = zip(*tests)
     qc = QcConfig(self.yamlfile)
     results = qc.run(
         tinp=times,
         inp=values,
         zinp=depths
     )
     npt.assert_array_equal(
         results['qartod']['climatology_test'],
         np.ma.array([1, 1, 1, 3, 9])
     )
    def test_run_with_agg(self):
        qc = QcConfig({
            'qartod': {
                'gross_range_test': {
                    'fail_span': [0, 12],
                },
                'spike_test': {
                    'suspect_threshold': 3,
                    'fail_threshold': 10,
                },
                'aggregate': {}
            }
        })
        inp = [-1, 0, 1, 2, 10, 3]
        expected_gross_range = np.array([4, 1, 1, 1, 1, 1])
        expected_spike = np.array([2, 1, 1, 3, 3, 2])
        expected_agg = np.array([4, 1, 1, 3, 3, 1])

        r = qc.run(inp=inp)

        npt.assert_array_equal(r['qartod']['gross_range_test'],
                               expected_gross_range)
        npt.assert_array_equal(r['qartod']['spike_test'], expected_spike)
        npt.assert_array_equal(r['qartod']['aggregate'], expected_agg)
Example #18
0
 def test_climatology_test(self):
     qc = QcConfig({
         'qartod': {
             'climatology_test': {
                 'config': [
                     {
                         'vspan': (10, 20),
                         'tspan': (0, 1),
                         'period': 'quarter'
                     },
                 ]
             }
         }
     })
     self.perf_test(qc)
        "time",
        "z",
    ]
    data = e.to_pandas(
        index_col="time (UTC)",
        parse_dates=True,
    )
    data["timestamp"] = data.index.astype("int64") // 1e9
    data.to_csv(fname)

data.head()

from ioos_qc.config import QcConfig


qc = QcConfig(qc_config)

qc_results =  qc.run(
    inp=data["sea_surface_height_above_sea_level_geoid_mhhw (m)"],
    tinp=data["timestamp"],
    zinp=data["z (m)"],
    gen_agg=True
)

qc_results

The results are returned in a dictionary format, similar to the input configuration, with a mask for each test. While the mask **is** a masked array it should not be applied as such. The results range from 1 to 4 meaning:

1. data passed the QA/QC
2. did not run on this data point
3. flag as suspect
 def test_load_json_str(self):
     with open(self.yamlfile) as f:
         js = json.dumps(yaml.load(f.read()))
     qc = QcConfig(js)
     assert qc.config == self.expected_dict
Example #21
0
 def test_climatology_config_yaml_conversion(self):
     qc = QcConfig(self.yamlfile)
     yaml_climatology_config = ClimatologyConfig.convert(
         qc.config['qartod']['climatology_test']['config'])
     self._assert_cc_configs_equal(self.cc, yaml_climatology_config)
Example #22
0
 def test_climatology_json_conversion(self):
     qc = QcConfig(self.json_config)
     json_climatology_config = ClimatologyConfig.convert(
         qc.config['qartod']['climatology_test']['config'])
     self._assert_cc_configs_equal(self.cc, json_climatology_config)
Example #23
0
    def test_run(self):
        qc = QcConfig(self.config)
        r = qc.run(inp=list(range(13)))

        expected = np.array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3])
        npt.assert_array_equal(r['qartod']['gross_range_test'], expected)
Example #24
0
 def test_load_file_path(self):
     qc = QcConfig(self.yamlfile)
     assert qc.config == self.expected_dict
 def test_load_yaml_str(self):
     with open(self.yamlfile) as f:
         qc = QcConfig(f.read())
     assert qc.config == self.expected_dict
Example #26
0
 def test_load_yaml_dict_object(self):
     with open(self.yamlfile) as f:
         y = yaml.load(f.read(), Loader=yaml.Loader)
         qc = QcConfig(y)
     assert qc.config == self.expected_dict
Example #27
0
    def execute_qartod_test(self, qartod_test_record, dataset):
        """
        Run a single QARTOD test against the given dataset and record the results in the dataset.
        :param qartod_test_record: QartodTestRecord indicating a test to execute
        :param dataset: xarray.Dataset holding the science data the QARTOD test evaluates
        :return:
        """
        # Extract configuration details for test inputs referring to dataset variables
        params = qartod_test_record.parameters
        # single quoted strings in parameters (i.e. from the database field) will mess up the json.loads call
        params = params.replace("'", "\"")
        try:
            param_dict = json.loads(params)
        except ValueError:
            log.error(
                '<%s> Failure deserializing QC parameter configuration %r',
                self.request_id, params)
            return

        parameter_under_test = param_dict['inp']

        # can't run test on data that's not there
        if parameter_under_test not in dataset:
            return

        # Extract configuration details for remaining test inputs
        config = qartod_test_record.qcConfig
        # single quoted strings in qcConfig (i.e. from the database field) will mess up the json.loads call
        config = config.replace("'", "\"")
        try:
            qc_config = QcConfig(json.loads(config))
        except ValueError:
            log.error(
                '<%s> Failure deserializing QC test configuration %r for parameter %r',
                self.request_id, config, parameter_under_test)
            return

        # replace parameter names with the actual numpy arrays from the dataset for each entry in param_dict
        # caste keys to list instead of iterating dict directly because we may delete keys in this loop
        for input_name in list(param_dict.keys()):
            param_name = param_dict[input_name]
            if param_name:
                param_dict[input_name] = dataset[param_name].values
            else:
                # optional parameter set to None/null - remove it
                del param_dict[input_name]

        # call QARTOD test in a separate process to deal with crashes, e.g. segfaults
        read_fd, write_fd = os.pipe()
        processid = os.fork()
        if processid == 0:
            # child process
            with os.fdopen(write_fd, 'w') as w:
                os.close(read_fd)
                # run the qc function
                try:
                    # all arguments except the data under test come from the configuration object
                    # results is a nested dictionary
                    results = qc_config.run(**param_dict)
                    # convert results into a string for sending over pipe
                    # NOTE: this converts numpy arrays to lists! Use np.asarray() to restore them.
                    results_string = json.dumps(results, cls=NumpyEncoder)
                    w.write(results_string)
                except (TypeError, ValueError) as e:
                    log.exception(
                        '<%s> Failure executing QC with configuration %r %r',
                        self.request_id, config, e)
                    w.write(EXCEPTION_MESSAGE)
            # child process is done, don't let it stick around
            os._exit(0)

        # parent process
        os.close(write_fd)
        with os.fdopen(read_fd) as r:
            results_string = r.read()
        # wait for the child process to prevent zombies - second argument of 0 means default behavior of waitpid
        os.waitpid(processid, 0)
        # check for failure to produce results
        if not results_string:
            # an error, e.g. segfault, prevented proper qc execution, proceed with trying the next qc function
            log.error(
                '<%s> Failed to execute QC with configuration %r: QC process failed to return any data',
                self.request_id, config)
            return

        if results_string == EXCEPTION_MESSAGE:
            # an exception has already been logged, proceed with trying the next qc function
            return

        # load the results dict from the results string
        results = json.loads(results_string)

        # results is a nested dictionary with the outer keys being module names, the inner keys being test
        # names, and the inner values being the results for the given test
        # e.g. {'qartod': {'gross_range_test': [0, 0, 3, 4, 0], 'location_test': [2, 2, 2, 2, 2]}}
        for module, test_set in results.items():
            for test, test_results in test_set.items():
                # test_results was converted from an np.array to a list during serialization, so convert it back
                test_results = np.asarray(test_results)

                # Verify all QC results are valid QARTOD Primary Level Flags
                mask = np.array([
                    item not in QartodFlags.getValidQCFlags()
                    for item in test_results
                ])

                if mask.any():
                    log.error(
                        'Received QC result with invalid QARTOD Primary Flag from %s. Invalid flags: %r',
                        test, np.unique(test_results[mask]))
                    # Use the "high interest" (SUSPECT) flag to draw attention to the failure
                    test_results[mask] = QartodFlags.SUSPECT

                # add results to dataset
                QartodQcExecutor.insert_qc_results(parameter_under_test, test,
                                                   test_results, dataset)
Example #28
0
 def test_load_path_object(self):
     qc = QcConfig(Path(self.yamlfile))
     assert qc.config == self.expected_dict
Example #29
0
    def qc(self, dataset_ids=None, verbose=False, skip_units=False):
        """Light quality check on data.

        This runs one IOOS QARTOD on data as a first order quality check.
        Only returns data that is quality checked.

        Requires pint for unit handling. Requires user-input `criteria` and
        `var_def` to run.

        This is slow if your data is both chunks of time and space, so this
        should first narrow by both as much as possible.

        Parameters
        ----------
        dataset_ids: str, list, optional
            Read in data for dataset_ids specifically. If none are
            provided, data will be read in for all `self.keys()`.
        verbose: boolean, optional
            If True, report summary statistics on QC flag distribution in datasets.
        skip_units: boolean, optional
            If True, do not interpret or alter units and assume the data is in
            the units described in var_def already.

        Returns
        -------
        Dataset with added variables for each variable in dataset that was checked, with name of [variable]+'_qc'.

        Notes
        -----
        Code has been saved for data in DataFrames, but is changing so
        that data will be in Datasets. This way, can use cf-xarray
        functionality for custom variable names and easier to have
        recognizable units for variables with netcdf than csv.
        """

        assertion = (
            "Need to have custom criteria and variable information defined to run QC."
        )
        assert self.criteria and self.var_def, assertion

        if dataset_ids is None:
            data_ids = (
                self.keys()
            )  # Only return already read-in dataset_ids  # self.dataset_ids
        else:
            data_ids = dataset_ids
            if not isinstance(data_ids, list):
                data_ids = [data_ids]

        data_out = {}
        for data_id in data_ids:
            # access the Dataset
            dd = self[data_id]
            # which custom variable names are in dataset
            # dd_varnames are the variable names in the Dataset dd
            # cf_varnames are the custom names we can use to refer to the
            # variables through cf-xarray
            if isinstance(dd, pd.DataFrame):
                varnames, cf_varnames = [], []
                for var in self.var_def.keys():
                    try:
                        varname = dd.cf[var].name
                        varnames.append(varname)
                        cf_varnames.append(var)
                    except:
                        pass
            elif isinstance(dd, xr.Dataset):
                varnames = [
                    (cf_xarray.accessor._get_custom_criteria(dd, var), var)
                    for var in self.var_def.keys() if
                    len(cf_xarray.accessor._get_custom_criteria(dd, var)) > 0
                ]
            assert len(varnames) > 0, "no custom names matched in Dataset."
            if isinstance(dd, pd.DataFrame):
                dd_varnames = varnames.copy()
            elif isinstance(dd, xr.Dataset):
                dd_varnames, cf_varnames = zip(*varnames)
                dd_varnames = sum(dd_varnames, [])
            assert len(dd_varnames) == len(
                cf_varnames
            ), "looks like multiple variables might have been identified for a custom variable name"

            # subset to just the boem or requested variables for each df or ds
            if isinstance(dd, pd.DataFrame):
                dd2 = dd[list(varnames)]
            elif isinstance(dd, xr.Dataset):
                dd2 = dd.cf[cf_varnames]
                # dd2 = dd[varnames]  # equivalent

            if not skip_units:

                # Preprocess to change salinity units away from 1e-3
                if isinstance(dd, pd.DataFrame):
                    # this replaces units in the 2nd column level of 1e-3 with psu
                    new_levs = [
                        "psu" if col == "1e-3" else col
                        for col in dd2.columns.levels[1]
                    ]
                    dd2.columns.set_levels(new_levs, level=1, inplace=True)
                elif isinstance(dd, xr.Dataset):
                    for Var in dd2.data_vars:
                        if ("units" in dd2[Var].attrs
                                and dd2[Var].attrs["units"] == "1e-3"):
                            dd2[Var].attrs["units"] = "psu"
                # run pint quantify on each data structure
                dd2 = dd2.pint.quantify()
                # dd2 = dd2.pint.quantify(level=-1)

                # go through each variable by name to make sure in correct units
                # have to do this in separate loop so that can dequantify afterward
                if isinstance(dd, pd.DataFrame):
                    print("NOT IMPLEMENTED FOR DATAFRAME YET")
                elif isinstance(dd, xr.Dataset):
                    # form of "temp": "degree_Celsius"
                    units_dict = {
                        dd_varname: self.var_def[cf_varname]["units"]
                        for (dd_varname,
                             cf_varname) in zip(dd_varnames, cf_varnames)
                    }
                    # convert to conventional units
                    dd2 = dd2.pint.to(units_dict)

                dd2 = dd2.pint.dequantify()

            # now loop for QARTOD on each variable
            for dd_varname, cf_varname in zip(dd_varnames, cf_varnames):
                # run QARTOD
                qc_config = {
                    "qartod": {
                        "gross_range_test": {
                            "fail_span":
                            self.var_def[cf_varname]["fail_span"],
                            "suspect_span":
                            self.var_def[cf_varname]["suspect_span"],
                        },
                    }
                }
                qc = QcConfig(qc_config)
                qc_results = qc.run(inp=dd2[dd_varname])
                # qc_results = qc.run(inp=dd2.cf[cf_varname])  # this isn't working for some reason

                # put flags into dataset
                new_qc_var = f"{dd_varname}_qc"
                if isinstance(dd, pd.DataFrame):
                    dd2[new_qc_var] = qc_results["qartod"]["gross_range_test"]
                elif isinstance(dd, xr.Dataset):
                    new_data = qc_results["qartod"]["gross_range_test"]
                    dims = dd2[dd_varname].dims
                    dd2[f"{dd_varname}_qc"] = (dims, new_data)

            data_out[data_id] = dd2

        if verbose:
            for dataset_id, dd in data_out.items():
                print(dataset_id)
                qckeys = dd2[[var for var in dd.data_vars if "_qc" in var]]
                for qckey in qckeys:
                    print(qckey)
                    for flag, desc in odg.qcdefs.items():
                        print(
                            f"Flag == {flag} ({desc}): {int((dd[qckey] == int(flag)).sum())}"
                        )

        return data_out