Beispiel #1
0
    def udf(df: pd.DataFrame) -> pd.Series:
        from datadog.dogstatsd import DogStatsd

        reporter = (DogStatsd(
            host=os.environ["STATSD_HOST"],
            port=int(os.environ["STATSD_PORT"]),
            telemetry_min_flush_interval=0,
        ) if os.getenv("STATSD_HOST") and os.getenv("STATSD_PORT") else
                    DogStatsd())

        ds = PandasDataset.from_dataset(df)
        result = ds.validate(expectations, result_format="COMPLETE")
        valid_rows = pd.Series([True] * df.shape[0])

        for check in result.results:
            if check.exception_info["raised_exception"]:
                # ToDo: probably we should mark all rows as invalid
                continue

            check_kwargs = check.expectation_config.kwargs
            check_kwargs.pop("result_format", None)
            check_name = "_".join([check.expectation_config.expectation_type] +
                                  [
                                      str(v) for v in check_kwargs.values()
                                      if isinstance(v, (str, int, float))
                                  ])

            if ("unexpected_count" in check.result
                    and check.result["unexpected_count"] > 0):
                reporter.increment(
                    "feast_feature_validation_check_failed",
                    value=check.result["unexpected_count"],
                    tags=[
                        f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}",
                        f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}",
                        f"check:{check_name}",
                    ],
                )

                valid_rows.iloc[check.result["unexpected_index_list"]] = False

            elif "observed_value" in check.result and check.result[
                    "observed_value"]:
                reporter.gauge(
                    "feast_feature_validation_observed_value",
                    value=int(check.result["observed_value"] *
                              100  # storing as decimal with precision 2
                              ) if not check.success else
                    0,  # nullify everything below threshold
                    tags=[
                        f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}",
                        f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}",
                        f"check:{check_name}",
                    ],
                )

        return valid_rows
Beispiel #2
0
def create_suite():
    df = pd.DataFrame()
    df['num'] = np.random.randint(0, 10, 100)
    df['num2'] = np.random.randint(0, 20, 100)
    ds = PandasDataset.from_dataset(df)

    ds.expect_column_values_to_be_between('num', 0, 10)
    ds.expect_column_values_to_be_between('num2', 0, 20)

    return ds.get_expectation_suite()
Beispiel #3
0
    def validate(df) -> pd.DataFrame:
        ds = PandasDataset.from_dataset(df)
        # print(ds, ds.shape)
        result = ds.validate(suite, result_format='COMPLETE')
        valid_rows = pd.Series([True] * ds.shape[0])
        # print(result)
        for check in result.results:
            if check.success:
                continue

            valid_rows.iloc[check.result['unexpected_index_list']] = False
        return valid_rows
Beispiel #4
0
    def udf(df: pd.DataFrame) -> pd.Series:
        ds = PandasDataset.from_dataset(df)
        result = ds.validate(expectations, result_format="COMPLETE")
        valid_rows = pd.Series([True] * df.shape[0])

        for check in result.results:
            if check.success:
                continue

            if check.exception_info["raised_exception"]:
                # ToDo: probably we should mark all rows as invalid
                continue

            valid_rows.iloc[check.result["unexpected_index_list"]] = False

        return valid_rows