def udf(df: pd.DataFrame) -> pd.Series: from datadog.dogstatsd import DogStatsd reporter = (DogStatsd( host=os.environ["STATSD_HOST"], port=int(os.environ["STATSD_PORT"]), telemetry_min_flush_interval=0, ) if os.getenv("STATSD_HOST") and os.getenv("STATSD_PORT") else DogStatsd()) ds = PandasDataset.from_dataset(df) result = ds.validate(expectations, result_format="COMPLETE") valid_rows = pd.Series([True] * df.shape[0]) for check in result.results: if check.exception_info["raised_exception"]: # ToDo: probably we should mark all rows as invalid continue check_kwargs = check.expectation_config.kwargs check_kwargs.pop("result_format", None) check_name = "_".join([check.expectation_config.expectation_type] + [ str(v) for v in check_kwargs.values() if isinstance(v, (str, int, float)) ]) if ("unexpected_count" in check.result and check.result["unexpected_count"] > 0): reporter.increment( "feast_feature_validation_check_failed", value=check.result["unexpected_count"], tags=[ f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}", f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}", f"check:{check_name}", ], ) valid_rows.iloc[check.result["unexpected_index_list"]] = False elif "observed_value" in check.result and check.result[ "observed_value"]: reporter.gauge( "feast_feature_validation_observed_value", value=int(check.result["observed_value"] * 100 # storing as decimal with precision 2 ) if not check.success else 0, # nullify everything below threshold tags=[ f"feature_table:{os.getenv('FEAST_INGESTION_FEATURE_TABLE', 'unknown')}", f"project:{os.getenv('FEAST_INGESTION_PROJECT_NAME', 'default')}", f"check:{check_name}", ], ) return valid_rows
def create_suite(): df = pd.DataFrame() df['num'] = np.random.randint(0, 10, 100) df['num2'] = np.random.randint(0, 20, 100) ds = PandasDataset.from_dataset(df) ds.expect_column_values_to_be_between('num', 0, 10) ds.expect_column_values_to_be_between('num2', 0, 20) return ds.get_expectation_suite()
def validate(df) -> pd.DataFrame: ds = PandasDataset.from_dataset(df) # print(ds, ds.shape) result = ds.validate(suite, result_format='COMPLETE') valid_rows = pd.Series([True] * ds.shape[0]) # print(result) for check in result.results: if check.success: continue valid_rows.iloc[check.result['unexpected_index_list']] = False return valid_rows
def udf(df: pd.DataFrame) -> pd.Series: ds = PandasDataset.from_dataset(df) result = ds.validate(expectations, result_format="COMPLETE") valid_rows = pd.Series([True] * df.shape[0]) for check in result.results: if check.success: continue if check.exception_info["raised_exception"]: # ToDo: probably we should mark all rows as invalid continue valid_rows.iloc[check.result["unexpected_index_list"]] = False return valid_rows