def test_validator_progress_bar_config_enabled(mock_tqdm, mock_validation_graph, mock_data_context): data_context = mock_data_context() engine = PandasExecutionEngine() validator = Validator(engine, data_context=data_context) # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm mock_validation_graph.edges.__len__ = lambda _: 3 validator.resolve_validation_graph(mock_validation_graph, {}) # Still invoked but doesn't actually do anything due to `disabled` assert mock_tqdm.called is True assert mock_tqdm.call_args[1]["disable"] is False
def test_validator_progress_bar_config_disabled(mock_tqdm, mock_validation_graph, mock_data_context): data_context = mock_data_context() data_context.progress_bars = ProgressBarsConfig(metric_calculations=False) engine = PandasExecutionEngine() validator = Validator(engine, data_context=data_context) # ValidationGraph is a complex object that requires len > 3 to not trigger tqdm mock_validation_graph.edges.__len__ = lambda _: 3 validator.resolve_validation_graph(mock_validation_graph, {}) assert mock_tqdm.called is True assert mock_tqdm.call_args[1]["disable"] is True
def test_resolve_validation_graph_with_bad_config_catch_exceptions_true( basic_datasource, ): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, None]}) batch = basic_datasource.get_single_batch_from_batch_request( RuntimeBatchRequest( **{ "datasource_name": "my_datasource", "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "runtime_parameters": { "batch_data": df, }, "batch_identifiers": { "pipeline_stage_name": 0, "airflow_run_id": 0, "custom_key_0": 0, }, })) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_max_to_be_between", kwargs={ "column": "not_in_table", "min_value": 1, "max_value": 29 }, ) runtime_configuration = { "catch_exceptions": True, "result_format": { "result_format": "BASIC" }, } execution_engine = PandasExecutionEngine() validator = Validator(execution_engine=execution_engine, batches=[batch]) expectation_impl = get_expectation_impl( expectation_configuration.expectation_type) validation_dependencies = expectation_impl().get_validation_dependencies( expectation_configuration, execution_engine, runtime_configuration)["metrics"] graph = ValidationGraph() for metric_configuration in validation_dependencies.values(): validator.build_metric_dependency_graph( graph=graph, execution_engine=execution_engine, metric_configuration=metric_configuration, configuration=expectation_configuration, runtime_configuration=runtime_configuration, ) metrics: Dict[Tuple[str, str, str], Any] = {} aborted_metrics_info: Dict[Tuple[str, str, str], Dict[ str, Union[MetricConfiguration, Set[ExceptionInfo], int]], ] = validator.resolve_validation_graph( graph=graph, metrics=metrics, runtime_configuration=runtime_configuration, ) assert len(aborted_metrics_info) == 1 aborted_metric_info_item = list(aborted_metrics_info.values())[0] assert aborted_metric_info_item[ "num_failures"] == MAX_METRIC_COMPUTATION_RETRIES assert len(aborted_metric_info_item["exception_info"]) == 1 exception_info = next(iter(aborted_metric_info_item["exception_info"])) assert (exception_info["exception_message"] == 'Error: The column "not_in_table" in BatchData does not exist.')
def get_validation_dependencies( self, configuration: Optional[ExpectationConfiguration] = None, execution_engine: Optional[ExecutionEngine] = None, runtime_configuration: Optional[dict] = None, ): all_dependencies = super().get_validation_dependencies( configuration, execution_engine, runtime_configuration ) dependencies = all_dependencies["metrics"] partition_object = configuration.kwargs["partition_object"] domain_kwargs = configuration.get_domain_kwargs() is_categorical = None bins = None if partition_object is None: if configuration.kwargs.get( "bucketize_data", self.default_kwarg_values["bucketize_data"] ): is_categorical = False partition_metric_configuration = MetricConfiguration( "column.partition", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={ "bins": "auto", "allow_relative_error": False, }, ) # # Note: 20201116 - JPC - the execution engine doesn't provide capability to evaluate # dependencies, so we use a validator # validator = Validator(execution_engine=execution_engine) graph = ValidationGraph() validator.build_metric_dependency_graph( graph=graph, child_node=partition_metric_configuration, configuration=configuration, execution_engine=execution_engine, ) bins = validator.resolve_validation_graph(graph, metrics=dict())[ partition_metric_configuration.id ] hist_metric_configuration = MetricConfiguration( "column.histogram", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"bins": tuple(bins),}, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs=dict(), ) # # NOTE 20201117 - JPC - Would prefer not to include partition_metric_configuraiton here, # since we have already evaluated it, and its result is in the kwargs for the histogram. # However, currently the dependencies' configurations are not passed to the _validate method # dependencies["column.partition"] = partition_metric_configuration dependencies["column.histogram"] = hist_metric_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration else: is_categorical = True counts_configuration = MetricConfiguration( "column.value_counts", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"sort": "value",}, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, ) dependencies["column.value_counts"] = counts_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration if is_categorical is True or is_valid_categorical_partition_object( partition_object ): dependencies["column.value_counts"] = MetricConfiguration( "column.value_counts", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"sort": "value"}, ) dependencies["column_values.nonnull.count"] = MetricConfiguration( "column_values.nonnull.count", domain_kwargs ) else: if ( bins is None ): # if the user did not supply a partition_object, so we just computed it if not is_valid_partition_object(partition_object): raise ValueError("Invalid partition_object provided") bins = partition_object["bins"] hist_metric_configuration = MetricConfiguration( "column.histogram", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"bins": bins,}, ) nonnull_configuration = MetricConfiguration( "column_values.nonnull.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs=dict(), ) dependencies["column.histogram"] = hist_metric_configuration dependencies["column_values.nonnull.count"] = nonnull_configuration below_partition = MetricConfiguration( "column_values.between.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"max_value": bins[0]}, ) above_partition = MetricConfiguration( "column_values.between.count", metric_domain_kwargs=domain_kwargs, metric_value_kwargs={"min_value": bins[-1], "strict_min": True}, ) dependencies["below_partition"] = below_partition dependencies["above_partition"] = above_partition return all_dependencies