def test_regex_pattern_string_parameter_builder_alice( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } candidate_regexes: List[str] = [ r"^\d{1}$", r"^\d{2}$", r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$", ] metric_domain_kwargs = {"column": "id"} regex_pattern_string_parameter: RegexPatternStringParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, data_context=data_context, batch_request=batch_request, ) ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs ) assert parameter_container.parameter_nodes is None regex_pattern_string_parameter._build_parameters( parameter_container=parameter_container, domain=domain ) fully_qualified_parameter_name_for_value: str = "$parameter.my_regex" expected_value: dict = { "value": [r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$"], "details": {"success_ratio": [1.0]}, } expected_value: dict = { "value": [r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$"], "details": { "evaluated_regexes": { r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$": 1.0, r"^\d{1}$": 0, r"^\d{2}$": 0, }, "threshold": 1.0, }, } assert ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters={domain.id: parameter_container}, ) == expected_value )
def test_execution_mean_table_columns_set_match_multi_batch_parameter_builder( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } mean_table_columns_set_match_multi_batch_parameter_builder: ParameterBuilder = ( MeanTableColumnsSetMatchMultiBatchParameterBuilder( name= "my_mean_table_columns_set_match_multi_batch_parameter_builder", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, json_serialize=True, data_context=data_context, )) domain: Domain = Domain( domain_type=MetricDomainTypes.TABLE, domain_kwargs=None, rule_name="my_rule", ) variables: Optional[ParameterContainer] = None parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } expected_parameter_value: dict = { "value": { "VendorID", "pickup_datetime", "total_amount", "congestion_surcharge", "dropoff_datetime", "mta_tax", "store_and_fwd_flag", "tip_amount", "trip_distance", "payment_type", "DOLocationID", "improvement_surcharge", "extra", "tolls_amount", "RatecodeID", "passenger_count", "PULocationID", "fare_amount", }, "details": { "success_ratio": 1.0, }, } mean_table_columns_set_match_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference= mean_table_columns_set_match_multi_batch_parameter_builder. fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) assert len( parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) == len( expected_parameter_value[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] = set( parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) assert parameter_node == expected_parameter_value
def test_regex_pattern_string_parameter_builder_bobby_multiple_matches( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) metric_domain_kwargs: dict = {"column": "VendorID"} candidate_regexes: List[str] = [ r"^\d{1}$", # will match r"^[12]{1}$", # will match 0.9941111111 of the time r"^\d{4}$", # won't match ] threshold: float = 0.9 batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", "data_connector_query": {"index": -1}, } regex_parameter: RegexPatternStringParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, threshold=threshold, data_context=data_context, batch_request=batch_request, ) ) assert regex_parameter.CANDIDATE_REGEX != candidate_regexes assert regex_parameter.candidate_regexes == candidate_regexes assert regex_parameter.threshold == 0.9 parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs ) assert parameter_container.parameter_nodes is None regex_parameter._build_parameters( parameter_container=parameter_container, domain=domain ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": [r"^[12]{1}$", r"^\d{1}$"], "details": { "evaluated_regexes": { r"^\d{1}$": 1.0, r"^[12]{1}$": 0.9941111111111111, r"^\d{4}$": 0, }, "threshold": 0.9, }, } results = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters={domain.id: parameter_container}, ) assert results is not None assert sorted(results["value"]) == sorted(expected_value["value"]) assert results["details"] == expected_value["details"]
def test_regex_pattern_string_parameter_builder_bobby_no_match( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) metric_domain_kwargs: dict = {"column": "VendorID"} candidate_regexes: Set[str] = { r"^\d{3}$", # won't match } threshold: float = 0.9 batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", "data_connector_query": {"index": -1}, } regex_parameter: RegexPatternStringParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, threshold=threshold, data_context=data_context, batch_request=batch_request, ) ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs ) assert parameter_container.parameter_nodes is None regex_parameter._build_parameters( parameter_container=parameter_container, domain=domain ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": [], "details": { "evaluated_regexes": { r"/\d+/": 0, r"/-?\d+/": 0, r"/-?\d+(\.\d*)?/": 0, r"/[A-Za-z0-9\.,;:!?()\"'%\-]+/": 0, r"^\s+/": 0, r"\s+/$": 0, r"/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)/": 0, r"/<\/?(?:p|a|b|img)(?: \/)?>/": 0, r"/(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}/": 0, r"/(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}/": 0, r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ": 0, }, "threshold": 0.9, }, } assert ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters={domain.id: parameter_container}, ) == expected_value )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_separate( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, )) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: Optional[str] = None max_user_id: int = 999999999999 default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value.value[0], max_value=max_user_id, ) expectation_configuration: Optional[ ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, parameters=parameters, ) assert expectation_configuration.kwargs["min_value"] == 397433
def test_regex_two_candidates(mock_data_context: mock.MagicMock, batch_fixture: Batch): batch: Batch = batch_fixture mock_data_context.get_batch_list.return_value = [batch] mock_data_context.get_validator_using_batch_list.return_value = Validator( execution_engine=PandasExecutionEngine(), batches=[batch]) data_context: DataContext = mock_data_context metric_domain_kwargs: dict = {"column": "b"} candidate_regexes: List[str] = [r"^\d{1}$", r"^\d{3}$"] regex_pattern_string_parameter_builder: ParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, data_context=data_context, )) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_pattern_string_parameter_builder.build_parameters( domain=domain, parameters=parameters, batch_list=[batch], ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder.value") expected_value: str = "^\\d{1}$" assert (get_parameter_value_and_validate_return_type( parameter_reference=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) == expected_value) fully_qualified_parameter_name_for_meta: str = ( "$parameter.my_regex_pattern_string_parameter_builder.details") expected_meta: dict = { "evaluated_regexes": { "^\\d{1}$": 1.0, "^\\d{3}$": 0.0 }, "success_ratio": 1.0, } meta: dict = get_parameter_value_and_validate_return_type( parameter_reference=fully_qualified_parameter_name_for_meta, expected_return_type=dict, domain=domain, parameters=parameters, ) assert meta == expected_meta
def test_value_set_multi_batch_parameter_builder_bobby_string( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs_for_parameter_builder: str = "$domain.domain_kwargs" value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = ( ValueSetMultiBatchParameterBuilder( name="my_store_and_fwd_flag_value_set", metric_domain_kwargs=metric_domain_kwargs_for_parameter_builder, data_context=data_context, )) metric_domain_kwargs: dict = {"column": "store_and_fwd_flag"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None variables: Optional[ParameterContainer] = None value_set_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) assert (parameter_container.parameter_nodes is None or len(parameter_container.parameter_nodes) == 1) expected_value_set: List[str] = ["N", "Y"] expected_parameter_value: dict = { "value": expected_value_set, "details": { "metric_configuration": { "metric_name": "column.distinct_values", "domain_kwargs": { "column": "store_and_fwd_flag" }, "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } fully_qualified_parameter_name_for_value: str = ( "$parameter.my_store_and_fwd_flag_value_set") parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=fully_qualified_parameter_name_for_value, expected_return_type=None, variables=variables, parameters=parameters, ) assert sorted(parameter_node.value) == expected_parameter_value["value"] assert parameter_node.details == expected_parameter_value["details"]
def test_mean_unexpected_map_metric_multi_batch_parameter_builder_bobby_numeric_dependencies_evaluated_separately( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } my_total_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder( name="my_total_count", metric_name="table.row_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, json_serialize=False, data_context=data_context, ) my_null_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder( name="my_null_count", metric_name="column_values.nonnull.unexpected_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=False, replace_nan_with_zero=False, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, json_serialize=False, data_context=data_context, ) mean_unexpected_map_metric_multi_batch_parameter_builder: ParameterBuilder = ( MeanUnexpectedMapMetricMultiBatchParameterBuilder( name= "my_passenger_count_values_not_null_mean_unexpected_map_metric", map_metric_name="column_values.nonnull", total_count_parameter_builder_name="my_total_count", null_count_parameter_builder_name="my_null_count", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, evaluation_parameter_builder_configs=None, json_serialize=False, data_context=data_context, )) metric_domain_kwargs: dict = {"column": "passenger_count"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) variables: Optional[ParameterContainer] = None parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } my_total_count_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) my_null_count_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) mean_unexpected_map_metric_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) expected_parameter_value: float = 0.0 parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference= mean_unexpected_map_metric_multi_batch_parameter_builder. fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) rtol: float = RTOL atol: float = 5.0e-1 * ATOL np.testing.assert_allclose( actual=parameter_node.value, desired=expected_parameter_value, rtol=rtol, atol=atol, err_msg= f"Actual value of {parameter_node.value} differs from expected value of {expected_parameter_value} by more than {atol + rtol * abs(parameter_node.value)} tolerance.", )
def test_get_parameter_values_for_fully_qualified_parameter_names( parameters_with_different_depth_level_values, ): parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) build_parameter_container( parameter_container=parameter_container, parameter_values=parameters_with_different_depth_level_values, ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=None, details=None, rule_name="my_rule", ) # Convert variables argument to ParameterContainer variables: ParameterContainer = build_parameter_container_for_variables( variables_configs={ "my_int": 9, "my_float": 3.38, "my_string": "hello", } ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } # fmt: off expected_parameter_values_for_fully_qualified_parameter_names: Dict[str, ParameterNode] = { "$variables": { "my_int": 9, "my_float": 3.38, "my_string": "hello", }, "$parameter.weekly_taxi_fairs.mean_values": { "value": [ { "sunday": 71.43, "monday": 74.35, "tuesday": 42.3, "wednesday": 42.3, "thursday": 82.2, "friday": 78.78, "saturday": 91.39, }, { "sunday": 81.43, "monday": 84.35, "tuesday": 52.3, "wednesday": 43.3, "thursday": 22.2, "friday": 98.78, "saturday": 81.39, }, { "sunday": 61.43, "monday": 34.35, "tuesday": 82.3, "wednesday": 72.3, "thursday": 22.2, "friday": 38.78, "saturday": 51.39, }, { "sunday": 51.43, "monday": 64.35, "tuesday": 72.3, "wednesday": 82.3, "thursday": 22.2, "friday": 98.78, "saturday": 31.39, }, { "sunday": 72.43, "monday": 77.35, "tuesday": 46.3, "wednesday": 47.3, "thursday": 88.2, "friday": 79.78, "saturday": 93.39, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 41.3, "wednesday": 49.3, "thursday": 80.2, "friday": 78.78, "saturday": 93.39, }, { "sunday": 74.43, "monday": 78.35, "tuesday": 49.3, "wednesday": 43.3, "thursday": 88.2, "friday": 72.78, "saturday": 97.39, }, { "sunday": 73.43, "monday": 72.35, "tuesday": 40.3, "wednesday": 40.3, "thursday": 89.2, "friday": 77.78, "saturday": 90.39, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 45.3, "wednesday": 44.3, "thursday": 89.2, "friday": 77.78, "saturday": 96.39, }, { "sunday": 75.43, "monday": 74.25, "tuesday": 42.33, "wednesday": 42.23, "thursday": 82.21, "friday": 78.76, "saturday": 91.37, }, { "sunday": 71.43, "monday": 74.37, "tuesday": 42.3, "wednesday": 42.32, "thursday": 82.23, "friday": 78.77, "saturday": 91.49, }, { "sunday": 71.63, "monday": 74.37, "tuesday": 42.2, "wednesday": 42.1, "thursday": 82.29, "friday": 78.79, "saturday": 91.39, }, { "sunday": 71.42, "monday": 74.33, "tuesday": 42.33, "wednesday": 42.34, "thursday": 82.25, "friday": 78.77, "saturday": 91.69, }, { "sunday": 71.44, "monday": 72.35, "tuesday": 42.33, "wednesday": 42.31, "thursday": 82.29, "friday": 78.68, "saturday": 91.49, }, { "sunday": 71.44, "monday": 74.32, "tuesday": 42.32, "wednesday": 42.32, "thursday": 82.29, "friday": 78.77, "saturday": 91.49, }, { "sunday": 71.44, "monday": 74.33, "tuesday": 42.21, "wednesday": 42.31, "thursday": 82.27, "friday": 78.74, "saturday": 91.49, }, { "sunday": 71.33, "monday": 74.25, "tuesday": 42.31, "wednesday": 42.03, "thursday": 82.02, "friday": 78.08, "saturday": 91.38, }, { "sunday": 71.41, "monday": 74.31, "tuesday": 42.39, "wednesday": 42.93, "thursday": 82.92, "friday": 78.75, "saturday": 91.49, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 42.3, "wednesday": 32.3, "thursday": 52.2, "friday": 88.78, "saturday": 81.39, }, { "sunday": 71.43, "monday": 74.35, "tuesday": 32.3, "wednesday": 92.3, "thursday": 72.2, "friday": 74.78, "saturday": 51.39, }, { "sunday": 72.43, "monday": 64.35, "tuesday": 52.3, "wednesday": 42.39, "thursday": 82.28, "friday": 78.77, "saturday": 91.36, }, { "sunday": 81.43, "monday": 94.35, "tuesday": 62.3, "wednesday": 52.3, "thursday": 92.2, "friday": 88.78, "saturday": 51.39, }, { "sunday": 21.43, "monday": 34.35, "tuesday": 42.34, "wednesday": 62.3, "thursday": 52.2, "friday": 98.78, "saturday": 81.39, }, { "sunday": 71.33, "monday": 74.25, "tuesday": 42.13, "wednesday": 42.93, "thursday": 82.82, "friday": 78.78, "saturday": 91.39, }, { "sunday": 72.43, "monday": 73.35, "tuesday": 44.3, "wednesday": 45.3, "thursday": 86.2, "friday": 77.78, "saturday": 98.39, }, { "sunday": 79.43, "monday": 78.35, "tuesday": 47.3, "wednesday": 46.3, "thursday": 85.2, "friday": 74.78, "saturday": 93.39, }, { "sunday": 71.42, "monday": 74.31, "tuesday": 42.0, "wednesday": 42.1, "thursday": 82.23, "friday": 65.78, "saturday": 91.26, }, { "sunday": 91.43, "monday": 84.35, "tuesday": 42.37, "wednesday": 42.36, "thursday": 82.25, "friday": 78.74, "saturday": 91.32, }, { "sunday": 71.33, "monday": 74.45, "tuesday": 42.35, "wednesday": 42.36, "thursday": 82.27, "friday": 26.78, "saturday": 71.39, }, { "sunday": 71.53, "monday": 73.35, "tuesday": 43.32, "wednesday": 42.23, "thursday": 82.32, "friday": 78.18, "saturday": 91.49, }, { "sunday": 71.53, "monday": 74.25, "tuesday": 52.3, "wednesday": 52.3, "thursday": 81.23, "friday": 78.78, "saturday": 78.39, }, ], "details": { "confidence": "high", }, }, "$parameter.tolerances.mostly": 0.91, "$parameter.tolerances.financial.usd": 1.0, "$parameter.monthly_taxi_fairs.mean_values": { "value": [ 2.3, 9.8, 42.3, 8.1, 38.5, 53.7, 71.43, 16.34, 49.43, 74.35, 51.98, 46.42, 20.01, 69.44, 65.32, 8.83, 55.79, 82.2, 36.93, 83.78, 31.13, 76.93, 67.67, 25.12, 58.04, 79.78, 90.91, 15.26, 61.65, 78.78, 12.99, ], "details": { "confidence": "low", }, }, "$parameter.date_strings.yyyy_mm_dd_hh_mm_ss_tz_date_format": { "value": "%Y-%m-%d %H:%M:%S %Z", "details": { "confidence": 0.78, }, }, "$parameter.date_strings.yyyy_mm_dd_date_format": { "value": "%Y-%m-%d", "details": { "confidence": 0.78, }, }, "$parameter.date_strings.tolerances.max_num_conversion_attempts": 5, "$parameter.date_strings.tolerances.max_abs_error_time_milliseconds": 100, "$parameter.date_strings.mm_yyyy_dd_hh_mm_ss_tz_date_format": { "value": "%m-%Y-%d %H:%M:%S %Z", "details": { "confidence": 0.78, }, }, "$parameter.date_strings.mm_yyyy_dd_date_format": { "value": "%m-%Y-%d", "details": { "confidence": 0.78, }, }, "$parameter.daily_taxi_fairs.mean_values": { "value": { "sunday": 71.43, "monday": 74.35, "tuesday": 42.3, "wednesday": 42.3, "thursday": 82.2, "friday": 78.78, "saturday": 91.39, }, "details": { "confidence": "medium", }, }, "$mean": 0.65, } # fmt: on parameter_values_for_fully_qualified_parameter_names: Dict[ str, ParameterNode ] = get_parameter_values_for_fully_qualified_parameter_names( domain=domain, variables=variables, parameters=parameters, ) assert ( parameter_values_for_fully_qualified_parameter_names == expected_parameter_values_for_fully_qualified_parameter_names )
def test_value_set_multi_batch_parameter_builder_alice_single_batch_string( alice_columnar_table_single_batch_context, ): """ What does this test and why? This tests that non-numeric columns are handled appropriately, """ data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_agent"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = ( ValueSetMultiBatchParameterBuilder( name="my_user_agent_value_set", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, )) assert parameter_container.parameter_nodes is None variables: Optional[ParameterContainer] = None value_set_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) assert (parameter_container.parameter_nodes is None or len(parameter_container.parameter_nodes) == 1) expected_value_set: List[str] = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", ] expected_parameter_value: dict = { "value": expected_value_set, "details": { "metric_configuration": { "domain_kwargs": { "column": "user_agent" }, "metric_name": "column.distinct_values", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 1, }, } fully_qualified_parameter_name_for_value: str = "$parameter.my_user_agent_value_set" parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=fully_qualified_parameter_name_for_value, expected_return_type=None, variables=variables, parameters=parameters, ) assert sorted(parameter_node.value) == expected_parameter_value["value"] assert parameter_node.details == expected_parameter_value["details"]
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_bw_method( bobby_columnar_table_multi_batch_deterministic_data_context, ): """ This tests whether a change to bw_method results in a change to the range """ data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: dict = {"column": "fare_amount"} numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="kde", false_positive_rate=5.0e-2, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) default_bw_method_value: np.ndarray = parameter_node.pop("value") numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="kde", bw_method=0.5, false_positive_rate=5.0e-2, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) other_bw_method_value: np.ndarray = parameter_node.pop("value") assert default_bw_method_value[0] != other_bw_method_value[0]
def test_kde_numeric_metric_range_multi_batch_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="row_count_range", metric_name="table.row_count", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="kde", include_estimator_samples_histogram_in_details=True, false_positive_rate=1.0e-2, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( rule_name="my_rule", domain_type=MetricDomainTypes.TABLE, ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.row_count_range" expected_value_dict: Dict[str, Optional[str]] = { "value": None, "details": { "metric_configuration": { "domain_kwargs": {}, "metric_name": "table.row_count", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_value: np.ndarray = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict expected_value: np.ndarray = np.array([6180, 10277]) # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired) # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details). rtol: float = 1.0e-2 atol: float = 0 # kde results should be stable +/- 1% np.testing.assert_allclose( actual=actual_value, desired=expected_value, rtol=rtol, atol=atol, err_msg= f"Actual value of {actual_value} differs from expected value of {expected_value} by more than {atol + rtol * abs(expected_value)} tolerance.", ) expected_estimation_histogram: np.ndarray = np.array([ 13.0, 155.0, 719.0, 1546.0, 2221.0, 2570.0, 1946.0, 683.0, 137.0, 9.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1
def test_oneshot_numeric_metric_range_multi_batch_parameter_builder_with_evaluation_dependency_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: Dict[str, str] = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: Dict[str, str] = {"column": "fare_amount"} fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" my_column_min_metric_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig( module_name="great_expectations.rule_based_profiler.parameter_builder", class_name="MetricMultiBatchParameterBuilder", name="my_column_min", metric_name="column.min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, enforce_numeric_metric=True, replace_nan_with_zero=True, reduce_scalar_metric=True, evaluation_parameter_builder_configs=None, ) evaluation_parameter_builder_configs: Optional[ List[ParameterBuilderConfig]] = [ my_column_min_metric_multi_batch_parameter_builder_config, ] numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name=None, metric_multi_batch_parameter_builder_name="my_column_min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=1.0e-2, round_decimals=1, evaluation_parameter_builder_configs= evaluation_parameter_builder_configs, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 expected_value_dict: Dict[str, Optional[str]] = { "value": None, "details": { "metric_configuration": { "domain_kwargs": { "column": "fare_amount" }, "metric_name": "column.min", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_01: np.ndarray = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_01_lower: float = actual_values_01[0] actual_value_01_upper: float = actual_values_01[1] expected_value_01_lower: float = -51.7 expected_value_01_upper: float = -21.0 assert actual_value_01_lower == expected_value_01_lower assert actual_value_01_upper == expected_value_01_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1 numeric_metric_range_parameter_builder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_multi_batch_parameter_builder_name="my_column_min", metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=5.0e-2, round_decimals=1, evaluation_parameter_builder_configs= evaluation_parameter_builder_configs, data_context=data_context, )) numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, recompute_existing_parameter_values=True, batch_request=batch_request, ) parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_05 = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_05_lower: float = actual_values_05[0] actual_value_05_upper: float = actual_values_05[1] expected_value_05_lower: float = -50.5 expected_value_05_upper: float = -21.1 assert actual_value_05_lower == expected_value_05_lower assert actual_value_05_upper == expected_value_05_upper # if false positive rate is higher, our range should be more narrow assert actual_value_01_lower < actual_value_05_lower assert actual_value_01_upper > actual_value_05_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1
def test_partition_parameter_builder_alice_continuous_changed_to_categorical( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } parameter_builder: ParameterBuilder = PartitionParameterBuilder( name="my_name", bucketize_data=True, data_context=data_context, ) metric_domain_kwargs: dict = {"column": "event_ts"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None variables: Optional[ParameterContainer] = None parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) expected_parameter_value: dict = { "value": { "values": [ "2004-10-19 10:23:54", "2004-10-19 10:23:55", "2004-10-19 11:05:20", ], "weights": [0.3333333333333333, 0.3333333333333333, 0.3333333333333333], }, "details": { "metric_configuration": { "metric_name": "column.value_counts", "domain_kwargs": {"column": "event_ts"}, "metric_value_kwargs": {"sort": "value"}, "metric_dependencies": None, }, "num_batches": 1, }, } parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=parameter_builder.fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) assert parameter_node == expected_parameter_value
def test_value_set_multi_batch_parameter_builder_alice_single_batch_numeric( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "event_type"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = ( ValueSetMultiBatchParameterBuilder( name="my_event_type_value_set", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, )) assert parameter_container.parameter_nodes is None variables: Optional[ParameterContainer] = None value_set_multi_batch_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) assert (parameter_container.parameter_nodes is None or len(parameter_container.parameter_nodes) == 1) expected_value_set: List[int] = [19, 22, 73] expected_parameter_value: dict = { "value": expected_value_set, "details": { "metric_configuration": { "domain_kwargs": { "column": "event_type" }, "metric_name": "column.distinct_values", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 1, }, } fully_qualified_parameter_name_for_value: str = "$parameter.my_event_type_value_set" parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=fully_qualified_parameter_name_for_value, expected_return_type=None, variables=variables, parameters=parameters, ) assert sorted(parameter_node.value) == expected_parameter_value["value"] assert parameter_node.details == expected_parameter_value["details"]
def test_partition_parameter_builder_alice_continuous( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } parameter_builder: ParameterBuilder = PartitionParameterBuilder( name="my_name", bucketize_data=True, data_context=data_context, ) metric_domain_kwargs: dict = {"column": "user_id"} domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None variables: Optional[ParameterContainer] = None parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) expected_parameter_value: dict = { "value": { "bins": [397433.0, 4942918.5, 9488404.0], "weights": [0.6666666666666666, 0.3333333333333333], "tail_weights": [0.0, 0.0], }, "details": { "metric_configuration": { "metric_name": "column.histogram", "domain_kwargs": {"column": "user_id"}, "metric_value_kwargs": {"bins": [397433.0, 4942918.5, 9488404.0]}, "metric_dependencies": None, }, "num_batches": 1, }, } parameter_node: ParameterNode = get_parameter_value_and_validate_return_type( domain=domain, parameter_reference=parameter_builder.fully_qualified_parameter_name, expected_return_type=None, variables=variables, parameters=parameters, ) assert parameter_node == expected_parameter_value
def _build_profiler(self) -> None: """ Builds "RuleBasedProfiler", corresponding to present DataAssistant use case. Starts with empty "RuleBasedProfiler" (initialized in constructor) and adds Rule objects. Subclasses can add custom "Rule" objects as appropriate for their respective particular DataAssistant use cases. """ variables: dict = {} profiler: Optional[BaseRuleBasedProfiler] rules: List[Rule] rule: Rule domain_builder: DomainBuilder parameter_builders: List[ParameterBuilder] expectation_configuration_builders: List[ExpectationConfigurationBuilder] """ For each Self-Initializing "Expectation" as specified by "DataAssistant.expectation_kwargs_by_expectation_type" interface property, retrieve its "RuleBasedProfiler" configuration, construct "Rule" object based on it, while incorporating metrics "ParameterBuilder" objects for "MetricDomainTypes", emitted by "DomainBuilder" of comprised "Rule", specified by "DataAssistant.metrics_parameter_builders_by_domain" interface property. Append this "Rule" object to overall DataAssistant "RuleBasedProfiler" object; incorporate "variables" as well. """ expectation_type: str expectation_kwargs: Dict[str, Any] for ( expectation_type, expectation_kwargs, ) in self.expectation_kwargs_by_expectation_type.items(): profiler = self._validator.build_rule_based_profiler_for_expectation( expectation_type=expectation_type )(**expectation_kwargs) variables.update(convert_variables_to_dict(variables=profiler.variables)) rules = profiler.rules for rule in rules: domain_builder = rule.domain_builder parameter_builders = rule.parameter_builders or [] parameter_builders.extend( self.metrics_parameter_builders_by_domain[ Domain( domain_builder.domain_type, ) ] ) expectation_configuration_builders = ( rule.expectation_configuration_builders or [] ) self.profiler.add_rule( rule=Rule( name=rule.name, variables=rule.variables, domain_builder=domain_builder, parameter_builders=parameter_builders, expectation_configuration_builders=expectation_configuration_builders, ) ) self.profiler.variables = self.profiler.reconcile_profiler_variables( variables=variables, reconciliation_strategy=DEFAULT_RECONCILATION_DIRECTIVES.variables, )
def test_default_expectation_configuration_builder_alice_parentheses_parameter_variable_condition_false( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, )) variables: ParameterContainer = build_parameter_container_for_variables({ "max_user_id": 999999999999, "answer": 42 }) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: str = "($variables.max_user_id<0 | $variables.answer!=42) | $parameter.my_min_user_id.value[0]<0" max_value: str = "$variables.max_user_id" default_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value, max_value=max_value, ) expectation_configuration: Optional[ ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, variables=variables, parameters=parameters, ) assert expectation_configuration is None
def _get_domains( self, variables: Optional[ParameterContainer] = None, ) -> List[Domain]: """ Find the semantic column type for each column and return all domains matching the specified type or types. """ semantic_types: List[ SemanticDomainTypes] = _parse_semantic_domain_type_argument( semantic_types=self.semantic_types) batch_id: str = self.get_batch_id(variables=variables) column_types_dict_list: List[Dict[str, Any]] = self.get_validator( variables=variables).get_metric(metric=MetricConfiguration( metric_name="table.column_types", metric_domain_kwargs={ "batch_id": batch_id, }, metric_value_kwargs={ "include_nested": True, }, metric_dependencies=None, )) table_column_names: List[str] = self.get_validator( variables=variables).get_metric(metric=MetricConfiguration( metric_name="table.columns", metric_domain_kwargs={ "batch_id": batch_id, }, metric_value_kwargs=None, metric_dependencies=None, )) column_name: str # A semantic type is distinguished from the structured column type; # An example structured column type would be "integer". The inferred semantic type would be "id". table_column_name_to_inferred_semantic_domain_type_mapping: Dict[ str, SemanticDomainTypes] = { column_name: self.infer_semantic_domain_type_from_table_column_type( column_types_dict_list=column_types_dict_list, column_name=column_name, ).semantic_domain_type for column_name in table_column_names } candidate_column_names: List[str] = list( filter( lambda candidate_column_name: table_column_name_to_inferred_semantic_domain_type_mapping[ candidate_column_name] in semantic_types, table_column_names, )) domains: List[Domain] = [ Domain( domain_type=self.domain_type, domain_kwargs={ "column": column_name, }, details={ "inferred_semantic_domain_type": table_column_name_to_inferred_semantic_domain_type_mapping[ column_name], }, ) for column_name in candidate_column_names ] return domains
def test_meta_not_dict_exception(alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, )) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition = None max_user_id: int = 999999999999 with pytest.raises(ge_exceptions.ProfilerExecutionError) as e: # noinspection PyTypeChecker DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value.value[0], max_value=max_user_id, meta="Strings are not acceptable", ) assert ( str(e.value) == 'Argument "Strings are not acceptable" in "DefaultExpectationConfigurationBuilder" must be of type "dictionary" (value of type "<class \'str\'>" was encountered).\n' )
def test_column_values_nonnull_multi_batch_one_column_not_emitted( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: BatchRequest = BatchRequest( datasource_name="taxi_pandas", data_connector_name="monthly", data_asset_name="my_reports", ) domain_builder: MapMetricColumnDomainBuilder = MapMetricColumnDomainBuilder( map_metric_name="column_values.nonnull", max_unexpected_values=0, max_unexpected_ratio=None, min_max_unexpected_values_proportion=9.75e-1, data_context=data_context, ) domains: List[Domain] = domain_builder.get_domains( rule_name="my_rule", batch_request=batch_request) # Unit Tests for "inferred_semantic_domain_type" are provided separately. domain: Domain for domain in domains: domain.details = {} domains = sorted(domains, key=lambda x: x.domain_kwargs["column"]) bobby_compliant_column_names: List[str] = [ "VendorID", "pickup_datetime", "dropoff_datetime", "passenger_count", "trip_distance", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID", "payment_type", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", ] column_name: str bobby_expected_column_domains: List[Domain] = [ Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs={ "column": column_name, }, rule_name="my_rule", ) for column_name in bobby_compliant_column_names ] bobby_expected_column_domains = sorted( bobby_expected_column_domains, key=lambda x: x.domain_kwargs["column"]) assert len(domains) == 17 assert domains == bobby_expected_column_domains