def test_get_parameter_value_by_fully_qualified_parameter_name_valid_parameter_name( column_Age_domain, column_Date_domain, rule_with_parameters, variables_multi_part_name_parameter_container, domain_name, fully_qualified_parameter_name, value, value_accessor, details, use_value_suffix, test_details, ): if domain_name == "age": domain = column_Age_domain elif domain_name == "date": domain = column_Date_domain else: raise ValueError( f'Supported "domain_name" parameter values are "age" and "date".') if value_accessor is None: value_accessor = "" if use_value_suffix: fully_qualified_parameter_name_for_value = ( f"{fully_qualified_parameter_name}.value{value_accessor}") else: fully_qualified_parameter_name_for_value = ( f"{fully_qualified_parameter_name}{value_accessor}") assert (get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, variables=variables_multi_part_name_parameter_container, parameters=rule_with_parameters.parameters, ) == value) if test_details: fully_qualified_parameter_name_for_details = ( f"{fully_qualified_parameter_name}.details") assert (get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_details, domain=domain, variables=variables_multi_part_name_parameter_container, parameters=rule_with_parameters.parameters, ) == details)
def test_simple_date_format_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) metric_domain_kwargs: dict = {"column": "pickup_datetime"} candidate_strings: set[str] = { "%Y-%m-%d", "%Y-%m-%d %H:%M:%S", } threshold: float = 0.9 batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } date_format_string_parameter: SimpleDateFormatStringParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_simple_date_format_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_strings=candidate_strings, threshold=threshold, data_context=data_context, batch_request=batch_request, )) assert date_format_string_parameter.CANDIDATE_STRINGS != candidate_strings assert date_format_string_parameter._candidate_strings == candidate_strings assert date_format_string_parameter._threshold == 0.9 parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs) assert parameter_container.parameter_nodes is None date_format_string_parameter._build_parameters( parameter_container=parameter_container, domain=domain) assert len(parameter_container.parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = ( "$parameter.my_simple_date_format_string_parameter_builder") expected_value: dict = { "value": "%Y-%m-%d %H:%M:%S", "details": { "success_ratio": 1.0 }, } assert (get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters={domain.id: parameter_container}, ) == expected_value)
def test_simple_date_format_parameter_builder_alice( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs = {"column": "event_ts"} date_format_string_parameter: SimpleDateFormatStringParameterBuilder = ( SimpleDateFormatStringParameterBuilder( name="my_date_format", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, batch_request=batch_request, )) assert date_format_string_parameter.CANDIDATE_STRINGS == DEFAULT_CANDIDATE_STRINGS assert date_format_string_parameter.candidate_strings is None assert date_format_string_parameter._threshold == 1.0 parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs) assert parameter_container.parameter_nodes is None date_format_string_parameter._build_parameters( parameter_container=parameter_container, domain=domain) # noinspection PyTypeChecker assert len(parameter_container.parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format" expected_value: dict = { "value": "%Y-%m-%d %H:%M:%S", "details": { "success_ratio": 1.0 }, } assert (get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters={domain.id: parameter_container}, ) == expected_value)
def test_get_parameter_value_by_fully_qualified_parameter_name_invalid_parameter_name( column_Age_domain, variables_multi_part_name_parameter_container, rule_with_parameters, ): with pytest.raises(ge_exceptions.ProfilerExecutionError, match=r".+start with \$.*"): # noinspection PyUnusedLocal parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name="mean", domain=column_Age_domain, variables=variables_multi_part_name_parameter_container, parameters=rule_with_parameters.parameters, )
def get_parameter_value( domain: Optional[Domain] = None, parameter_reference: Optional[Union[Any, str]] = None, variables: Optional[ParameterContainer] = None, parameters: Optional[Dict[str, ParameterContainer]] = None, ) -> Optional[Any]: """ This method allows for the parameter_reference to be specified as an object (literal, dict, any typed object, etc.) or as a fully-qualified parameter name. Moreover, if the parameter_reference argument is an object of type "dict", it will recursively detect values using the fully-qualified parameter name format and evaluate them accordingly. """ if isinstance(parameter_reference, dict): for key, value in parameter_reference.items(): parameter_reference[key] = get_parameter_value( domain=domain, parameter_reference=value, variables=variables, parameters=parameters, ) elif isinstance(parameter_reference, (list, set, tuple)): parameter_reference_type: type = type(parameter_reference) element: Any return parameter_reference_type([ get_parameter_value( domain=domain, parameter_reference=element, variables=variables, parameters=parameters, ) for element in parameter_reference ]) elif isinstance( parameter_reference, str) and is_fully_qualified_parameter_name_literal_string_format( fully_qualified_parameter_name=parameter_reference): parameter_reference = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=parameter_reference, domain=domain, variables=variables, parameters=parameters, ) parameter_reference = get_parameter_value( domain=domain, parameter_reference=parameter_reference, variables=variables, parameters=parameters, ) return parameter_reference
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_bw_method( bobby_columnar_table_multi_batch_deterministic_data_context, ): """ This tests whether a change to bw_method results in a change to the range """ data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: dict = {"column": "fare_amount"} numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="kde", false_positive_rate=5.0e-2, round_decimals=0, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( rule_name="my_rule", domain_type=MetricDomainTypes.TABLE, ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) default_bw_method_value: np.ndarray = parameter_node.pop("value") numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="kde", bw_method=0.5, false_positive_rate=5.0e-2, round_decimals=0, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( rule_name="my_rule", domain_type=MetricDomainTypes.TABLE, ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) other_bw_method_value: np.ndarray = parameter_node.pop("value") assert default_bw_method_value[0] != other_bw_method_value[0]
def test_kde_numeric_metric_range_multi_batch_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="row_count_range", metric_name="table.row_count", estimator="kde", include_estimator_samples_histogram_in_details=True, false_positive_rate=1.0e-2, round_decimals=0, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( rule_name="my_rule", domain_type=MetricDomainTypes.TABLE, ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.row_count_range" expected_value_dict: dict = { "value": None, "details": { "metric_configuration": { "domain_kwargs": {}, "metric_name": "table.row_count", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_value: np.ndarray = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict expected_value: np.ndarray = np.array([6180, 10277]) # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired) # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details). rtol: float = 1.0e-2 atol: float = 0 # kde results should be stable +/- 1% np.testing.assert_allclose( actual=actual_value, desired=expected_value, rtol=rtol, atol=atol, err_msg= f"Actual value of {actual_value} differs from expected value of {expected_value} by more than {atol + rtol * abs(expected_value)} tolerance.", ) expected_estimation_histogram: np.ndarray = np.array([ 13.0, 155.0, 719.0, 1546.0, 2221.0, 2570.0, 1946.0, 683.0, 137.0, 9.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1
def test_oneshot_numeric_metric_range_multi_batch_parameter_builder_bobby( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: dict = {"column": "fare_amount"} fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range" expected_value_dict: dict actual_value_dict: dict numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=1.0e-2, round_decimals=1, json_serialize=False, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 expected_value_dict = { "value": None, "details": { "metric_configuration": { "domain_kwargs": { "column": "fare_amount" }, "metric_name": "column.min", "metric_value_kwargs": None, "metric_dependencies": None, }, "num_batches": 3, }, } parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_01: np.ndarray = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_01_lower: float = actual_values_01[0] actual_value_01_upper: float = actual_values_01[1] expected_value_01_lower: float = -51.7 expected_value_01_upper: float = -21.0 assert actual_value_01_lower == expected_value_01_lower assert actual_value_01_upper == expected_value_01_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1 numeric_metric_range_parameter_builder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_min_range", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, estimator="oneshot", include_estimator_samples_histogram_in_details=True, false_positive_rate=5.0e-2, round_decimals=1, json_serialize=False, data_context=data_context, )) numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, recompute_existing_parameter_values=True, batch_request=batch_request, ) parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) actual_values_05 = parameter_node.pop("value") parameter_node["value"] = None actual_estimation_histogram: np.ndarray = parameter_node.details.pop( "estimation_histogram") assert parameter_node == expected_value_dict actual_value_05_lower: float = actual_values_05[0] actual_value_05_upper: float = actual_values_05[1] expected_value_05_lower: float = -50.5 expected_value_05_upper: float = -21.1 assert actual_value_05_lower == expected_value_05_lower assert actual_value_05_upper == expected_value_05_upper # if false positive rate is higher, our range should be more narrow assert actual_value_01_lower < actual_value_05_lower assert actual_value_01_upper > actual_value_05_upper expected_estimation_histogram: np.ndarray = np.array([ 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, ]) # Assert no significant difference between expected (null hypothesis) and actual estimation histograms. ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram, data2=expected_estimation_histogram) p_value: float = ks_result[1] assert p_value > 9.5e-1
def test_regex_pattern_string_parameter_builder_bobby_no_match( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", "data_connector_query": {"index": -1}, } metric_domain_kwargs: dict = {"column": "VendorID"} candidate_regexes: Set[str] = { r"^\d{3}$", # won't match } threshold: float = 0.9 regex_parameter: ParameterBuilder = RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, threshold=threshold, data_context=data_context, ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": "(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}", "details": { "evaluated_regexes": { r"\d+": 1.0, r"-?\d+": 1.0, r"-?\d+(\.\d*)?": 1.0, r"[A-Za-z0-9\.,;:!?()\"'%\-]+": 1.0, r"^\s+": 0.0, r"\s+$": 0.0, r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)": 0.0, r"<\/?(?:p|a|b|img)(?: \/)?>": 0.0, r"(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}": 0.0, r"(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}": 1.0, r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ": 0.0, }, "success_ratio": 1.0, }, } assert ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) == expected_value )
def test_regex_pattern_string_parameter_builder_bobby_multiple_matches( bobby_columnar_table_multi_batch_deterministic_data_context, ): data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context ) batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", "data_connector_query": {"index": -1}, } metric_domain_kwargs: dict = {"column": "VendorID"} candidate_regexes: List[str] = [ r"^\d{1}$", # will match r"^[12]{1}$", # will match 0.9941111111 of the time r"^\d{4}$", # won't match ] threshold: float = 0.9 regex_parameter: RegexPatternStringParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, threshold=threshold, data_context=data_context, ) ) assert regex_parameter.CANDIDATE_REGEX != candidate_regexes assert regex_parameter.candidate_regexes == candidate_regexes assert regex_parameter.threshold == 0.9 domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": r"^\d{1}$", "details": { "evaluated_regexes": { r"^\d{1}$": 1.0, r"^[12]{1}$": 0.9941111111111111, r"^\d{4}$": 0.0, }, "success_ratio": 1.0, }, } results = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) assert results is not None assert sorted(results["value"]) == sorted(expected_value["value"]) assert results["details"] == expected_value["details"]
def test_regex_pattern_string_parameter_builder_alice( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs = {"column": "id"} candidate_regexes: List[str] = [ r"^\d{1}$", r"^\d{2}$", r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$", ] regex_pattern_string_parameter: ParameterBuilder = ( RegexPatternStringParameterBuilder( name="my_regex_pattern_string_parameter_builder", metric_domain_kwargs=metric_domain_kwargs, candidate_regexes=candidate_regexes, data_context=data_context, ) ) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None regex_pattern_string_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = ( "$parameter.my_regex_pattern_string_parameter_builder" ) expected_value: dict = { "value": r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$", "details": { "evaluated_regexes": { r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$": 1.0, r"^\d{1}$": 0.0, r"^\d{2}$": 0.0, }, "success_ratio": 1.0, }, } assert ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) == expected_value )
def test_default_expectation_configuration_builder_alice_parentheses_parameter_variable_condition_true( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, ) ) variables: ParameterContainer = build_parameter_container_for_variables( {"max_user_id": 999999999999, "answer": 42} ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: str = "($variables.max_user_id>0 & $variables.answer==42) | $parameter.my_min_user_id.value[0]<0" max_value: str = "$variables.max_user_id" default_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value, max_value=max_value, ) expectation_configuration: Optional[ ExpectationConfiguration ] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, variables=variables, parameters=parameters, ) assert expectation_configuration.kwargs["min_value"] == 397433
def test_condition_not_string_exception( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, ) ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: dict = {"condition": "$variables.tolerance<0.8"} max_user_id: int = 999999999999 with pytest.raises(ge_exceptions.ProfilerExecutionError) as e: # noinspection PyTypeChecker DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value.value[0], max_value=max_user_id, ) assert ( str(e.value) == 'Argument "{\'condition\': \'$variables.tolerance<0.8\'}" in "DefaultExpectationConfigurationBuilder" must be of type "string" (value of type "<class \'dict\'>" was encountered).\n' )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_separate( alice_columnar_table_single_batch_context, ): data_context: DataContext = alice_columnar_table_single_batch_context batch_request: dict = { "datasource_name": "alice_columnar_table_single_batch_datasource", "data_connector_name": "alice_columnar_table_single_batch_data_connector", "data_asset_name": "alice_columnar_table_single_batch_data_asset", } metric_domain_kwargs: dict = {"column": "user_id"} min_user_id_parameter: MetricMultiBatchParameterBuilder = ( MetricMultiBatchParameterBuilder( name="my_min_user_id", metric_name="column.min", metric_domain_kwargs=metric_domain_kwargs, data_context=data_context, ) ) parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None) domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } min_user_id_parameter.build_parameters( domain=domain, parameters=parameters, batch_request=batch_request, ) fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id" parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name=fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, ) condition: Optional[str] = None max_user_id: int = 999999999999 default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder( expectation_type="expect_column_values_to_be_between", condition=condition, min_value=parameter_value.value[0], max_value=max_user_id, ) expectation_configuration: Optional[ ExpectationConfiguration ] = default_expectation_configuration_builder.build_expectation_configuration( domain=domain, parameters=parameters, ) assert expectation_configuration.kwargs["min_value"] == 397433
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_vs_bootstrap_marginal_info_at_boundary( bobby_columnar_table_multi_batch_deterministic_data_context, ): """ This tests whether kde gives a wider estimate for the max """ data_context: DataContext = ( bobby_columnar_table_multi_batch_deterministic_data_context) # BatchRequest yielding three batches batch_request: dict = { "datasource_name": "taxi_pandas", "data_connector_name": "monthly", "data_asset_name": "my_reports", } metric_domain_kwargs: dict = {"column": "fare_amount"} numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_max_range", metric_name="column.max", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="bootstrap", false_positive_rate=5.0e-2, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) variables: Optional[ParameterContainer] = None domain: Domain = Domain( domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs, rule_name="my_rule", ) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_max_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) bootstrap_value: np.ndarray = parameter_node.pop("value") numeric_metric_range_parameter_builder: ParameterBuilder = ( NumericMetricRangeMultiBatchParameterBuilder( name="column_max_range", metric_name="column.max", metric_multi_batch_parameter_builder_name=None, metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME, metric_value_kwargs=None, estimator="kde", false_positive_rate=5.0e-2, round_decimals=0, evaluation_parameter_builder_configs=None, data_context=data_context, )) parameter_container: ParameterContainer = ParameterContainer( parameter_nodes=None) parameters: Dict[str, ParameterContainer] = { domain.id: parameter_container, } assert parameter_container.parameter_nodes is None numeric_metric_range_parameter_builder.build_parameters( domain=domain, variables=variables, parameters=parameters, batch_request=batch_request, ) parameter_nodes: Optional[Dict[str, ParameterNode]] = ( parameter_container.parameter_nodes or {}) assert len(parameter_nodes) == 1 fully_qualified_parameter_name_for_value: str = "$parameter.column_max_range" parameter_node: ParameterNode = ( get_parameter_value_by_fully_qualified_parameter_name( fully_qualified_parameter_name= fully_qualified_parameter_name_for_value, domain=domain, parameters=parameters, )) kde_value: np.ndarray = parameter_node.pop("value") assert kde_value[1] > bootstrap_value[1]