コード例 #1
0
def test_get_parameter_value_by_fully_qualified_parameter_name_valid_parameter_name(
    column_Age_domain,
    column_Date_domain,
    rule_with_parameters,
    variables_multi_part_name_parameter_container,
    domain_name,
    fully_qualified_parameter_name,
    value,
    value_accessor,
    details,
    use_value_suffix,
    test_details,
):
    if domain_name == "age":
        domain = column_Age_domain
    elif domain_name == "date":
        domain = column_Date_domain
    else:
        raise ValueError(
            f'Supported "domain_name" parameter values are "age" and "date".')

    if value_accessor is None:
        value_accessor = ""

    if use_value_suffix:
        fully_qualified_parameter_name_for_value = (
            f"{fully_qualified_parameter_name}.value{value_accessor}")
    else:
        fully_qualified_parameter_name_for_value = (
            f"{fully_qualified_parameter_name}{value_accessor}")

    assert (get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        variables=variables_multi_part_name_parameter_container,
        parameters=rule_with_parameters.parameters,
    ) == value)

    if test_details:
        fully_qualified_parameter_name_for_details = (
            f"{fully_qualified_parameter_name}.details")

        assert (get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_details,
            domain=domain,
            variables=variables_multi_part_name_parameter_container,
            parameters=rule_with_parameters.parameters,
        ) == details)
def test_simple_date_format_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    metric_domain_kwargs: dict = {"column": "pickup_datetime"}
    candidate_strings: set[str] = {
        "%Y-%m-%d",
        "%Y-%m-%d %H:%M:%S",
    }
    threshold: float = 0.9
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_strings=candidate_strings,
            threshold=threshold,
            data_context=data_context,
            batch_request=batch_request,
        ))

    assert date_format_string_parameter.CANDIDATE_STRINGS != candidate_strings
    assert date_format_string_parameter._candidate_strings == candidate_strings
    assert date_format_string_parameter._threshold == 0.9

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN,
                            domain_kwargs=metric_domain_kwargs)

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain)

    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_simple_date_format_string_parameter_builder")
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0
        },
    }

    assert (get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters={domain.id: parameter_container},
    ) == expected_value)
def test_simple_date_format_parameter_builder_alice(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs = {"column": "event_ts"}

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_date_format",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
            batch_request=batch_request,
        ))

    assert date_format_string_parameter.CANDIDATE_STRINGS == DEFAULT_CANDIDATE_STRINGS
    assert date_format_string_parameter.candidate_strings is None
    assert date_format_string_parameter._threshold == 1.0

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN,
                            domain_kwargs=metric_domain_kwargs)

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain)

    # noinspection PyTypeChecker
    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format"
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0
        },
    }

    assert (get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters={domain.id: parameter_container},
    ) == expected_value)
コード例 #4
0
def test_get_parameter_value_by_fully_qualified_parameter_name_invalid_parameter_name(
    column_Age_domain,
    variables_multi_part_name_parameter_container,
    rule_with_parameters,
):
    with pytest.raises(ge_exceptions.ProfilerExecutionError,
                       match=r".+start with \$.*"):
        # noinspection PyUnusedLocal
        parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name="mean",
            domain=column_Age_domain,
            variables=variables_multi_part_name_parameter_container,
            parameters=rule_with_parameters.parameters,
        )
コード例 #5
0
ファイル: util.py プロジェクト: rpatil524/great_expectations
def get_parameter_value(
    domain: Optional[Domain] = None,
    parameter_reference: Optional[Union[Any, str]] = None,
    variables: Optional[ParameterContainer] = None,
    parameters: Optional[Dict[str, ParameterContainer]] = None,
) -> Optional[Any]:
    """
    This method allows for the parameter_reference to be specified as an object (literal, dict, any typed object, etc.)
    or as a fully-qualified parameter name.  Moreover, if the parameter_reference argument is an object of type "dict",
    it will recursively detect values using the fully-qualified parameter name format and evaluate them accordingly.
    """
    if isinstance(parameter_reference, dict):
        for key, value in parameter_reference.items():
            parameter_reference[key] = get_parameter_value(
                domain=domain,
                parameter_reference=value,
                variables=variables,
                parameters=parameters,
            )
    elif isinstance(parameter_reference, (list, set, tuple)):
        parameter_reference_type: type = type(parameter_reference)
        element: Any
        return parameter_reference_type([
            get_parameter_value(
                domain=domain,
                parameter_reference=element,
                variables=variables,
                parameters=parameters,
            ) for element in parameter_reference
        ])
    elif isinstance(
            parameter_reference,
            str) and is_fully_qualified_parameter_name_literal_string_format(
                fully_qualified_parameter_name=parameter_reference):
        parameter_reference = get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=parameter_reference,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )
        parameter_reference = get_parameter_value(
            domain=domain,
            parameter_reference=parameter_reference,
            variables=variables,
            parameters=parameters,
        )

    return parameter_reference
コード例 #6
0
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_bw_method(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    """
    This tests whether a change to bw_method results in a change to the range
    """

    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: dict = {"column": "fare_amount"}

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="kde",
            false_positive_rate=5.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    default_bw_method_value: np.ndarray = parameter_node.pop("value")

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="kde",
            bw_method=0.5,
            false_positive_rate=5.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    other_bw_method_value: np.ndarray = parameter_node.pop("value")

    assert default_bw_method_value[0] != other_bw_method_value[0]
コード例 #7
0
def test_kde_numeric_metric_range_multi_batch_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            estimator="kde",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=1.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.row_count_range"
    expected_value_dict: dict = {
        "value": None,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {},
                "metric_name": "table.row_count",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_value: np.ndarray = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    expected_value: np.ndarray = np.array([6180, 10277])

    # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired)
    # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details).
    rtol: float = 1.0e-2
    atol: float = 0

    # kde results should be stable +/- 1%
    np.testing.assert_allclose(
        actual=actual_value,
        desired=expected_value,
        rtol=rtol,
        atol=atol,
        err_msg=
        f"Actual value of {actual_value} differs from expected value of {expected_value} by more than {atol + rtol * abs(expected_value)} tolerance.",
    )

    expected_estimation_histogram: np.ndarray = np.array([
        13.0,
        155.0,
        719.0,
        1546.0,
        2221.0,
        2570.0,
        1946.0,
        683.0,
        137.0,
        9.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1
コード例 #8
0
def test_oneshot_numeric_metric_range_multi_batch_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: dict = {"column": "fare_amount"}

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    expected_value_dict: dict
    actual_value_dict: dict

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="oneshot",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=1.0e-2,
            round_decimals=1,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    expected_value_dict = {
        "value": None,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {
                    "column": "fare_amount"
                },
                "metric_name": "column.min",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_values_01: np.ndarray = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    actual_value_01_lower: float = actual_values_01[0]
    actual_value_01_upper: float = actual_values_01[1]
    expected_value_01_lower: float = -51.7
    expected_value_01_upper: float = -21.0

    assert actual_value_01_lower == expected_value_01_lower
    assert actual_value_01_upper == expected_value_01_upper

    expected_estimation_histogram: np.ndarray = np.array([
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        2.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1

    numeric_metric_range_parameter_builder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="oneshot",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=5.0e-2,
            round_decimals=1,
            json_serialize=False,
            data_context=data_context,
        ))

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        recompute_existing_parameter_values=True,
        batch_request=batch_request,
    )

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_values_05 = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    actual_value_05_lower: float = actual_values_05[0]
    actual_value_05_upper: float = actual_values_05[1]
    expected_value_05_lower: float = -50.5
    expected_value_05_upper: float = -21.1

    assert actual_value_05_lower == expected_value_05_lower
    assert actual_value_05_upper == expected_value_05_upper

    # if false positive rate is higher, our range should be more narrow
    assert actual_value_01_lower < actual_value_05_lower
    assert actual_value_01_upper > actual_value_05_upper

    expected_estimation_histogram: np.ndarray = np.array([
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        2.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1
コード例 #9
0
def test_regex_pattern_string_parameter_builder_bobby_no_match(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
        "data_connector_query": {"index": -1},
    }

    metric_domain_kwargs: dict = {"column": "VendorID"}

    candidate_regexes: Set[str] = {
        r"^\d{3}$",  # won't match
    }
    threshold: float = 0.9

    regex_parameter: ParameterBuilder = RegexPatternStringParameterBuilder(
        name="my_regex_pattern_string_parameter_builder",
        metric_domain_kwargs=metric_domain_kwargs,
        candidate_regexes=candidate_regexes,
        threshold=threshold,
        data_context=data_context,
    )
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": "(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}",
        "details": {
            "evaluated_regexes": {
                r"\d+": 1.0,
                r"-?\d+": 1.0,
                r"-?\d+(\.\d*)?": 1.0,
                r"[A-Za-z0-9\.,;:!?()\"'%\-]+": 1.0,
                r"^\s+": 0.0,
                r"\s+$": 0.0,
                r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)": 0.0,
                r"<\/?(?:p|a|b|img)(?: \/)?>": 0.0,
                r"(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}": 0.0,
                r"(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}": 1.0,
                r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ": 0.0,
            },
            "success_ratio": 1.0,
        },
    }

    assert (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        )
        == expected_value
    )
コード例 #10
0
def test_regex_pattern_string_parameter_builder_bobby_multiple_matches(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
        "data_connector_query": {"index": -1},
    }

    metric_domain_kwargs: dict = {"column": "VendorID"}

    candidate_regexes: List[str] = [
        r"^\d{1}$",  # will match
        r"^[12]{1}$",  # will match 0.9941111111 of the time
        r"^\d{4}$",  # won't match
    ]
    threshold: float = 0.9

    regex_parameter: RegexPatternStringParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            threshold=threshold,
            data_context=data_context,
        )
    )

    assert regex_parameter.CANDIDATE_REGEX != candidate_regexes
    assert regex_parameter.candidate_regexes == candidate_regexes
    assert regex_parameter.threshold == 0.9

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": r"^\d{1}$",
        "details": {
            "evaluated_regexes": {
                r"^\d{1}$": 1.0,
                r"^[12]{1}$": 0.9941111111111111,
                r"^\d{4}$": 0.0,
            },
            "success_ratio": 1.0,
        },
    }

    results = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )
    assert results is not None
    assert sorted(results["value"]) == sorted(expected_value["value"])
    assert results["details"] == expected_value["details"]
コード例 #11
0
def test_regex_pattern_string_parameter_builder_alice(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs = {"column": "id"}
    candidate_regexes: List[str] = [
        r"^\d{1}$",
        r"^\d{2}$",
        r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$",
    ]

    regex_pattern_string_parameter: ParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
        )
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_pattern_string_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )
    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$",
        "details": {
            "evaluated_regexes": {
                r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$": 1.0,
                r"^\d{1}$": 0.0,
                r"^\d{2}$": 0.0,
            },
            "success_ratio": 1.0,
        },
    }

    assert (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        )
        == expected_value
    )
def test_default_expectation_configuration_builder_alice_parentheses_parameter_variable_condition_true(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        )
    )

    variables: ParameterContainer = build_parameter_container_for_variables(
        {"max_user_id": 999999999999, "answer": 42}
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: str = "($variables.max_user_id>0 & $variables.answer==42) | $parameter.my_min_user_id.value[0]<0"
    max_value: str = "$variables.max_user_id"

    default_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=parameter_value,
        max_value=max_value,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration
    ] = default_expectation_configuration_builder.build_expectation_configuration(
        domain=domain,
        variables=variables,
        parameters=parameters,
    )

    assert expectation_configuration.kwargs["min_value"] == 397433
def test_condition_not_string_exception(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        )
    )

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: dict = {"condition": "$variables.tolerance<0.8"}
    max_user_id: int = 999999999999

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        # noinspection PyTypeChecker
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_between",
            condition=condition,
            min_value=parameter_value.value[0],
            max_value=max_user_id,
        )

    assert (
        str(e.value)
        == 'Argument "{\'condition\': \'$variables.tolerance<0.8\'}" in "DefaultExpectationConfigurationBuilder" must be of type "string" (value of type "<class \'dict\'>" was encountered).\n'
    )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_separate(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        )
    )

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: Optional[str] = None
    max_user_id: int = 999999999999

    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=parameter_value.value[0],
        max_value=max_user_id,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration
    ] = default_expectation_configuration_builder.build_expectation_configuration(
        domain=domain,
        parameters=parameters,
    )

    assert expectation_configuration.kwargs["min_value"] == 397433
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_vs_bootstrap_marginal_info_at_boundary(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    """
    This tests whether kde gives a wider estimate for the max
    """

    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: dict = {"column": "fare_amount"}

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_max_range",
            metric_name="column.max",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="bootstrap",
            false_positive_rate=5.0e-2,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_max_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    bootstrap_value: np.ndarray = parameter_node.pop("value")

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_max_range",
            metric_name="column.max",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="kde",
            false_positive_rate=5.0e-2,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_max_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    kde_value: np.ndarray = parameter_node.pop("value")

    assert kde_value[1] > bootstrap_value[1]