def test_regex_pattern_string_parameter_builder_alice(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context
    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    candidate_regexes: List[str] = [
        r"^\d{1}$",
        r"^\d{2}$",
        r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$",
    ]
    metric_domain_kwargs = {"column": "id"}

    regex_pattern_string_parameter: RegexPatternStringParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
            batch_request=batch_request,
        )
    )

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs
    )
    assert parameter_container.parameter_nodes is None

    regex_pattern_string_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain
    )
    fully_qualified_parameter_name_for_value: str = "$parameter.my_regex"
    expected_value: dict = {
        "value": [r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$"],
        "details": {"success_ratio": [1.0]},
    }
    expected_value: dict = {
        "value": [r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$"],
        "details": {
            "evaluated_regexes": {
                r"^\S{8}-\S{4}-\S{4}-\S{4}-\S{12}$": 1.0,
                r"^\d{1}$": 0,
                r"^\d{2}$": 0,
            },
            "threshold": 1.0,
        },
    }

    assert (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters={domain.id: parameter_container},
        )
        == expected_value
    )
Exemple #2
0
def test_execution_mean_table_columns_set_match_multi_batch_parameter_builder(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    mean_table_columns_set_match_multi_batch_parameter_builder: ParameterBuilder = (
        MeanTableColumnsSetMatchMultiBatchParameterBuilder(
            name=
            "my_mean_table_columns_set_match_multi_batch_parameter_builder",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
            json_serialize=True,
            data_context=data_context,
        ))

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        domain_kwargs=None,
        rule_name="my_rule",
    )

    variables: Optional[ParameterContainer] = None

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    expected_parameter_value: dict = {
        "value": {
            "VendorID",
            "pickup_datetime",
            "total_amount",
            "congestion_surcharge",
            "dropoff_datetime",
            "mta_tax",
            "store_and_fwd_flag",
            "tip_amount",
            "trip_distance",
            "payment_type",
            "DOLocationID",
            "improvement_surcharge",
            "extra",
            "tolls_amount",
            "RatecodeID",
            "passenger_count",
            "PULocationID",
            "fare_amount",
        },
        "details": {
            "success_ratio": 1.0,
        },
    }

    mean_table_columns_set_match_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=
        mean_table_columns_set_match_multi_batch_parameter_builder.
        fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert len(
        parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY]) == len(
            expected_parameter_value[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])

    parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY] = set(
        parameter_node[FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY])
    assert parameter_node == expected_parameter_value
def test_regex_pattern_string_parameter_builder_bobby_multiple_matches(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )
    metric_domain_kwargs: dict = {"column": "VendorID"}
    candidate_regexes: List[str] = [
        r"^\d{1}$",  # will match
        r"^[12]{1}$",  # will match 0.9941111111 of the time
        r"^\d{4}$",  # won't match
    ]
    threshold: float = 0.9
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
        "data_connector_query": {"index": -1},
    }

    regex_parameter: RegexPatternStringParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            threshold=threshold,
            data_context=data_context,
            batch_request=batch_request,
        )
    )

    assert regex_parameter.CANDIDATE_REGEX != candidate_regexes
    assert regex_parameter.candidate_regexes == candidate_regexes
    assert regex_parameter.threshold == 0.9

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs
    )

    assert parameter_container.parameter_nodes is None

    regex_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain
    )

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": [r"^[12]{1}$", r"^\d{1}$"],
        "details": {
            "evaluated_regexes": {
                r"^\d{1}$": 1.0,
                r"^[12]{1}$": 0.9941111111111111,
                r"^\d{4}$": 0,
            },
            "threshold": 0.9,
        },
    }

    results = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters={domain.id: parameter_container},
    )

    assert results is not None
    assert sorted(results["value"]) == sorted(expected_value["value"])
    assert results["details"] == expected_value["details"]
def test_regex_pattern_string_parameter_builder_bobby_no_match(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )
    metric_domain_kwargs: dict = {"column": "VendorID"}
    candidate_regexes: Set[str] = {
        r"^\d{3}$",  # won't match
    }
    threshold: float = 0.9
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
        "data_connector_query": {"index": -1},
    }

    regex_parameter: RegexPatternStringParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            threshold=threshold,
            data_context=data_context,
            batch_request=batch_request,
        )
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN, domain_kwargs=metric_domain_kwargs
    )

    assert parameter_container.parameter_nodes is None

    regex_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain
    )

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder"
    )
    expected_value: dict = {
        "value": [],
        "details": {
            "evaluated_regexes": {
                r"/\d+/": 0,
                r"/-?\d+/": 0,
                r"/-?\d+(\.\d*)?/": 0,
                r"/[A-Za-z0-9\.,;:!?()\"'%\-]+/": 0,
                r"^\s+/": 0,
                r"\s+/$": 0,
                r"/https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#()?&//=]*)/": 0,
                r"/<\/?(?:p|a|b|img)(?: \/)?>/": 0,
                r"/(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})(?:.(?:25[0-5]|2[0-4]\d|[01]\d{2}|\d{1,2})){3}/": 0,
                r"/(?:[A-Fa-f0-9]){0,4}(?: ?:? ?(?:[A-Fa-f0-9]){0,4}){0,7}/": 0,
                r"\b[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}-[0-5][0-9a-fA-F]{3}-[089ab][0-9a-fA-F]{3}-\b[0-9a-fA-F]{12}\b ": 0,
            },
            "threshold": 0.9,
        },
    }

    assert (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters={domain.id: parameter_container},
        )
        == expected_value
    )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_separate(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: Optional[str] = None
    max_user_id: int = 999999999999

    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=parameter_value.value[0],
        max_value=max_user_id,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration(
            domain=domain,
            parameters=parameters,
        )

    assert expectation_configuration.kwargs["min_value"] == 397433
def test_regex_two_candidates(mock_data_context: mock.MagicMock,
                              batch_fixture: Batch):
    batch: Batch = batch_fixture

    mock_data_context.get_batch_list.return_value = [batch]
    mock_data_context.get_validator_using_batch_list.return_value = Validator(
        execution_engine=PandasExecutionEngine(), batches=[batch])
    data_context: DataContext = mock_data_context

    metric_domain_kwargs: dict = {"column": "b"}
    candidate_regexes: List[str] = [r"^\d{1}$", r"^\d{3}$"]

    regex_pattern_string_parameter_builder: ParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
        ))

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    regex_pattern_string_parameter_builder.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_list=[batch],
    )
    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_regex_pattern_string_parameter_builder.value")

    expected_value: str = "^\\d{1}$"

    assert (get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    ) == expected_value)
    fully_qualified_parameter_name_for_meta: str = (
        "$parameter.my_regex_pattern_string_parameter_builder.details")
    expected_meta: dict = {
        "evaluated_regexes": {
            "^\\d{1}$": 1.0,
            "^\\d{3}$": 0.0
        },
        "success_ratio": 1.0,
    }
    meta: dict = get_parameter_value_and_validate_return_type(
        parameter_reference=fully_qualified_parameter_name_for_meta,
        expected_return_type=dict,
        domain=domain,
        parameters=parameters,
    )

    assert meta == expected_meta
Exemple #7
0
def test_value_set_multi_batch_parameter_builder_bobby_string(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs_for_parameter_builder: str = "$domain.domain_kwargs"
    value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = (
        ValueSetMultiBatchParameterBuilder(
            name="my_store_and_fwd_flag_value_set",
            metric_domain_kwargs=metric_domain_kwargs_for_parameter_builder,
            data_context=data_context,
        ))

    metric_domain_kwargs: dict = {"column": "store_and_fwd_flag"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    value_set_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    expected_value_set: List[str] = ["N", "Y"]
    expected_parameter_value: dict = {
        "value": expected_value_set,
        "details": {
            "metric_configuration": {
                "metric_name": "column.distinct_values",
                "domain_kwargs": {
                    "column": "store_and_fwd_flag"
                },
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_store_and_fwd_flag_value_set")
    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert sorted(parameter_node.value) == expected_parameter_value["value"]
    assert parameter_node.details == expected_parameter_value["details"]
def test_mean_unexpected_map_metric_multi_batch_parameter_builder_bobby_numeric_dependencies_evaluated_separately(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    my_total_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
        name="my_total_count",
        metric_name="table.row_count",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=False,
        replace_nan_with_zero=False,
        reduce_scalar_metric=True,
        evaluation_parameter_builder_configs=None,
        json_serialize=False,
        data_context=data_context,
    )
    my_null_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
        name="my_null_count",
        metric_name="column_values.nonnull.unexpected_count",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=False,
        replace_nan_with_zero=False,
        reduce_scalar_metric=True,
        evaluation_parameter_builder_configs=None,
        json_serialize=False,
        data_context=data_context,
    )

    mean_unexpected_map_metric_multi_batch_parameter_builder: ParameterBuilder = (
        MeanUnexpectedMapMetricMultiBatchParameterBuilder(
            name=
            "my_passenger_count_values_not_null_mean_unexpected_map_metric",
            map_metric_name="column_values.nonnull",
            total_count_parameter_builder_name="my_total_count",
            null_count_parameter_builder_name="my_null_count",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
            json_serialize=False,
            data_context=data_context,
        ))

    metric_domain_kwargs: dict = {"column": "passenger_count"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    variables: Optional[ParameterContainer] = None

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    my_total_count_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )
    my_null_count_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    mean_unexpected_map_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: float = 0.0

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=
        mean_unexpected_map_metric_multi_batch_parameter_builder.
        fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    rtol: float = RTOL
    atol: float = 5.0e-1 * ATOL
    np.testing.assert_allclose(
        actual=parameter_node.value,
        desired=expected_parameter_value,
        rtol=rtol,
        atol=atol,
        err_msg=
        f"Actual value of {parameter_node.value} differs from expected value of {expected_parameter_value} by more than {atol + rtol * abs(parameter_node.value)} tolerance.",
    )
Exemple #9
0
def test_get_parameter_values_for_fully_qualified_parameter_names(
    parameters_with_different_depth_level_values,
):
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    build_parameter_container(
        parameter_container=parameter_container,
        parameter_values=parameters_with_different_depth_level_values,
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=None,
        details=None,
        rule_name="my_rule",
    )
    # Convert variables argument to ParameterContainer
    variables: ParameterContainer = build_parameter_container_for_variables(
        variables_configs={
            "my_int": 9,
            "my_float": 3.38,
            "my_string": "hello",
        }
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    # fmt: off
    expected_parameter_values_for_fully_qualified_parameter_names: Dict[str, ParameterNode] = {
        "$variables": {
            "my_int": 9,
            "my_float": 3.38,
            "my_string": "hello",
        },
        "$parameter.weekly_taxi_fairs.mean_values": {
            "value": [
                {
                    "sunday": 71.43,
                    "monday": 74.35,
                    "tuesday": 42.3,
                    "wednesday": 42.3,
                    "thursday": 82.2,
                    "friday": 78.78,
                    "saturday": 91.39,
                },
                {
                    "sunday": 81.43,
                    "monday": 84.35,
                    "tuesday": 52.3,
                    "wednesday": 43.3,
                    "thursday": 22.2,
                    "friday": 98.78,
                    "saturday": 81.39,
                },
                {
                    "sunday": 61.43,
                    "monday": 34.35,
                    "tuesday": 82.3,
                    "wednesday": 72.3,
                    "thursday": 22.2,
                    "friday": 38.78,
                    "saturday": 51.39,
                },
                {
                    "sunday": 51.43,
                    "monday": 64.35,
                    "tuesday": 72.3,
                    "wednesday": 82.3,
                    "thursday": 22.2,
                    "friday": 98.78,
                    "saturday": 31.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 77.35,
                    "tuesday": 46.3,
                    "wednesday": 47.3,
                    "thursday": 88.2,
                    "friday": 79.78,
                    "saturday": 93.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 41.3,
                    "wednesday": 49.3,
                    "thursday": 80.2,
                    "friday": 78.78,
                    "saturday": 93.39,
                },
                {
                    "sunday": 74.43,
                    "monday": 78.35,
                    "tuesday": 49.3,
                    "wednesday": 43.3,
                    "thursday": 88.2,
                    "friday": 72.78,
                    "saturday": 97.39,
                },
                {
                    "sunday": 73.43,
                    "monday": 72.35,
                    "tuesday": 40.3,
                    "wednesday": 40.3,
                    "thursday": 89.2,
                    "friday": 77.78,
                    "saturday": 90.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 45.3,
                    "wednesday": 44.3,
                    "thursday": 89.2,
                    "friday": 77.78,
                    "saturday": 96.39,
                },
                {
                    "sunday": 75.43,
                    "monday": 74.25,
                    "tuesday": 42.33,
                    "wednesday": 42.23,
                    "thursday": 82.21,
                    "friday": 78.76,
                    "saturday": 91.37,
                },
                {
                    "sunday": 71.43,
                    "monday": 74.37,
                    "tuesday": 42.3,
                    "wednesday": 42.32,
                    "thursday": 82.23,
                    "friday": 78.77,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.63,
                    "monday": 74.37,
                    "tuesday": 42.2,
                    "wednesday": 42.1,
                    "thursday": 82.29,
                    "friday": 78.79,
                    "saturday": 91.39,
                },
                {
                    "sunday": 71.42,
                    "monday": 74.33,
                    "tuesday": 42.33,
                    "wednesday": 42.34,
                    "thursday": 82.25,
                    "friday": 78.77,
                    "saturday": 91.69,
                },
                {
                    "sunday": 71.44,
                    "monday": 72.35,
                    "tuesday": 42.33,
                    "wednesday": 42.31,
                    "thursday": 82.29,
                    "friday": 78.68,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.44,
                    "monday": 74.32,
                    "tuesday": 42.32,
                    "wednesday": 42.32,
                    "thursday": 82.29,
                    "friday": 78.77,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.44,
                    "monday": 74.33,
                    "tuesday": 42.21,
                    "wednesday": 42.31,
                    "thursday": 82.27,
                    "friday": 78.74,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.33,
                    "monday": 74.25,
                    "tuesday": 42.31,
                    "wednesday": 42.03,
                    "thursday": 82.02,
                    "friday": 78.08,
                    "saturday": 91.38,
                },
                {
                    "sunday": 71.41,
                    "monday": 74.31,
                    "tuesday": 42.39,
                    "wednesday": 42.93,
                    "thursday": 82.92,
                    "friday": 78.75,
                    "saturday": 91.49,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 42.3,
                    "wednesday": 32.3,
                    "thursday": 52.2,
                    "friday": 88.78,
                    "saturday": 81.39,
                },
                {
                    "sunday": 71.43,
                    "monday": 74.35,
                    "tuesday": 32.3,
                    "wednesday": 92.3,
                    "thursday": 72.2,
                    "friday": 74.78,
                    "saturday": 51.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 64.35,
                    "tuesday": 52.3,
                    "wednesday": 42.39,
                    "thursday": 82.28,
                    "friday": 78.77,
                    "saturday": 91.36,
                },
                {
                    "sunday": 81.43,
                    "monday": 94.35,
                    "tuesday": 62.3,
                    "wednesday": 52.3,
                    "thursday": 92.2,
                    "friday": 88.78,
                    "saturday": 51.39,
                },
                {
                    "sunday": 21.43,
                    "monday": 34.35,
                    "tuesday": 42.34,
                    "wednesday": 62.3,
                    "thursday": 52.2,
                    "friday": 98.78,
                    "saturday": 81.39,
                },
                {
                    "sunday": 71.33,
                    "monday": 74.25,
                    "tuesday": 42.13,
                    "wednesday": 42.93,
                    "thursday": 82.82,
                    "friday": 78.78,
                    "saturday": 91.39,
                },
                {
                    "sunday": 72.43,
                    "monday": 73.35,
                    "tuesday": 44.3,
                    "wednesday": 45.3,
                    "thursday": 86.2,
                    "friday": 77.78,
                    "saturday": 98.39,
                },
                {
                    "sunday": 79.43,
                    "monday": 78.35,
                    "tuesday": 47.3,
                    "wednesday": 46.3,
                    "thursday": 85.2,
                    "friday": 74.78,
                    "saturday": 93.39,
                },
                {
                    "sunday": 71.42,
                    "monday": 74.31,
                    "tuesday": 42.0,
                    "wednesday": 42.1,
                    "thursday": 82.23,
                    "friday": 65.78,
                    "saturday": 91.26,
                },
                {
                    "sunday": 91.43,
                    "monday": 84.35,
                    "tuesday": 42.37,
                    "wednesday": 42.36,
                    "thursday": 82.25,
                    "friday": 78.74,
                    "saturday": 91.32,
                },
                {
                    "sunday": 71.33,
                    "monday": 74.45,
                    "tuesday": 42.35,
                    "wednesday": 42.36,
                    "thursday": 82.27,
                    "friday": 26.78,
                    "saturday": 71.39,
                },
                {
                    "sunday": 71.53,
                    "monday": 73.35,
                    "tuesday": 43.32,
                    "wednesday": 42.23,
                    "thursday": 82.32,
                    "friday": 78.18,
                    "saturday": 91.49,
                },
                {
                    "sunday": 71.53,
                    "monday": 74.25,
                    "tuesday": 52.3,
                    "wednesday": 52.3,
                    "thursday": 81.23,
                    "friday": 78.78,
                    "saturday": 78.39,
                },
            ],
            "details": {
                "confidence": "high",
            },
        },
        "$parameter.tolerances.mostly": 0.91,
        "$parameter.tolerances.financial.usd": 1.0,
        "$parameter.monthly_taxi_fairs.mean_values": {
            "value": [
                2.3,
                9.8,
                42.3,
                8.1,
                38.5,
                53.7,
                71.43,
                16.34,
                49.43,
                74.35,
                51.98,
                46.42,
                20.01,
                69.44,
                65.32,
                8.83,
                55.79,
                82.2,
                36.93,
                83.78,
                31.13,
                76.93,
                67.67,
                25.12,
                58.04,
                79.78,
                90.91,
                15.26,
                61.65,
                78.78,
                12.99,
            ],
            "details": {
                "confidence": "low",
            },
        },
        "$parameter.date_strings.yyyy_mm_dd_hh_mm_ss_tz_date_format": {
            "value": "%Y-%m-%d %H:%M:%S %Z",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.date_strings.yyyy_mm_dd_date_format": {
            "value": "%Y-%m-%d",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.date_strings.tolerances.max_num_conversion_attempts": 5,
        "$parameter.date_strings.tolerances.max_abs_error_time_milliseconds": 100,
        "$parameter.date_strings.mm_yyyy_dd_hh_mm_ss_tz_date_format": {
            "value": "%m-%Y-%d %H:%M:%S %Z",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.date_strings.mm_yyyy_dd_date_format": {
            "value": "%m-%Y-%d",
            "details": {
                "confidence": 0.78,
            },
        },
        "$parameter.daily_taxi_fairs.mean_values": {
            "value": {
                "sunday": 71.43,
                "monday": 74.35,
                "tuesday": 42.3,
                "wednesday": 42.3,
                "thursday": 82.2,
                "friday": 78.78,
                "saturday": 91.39,
            },
            "details": {
                "confidence": "medium",
            },
        },
        "$mean": 0.65,
    }
    # fmt: on

    parameter_values_for_fully_qualified_parameter_names: Dict[
        str, ParameterNode
    ] = get_parameter_values_for_fully_qualified_parameter_names(
        domain=domain,
        variables=variables,
        parameters=parameters,
    )
    assert (
        parameter_values_for_fully_qualified_parameter_names
        == expected_parameter_values_for_fully_qualified_parameter_names
    )
Exemple #10
0
def test_value_set_multi_batch_parameter_builder_alice_single_batch_string(
    alice_columnar_table_single_batch_context, ):
    """
    What does this test and why?
    This tests that non-numeric columns are handled appropriately,
    """
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_agent"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = (
        ValueSetMultiBatchParameterBuilder(
            name="my_user_agent_value_set",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    value_set_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    expected_value_set: List[str] = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
    ]
    expected_parameter_value: dict = {
        "value": expected_value_set,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {
                    "column": "user_agent"
                },
                "metric_name": "column.distinct_values",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    fully_qualified_parameter_name_for_value: str = "$parameter.my_user_agent_value_set"
    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert sorted(parameter_node.value) == expected_parameter_value["value"]
    assert parameter_node.details == expected_parameter_value["details"]
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_bw_method(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    """
    This tests whether a change to bw_method results in a change to the range
    """

    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: dict = {"column": "fare_amount"}

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="kde",
            false_positive_rate=5.0e-2,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    default_bw_method_value: np.ndarray = parameter_node.pop("value")

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="kde",
            bw_method=0.5,
            false_positive_rate=5.0e-2,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    other_bw_method_value: np.ndarray = parameter_node.pop("value")

    assert default_bw_method_value[0] != other_bw_method_value[0]
def test_kde_numeric_metric_range_multi_batch_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="kde",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=1.0e-2,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.row_count_range"
    expected_value_dict: Dict[str, Optional[str]] = {
        "value": None,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {},
                "metric_name": "table.row_count",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_value: np.ndarray = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    expected_value: np.ndarray = np.array([6180, 10277])

    # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired)
    # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details).
    rtol: float = 1.0e-2
    atol: float = 0

    # kde results should be stable +/- 1%
    np.testing.assert_allclose(
        actual=actual_value,
        desired=expected_value,
        rtol=rtol,
        atol=atol,
        err_msg=
        f"Actual value of {actual_value} differs from expected value of {expected_value} by more than {atol + rtol * abs(expected_value)} tolerance.",
    )

    expected_estimation_histogram: np.ndarray = np.array([
        13.0,
        155.0,
        719.0,
        1546.0,
        2221.0,
        2570.0,
        1946.0,
        683.0,
        137.0,
        9.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1
def test_oneshot_numeric_metric_range_multi_batch_parameter_builder_with_evaluation_dependency_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: Dict[str, str] = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: Dict[str, str] = {"column": "fare_amount"}

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    my_column_min_metric_multi_batch_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig(
        module_name="great_expectations.rule_based_profiler.parameter_builder",
        class_name="MetricMultiBatchParameterBuilder",
        name="my_column_min",
        metric_name="column.min",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=True,
        replace_nan_with_zero=True,
        reduce_scalar_metric=True,
        evaluation_parameter_builder_configs=None,
    )
    evaluation_parameter_builder_configs: Optional[
        List[ParameterBuilderConfig]] = [
            my_column_min_metric_multi_batch_parameter_builder_config,
        ]
    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name=None,
            metric_multi_batch_parameter_builder_name="my_column_min",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="oneshot",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=1.0e-2,
            round_decimals=1,
            evaluation_parameter_builder_configs=
            evaluation_parameter_builder_configs,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    expected_value_dict: Dict[str, Optional[str]] = {
        "value": None,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {
                    "column": "fare_amount"
                },
                "metric_name": "column.min",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_values_01: np.ndarray = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    actual_value_01_lower: float = actual_values_01[0]
    actual_value_01_upper: float = actual_values_01[1]
    expected_value_01_lower: float = -51.7
    expected_value_01_upper: float = -21.0

    assert actual_value_01_lower == expected_value_01_lower
    assert actual_value_01_upper == expected_value_01_upper

    expected_estimation_histogram: np.ndarray = np.array([
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        2.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1

    numeric_metric_range_parameter_builder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_multi_batch_parameter_builder_name="my_column_min",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="oneshot",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=5.0e-2,
            round_decimals=1,
            evaluation_parameter_builder_configs=
            evaluation_parameter_builder_configs,
            data_context=data_context,
        ))

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        recompute_existing_parameter_values=True,
        batch_request=batch_request,
    )

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_values_05 = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    actual_value_05_lower: float = actual_values_05[0]
    actual_value_05_upper: float = actual_values_05[1]
    expected_value_05_lower: float = -50.5
    expected_value_05_upper: float = -21.1

    assert actual_value_05_lower == expected_value_05_lower
    assert actual_value_05_upper == expected_value_05_upper

    # if false positive rate is higher, our range should be more narrow
    assert actual_value_01_lower < actual_value_05_lower
    assert actual_value_01_upper > actual_value_05_upper

    expected_estimation_histogram: np.ndarray = np.array([
        1.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        0.0,
        2.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result = stats.ks_2samp(data1=actual_estimation_histogram,
                               data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1
def test_partition_parameter_builder_alice_continuous_changed_to_categorical(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    parameter_builder: ParameterBuilder = PartitionParameterBuilder(
        name="my_name",
        bucketize_data=True,
        data_context=data_context,
    )

    metric_domain_kwargs: dict = {"column": "event_ts"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: dict = {
        "value": {
            "values": [
                "2004-10-19 10:23:54",
                "2004-10-19 10:23:55",
                "2004-10-19 11:05:20",
            ],
            "weights": [0.3333333333333333, 0.3333333333333333, 0.3333333333333333],
        },
        "details": {
            "metric_configuration": {
                "metric_name": "column.value_counts",
                "domain_kwargs": {"column": "event_ts"},
                "metric_value_kwargs": {"sort": "value"},
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=parameter_builder.fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert parameter_node == expected_parameter_value
Exemple #15
0
def test_value_set_multi_batch_parameter_builder_alice_single_batch_numeric(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "event_type"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    value_set_multi_batch_parameter_builder: ValueSetMultiBatchParameterBuilder = (
        ValueSetMultiBatchParameterBuilder(
            name="my_event_type_value_set",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    value_set_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    assert (parameter_container.parameter_nodes is None
            or len(parameter_container.parameter_nodes) == 1)

    expected_value_set: List[int] = [19, 22, 73]
    expected_parameter_value: dict = {
        "value": expected_value_set,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {
                    "column": "event_type"
                },
                "metric_name": "column.distinct_values",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    fully_qualified_parameter_name_for_value: str = "$parameter.my_event_type_value_set"
    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=fully_qualified_parameter_name_for_value,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert sorted(parameter_node.value) == expected_parameter_value["value"]
    assert parameter_node.details == expected_parameter_value["details"]
def test_partition_parameter_builder_alice_continuous(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    parameter_builder: ParameterBuilder = PartitionParameterBuilder(
        name="my_name",
        bucketize_data=True,
        data_context=data_context,
    )

    metric_domain_kwargs: dict = {"column": "user_id"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    variables: Optional[ParameterContainer] = None
    parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: dict = {
        "value": {
            "bins": [397433.0, 4942918.5, 9488404.0],
            "weights": [0.6666666666666666, 0.3333333333333333],
            "tail_weights": [0.0, 0.0],
        },
        "details": {
            "metric_configuration": {
                "metric_name": "column.histogram",
                "domain_kwargs": {"column": "user_id"},
                "metric_value_kwargs": {"bins": [397433.0, 4942918.5, 9488404.0]},
                "metric_dependencies": None,
            },
            "num_batches": 1,
        },
    }

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=parameter_builder.fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    assert parameter_node == expected_parameter_value
Exemple #17
0
    def _build_profiler(self) -> None:
        """
        Builds "RuleBasedProfiler", corresponding to present DataAssistant use case.

        Starts with empty "RuleBasedProfiler" (initialized in constructor) and adds Rule objects.

        Subclasses can add custom "Rule" objects as appropriate for their respective particular DataAssistant use cases.
        """
        variables: dict = {}

        profiler: Optional[BaseRuleBasedProfiler]
        rules: List[Rule]
        rule: Rule
        domain_builder: DomainBuilder
        parameter_builders: List[ParameterBuilder]
        expectation_configuration_builders: List[ExpectationConfigurationBuilder]

        """
        For each Self-Initializing "Expectation" as specified by "DataAssistant.expectation_kwargs_by_expectation_type"
        interface property, retrieve its "RuleBasedProfiler" configuration, construct "Rule" object based on it, while
        incorporating metrics "ParameterBuilder" objects for "MetricDomainTypes", emitted by "DomainBuilder"
        of comprised "Rule", specified by "DataAssistant.metrics_parameter_builders_by_domain" interface property.
        Append this "Rule" object to overall DataAssistant "RuleBasedProfiler" object; incorporate "variables" as well.
        """
        expectation_type: str
        expectation_kwargs: Dict[str, Any]
        for (
            expectation_type,
            expectation_kwargs,
        ) in self.expectation_kwargs_by_expectation_type.items():
            profiler = self._validator.build_rule_based_profiler_for_expectation(
                expectation_type=expectation_type
            )(**expectation_kwargs)
            variables.update(convert_variables_to_dict(variables=profiler.variables))
            rules = profiler.rules
            for rule in rules:
                domain_builder = rule.domain_builder
                parameter_builders = rule.parameter_builders or []
                parameter_builders.extend(
                    self.metrics_parameter_builders_by_domain[
                        Domain(
                            domain_builder.domain_type,
                        )
                    ]
                )
                expectation_configuration_builders = (
                    rule.expectation_configuration_builders or []
                )
                self.profiler.add_rule(
                    rule=Rule(
                        name=rule.name,
                        variables=rule.variables,
                        domain_builder=domain_builder,
                        parameter_builders=parameter_builders,
                        expectation_configuration_builders=expectation_configuration_builders,
                    )
                )

        self.profiler.variables = self.profiler.reconcile_profiler_variables(
            variables=variables,
            reconciliation_strategy=DEFAULT_RECONCILATION_DIRECTIVES.variables,
        )
def test_default_expectation_configuration_builder_alice_parentheses_parameter_variable_condition_false(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    variables: ParameterContainer = build_parameter_container_for_variables({
        "max_user_id":
        999999999999,
        "answer":
        42
    })
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: str = "($variables.max_user_id<0 | $variables.answer!=42) | $parameter.my_min_user_id.value[0]<0"
    max_value: str = "$variables.max_user_id"

    default_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=parameter_value,
        max_value=max_value,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration(
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

    assert expectation_configuration is None
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Find the semantic column type for each column and return all domains matching the specified type or types.
        """
        semantic_types: List[
            SemanticDomainTypes] = _parse_semantic_domain_type_argument(
                semantic_types=self.semantic_types)

        batch_id: str = self.get_batch_id(variables=variables)
        column_types_dict_list: List[Dict[str, Any]] = self.get_validator(
            variables=variables).get_metric(metric=MetricConfiguration(
                metric_name="table.column_types",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs={
                    "include_nested": True,
                },
                metric_dependencies=None,
            ))

        table_column_names: List[str] = self.get_validator(
            variables=variables).get_metric(metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            ))

        column_name: str

        # A semantic type is distinguished from the structured column type;
        # An example structured column type would be "integer".  The inferred semantic type would be "id".
        table_column_name_to_inferred_semantic_domain_type_mapping: Dict[
            str, SemanticDomainTypes] = {
                column_name:
                self.infer_semantic_domain_type_from_table_column_type(
                    column_types_dict_list=column_types_dict_list,
                    column_name=column_name,
                ).semantic_domain_type
                for column_name in table_column_names
            }
        candidate_column_names: List[str] = list(
            filter(
                lambda candidate_column_name:
                table_column_name_to_inferred_semantic_domain_type_mapping[
                    candidate_column_name] in semantic_types,
                table_column_names,
            ))

        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column": column_name,
                },
                details={
                    "inferred_semantic_domain_type":
                    table_column_name_to_inferred_semantic_domain_type_mapping[
                        column_name],
                },
            ) for column_name in candidate_column_names
        ]

        return domains
def test_meta_not_dict_exception(alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        ))

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition = None
    max_user_id: int = 999999999999

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        # noinspection PyTypeChecker
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_between",
            condition=condition,
            min_value=parameter_value.value[0],
            max_value=max_user_id,
            meta="Strings are not acceptable",
        )

    assert (
        str(e.value) ==
        'Argument "Strings are not acceptable" in "DefaultExpectationConfigurationBuilder" must be of type "dictionary" (value of type "<class \'str\'>" was encountered).\n'
    )
Exemple #21
0
def test_column_values_nonnull_multi_batch_one_column_not_emitted(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_pandas",
        data_connector_name="monthly",
        data_asset_name="my_reports",
    )

    domain_builder: MapMetricColumnDomainBuilder = MapMetricColumnDomainBuilder(
        map_metric_name="column_values.nonnull",
        max_unexpected_values=0,
        max_unexpected_ratio=None,
        min_max_unexpected_values_proportion=9.75e-1,
        data_context=data_context,
    )
    domains: List[Domain] = domain_builder.get_domains(
        rule_name="my_rule", batch_request=batch_request)

    # Unit Tests for "inferred_semantic_domain_type" are provided separately.
    domain: Domain
    for domain in domains:
        domain.details = {}

    domains = sorted(domains, key=lambda x: x.domain_kwargs["column"])

    bobby_compliant_column_names: List[str] = [
        "VendorID",
        "pickup_datetime",
        "dropoff_datetime",
        "passenger_count",
        "trip_distance",
        "RatecodeID",
        "store_and_fwd_flag",
        "PULocationID",
        "DOLocationID",
        "payment_type",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "total_amount",
    ]

    column_name: str
    bobby_expected_column_domains: List[Domain] = [
        Domain(
            domain_type=MetricDomainTypes.COLUMN,
            domain_kwargs={
                "column": column_name,
            },
            rule_name="my_rule",
        ) for column_name in bobby_compliant_column_names
    ]
    bobby_expected_column_domains = sorted(
        bobby_expected_column_domains, key=lambda x: x.domain_kwargs["column"])

    assert len(domains) == 17
    assert domains == bobby_expected_column_domains