Example #1
0
def test_profiler_parameter_builder_added(data_context_with_taxi_data):
    """
    What does this test and why?

    This test now adds a simple ParameterBuilder to our Rule. More specifically,
    we use a MetricMultiBatchParameterBuilder to pass in the min_value parameter to
    expect_column_values_to_be_greater_than.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_rule: Rule = Rule(
        name="rule_with_variables_and_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    my_rbp.add_rule(rule=simple_rule)
    result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request)
    expectation_configurations: List[
        ExpectationConfiguration] = result.expectation_configurations
    assert len(expectation_configurations) == 4
Example #2
0
 def get_table_columns_metric_multi_batch_parameter_builder(
 ) -> ParameterBuilder:
     """
     This method instantiates one commonly used "MetricMultiBatchParameterBuilder" with specified directives.
     """
     metric_name: str = "table.columns"
     name: str = sanitize_parameter_name(name=metric_name)
     return MetricMultiBatchParameterBuilder(
         name=name,
         metric_name=metric_name,
         metric_domain_kwargs=
         DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
         metric_value_kwargs=None,
         enforce_numeric_metric=False,
         replace_nan_with_zero=False,
         reduce_scalar_metric=True,
         evaluation_parameter_builder_configs=None,
         data_context=None,
     )
Example #3
0
 def build_numeric_metric_multi_batch_parameter_builder(
     metric_name: str,
     metric_domain_kwargs: Optional[Union[
         str, dict]] = DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
     metric_value_kwargs: Optional[Union[str, dict]] = None,
 ) -> MetricMultiBatchParameterBuilder:
     """
     This method instantiates "MetricMultiBatchParameterBuilder" class with specific arguments for given purpose.
     """
     name: str = sanitize_parameter_name(name=metric_name)
     return MetricMultiBatchParameterBuilder(
         name=name,
         metric_name=metric_name,
         metric_domain_kwargs=metric_domain_kwargs,
         metric_value_kwargs=metric_value_kwargs,
         enforce_numeric_metric=True,
         replace_nan_with_zero=True,
         reduce_scalar_metric=True,
         evaluation_parameter_builder_configs=None,
         data_context=None,
     )
Example #4
0
def test_profiler_save_and_load(data_context_with_taxi_data):
    """
    What does this test and why?

    This tests whether context.save_profiler() can be invoked to update a profiler that lives in Store.
    The test ensures that any changes that we make to the Profiler, like adding a rule, will be persisted.

    The test tests that context.save_profiler() and context.get_profiler() return the expected RBP.
    """
    context: DataContext = data_context_with_taxi_data
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_variables_rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    res: dict = my_rbp.config.to_json_dict()
    assert res == {
        "class_name": "RuleBasedProfiler",
        "module_name": "great_expectations.rule_based_profiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "rules": None,
        "variables": {},
    }
    my_rbp.add_rule(rule=simple_variables_rule)
    context.save_profiler(name="my_rbp", profiler=my_rbp)

    # load profiler from store
    my_loaded_profiler: RuleBasedProfiler = context.get_profiler(name="my_rbp")

    res = my_loaded_profiler.config.to_json_dict()
    assert res == {
        "module_name": "great_expectations.rule_based_profiler",
        "class_name": "RuleBasedProfiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "variables": {},
        "rules": {
            "rule_with_no_variables_no_parameters": {
                "domain_builder": {
                    "module_name":
                    "great_expectations.rule_based_profiler.domain_builder.column_domain_builder",
                    "class_name": "ColumnDomainBuilder",
                    "include_column_name_suffixes": [
                        "_amount",
                    ],
                },
                "variables": {},
                "parameter_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder",
                        "class_name": "MetricMultiBatchParameterBuilder",
                        "name": "my_column_min",
                        "metric_name": "column.min",
                        "metric_domain_kwargs": "$domain.domain_kwargs",
                        "enforce_numeric_metric": False,
                        "replace_nan_with_zero": False,
                        "reduce_scalar_metric": True,
                        "evaluation_parameter_builder_configs": None,
                    },
                ],
                "expectation_configuration_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.expectation_configuration_builder.default_expectation_configuration_builder",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "expectation_type":
                        "expect_column_values_to_be_greater_than",
                        "meta": {},
                        "column": "$domain.domain_kwargs.column",
                        "validation_parameter_builder_configs": None,
                        "value": "$parameter.my_column_min.value[-1]",
                    },
                ],
            },
        },
    }
def test_mean_unexpected_map_metric_multi_batch_parameter_builder_bobby_numeric_dependencies_evaluated_separately(
    bobby_columnar_table_multi_batch_deterministic_data_context,
):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context
    )

    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    my_total_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
        name="my_total_count",
        metric_name="table.row_count",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=False,
        replace_nan_with_zero=False,
        reduce_scalar_metric=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )
    my_null_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
        name="my_null_count",
        metric_name="column_values.nonnull.unexpected_count",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=False,
        replace_nan_with_zero=False,
        reduce_scalar_metric=True,
        evaluation_parameter_builder_configs=None,
        data_context=data_context,
    )

    mean_unexpected_map_metric_multi_batch_parameter_builder: ParameterBuilder = (
        MeanUnexpectedMapMetricMultiBatchParameterBuilder(
            name="my_passenger_count_values_not_null_mean_unexpected_map_metric",
            map_metric_name="column_values.nonnull",
            total_count_parameter_builder_name="my_total_count",
            null_count_parameter_builder_name="my_null_count",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        )
    )

    metric_domain_kwargs: dict = {"column": "passenger_count"}
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )

    variables: Optional[ParameterContainer] = None

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    my_total_count_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )
    my_null_count_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    mean_unexpected_map_metric_multi_batch_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    expected_parameter_value: float = 0.0

    parameter_node: ParameterNode = get_parameter_value_and_validate_return_type(
        domain=domain,
        parameter_reference=mean_unexpected_map_metric_multi_batch_parameter_builder.json_serialized_fully_qualified_parameter_name,
        expected_return_type=None,
        variables=variables,
        parameters=parameters,
    )

    rtol: float = RTOL
    atol: float = 5.0e-1 * ATOL
    np.testing.assert_allclose(
        actual=parameter_node.value,
        desired=expected_parameter_value,
        rtol=rtol,
        atol=atol,
        err_msg=f"Actual value of {parameter_node.value} differs from expected value of {expected_parameter_value} by more than {atol + rtol * abs(parameter_node.value)} tolerance.",
    )
def test_default_expectation_configuration_builder_alice_parentheses_parameter_variable_condition_true(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        )
    )

    variables: ParameterContainer = build_parameter_container_for_variables(
        {"max_user_id": 999999999999, "answer": 42}
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: str = "($variables.max_user_id>0 & $variables.answer==42) | $parameter.my_min_user_id.value[0]<0"
    max_value: str = "$variables.max_user_id"

    default_expectation_configuration_builder: DefaultExpectationConfigurationBuilder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=parameter_value,
        max_value=max_value,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration
    ] = default_expectation_configuration_builder.build_expectation_configuration(
        domain=domain,
        variables=variables,
        parameters=parameters,
    )

    assert expectation_configuration.kwargs["min_value"] == 397433
def test_condition_not_string_exception(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        )
    )

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: dict = {"condition": "$variables.tolerance<0.8"}
    max_user_id: int = 999999999999

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        # noinspection PyTypeChecker
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_between",
            condition=condition,
            min_value=parameter_value.value[0],
            max_value=max_user_id,
        )

    assert (
        str(e.value)
        == 'Argument "{\'condition\': \'$variables.tolerance<0.8\'}" in "DefaultExpectationConfigurationBuilder" must be of type "string" (value of type "<class \'dict\'>" was encountered).\n'
    )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_separate(
    alice_columnar_table_single_batch_context,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name": "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    min_user_id_parameter: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
        )
    )

    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    min_user_id_parameter.build_parameters(
        domain=domain,
        parameters=parameters,
        batch_request=batch_request,
    )

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id"
    parameter_value: Any = get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters=parameters,
    )

    condition: Optional[str] = None
    max_user_id: int = 999999999999

    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=parameter_value.value[0],
        max_value=max_user_id,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration
    ] = default_expectation_configuration_builder.build_expectation_configuration(
        domain=domain,
        parameters=parameters,
    )

    assert expectation_configuration.kwargs["min_value"] == 397433
Example #9
0
 def metrics_parameter_builders_by_domain(
     self,
 ) -> Dict[Domain, List[ParameterBuilder]]:
     table_row_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
         name="table_row_count",
         metric_name="table.row_count",
         metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
         metric_value_kwargs=None,
         enforce_numeric_metric=True,
         replace_nan_with_zero=True,
         reduce_scalar_metric=True,
         evaluation_parameter_builder_configs=None,
         json_serialize=True,
         data_context=None,
     )
     column_distinct_values_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
         name="column_distinct_values.count",
         metric_name="column.distinct_values.count",
         metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
         metric_value_kwargs=None,
         enforce_numeric_metric=True,
         replace_nan_with_zero=True,
         reduce_scalar_metric=True,
         evaluation_parameter_builder_configs=None,
         json_serialize=True,
         data_context=None,
     )
     return {
         Domain(domain_type=MetricDomainTypes.TABLE,): [
             table_row_count_metric_multi_batch_parameter_builder,
         ],
         Domain(domain_type=MetricDomainTypes.COLUMN,): [
             column_distinct_values_metric_multi_batch_parameter_builder,
         ],
     }