Example #1
0
def rule_without_parameters(empty_data_context, ):
    skip_if_python_below_minimum_version()

    rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        domain_builder=ColumnDomainBuilder(data_context=empty_data_context),
        expectation_configuration_builders=[
            DefaultExpectationConfigurationBuilder(
                expectation_type="expect_my_validation")
        ],
    )
    return rule
Example #2
0
def test_profiler_parameter_builder_added(data_context_with_taxi_data):
    """
    What does this test and why?

    This test now adds a simple ParameterBuilder to our Rule. More specifically,
    we use a MetricMultiBatchParameterBuilder to pass in the min_value parameter to
    expect_column_values_to_be_greater_than.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_rule: Rule = Rule(
        name="rule_with_variables_and_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    my_rbp.add_rule(rule=simple_rule)
    result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request)
    expectation_configurations: List[
        ExpectationConfiguration] = result.expectation_configurations
    assert len(expectation_configurations) == 4
Example #3
0
def test_add_rule_and_run_profiler(data_context_with_taxi_data):
    """
    What does this test and why?

    This is the first test where we build a Rule in memory and use the add_rule() method
    to add to our RuleBasedProfiler and run the profiler. We use the DomainBuilder from
    the previous test (against "_amount" columns) and an ExpectationConfigurationBuilder
    that uses expect_column_values_to_not_be_null because it only needs a domain value.

    The test eventually asserts that the profiler return 4 Expectations, one per column in
    our domain.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_not_be_null",
        column="$domain.domain_kwargs.column",
    )
    simple_rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        variables=None,
        domain_builder=domain_builder,
        expectation_configuration_builders=[
            default_expectation_configuration_builder
        ],
    )
    my_rbp: RuleBasedProfiler = RuleBasedProfiler(
        name="my_simple_rbp",
        config_version=1.0,
        data_context=context,
    )
    my_rbp.add_rule(rule=simple_rule)
    result: RuleBasedProfilerResult = my_rbp.run(batch_request=batch_request)
    expectation_configurations: List[
        ExpectationConfiguration] = result.expectation_configurations
    assert len(expectation_configurations) == 4
Example #4
0
def rule_without_variables(
    empty_data_context,
    column_Age_domain,
    column_Date_domain,
    variables_multi_part_name_parameter_container,
    single_part_name_parameter_container,
    multi_part_name_parameter_container,
):
    rule: Rule = Rule(
        name="rule_without_variables",
        variables=None,
        domain_builder=ColumnDomainBuilder(data_context=empty_data_context),
        expectation_configuration_builders=[
            DefaultExpectationConfigurationBuilder(
                expectation_type="expect_my_validation",
                column=f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
            ),
        ],
    )
    return rule
Example #5
0
def rule_with_parameters(
    empty_data_context,
    column_Age_domain,
    column_Date_domain,
    variables_multi_part_name_parameter_container,
    single_part_name_parameter_container,
    multi_part_name_parameter_container,
):
    skip_if_python_below_minimum_version()

    rule: Rule = Rule(
        name="rule_with_parameters",
        domain_builder=ColumnDomainBuilder(data_context=empty_data_context),
        expectation_configuration_builders=[
            DefaultExpectationConfigurationBuilder(
                expectation_type="expect_my_validation")
        ],
    )
    rule._parameters = {
        column_Age_domain.id: single_part_name_parameter_container,
        column_Date_domain.id: multi_part_name_parameter_container,
    }
    return rule
Example #6
0
def test_domain_builder(data_context_with_taxi_data):
    """
    What does this test and why?

    In the process of building a RuleBasedProfiler, one of the first components we want to build/test
    is DomainBuilder, which returns the domains (in this case columns of our data) that the profiler
    will be run on.  This test will ColumnDomainBuilder on the suffix "_amount", which
    returns 4 columns as the domain.
    """
    context: DataContext = data_context_with_taxi_data
    batch_request: BatchRequest = BatchRequest(
        datasource_name="taxi_multibatch_datasource_other_possibility",
        data_connector_name="default_inferred_data_connector_name",
        data_asset_name="yellow_tripdata_sample_2018",
        data_connector_query={"index": -1},
    )
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    domains: list = domain_builder.get_domains(rule_name="my_rule",
                                               batch_request=batch_request)
    assert len(domains) == 4
    assert domains == [
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "fare_amount",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "fare_amount": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "tip_amount",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "tip_amount": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "tolls_amount",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "tolls_amount": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "total_amount",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "total_amount": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
    ]
Example #7
0
def test_profiler_save_and_load(data_context_with_taxi_data):
    """
    What does this test and why?

    This tests whether context.save_profiler() can be invoked to update a profiler that lives in Store.
    The test ensures that any changes that we make to the Profiler, like adding a rule, will be persisted.

    The test tests that context.save_profiler() and context.get_profiler() return the expected RBP.
    """
    context: DataContext = data_context_with_taxi_data
    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_column_name_suffixes=["_amount"],
        data_context=context,
    )
    # parameter_builder
    numeric_range_parameter_builder: MetricMultiBatchParameterBuilder = (
        MetricMultiBatchParameterBuilder(
            data_context=context,
            metric_name="column.min",
            metric_domain_kwargs="$domain.domain_kwargs",
            name="my_column_min",
        ))
    config_builder: DefaultExpectationConfigurationBuilder = (
        DefaultExpectationConfigurationBuilder(
            expectation_type="expect_column_values_to_be_greater_than",
            value="$parameter.my_column_min.value[-1]",
            column="$domain.domain_kwargs.column",
        ))
    simple_variables_rule: Rule = Rule(
        name="rule_with_no_variables_no_parameters",
        variables=None,
        domain_builder=domain_builder,
        parameter_builders=[numeric_range_parameter_builder],
        expectation_configuration_builders=[config_builder],
    )
    my_rbp = RuleBasedProfiler(
        name="my_rbp",
        config_version=1.0,
        data_context=context,
    )
    res: dict = my_rbp.config.to_json_dict()
    assert res == {
        "class_name": "RuleBasedProfiler",
        "module_name": "great_expectations.rule_based_profiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "rules": None,
        "variables": {},
    }
    my_rbp.add_rule(rule=simple_variables_rule)
    context.save_profiler(name="my_rbp", profiler=my_rbp)

    # load profiler from store
    my_loaded_profiler: RuleBasedProfiler = context.get_profiler(name="my_rbp")

    res = my_loaded_profiler.config.to_json_dict()
    assert res == {
        "module_name": "great_expectations.rule_based_profiler",
        "class_name": "RuleBasedProfiler",
        "name": "my_rbp",
        "config_version": 1.0,
        "variables": {},
        "rules": {
            "rule_with_no_variables_no_parameters": {
                "domain_builder": {
                    "module_name":
                    "great_expectations.rule_based_profiler.domain_builder.column_domain_builder",
                    "class_name": "ColumnDomainBuilder",
                    "include_column_name_suffixes": [
                        "_amount",
                    ],
                },
                "variables": {},
                "parameter_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.parameter_builder.metric_multi_batch_parameter_builder",
                        "class_name": "MetricMultiBatchParameterBuilder",
                        "name": "my_column_min",
                        "metric_name": "column.min",
                        "metric_domain_kwargs": "$domain.domain_kwargs",
                        "enforce_numeric_metric": False,
                        "replace_nan_with_zero": False,
                        "reduce_scalar_metric": True,
                        "evaluation_parameter_builder_configs": None,
                    },
                ],
                "expectation_configuration_builders": [
                    {
                        "module_name":
                        "great_expectations.rule_based_profiler.expectation_configuration_builder.default_expectation_configuration_builder",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "expectation_type":
                        "expect_column_values_to_be_greater_than",
                        "meta": {},
                        "column": "$domain.domain_kwargs.column",
                        "validation_parameter_builder_configs": None,
                        "value": "$parameter.my_column_min.value[-1]",
                    },
                ],
            },
        },
    }
def test_builder_executed_with_runtime_batch_request_does_not_raise_error(
    data_context_with_datasource_pandas_engine,
    alice_columnar_table_single_batch,
):
    data_context: DataContext = data_context_with_datasource_pandas_engine

    profiler_config: str = alice_columnar_table_single_batch["profiler_config"]

    full_profiler_config_dict: dict = yaml.load(profiler_config)

    variables_configs: dict = full_profiler_config_dict.get("variables")
    if variables_configs is None:
        variables_configs = {}

    variables: ParameterContainer = build_parameter_container_for_variables(
        variables_configs=variables_configs)

    df: pd.DataFrame = pd.DataFrame({
        "a": [
            "2021-01-01",
            "2021-01-31",
            "2021-02-28",
            "2021-03-20",
            "2021-02-21",
            "2021-05-01",
            "2021-06-18",
        ]
    })

    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": "default_runtime_data_connector_name",
        "data_asset_name": "my_data_asset",
        "runtime_parameters": {
            "batch_data": df,
        },
        "batch_identifiers": {
            "default_identifier_name": "my_identifier",
        },
    }

    domain_builder: DomainBuilder = ColumnDomainBuilder(
        data_context=data_context, )
    domains: List[Domain] = domain_builder.get_domains(
        rule_name="my_rule",
        variables=variables,
        batch_request=batch_request,
    )

    assert len(domains) == 1
    assert domains == [
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "a",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "a": SemanticDomainTypes.TEXT.value,
                },
            },
        },
    ]
def test_column_domain_builder_with_simple_semantic_type_included(
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    profiler_config: str = alice_columnar_table_single_batch["profiler_config"]

    full_profiler_config_dict: dict = yaml.load(profiler_config)

    variables_configs: dict = full_profiler_config_dict.get("variables")
    if variables_configs is None:
        variables_configs = {}

    variables: ParameterContainer = build_parameter_container_for_variables(
        variables_configs=variables_configs)

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    domain_builder: DomainBuilder = ColumnDomainBuilder(
        include_semantic_types=[
            "numeric",
        ],
        data_context=data_context,
    )
    domains: List[Domain] = domain_builder.get_domains(
        rule_name="my_rule", variables=variables, batch_request=batch_request)

    assert len(domains) == 2
    # Assert Domain object equivalence.
    assert domains == [
        {
            "rule_name": "my_rule",
            "domain_type": "column",
            "domain_kwargs": {
                "column": "event_type",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "event_type": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": "column",
            "domain_kwargs": {
                "column": "user_id",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "user_id": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
    ]
def test_column_domain_builder(
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    profiler_config: str = alice_columnar_table_single_batch["profiler_config"]

    full_profiler_config_dict: dict = yaml.load(profiler_config)

    variables_configs: dict = full_profiler_config_dict.get("variables")
    if variables_configs is None:
        variables_configs = {}

    variables: ParameterContainer = build_parameter_container_for_variables(
        variables_configs=variables_configs)

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    domain_builder: DomainBuilder = ColumnDomainBuilder(
        data_context=data_context)
    domains: List[Domain] = domain_builder.get_domains(
        rule_name="my_rule", variables=variables, batch_request=batch_request)

    assert len(domains) == 7
    assert domains == [
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "id",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "id": SemanticDomainTypes.TEXT.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "event_type",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "event_type": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "user_id",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "user_id": SemanticDomainTypes.NUMERIC.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "event_ts",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "event_ts": SemanticDomainTypes.TEXT.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "server_ts",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "server_ts": SemanticDomainTypes.TEXT.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "device_ts",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "device_ts": SemanticDomainTypes.TEXT.value,
                },
            },
        },
        {
            "rule_name": "my_rule",
            "domain_type": MetricDomainTypes.COLUMN.value,
            "domain_kwargs": {
                "column": "user_agent",
            },
            "details": {
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "user_agent": SemanticDomainTypes.TEXT.value,
                },
            },
        },
    ]
Example #11
0
def test_column_domain_builder(
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
    column_Age_domain,
    column_Date_domain,
    column_Description_domain,
):
    data_context: DataContext = alice_columnar_table_single_batch_context

    profiler_config: str = alice_columnar_table_single_batch["profiler_config"]

    full_profiler_config_dict: dict = yaml.load(profiler_config)
    variables_configs: dict = full_profiler_config_dict.get("variables")
    variables: Optional[ParameterContainer] = None
    if variables_configs:
        variables = build_parameter_container_for_variables(
            variables_configs=variables_configs)

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    domain_builder: DomainBuilder = ColumnDomainBuilder(
        data_context=data_context,
        batch_request=batch_request,
    )
    domains: List[Domain] = domain_builder.get_domains(variables=variables)

    assert len(domains) == 7
    assert domains == [
        {
            "domain_type": "column",
            "domain_kwargs": {
                "column": "id",
            },
            "details": {},
        },
        {
            "domain_type": "column",
            "domain_kwargs": {
                "column": "event_type",
            },
            "details": {},
        },
        {
            "domain_type": "column",
            "domain_kwargs": {
                "column": "user_id",
            },
            "details": {},
        },
        {
            "domain_type": "column",
            "domain_kwargs": {
                "column": "event_ts",
            },
            "details": {},
        },
        {
            "domain_type": "column",
            "domain_kwargs": {
                "column": "server_ts",
            },
            "details": {},
        },
        {
            "domain_type": "column",
            "domain_kwargs": {
                "column": "device_ts",
            },
            "details": {},
        },
        {
            "domain_type": "column",
            "domain_kwargs": {
                "column": "user_agent",
            },
            "details": {},
        },
    ]