def test_profile_excludes_citations(
    alice_columnar_table_single_batch_context,
    alice_columnar_table_single_batch,
):
    # Load data context
    data_context: DataContext = alice_columnar_table_single_batch_context

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = alice_columnar_table_single_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: dict = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=alice_columnar_table_single_batch[
            "expected_expectation_suite_name"
        ],
        include_citation=False,
    )

    assert expectation_suite.meta.get("citations") is None
Esempio n. 2
0
def test_resolve_config_using_acceptable_arguments(
    profiler_with_placeholder_args: RuleBasedProfiler, ) -> None:
    old_config: RuleBasedProfilerConfig = profiler_with_placeholder_args.config

    # Roundtrip through schema validation to add/or restore any missing fields.
    old_deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        old_config.to_json_dict())
    old_deserialized_config.pop("class_name")
    old_deserialized_config.pop("module_name")

    old_config = RuleBasedProfilerConfig(**old_deserialized_config)

    # Brand new config is created but existing attributes are unchanged
    new_config: RuleBasedProfilerConfig = (
        RuleBasedProfilerConfig.resolve_config_using_acceptable_arguments(
            profiler=profiler_with_placeholder_args, ))

    # Roundtrip through schema validation to add/or restore any missing fields.
    # new_deserialized_config: dict = ruleBasedProfilerConfigSchema.load(new_config.to_json_dict())
    new_deserialized_config: dict = new_config.to_json_dict()
    new_deserialized_config.pop("class_name")
    new_deserialized_config.pop("module_name")

    new_config = RuleBasedProfilerConfig(**new_deserialized_config)

    assert id(old_config) != id(new_config)
    assert all(old_config[attr] == new_config[attr]
               for attr in ("config_version", "name"))
Esempio n. 3
0
def test_bobster_profiler_user_workflow_multi_batch_row_count_range_rule_bootstrap_sampling_method(
    bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context,
    bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000,
):
    # Load data context
    data_context: DataContext = (
        bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000_data_context
    )

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
        "profiler_config"]

    # Instantiate Profiler
    profiler_config: CommentedMap = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=
        bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
            "test_configuration_bootstrap_sampling_method"]
        ["expectation_suite_name"], )
    expect_table_row_count_to_be_between_expectation_configuration_kwargs: dict = (
        expectation_suite.to_json_dict()["expectations"][0]["kwargs"])
    min_value: int = (
        expect_table_row_count_to_be_between_expectation_configuration_kwargs[
            "min_value"])
    max_value: int = (
        expect_table_row_count_to_be_between_expectation_configuration_kwargs[
            "max_value"])

    assert (bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
        "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_min_value_mean_value"] <
            min_value <
            bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
                "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_mean_value"])
    assert (bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
        "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_mean_value"] < max_value <
            bobster_columnar_table_multi_batch_normal_mean_5000_stdev_1000[
                "test_configuration_bootstrap_sampling_method"]
            ["expect_table_row_count_to_be_between_max_value_mean_value"])
Esempio n. 4
0
def test_bobby_profiler_user_workflow_multi_batch_row_count_range_rule_and_column_ranges_rule_oneshot_sampling_method(
    bobby_columnar_table_multi_batch_deterministic_data_context,
    bobby_columnar_table_multi_batch,
):
    # Load data context
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = bobby_columnar_table_multi_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: dict = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=bobby_columnar_table_multi_batch[
            "test_configuration_oneshot_sampling_method"]
        ["expectation_suite_name"],
        include_citation=True,
    )

    assert (expectation_suite == bobby_columnar_table_multi_batch[
        "test_configuration_oneshot_sampling_method"]
            ["expected_expectation_suite"])
def test_notebook_execution_rule_based_profiler_with_pandas_backend(
    titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled,
    bobby_columnar_table_multi_batch,
):
    """
    To set this test up we:

    - create a suite using Rule-Based Profiler
    - verify that no validations have happened
    - create the suite edit notebook by hijacking the private cli method

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled
    root_dir: str = context.root_directory
    uncommitted_dir: str = os.path.join(root_dir, "uncommitted")
    expectation_suite_name: str = "warning"

    context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": "my_basic_data_connector",
        "data_asset_name": "Titanic_1912",
    }

    # Sanity check test setup
    original_suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    assert len(original_suite.expectations) == 0
    assert context.list_expectation_suite_names() == [expectation_suite_name]
    assert context.list_datasources() == [
        {
            "name": "my_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine",
                "module_name": "great_expectations.execution_engine",
            },
            "data_connectors": {
                "my_basic_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "class_name": "InferredAssetFilesystemDataConnector",
                },
                "my_special_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "pattern":
                            "(.+)_(\\d+)_(\\d+)\\.csv",
                            "group_names": ["name", "timestamp", "size"],
                            "class_name":
                            "Asset",
                            "base_directory":
                            f"{root_dir}/../data/titanic",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_other_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "class_name":
                            "Asset",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_runtime_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "batch_identifiers":
                    ["pipeline_stage_name", "airflow_run_id"],
                    "class_name": "RuntimeDataConnector",
                },
            },
        },
        {
            "name": "my_additional_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "my_additional_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "base_directory": f"{root_dir}/../data/titanic",
                    "class_name": "InferredAssetFilesystemDataConnector",
                }
            },
        },
    ]

    assert context.get_validation_result(
        expectation_suite_name="warning") == {}

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = bobby_columnar_table_multi_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: dict = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=context,
    )

    profiler_name: str = "bobby_user_workflow"

    context.save_profiler(
        profiler=profiler,
        name=profiler_name,
    )

    # Create notebook
    # do not want to actually send usage_message, since the function call is not the result of actual usage
    _suite_edit_workflow(
        context=context,
        expectation_suite_name=expectation_suite_name,
        profile=True,
        profiler_name=profiler_name,
        usage_event="test_notebook_execution",
        interactive_mode=CLISuiteInteractiveFlagCombinations.
        UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE,
        no_jupyter=True,
        create_if_not_exist=False,
        datasource_name=None,
        batch_request=batch_request,
        additional_batch_request_args=None,
        suppress_usage_message=True,
        assume_yes=True,
    )
    edit_notebook_path: str = os.path.join(uncommitted_dir,
                                           "edit_warning.ipynb")
    assert os.path.isfile(edit_notebook_path)

    run_notebook(
        notebook_path=edit_notebook_path,
        notebook_dir=uncommitted_dir,
        string_to_be_replaced=
        "context.open_data_docs(resource_identifier=validation_result_identifier)",
        replacement_string="",
    )

    # Assertions about output
    context = DataContext(context_root_dir=root_dir)
    obs_validation_result: ExpectationSuiteValidationResult = (
        context.get_validation_result(expectation_suite_name="warning"))
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 13,
        "successful_expectations": 13,
        "unsuccessful_expectations": 0,
        "success_percent": 100.0,
    }

    expected_expectation_configurations: List[ExpectationConfiguration] = [
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {},
                            "metric_dependencies": None,
                            "metric_name": "table.row_count",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "max_value": 1313,
                    "min_value": 1313
                },
                "expectation_type": "expect_table_row_count_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Unnamed: 0"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Unnamed: 0",
                    "max_value": 1,
                    "min_value": 1,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Unnamed: 0"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Unnamed: 0",
                    "max_value": 1313,
                    "min_value": 1313,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Age"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Age",
                    "max_value": 0.17,
                    "min_value": 0.17,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Age"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Age",
                    "max_value": 71.0,
                    "min_value": 71.0,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Survived"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Survived",
                    "max_value": 0,
                    "min_value": 0,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "Survived"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "Survived",
                    "max_value": 1,
                    "min_value": 1,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "SexCode"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.min",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "SexCode",
                    "max_value": 0,
                    "min_value": 0,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_min_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "domain_kwargs": {
                                "column": "SexCode"
                            },
                            "metric_dependencies": None,
                            "metric_name": "column.max",
                            "metric_value_kwargs": None,
                        },
                        "num_batches": 1,
                    }
                },
                "kwargs": {
                    "column": "SexCode",
                    "max_value": 1,
                    "min_value": 1,
                    "mostly": 1.0,
                },
                "expectation_type": "expect_column_max_to_be_between",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "PClass",
                    "value_set": [
                        "*",
                        "1st",
                        "2nd",
                        "3rd",
                    ],
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "Sex",
                    "value_set": ["female", "male"]
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "Survived",
                    "value_set": [0, 1]
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
        ExpectationConfiguration(
            **{
                "meta": {},
                "kwargs": {
                    "column": "SexCode",
                    "value_set": [0, 1]
                },
                "expectation_type": "expect_column_values_to_be_in_set",
            }),
    ]

    suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)

    expectation_configurations: List[ExpectationConfiguration] = []
    expectation_configuration: ExpectationConfiguration
    for expectation_configuration in suite.expectations:
        kwargs: dict = expectation_configuration.kwargs
        key: str
        value: Any
        kwargs = {
            key: sorted(value) if isinstance(value,
                                             (list, set, tuple)) else value
            for key, value in kwargs.items()
        }
        expectation_configuration.kwargs = kwargs
        expectation_configurations.append(expectation_configuration)

    assert expectation_configurations == expected_expectation_configurations
Esempio n. 6
0
def test_quentin_profiler_user_workflow_multi_batch_quantiles_value_ranges_rule(
    quentin_columnar_table_multi_batch_data_context,
    quentin_columnar_table_multi_batch,
):
    # Load data context
    data_context: DataContext = quentin_columnar_table_multi_batch_data_context

    # Load profiler configs & loop (run tests for each one)
    yaml_config: str = quentin_columnar_table_multi_batch["profiler_config"]

    # Instantiate Profiler
    profiler_config: CommentedMap = yaml.load(yaml_config)

    # Roundtrip through schema validation to remove any illegal fields add/or restore any missing fields.
    deserialized_config: dict = ruleBasedProfilerConfigSchema.load(
        profiler_config)
    serialized_config: dict = ruleBasedProfilerConfigSchema.dump(
        deserialized_config)

    # `class_name`/`module_name` are generally consumed through `instantiate_class_from_config`
    # so we need to manually remove those values if we wish to use the **kwargs instantiation pattern
    serialized_config.pop("class_name")
    serialized_config.pop("module_name")

    profiler: RuleBasedProfiler = RuleBasedProfiler(
        **serialized_config,
        data_context=data_context,
    )

    expectation_suite: ExpectationSuite = profiler.run(
        expectation_suite_name=quentin_columnar_table_multi_batch[
            "test_configuration"]["expectation_suite_name"], )

    expectation_configuration_dict: dict
    column_name: str
    expectation_kwargs: dict
    expect_column_quantile_values_to_be_between_expectation_configurations_kwargs_dict: Dict[
        str, dict] = {
            expectation_configuration_dict["kwargs"]["column"]:
            expectation_configuration_dict["kwargs"]
            for expectation_configuration_dict in
            expectation_suite.to_json_dict()["expectations"]
        }
    expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column: Dict[
        str, List[List[Number]]] = {
            column_name: expectation_kwargs["value_ranges"]
            for column_name, expectation_kwargs in
            expect_column_quantile_values_to_be_between_expectation_configurations_kwargs_dict
            .items()
        }

    assert (
        expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column[
            "tolls_amount"] ==
        quentin_columnar_table_multi_batch["test_configuration"]
        ["expect_column_quantile_values_to_be_between_quantile_ranges_by_column"]
        ["tolls_amount"])

    # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired)
    # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details).
    rtol: float = 1.0e-7
    atol: float = 5.0e-2

    value_range: List[Number]
    paired_quantiles: zip
    column_quantiles: List[List[Number]]
    idx: int
    for (
            column_name,
            column_quantiles,
    ) in (expect_column_quantile_values_to_be_between_expectation_configurations_value_ranges_by_column
          .items()):
        paired_quantiles = zip(
            column_quantiles,
            quentin_columnar_table_multi_batch["test_configuration"]
            ["expect_column_quantile_values_to_be_between_quantile_ranges_by_column"]
            [column_name],
        )
        for value_range in list(paired_quantiles):
            for idx in range(2):
                np.testing.assert_allclose(
                    actual=value_range[0][idx],
                    desired=value_range[1][idx],
                    rtol=rtol,
                    atol=atol,
                    err_msg=
                    f"Actual value of {value_range[0][idx]} differs from expected value of {value_range[1][idx]} by more than {atol + rtol * abs(value_range[1][idx])} tolerance.",
                )