Python ExpectationConfiguration Exemples, great_expectations.core.ExpectationConfiguration Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_render_ValidationResultsTableContentBlockRenderer.py Projet : volaland/great_expectations

def test_ValidationResultsTableContentBlockRenderer_get_observed_value(evr_success):
    evr_no_result_key = ExpectationValidationResult(
        success=True,
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_table_row_count_to_be_between",
            kwargs={"min_value": 0, "max_value": None, "result_format": "SUMMARY"},
        ),
    )

    evr_expect_column_values_to_not_be_null = ExpectationValidationResult(
        success=True,
        result={
            "element_count": 1313,
            "unexpected_count": 1050,
            "unexpected_percent": 79.96953541508,
            "partial_unexpected_list": [],
        },
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={"column": "Unnamed: 0", "mostly": 0.5, "result_format": "SUMMARY"},
        ),
    )

    evr_expect_column_values_to_be_null = ExpectationValidationResult(
        success=True,
        result={
            "element_count": 1313,
            "unexpected_count": 0,
            "unexpected_percent": 0.0,
            "partial_unexpected_list": [],
        },
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_null",
            kwargs={"column": "Unnamed: 0", "mostly": 0.5, "result_format": "SUMMARY"},
        ),
    )

    # test _get_observed_value when evr.result["observed_value"] exists
    output_1 = ValidationResultsTableContentBlockRenderer._get_observed_value(
        evr_success
    )
    assert output_1 == "1,313"
    # test _get_observed_value when evr.result does not exist
    output_2 = ValidationResultsTableContentBlockRenderer._get_observed_value(
        evr_no_result_key
    )
    assert output_2 == "--"
    # test _get_observed_value for expect_column_values_to_not_be_null expectation type
    output_3 = ValidationResultsTableContentBlockRenderer._get_observed_value(
        evr_expect_column_values_to_not_be_null
    )
    assert output_3 == "≈20.03% not null"
    # test _get_observed_value for expect_column_values_to_be_null expectation type
    output_4 = ValidationResultsTableContentBlockRenderer._get_observed_value(
        evr_expect_column_values_to_be_null
    )
    assert output_4 == "100% null"

Exemple #2

0

Afficher le fichier

Fichier : test_pandas_dataset.py Projet : rexboyce/great_expectations

def test_ge_pandas_automatic_failure_removal():
    df = ge.dataset.PandasDataset(
        {
            "A": [1, 2, 3, 4],
            "B": [5, 6, 7, 8],
            "C": ["a", "b", "c", "d"],
            "D": ["e", "f", "g", "h"],
        }
    )

    # Put some simple expectations on the data frame
    df.profile(ge.profile.ColumnsExistProfiler)
    df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4])
    df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8])
    df.expect_column_values_to_be_in_set("C", ["w", "x", "y", "z"])
    df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "h"])

    # First check that failing expectations are NOT automatically
    # dropped when sampling.
    # For this data frame, the expectation on column "C" above fails.
    exp1 = [
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "A"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "B"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "C"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "D"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "A", "value_set": [1, 2, 3, 4]},
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "B", "value_set": [5, 6, 7, 8]},
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "C", "value_set": ["w", "x", "y", "z"]},
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "D", "value_set": ["e", "f", "g", "h"]},
        ),
    ]
    samp1 = df.sample(n=2)
    assert samp1.find_expectations() == exp1

    # Now check subsetting to verify that failing expectations are NOT
    # automatically dropped when subsetting.
    sub1 = df[["A", "D"]]
    assert samp1.find_expectations() == exp1

    # Set property/attribute so that failing expectations are
    # automatically removed when sampling or subsetting.
    df.discard_subset_failing_expectations = True

    ###
    # Note: Order matters in this test, and a validationoperator may change order
    ###

    exp_samp = [
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "A"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "B"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "C"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "D"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "A", "value_set": [1, 2, 3, 4]},
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "B", "value_set": [5, 6, 7, 8]},
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "D", "value_set": ["e", "f", "g", "h"]},
        ),
    ]

    samp2 = df.sample(n=2)
    assert samp2.find_expectations() == exp_samp

    # Now check subsetting. In additional to the failure on column "C",
    # the expectations on column "B" now fail since column "B" doesn't
    # exist in the subset.
    sub2 = df[["A", "D"]]
    exp_sub = [
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "A"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "A", "value_set": [1, 2, 3, 4]},
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "D"}
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={"column": "D", "value_set": ["e", "f", "g", "h"]},
        ),
    ]
    assert samp2.find_expectations() == exp_samp

Exemple #3

0

Afficher le fichier

Fichier : basic_suite_builder_profiler.py Projet : waqassiddiqi/great_expectations

    def _profile(cls, dataset, configuration=None):
        logger.debug(f"Running profiler with configuration: {configuration}")
        if configuration == "demo":
            return cls._demo_profile(dataset)

        existing_columns = dataset.get_table_columns()
        selected_columns = existing_columns
        included_expectations = []
        excluded_expectations = []

        if configuration:
            if (
                "included_expectations" in configuration
                and "excluded_expectations" in configuration
            ):
                raise ProfilerError(
                    "Please specify either `included_expectations` or `excluded_expectations`."
                )
            if "included_expectations" in configuration:
                included_expectations = configuration["included_expectations"]
                if included_expectations in [False, None, []]:
                    included_expectations = None
                _check_that_expectations_are_available(dataset, included_expectations)
            if "excluded_expectations" in configuration:
                excluded_expectations = configuration["excluded_expectations"]
                if excluded_expectations in [False, None, []]:
                    excluded_expectations = None
                _check_that_expectations_are_available(dataset, excluded_expectations)

            if (
                "included_columns" in configuration
                and "excluded_columns" in configuration
            ):
                raise ProfilerError(
                    "Please specify either `excluded_columns` or `included_columns`."
                )
            elif "included_columns" in configuration:
                selected_columns = configuration["included_columns"]
                if selected_columns in [False, None, []]:
                    selected_columns = []
            elif "excluded_columns" in configuration:
                excluded_columns = configuration["excluded_columns"]
                if excluded_columns in [False, None, []]:
                    excluded_columns = []
                selected_columns = set(existing_columns) - set(excluded_columns)

        _check_that_columns_exist(dataset, selected_columns)
        if included_expectations is None:
            suite = cls._build_column_description_metadata(dataset)
            # remove column exist expectations
            suite.expectations = []
            return suite

        dataset.set_default_expectation_argument("catch_exceptions", False)
        dataset = cls._build_table_row_count_expectation(
            dataset,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )
        dataset.set_config_value("interactive_evaluation", True)
        dataset = cls._build_table_column_expectations(
            dataset,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )

        column_cache = {}
        if selected_columns:
            for column in selected_columns:
                cardinality = cls._get_column_cardinality_with_caching(
                    dataset, column, column_cache
                )
                column_type = cls._get_column_type_with_caching(
                    dataset, column, column_cache
                )

                if cardinality in [
                    ProfilerCardinality.TWO,
                    ProfilerCardinality.VERY_FEW,
                    ProfilerCardinality.FEW,
                ]:
                    cls._create_expectations_for_low_card_column(
                        dataset, column, column_cache
                    )
                elif cardinality in [
                    ProfilerCardinality.MANY,
                    ProfilerCardinality.VERY_MANY,
                    ProfilerCardinality.UNIQUE,
                ]:
                    # TODO we will want to finesse the number and types of
                    #  expectations created here. The simple version is deny/allow list
                    #  and the more complex version is desired per column type and
                    #  cardinality. This deserves more thought on configuration.
                    dataset.expect_column_values_to_be_unique(column)

                    if column_type in [ProfilerDataType.INT, ProfilerDataType.FLOAT]:
                        cls._create_expectations_for_numeric_column(dataset, column)
                    elif column_type in [ProfilerDataType.DATETIME]:
                        cls._create_expectations_for_datetime_column(
                            dataset,
                            column,
                            excluded_expectations=excluded_expectations,
                            included_expectations=included_expectations,
                        )
                    elif column_type in [ProfilerDataType.STRING]:
                        cls._create_expectations_for_string_column(
                            dataset,
                            column,
                            excluded_expectations=excluded_expectations,
                            included_expectations=included_expectations,
                        )
                    elif column_type in [ProfilerDataType.UNKNOWN]:
                        logger.debug(
                            f"Skipping expectation creation for column {column} of unknown type: {column_type}"
                        )

        if excluded_expectations:
            # NOTE: we reach into a private member here because of an expected future
            # refactor that will make the suite directly accessible
            dataset._expectation_suite.remove_all_expectations_of_type(
                excluded_expectations
            )
        if included_expectations:
            for expectation in dataset.get_expectation_suite(
                discard_failed_expectations=False, suppress_logging=True,
            ).expectations:
                if expectation.expectation_type not in included_expectations:
                    try:
                        dataset.remove_expectation(
                            ExpectationConfiguration(
                                expectation_type=expectation.expectation_type,
                                kwargs=expectation.kwargs,
                            ),
                            match_type="domain",
                            remove_multiple_matches=True,
                        )
                    except ValueError:
                        logger.debug(
                            f"Attempted to remove {expectation}, which was not found."
                        )

        expectation_suite = cls._build_column_description_metadata(dataset)

        return expectation_suite

Exemple #4

0

Afficher le fichier

                "action": {
                    "class_name": "OpenLineageValidationAction",
                    "module_name": "openlineage.common.provider.great_expectations.action"
                }
            }]
        }
    },
    anonymous_usage_statistics={'enabled': False}
)

TABLE_NAME = "test_data"

# Common validation results
table_result = ExpectationValidationResult(success=True,
                                           expectation_config=ExpectationConfiguration(
                                               expectation_type='expect_table_row_count_to_equal',
                                               kwargs={'value': 10}),
                                           result={"observed_value": 10})
column_result = ExpectationValidationResult(success=True,
                                            expectation_config=ExpectationConfiguration(
                                                expectation_type='expect_column_sum_to_be_between',
                                                kwargs={'column': 'size', 'min_value': 0,
                                                        'max_value': 100}
                                            ),
                                            result={'observed_value': 60})
result_suite = ExpectationSuiteValidationResult(success=True, meta={'batch_kwargs': {}},
                                                results=[table_result, column_result])


@pytest.fixture(scope='session')
def test_db_file():

Exemple #5

0

Afficher le fichier

Fichier : test_data_asset_internals.py Projet : trucklos/great_expectations

def test_find_expectations():
    my_df = ge.dataset.PandasDataset(
        {
            "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            "y": [1, 2, None, 4, None, 6, 7, 8, 9, None],
            "z": [
                "cello",
                "hello",
                "jello",
                "bellow",
                "fellow",
                "mellow",
                "wellow",
                "xello",
                "yellow",
                "zello",
            ],
        },
        profiler=ge.profile.ColumnsExistProfiler,
    )
    my_df.expect_column_values_to_be_of_type("x", "int")
    my_df.expect_column_values_to_be_of_type("y", "int")
    my_df.expect_column_values_to_be_of_type("z", "int")
    my_df.expect_column_values_to_be_increasing("x")
    my_df.expect_column_values_to_match_regex("z", "ello")

    assert my_df.find_expectations("expect_column_to_exist", "w") == []

    assert my_df.find_expectations(
        "expect_column_to_exist", "x", expectation_kwargs={}) == [
            ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                     kwargs={"column": "x"})
        ]

    assert my_df.find_expectations(
        "expect_column_to_exist", expectation_kwargs={"column": "y"}) == [
            ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                     kwargs={"column": "y"})
        ]

    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "x"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "y"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "z"}),
    ]

    assert my_df.find_expectations("expect_column_to_exist") == exp1

    with pytest.raises(ValueError) as exc:
        my_df.find_expectations("expect_column_to_exist", "x", {"column": "y"})

    assert "Conflicting column names in find_expectation_indexes:" in str(
        exc.value)

    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "x"}),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": "x",
                "type_": "int"
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_increasing",
            kwargs={"column": "x"},
        ),
    ]

    assert my_df.find_expectations(column="x") == exp1

Exemple #6

0

Afficher le fichier

Fichier : test_data_asset_internals.py Projet : trucklos/great_expectations

def test_discard_failing_expectations():
    df = ge.dataset.PandasDataset(
        {
            "A": [1, 2, 3, 4],
            "B": [5, 6, 7, 8],
            "C": ["a", "b", "c", "d"],
            "D": ["e", "f", "g", "h"],
        },
        profiler=ge.profile.ColumnsExistProfiler,
    )

    # Put some simple expectations on the data frame
    df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4])
    df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8])
    df.expect_column_values_to_be_in_set("C", ["a", "b", "c", "d"])
    df.expect_column_values_to_be_in_set("D", ["e", "f", "g", "h"])

    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "A"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "B"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "C"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "D"}),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "A",
                "value_set": [1, 2, 3, 4]
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "B",
                "value_set": [5, 6, 7, 8]
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "C",
                "value_set": ["a", "b", "c", "d"]
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "D",
                "value_set": ["e", "f", "g", "h"]
            },
        ),
    ]

    sub1 = df[:3]

    sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

    sub1 = df[1:2]
    sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

    sub1 = df[:-1]
    sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

    sub1 = df[-1:]
    sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

    sub1 = df[["A", "D"]]
    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "A"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "D"}),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "A",
                "value_set": [1, 2, 3, 4]
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "D",
                "value_set": ["e", "f", "g", "h"]
            },
        ),
    ]
    with pytest.warns(UserWarning,
                      match=r"Removed \d expectations that were 'False'"):
        sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

    sub1 = df[["A"]]
    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "A"}),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "A",
                "value_set": [1, 2, 3, 4]
            },
        ),
    ]
    with pytest.warns(UserWarning,
                      match=r"Removed \d expectations that were 'False'"):
        sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

    sub1 = df.iloc[:3, 1:4]
    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "B"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "C"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "D"}),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "B",
                "value_set": [5, 6, 7, 8]
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "C",
                "value_set": ["a", "b", "c", "d"]
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "D",
                "value_set": ["e", "f", "g", "h"]
            },
        ),
    ]
    with pytest.warns(UserWarning,
                      match=r"Removed \d expectations that were 'False'"):
        sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

    sub1 = df.loc[0:, "A":"B"]
    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "A"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "B"}),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "A",
                "value_set": [1, 2, 3, 4]
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "B",
                "value_set": [5, 6, 7, 8]
            },
        ),
    ]
    with pytest.warns(UserWarning,
                      match=r"Removed \d expectations that were 'False'"):
        sub1.discard_failing_expectations()
    assert sub1.find_expectations() == exp1

Exemple #7

0

Afficher le fichier

Fichier : test_expectation_suite_crud_methods.py Projet : whiteDigitalAI/great_expectations

def table_exp1():
    return ExpectationConfiguration(
        expectation_type="expect_table_columns_to_match_ordered_list",
        kwargs={"value": ["a", "b", "c"]},
    )

Exemple #8

0

Afficher le fichier

def test_validate():

    with open(
            file_relative_path(__file__,
                               "./test_sets/titanic_expectations.json")) as f:
        my_expectation_suite = expectationSuiteSchema.loads(f.read())

    with mock.patch("uuid.uuid1") as uuid:
        uuid.return_value = "1234"
        my_df = ge.read_csv(
            file_relative_path(__file__, "./test_sets/Titanic.csv"),
            expectation_suite=my_expectation_suite,
        )
    my_df.set_default_expectation_argument("result_format", "COMPLETE")

    with mock.patch("datetime.datetime") as mock_datetime:
        mock_datetime.utcnow.return_value = datetime(1955, 11, 5)
        results = my_df.validate(catch_exceptions=False)

    with open(
            file_relative_path(
                __file__,
                "./test_sets/titanic_expected_data_asset_validate_results.json"
            )) as f:
        expected_results = expectationSuiteValidationResultSchema.loads(
            f.read())

    del results.meta["great_expectations.__version__"]

    assert expected_results == results

    # Now, change the results and ensure they are no longer equal
    results.results[0] = ExpectationValidationResult()
    assert expected_results != results

    # Finally, confirm that only_return_failures works
    # and does not affect the "statistics" field.
    with mock.patch("datetime.datetime") as mock_datetime:
        mock_datetime.utcnow.return_value = datetime(1955, 11, 5)
        validation_results = my_df.validate(only_return_failures=True)
        del validation_results.meta["great_expectations.__version__"]

    expected_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "titanic",
            "run_id": "19551105T000000.000000Z",
            "batch_kwargs": {
                "ge_batch_id": "1234"
            },
            "batch_markers": {},
            "batch_parameters": {},
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type="expect_column_values_to_be_in_set",
                    kwargs={
                        "column": "PClass",
                        "value_set": ["1st", "2nd", "3rd"]
                    },
                ),
                success=False,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False,
                },
                result={
                    "partial_unexpected_index_list": [456],
                    "unexpected_count": 1,
                    "unexpected_list": ["*"],
                    "unexpected_percent": 0.07616146230007616,
                    "element_count": 1313,
                    "missing_percent": 0.0,
                    "partial_unexpected_counts": [{
                        "count": 1,
                        "value": "*"
                    }],
                    "partial_unexpected_list": ["*"],
                    "unexpected_percent_nonmissing": 0.07616146230007616,
                    "missing_count": 0,
                    "unexpected_index_list": [456],
                },
            )
        ],
        success=expected_results.success,  # unaffected
        statistics=expected_results["statistics"],  # unaffected
    )
    assert expected_results == validation_results

Exemple #9

0

Afficher le fichier

def test_find_expectations(baseline_suite, exp1, exp2):
    # Note: most of the logic in this method is based on
    # find_expectation_indexes and _copy_and_clean_up_expectations_from_indexes
    # These tests do not thoroughly cover that logic.
    # Instead, they focus on the behavior of the discard_* methods

    assert (baseline_suite.find_expectations(
        column="a",
        expectation_type="expect_column_values_to_be_between",
    ) == [])

    result = baseline_suite.find_expectations(
        column="a",
        expectation_type="expect_column_values_to_be_in_set",
    )
    assert len(result) == 1
    assert result[0] == ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={
            "column": "a",
            "value_set": [1, 2, 3],
            # "result_format": "BASIC"
        },
        meta={"notes": "This is an expectation."},
    )

    exp_with_all_the_params = ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column": "a",
            "result_format": "BASIC",
            "include_config": True,
            "catch_exceptions": True,
        },
        meta={},
    )
    baseline_suite.append_expectation(exp_with_all_the_params)

    assert baseline_suite.find_expectations(
        column="a",
        expectation_type="expect_column_values_to_not_be_null",
    )[0] == ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column": "a",
        },
        meta={},
    )

    assert (baseline_suite.find_expectations(
        column="a",
        expectation_type="expect_column_values_to_not_be_null",
        discard_result_format_kwargs=False,
        discard_include_config_kwargs=False,
        discard_catch_exceptions_kwargs=False,
    )[0] == exp_with_all_the_params)

    assert baseline_suite.find_expectations(
        column="a",
        expectation_type="expect_column_values_to_not_be_null",
        discard_result_format_kwargs=False,
        discard_catch_exceptions_kwargs=False,
    )[0] == ExpectationConfiguration(
        expectation_type="expect_column_values_to_not_be_null",
        kwargs={
            "column": "a",
            "result_format": "BASIC",
            "catch_exceptions": True,
        },
        meta={},
    )

Exemple #10

0

Afficher le fichier

Fichier : test_evaluation_parameter_store.py Projet : rlshuhart/great_expectations

def test_evaluation_parameter_store_methods(data_context):
    run_id = "20191125T000000.000000Z"
    source_patient_data_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "source_patient_data.default",
            "run_id": run_id
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type="expect_table_row_count_to_equal",
                    kwargs={
                        "value": 1024,
                    }),
                success=True,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False
                },
                result={
                    "observed_value": 1024,
                    "element_count": 1024,
                    "missing_percent": 0.0,
                    "missing_count": 0
                })
        ],
        success=True)

    data_context.store_evaluation_parameters(source_patient_data_results)

    bound_parameters = data_context.evaluation_parameter_store.get_bind_params(
        run_id)
    assert bound_parameters == {
        'urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result'
        '.observed_value':
        1024
    }
    source_diabetes_data_results = ExpectationSuiteValidationResult(
        meta={
            "expectation_suite_name": "source_diabetes_data.default",
            "run_id": run_id
        },
        results=[
            ExpectationValidationResult(
                expectation_config=ExpectationConfiguration(
                    expectation_type=
                    "expect_column_unique_value_count_to_be_between",
                    kwargs={
                        "column": "patient_nbr",
                        "min": 2048,
                        "max": 2048
                    }),
                success=True,
                exception_info={
                    "exception_message": None,
                    "exception_traceback": None,
                    "raised_exception": False
                },
                result={
                    "observed_value": 2048,
                    "element_count": 5000,
                    "missing_percent": 0.0,
                    "missing_count": 0
                })
        ],
        success=True)

    data_context.store_evaluation_parameters(source_diabetes_data_results)
    bound_parameters = data_context.evaluation_parameter_store.get_bind_params(
        run_id)
    assert bound_parameters == {
        'urn:great_expectations:validations:source_patient_data.default:expect_table_row_count_to_equal.result'
        '.observed_value':
        1024,
        'urn:great_expectations:validations:source_diabetes_data.default'
        ':expect_column_unique_value_count_to_be_between.result.observed_value:column=patient_nbr':
        2048
    }

Exemple #11

0

Afficher le fichier

def test_notebook_execution_with_pandas_backend(
    titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled,
):
    """
    To set this test up we:

    - create a suite using profiling
    - verify that no validations have happened
    - create the suite edit notebook by hijacking the private cli method

    We then:
    - execute that notebook (Note this will raise various errors like
    CellExecutionError if any cell in the notebook fails
    - create a new context from disk
    - verify that a validation has been run with our expectation suite
    """
    context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled
    root_dir: str = context.root_directory
    uncommitted_dir: str = os.path.join(root_dir, "uncommitted")
    expectation_suite_name: str = "warning"

    context.create_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    batch_request: dict = {
        "datasource_name": "my_datasource",
        "data_connector_name": "my_basic_data_connector",
        "data_asset_name": "Titanic_1912",
    }

    # Sanity check test setup
    original_suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    assert len(original_suite.expectations) == 0
    assert context.list_expectation_suite_names() == [expectation_suite_name]
    assert context.list_datasources() == [
        {
            "name": "my_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine",
                "module_name": "great_expectations.execution_engine",
            },
            "data_connectors": {
                "my_basic_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "class_name": "InferredAssetFilesystemDataConnector",
                },
                "my_special_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "pattern":
                            "(.+)_(\\d+)_(\\d+)\\.csv",
                            "group_names": ["name", "timestamp", "size"],
                            "class_name":
                            "Asset",
                            "base_directory":
                            f"{root_dir}/../data/titanic",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_other_data_connector": {
                    "glob_directive": "*.csv",
                    "assets": {
                        "users": {
                            "class_name":
                            "Asset",
                            "module_name":
                            "great_expectations.datasource.data_connector.asset",
                        }
                    },
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "base_directory": f"{root_dir}/../data/titanic",
                    "default_regex": {
                        "pattern": "(.+)\\.csv",
                        "group_names": ["name"]
                    },
                    "class_name": "ConfiguredAssetFilesystemDataConnector",
                },
                "my_runtime_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "batch_identifiers":
                    ["pipeline_stage_name", "airflow_run_id"],
                    "class_name": "RuntimeDataConnector",
                },
            },
        },
        {
            "name": "my_additional_datasource",
            "class_name": "Datasource",
            "module_name": "great_expectations.datasource",
            "execution_engine": {
                "module_name": "great_expectations.execution_engine",
                "class_name": "PandasExecutionEngine",
            },
            "data_connectors": {
                "my_additional_data_connector": {
                    "module_name":
                    "great_expectations.datasource.data_connector",
                    "default_regex": {
                        "pattern": "(.*)\\.csv",
                        "group_names": ["data_asset_name"],
                    },
                    "base_directory": f"{root_dir}/../data/titanic",
                    "class_name": "InferredAssetFilesystemDataConnector",
                }
            },
        },
    ]

    assert context.get_validation_result(
        expectation_suite_name="warning") == {}

    # Create notebook
    # do not want to actually send usage_message, since the function call is not the result of actual usage
    _suite_edit_workflow(
        context=context,
        expectation_suite_name=expectation_suite_name,
        profile=True,
        profiler_name=None,
        usage_event="test_notebook_execution",
        interactive_mode=CLISuiteInteractiveFlagCombinations.
        UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE,
        no_jupyter=True,
        create_if_not_exist=False,
        datasource_name=None,
        batch_request=batch_request,
        additional_batch_request_args=None,
        suppress_usage_message=True,
        assume_yes=True,
    )
    edit_notebook_path: str = os.path.join(uncommitted_dir,
                                           "edit_warning.ipynb")
    assert os.path.isfile(edit_notebook_path)

    run_notebook(
        notebook_path=edit_notebook_path,
        notebook_dir=uncommitted_dir,
        string_to_be_replaced=
        "context.open_data_docs(resource_identifier=validation_result_identifier)",
        replacement_string="",
    )

    # Assertions about output
    context = DataContext(context_root_dir=root_dir)
    obs_validation_result: ExpectationSuiteValidationResult = (
        context.get_validation_result(expectation_suite_name="warning"))
    assert obs_validation_result.statistics == {
        "evaluated_expectations": 2,
        "successful_expectations": 2,
        "unsuccessful_expectations": 0,
        "success_percent": 100.0,
    }

    suite: ExpectationSuite = context.get_expectation_suite(
        expectation_suite_name=expectation_suite_name)
    suite["meta"].pop("citations", None)
    assert suite.expectations == [
        ExpectationConfiguration(
            **{
                "expectation_type":
                "expect_table_columns_to_match_ordered_list",
                "kwargs": {
                    "column_list": [
                        "Unnamed: 0",
                        "Name",
                        "PClass",
                        "Age",
                        "Sex",
                        "Survived",
                        "SexCode",
                    ]
                },
                "meta": {},
            }),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_table_row_count_to_be_between",
                "kwargs": {
                    "max_value": 1313,
                    "min_value": 1313
                },
                "meta": {},
            }),
    ]

    columns_with_expectations: Set[str]
    expectations_from_suite: Set[str]
    (
        columns_with_expectations,
        expectations_from_suite,
    ) = get_set_of_columns_and_expectations_from_suite(suite=suite)

    expected_expectations: Set[str] = {
        "expect_table_columns_to_match_ordered_list",
        "expect_table_row_count_to_be_between",
    }
    assert columns_with_expectations == set()
    assert expectations_from_suite == expected_expectations

Exemple #12

0

Afficher le fichier

Fichier : test_render_ValidationResultsTableContentBlockRenderer.py Projet : volaland/great_expectations

def test_ValidationResultsTableContentBlockRenderer_get_unexpected_table(evr_success):
    evr_failed_no_result = ExpectationValidationResult(
        success=False,
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "Unnamed: 0",
                "value_set": [],
                "result_format": "SUMMARY",
            },
        ),
    )

    evr_failed_no_unexpected_list_or_counts = ExpectationValidationResult(
        success=False,
        result={
            "element_count": 1313,
            "missing_count": 0,
            "missing_percent": 0.0,
            "unexpected_count": 1313,
            "unexpected_percent": 100.0,
            "unexpected_percent_nonmissing": 100.0,
        },
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "Unnamed: 0",
                "value_set": [],
                "result_format": "SUMMARY",
            },
        ),
    )

    evr_failed_partial_unexpected_list = ExpectationValidationResult(
        success=False,
        result={
            "element_count": 1313,
            "missing_count": 0,
            "missing_percent": 0.0,
            "unexpected_count": 1313,
            "unexpected_percent": 100.0,
            "unexpected_percent_nonmissing": 100.0,
            "partial_unexpected_list": [
                1,
                2,
                3,
                4,
                5,
                6,
                7,
                8,
                9,
                10,
                11,
                12,
                13,
                14,
                15,
                16,
                17,
                18,
                19,
                20,
            ],
        },
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "Unnamed: 0",
                "value_set": [],
                "result_format": "SUMMARY",
            },
        ),
    )

    evr_failed_partial_unexpected_counts = ExpectationValidationResult(
        success=False,
        result={
            "element_count": 1313,
            "missing_count": 0,
            "missing_percent": 0.0,
            "unexpected_count": 1313,
            "unexpected_percent": 100.0,
            "unexpected_percent_nonmissing": 100.0,
            "partial_unexpected_list": [
                1,
                2,
                3,
                4,
                5,
                6,
                7,
                8,
                9,
                10,
                11,
                12,
                13,
                14,
                15,
                16,
                17,
                18,
                19,
                20,
            ],
            "partial_unexpected_index_list": [
                0,
                1,
                2,
                3,
                4,
                5,
                6,
                7,
                8,
                9,
                10,
                11,
                12,
                13,
                14,
                15,
                16,
                17,
                18,
                19,
            ],
            "partial_unexpected_counts": [
                {"value": 1, "count": 1},
                {"value": 2, "count": 1},
                {"value": 3, "count": 1},
                {"value": 4, "count": 1},
                {"value": 5, "count": 1},
                {"value": 6, "count": 1},
                {"value": 7, "count": 1},
                {"value": 8, "count": 1},
                {"value": 9, "count": 1},
                {"value": 10, "count": 1},
                {"value": 11, "count": 1},
                {"value": 12, "count": 1},
                {"value": 13, "count": 1},
                {"value": 14, "count": 1},
                {"value": 15, "count": 1},
                {"value": 16, "count": 1},
                {"value": 17, "count": 1},
                {"value": 18, "count": 1},
                {"value": 19, "count": 1},
                {"value": 20, "count": 1},
            ],
        },
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "Unnamed: 0",
                "value_set": [],
                "result_format": "SUMMARY",
            },
        ),
    )

    # test for succeeded evr
    output_1 = ValidationResultsTableContentBlockRenderer._get_unexpected_table(
        evr_success
    )
    assert output_1 is None

    # test for failed evr with no "result" key
    output_2 = ValidationResultsTableContentBlockRenderer._get_unexpected_table(
        evr_failed_no_result
    )
    assert output_2 is None

    # test for failed evr with no unexpected list or unexpected counts
    output_3 = ValidationResultsTableContentBlockRenderer._get_unexpected_table(
        evr_failed_no_unexpected_list_or_counts
    )
    assert output_3 is None

    # test for failed evr with partial unexpected list
    output_4 = ValidationResultsTableContentBlockRenderer._get_unexpected_table(
        evr_failed_partial_unexpected_list
    )
    assert output_4.to_json_dict() == {
        "content_block_type": "table",
        "table": [
            [1],
            [2],
            [3],
            [4],
            [5],
            [6],
            [7],
            [8],
            [9],
            [10],
            [11],
            [12],
            [13],
            [14],
            [15],
            [16],
            [17],
            [18],
            [19],
            [20],
        ],
        "header_row": ["Sampled Unexpected Values"],
        "styling": {"body": {"classes": ["table-bordered", "table-sm", "mt-3"]}},
    }

    # test for failed evr with partial unexpected counts
    output_5 = ValidationResultsTableContentBlockRenderer._get_unexpected_table(
        evr_failed_partial_unexpected_counts
    )
    assert output_5.to_json_dict() == {
        "content_block_type": "table",
        "table": [
            [1],
            [2],
            [3],
            [4],
            [5],
            [6],
            [7],
            [8],
            [9],
            [10],
            [11],
            [12],
            [13],
            [14],
            [15],
            [16],
            [17],
            [18],
            [19],
            [20],
        ],
        "header_row": ["Sampled Unexpected Values"],
        "styling": {"body": {"classes": ["table-bordered", "table-sm", "mt-3"]}},
    }

Exemple #13

0

Afficher le fichier

Fichier : test_render_ValidationResultsTableContentBlockRenderer.py Projet : volaland/great_expectations

def test_ValidationResultsTableContentBlockRenderer_get_unexpected_statement(
    evr_success, evr_failed
):
    evr_no_result = ExpectationValidationResult(
        success=True,
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_table_row_count_to_be_between",
            kwargs={"min_value": 0, "max_value": None, "result_format": "SUMMARY"},
        ),
    )
    evr_failed_no_unexpected_count = ExpectationValidationResult(
        success=False,
        result={
            "element_count": 1313,
            "missing_count": 0,
            "missing_percent": 0.0,
            "unexpected_percent": 0.2284843869002285,
            "unexpected_percent_nonmissing": 0.2284843869002285,
            "partial_unexpected_list": [
                "Daly, Mr Peter Denis ",
                "Barber, Ms ",
                "Geiger, Miss Emily ",
            ],
            "partial_unexpected_index_list": [77, 289, 303],
            "partial_unexpected_counts": [
                {"value": "Barber, Ms ", "count": 1},
                {"value": "Daly, Mr Peter Denis ", "count": 1},
                {"value": "Geiger, Miss Emily ", "count": 1},
            ],
        },
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_match_regex",
            kwargs={
                "column": "Name",
                "regex": "^\\s+|\\s+$",
                "result_format": "SUMMARY",
            },
        ),
    )

    # test for succeeded evr
    output_1 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement(
        evr_success
    )
    assert output_1 == []

    # test for failed evr
    output_2 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement(
        evr_failed
    )
    assert output_2 == [
        RenderedStringTemplateContent(
            **{
                "content_block_type": "string_template",
                "string_template": {
                    "template": "\n\n$unexpected_count unexpected values found. $unexpected_percent of $element_count total rows.",
                    "params": {
                        "unexpected_count": "3",
                        "unexpected_percent": "≈0.2285%",
                        "element_count": "1,313",
                    },
                    "tag": "strong",
                    "styling": {"classes": ["text-danger"]},
                },
            }
        )
    ]

    # test for evr with no "result" key
    output_3 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement(
        evr_no_result
    )
    print(json.dumps(output_3, indent=2))
    assert output_3 == []

    # test for evr with no unexpected count
    output_4 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement(
        evr_failed_no_unexpected_count
    )
    print(output_4)
    assert output_4 == []

    # test for evr with exception
    evr_failed_exception = ExpectationValidationResult(
        success=False,
        exception_info={
            "raised_exception": True,
            "exception_message": "Unrecognized column: not_a_real_column",
            "exception_traceback": "Traceback (most recent call last):\n...more_traceback...",
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_match_regex",
            kwargs={
                "column": "Name",
                "regex": "^\\s+|\\s+$",
                "result_format": "SUMMARY",
            },
        ),
    )

    output_5 = ValidationResultsTableContentBlockRenderer._get_unexpected_statement(
        evr_failed_exception
    )
    output_5 = [content.to_json_dict() for content in output_5]
    expected_output_5 = [
        {
            "content_block_type": "string_template",
            "string_template": {
                "template": "\n\n$expectation_type raised an exception:\n$exception_message",
                "params": {
                    "expectation_type": "expect_column_values_to_not_match_regex",
                    "exception_message": "Unrecognized column: not_a_real_column",
                },
                "tag": "strong",
                "styling": {
                    "classes": ["text-danger"],
                    "params": {
                        "exception_message": {"tag": "code"},
                        "expectation_type": {
                            "classes": ["badge", "badge-danger", "mb-2"]
                        },
                    },
                },
            },
        },
        {
            "content_block_type": "collapse",
            "collapse_toggle_link": "Show exception traceback...",
            "collapse": [
                {
                    "content_block_type": "string_template",
                    "string_template": {
                        "template": "Traceback (most recent call last):\n...more_traceback...",
                        "tag": "code",
                    },
                }
            ],
            "inline_link": False,
        },
    ]
    assert output_5 == expected_output_5

Exemple #14

0

Afficher le fichier

def test_expectation_summary_in_ExpectationSuitePageRenderer_render_expectation_suite_notes(
):
    result = ExpectationSuitePageRenderer._render_expectation_suite_notes(
        ExpectationSuite(expectation_suite_name="test",
                         meta={},
                         expectations=None))
    # print(RenderedContent.rendered_content_list_to_json(result.text))
    assert RenderedContent.rendered_content_list_to_json(result.text) == [
        "This Expectation suite currently contains 0 total Expectations across 0 columns."
    ]

    result = ExpectationSuitePageRenderer._render_expectation_suite_notes(
        ExpectationSuite(
            expectation_suite_name="test",
            meta={"notes": {
                "format": "markdown",
                "content": ["hi"]
            }},
        ))
    # print(RenderedContent.rendered_content_list_to_json(result.text))

    try:
        mistune.markdown("*test*")
        assert RenderedContent.rendered_content_list_to_json(result.text) == [
            "This Expectation suite currently contains 0 total Expectations across 0 columns.",
            {
                "content_block_type": "markdown",
                "styling": {
                    "parent": {}
                },
                "markdown": "hi",
            },
        ]
    except OSError:
        assert RenderedContent.rendered_content_list_to_json(result.text) == [
            "This Expectation suite currently contains 0 total Expectations across 0 columns.",
            "hi",
        ]

    result = ExpectationSuitePageRenderer._render_expectation_suite_notes(
        ExpectationSuite(
            expectation_suite_name="test",
            meta={},
            expectations=[
                ExpectationConfiguration(
                    expectation_type="expect_table_row_count_to_be_between",
                    kwargs={
                        "min_value": 0,
                        "max_value": None
                    },
                ),
                ExpectationConfiguration(
                    expectation_type="expect_column_to_exist",
                    kwargs={"column": "x"}),
                ExpectationConfiguration(
                    expectation_type="expect_column_to_exist",
                    kwargs={"column": "y"}),
            ],
        ))
    # print(RenderedContent.rendered_content_list_to_json(result.text)[0])
    assert (
        RenderedContent.rendered_content_list_to_json(result.text)[0] ==
        "This Expectation suite currently contains 3 total Expectations across 2 columns."
    )

Exemple #15

0

Afficher le fichier

Fichier : test_expectation_suite_crud_methods.py Projet : whiteDigitalAI/great_expectations

def table_exp3():
    return ExpectationConfiguration(
        expectation_type="expect_table_row_count_to_equal",
        kwargs={"value": 1})

Exemple #16

0

Afficher le fichier

def bobby_columnar_table_multi_batch():
    """
    # TODO: <Alex>ALEX -- Add DocString</Alex>
    """

    verbose_profiler_config_file_path: str = file_relative_path(
        __file__, "bobby_user_workflow_verbose_profiler_config.yml"
    )
    verbose_profiler_config: str
    with open(verbose_profiler_config_file_path) as f:
        verbose_profiler_config = f.read()

    my_row_count_range_rule_expectation_configurations_oneshot_sampling_method: List[
        ExpectationConfiguration
    ] = [
        ExpectationConfiguration(
            **{
                "kwargs": {"min_value": 7505, "max_value": 8495},
                "expectation_type": "expect_table_row_count_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "table.row_count",
                            "domain_kwargs": {},
                        },
                        "num_batches": 2,
                    },
                },
            },
        ),
    ]

    my_column_ranges_rule_expectation_configurations_oneshot_sampling_method: List[
        ExpectationConfiguration
    ] = [
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "VendorID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "VendorID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "VendorID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "VendorID",
                    "min_value": 4,
                    "max_value": 4,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "passenger_count"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "passenger_count",
                    "min_value": 0,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "passenger_count"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "passenger_count",
                    "min_value": 6,
                    "max_value": 6,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "trip_distance"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "trip_distance",
                    "min_value": 0.0,
                    "max_value": 0.0,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "trip_distance"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "trip_distance",
                    "min_value": 37.62,
                    "max_value": 57.85,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "RatecodeID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "RatecodeID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "RatecodeID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "RatecodeID",
                    "min_value": 5,
                    "max_value": 6,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "PULocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "PULocationID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "PULocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "PULocationID",
                    "min_value": 265,
                    "max_value": 265,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "DOLocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "DOLocationID",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "DOLocationID"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "DOLocationID",
                    "min_value": 265,
                    "max_value": 265,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "payment_type"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "payment_type",
                    "min_value": 1,
                    "max_value": 1,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "payment_type"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "payment_type",
                    "min_value": 4,
                    "max_value": 4,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "fare_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "fare_amount",
                    "min_value": -51.84,
                    "max_value": -21.16,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "fare_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "fare_amount",
                    "min_value": 228.94,
                    "max_value": 2990.05,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "extra"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "extra",
                    "min_value": -36.53,
                    "max_value": -1.18,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "extra"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "extra",
                    "min_value": 4.51,
                    "max_value": 6.99,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "mta_tax"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "mta_tax",
                    "min_value": -0.5,
                    "max_value": -0.5,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "mta_tax"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "mta_tax",
                    "min_value": 0.69,
                    "max_value": 37.32,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "tip_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tip_amount",
                    "min_value": 0.0,
                    "max_value": 0.0,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "tip_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tip_amount",
                    "min_value": 46.84,
                    "max_value": 74.86,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "tolls_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tolls_amount",
                    "min_value": 0.0,
                    "max_value": 0.0,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "tolls_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "tolls_amount",
                    "min_value": 26.4,
                    "max_value": 497.67,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "improvement_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "improvement_surcharge",
                    "min_value": -0.3,
                    "max_value": -0.3,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "improvement_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "improvement_surcharge",
                    "min_value": 0.3,
                    "max_value": 0.3,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "total_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "total_amount",
                    "min_value": -52.66,
                    "max_value": -24.44,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "total_amount"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "total_amount",
                    "min_value": 550.18,
                    "max_value": 2992.47,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_min_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.min",
                            "domain_kwargs": {"column": "congestion_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "congestion_surcharge",
                    "min_value": -2.49,
                    "max_value": -0.01,
                    "mostly": 1.0,
                },
            },
        ),
        ExpectationConfiguration(
            **{
                "expectation_type": "expect_column_max_to_be_between",
                "meta": {
                    "profiler_details": {
                        "metric_configuration": {
                            "metric_name": "column.max",
                            "domain_kwargs": {"column": "congestion_surcharge"},
                        },
                        "num_batches": 2,
                    }
                },
                "kwargs": {
                    "column": "congestion_surcharge",
                    "min_value": 0.01,
                    "max_value": 2.49,
                    "mostly": 1.0,
                },
            },
        ),
    ]

    expectation_configurations: List[ExpectationConfiguration] = []

    expectation_configurations.extend(
        my_row_count_range_rule_expectation_configurations_oneshot_sampling_method
    )
    expectation_configurations.extend(
        my_column_ranges_rule_expectation_configurations_oneshot_sampling_method
    )

    expectation_suite_name_oneshot_sampling_method: str = (
        "bobby_columnar_table_multi_batch_oneshot_sampling_method"
    )
    expected_expectation_suite_oneshot_sampling_method: ExpectationSuite = (
        ExpectationSuite(
            expectation_suite_name=expectation_suite_name_oneshot_sampling_method
        )
    )
    expectation_configuration: ExpectationConfiguration
    for expectation_configuration in expectation_configurations:
        expected_expectation_suite_oneshot_sampling_method.add_expectation(
            expectation_configuration
        )

    yaml = YAML()
    profiler_config: dict = yaml.load(verbose_profiler_config)
    expected_expectation_suite_oneshot_sampling_method.add_citation(
        comment="Suite created by Rule-Based Profiler with the configuration included.",
        profiler_config=profiler_config,
    )

    return {
        "profiler_config": verbose_profiler_config,
        "test_configuration_oneshot_sampling_method": {
            "expectation_suite_name": expectation_suite_name_oneshot_sampling_method,
            "expected_expectation_suite": expected_expectation_suite_oneshot_sampling_method,
        },
    }

Exemple #17

0

Afficher le fichier

Fichier : test_column_section_renderer.py Projet : shrutikaponde/great_expectations

def test_ValidationResultsTableContentBlockRenderer_generate_expectation_row_happy_path(
):
    evr = ExpectationValidationResult(
        success=True,
        result={
            'observed_value': True,
            'element_count': 162,
            'missing_count': 153,
            'missing_percent': 94.44444444444444
        },
        exception_info={
            'raised_exception': False,
            'exception_message': None,
            'exception_traceback': None
        },
        expectation_config=ExpectationConfiguration(
            expectation_type='expect_column_min_to_be_between',
            kwargs={
                'column': 'live',
                'min_value': None,
                'max_value': None,
                'result_format': 'SUMMARY'
            },
            meta={'BasicDatasetProfiler': {
                'confidence': 'very low'
            }}))
    result = ValidationResultsTableContentBlockRenderer.render(
        [evr]).to_json_dict()
    print(result)

    # Note: A better approach to testing would separate out styling into a separate test.
    assert result == {
        'content_block_type':
        'table',
        'styling': {
            'body': {
                'classes': ['table']
            },
            'classes': [
                'ml-2', 'mr-2', 'mt-0', 'mb-0', 'table-responsive',
                'hide-succeeded-validations-column-section-target-child'
            ]
        },
        'table': [[{
            'content_block_type': 'string_template',
            'styling': {
                'parent': {
                    'classes': ['hide-succeeded-validation-target-child']
                }
            },
            'string_template': {
                'template': '$icon',
                'params': {
                    'icon': ''
                },
                'styling': {
                    'params': {
                        'icon': {
                            'classes':
                            ['fas', 'fa-check-circle', 'text-success'],
                            'tag': 'i'
                        }
                    }
                }
            }
        }, {
            'content_block_type': 'string_template',
            'string_template': {
                'template':
                '$column minimum value may have any numerical value.',
                'params': {
                    "column": "live",
                    "min_value": None,
                    "max_value": None,
                    "result_format": "SUMMARY",
                    "parse_strings_as_datetimes": None
                },
                'styling': {
                    'default': {
                        'classes': ['badge', 'badge-secondary']
                    },
                    'params': {
                        'column': {
                            'classes': ['badge', 'badge-primary']
                        }
                    }
                }
            }
        }, 'True']],
        'header_row': ['Status', 'Expectation', 'Observed Value']
    }

Exemple #18

0

Afficher le fichier

Fichier : test_data_asset_internals.py Projet : trucklos/great_expectations

def test_get_and_save_expectation_suite(tmp_path_factory):
    directory_name = str(
        tmp_path_factory.mktemp("test_get_and_save_expectation_config"))
    df = ge.dataset.PandasDataset({
        "x": [1, 2, 4],
        "y": [1, 2, 5],
        "z": ["hello", "jello", "mello"],
    })

    df.expect_column_values_to_be_in_set("x", [1, 2, 4])
    df.expect_column_values_to_be_in_set("y", [1, 2, 4],
                                         catch_exceptions=True,
                                         include_config=True)
    df.expect_column_values_to_match_regex("z", "ello")

    ### First test set ###

    output_config = ExpectationSuite(
        expectations=[
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_in_set",
                kwargs={
                    "column": "x",
                    "value_set": [1, 2, 4]
                },
            ),
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_match_regex",
                kwargs={
                    "column": "z",
                    "regex": "ello"
                },
            ),
        ],
        expectation_suite_name="default",
        data_asset_type="Dataset",
        meta={"great_expectations.__version__": ge.__version__},
    )

    assert output_config == df.get_expectation_suite()

    df.save_expectation_suite(directory_name + "/temp1.json")
    with open(directory_name + "/temp1.json") as infile:
        loaded_config = expectationSuiteSchema.loads(infile.read())
    assert output_config == loaded_config

    ### Second test set ###

    output_config = ExpectationSuite(
        expectations=[
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_in_set",
                kwargs={
                    "column": "x",
                    "value_set": [1, 2, 4]
                },
            ),
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_in_set",
                kwargs={
                    "column": "y",
                    "value_set": [1, 2, 4]
                },
            ),
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_match_regex",
                kwargs={
                    "column": "z",
                    "regex": "ello"
                },
            ),
        ],
        expectation_suite_name="default",
        data_asset_type="Dataset",
        meta={"great_expectations.__version__": ge.__version__},
    )

    assert output_config == df.get_expectation_suite(
        discard_failed_expectations=False)
    df.save_expectation_suite(directory_name + "/temp2.json",
                              discard_failed_expectations=False)
    with open(directory_name + "/temp2.json") as infile:
        loaded_suite = expectationSuiteSchema.loads(infile.read())
    assert output_config == loaded_suite

    ### Third test set ###

    output_config = ExpectationSuite(
        expectations=[
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_in_set",
                kwargs={
                    "column": "x",
                    "value_set": [1, 2, 4],
                    "result_format": "BASIC",
                },
            ),
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_match_regex",
                kwargs={
                    "column": "z",
                    "regex": "ello",
                    "result_format": "BASIC"
                },
            ),
        ],
        expectation_suite_name="default",
        data_asset_type="Dataset",
        meta={"great_expectations.__version__": ge.__version__},
    )
    assert output_config == df.get_expectation_suite(
        discard_result_format_kwargs=False,
        discard_include_config_kwargs=False,
        discard_catch_exceptions_kwargs=False,
    )

    df.save_expectation_suite(
        directory_name + "/temp3.json",
        discard_result_format_kwargs=False,
        discard_include_config_kwargs=False,
        discard_catch_exceptions_kwargs=False,
    )
    with open(directory_name + "/temp3.json") as infile:
        loaded_suite = expectationSuiteSchema.loads(infile.read())
    assert output_config == loaded_suite

Exemple #19

0

Afficher le fichier

Fichier : test_column_section_renderer.py Projet : shrutikaponde/great_expectations

def test_ProfilingResultsColumnSectionRenderer_render_header_with_unescaped_dollar_sign(
        titanic_profiled_name_column_evrs):
    evr_with_unescaped_dollar_sign = ExpectationValidationResult(
        success=True,
        result={"observed_value": "float64"},
        exception_info={
            "raised_exception": False,
            "exception_message": None,
            "exception_traceback": None,
        },
        expectation_config=ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_type_list",
            kwargs={
                "column":
                "Car Insurance Premiums ($)",
                "type_list": [
                    "DOUBLE_PRECISION", "DoubleType", "FLOAT", "FLOAT4",
                    "FLOAT8", "FloatType", "NUMERIC", "float"
                ],
                "result_format":
                "SUMMARY"
            },
            meta={"BasicDatasetProfiler": {
                "confidence": "very low"
            }}))

    content_block = ProfilingResultsColumnSectionRenderer._render_header(
        [evr_with_unescaped_dollar_sign],
        column_type=[],
    ).to_json_dict()
    print(content_block)
    assert content_block == {
        'content_block_type': 'header',
        'styling': {
            'classes': ['col-12', 'p-0'],
            'header': {
                'classes': ['alert', 'alert-secondary']
            }
        },
        'header': {
            'content_block_type': 'string_template',
            'string_template': {
                'template': 'Car Insurance Premiums ($$)',
                'tooltip': {
                    'content': 'expect_column_to_exist',
                    'placement': 'top'
                },
                'tag': 'h5',
                'styling': {
                    'classes': ['m-0', 'p-0']
                }
            }
        },
        'subheader': {
            'content_block_type': 'string_template',
            'string_template': {
                'template': 'Type: []',
                'tooltip': {
                    'content':
                    'expect_column_values_to_be_of_type <br>expect_column_values_to_be_in_type_list'
                },
                'tag': 'h6',
                'styling': {
                    'classes': ['mt-1', 'mb-0']
                }
            }
        }
    }

Exemple #20

0

Afficher le fichier

Fichier : test_data_asset_internals.py Projet : trucklos/great_expectations

def test_remove_expectation():
    my_df = ge.dataset.PandasDataset(
        {
            "x": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            "y": [1, 2, None, 4, None, 6, 7, 8, 9, None],
            "z": [
                "cello",
                "hello",
                "jello",
                "bellow",
                "fellow",
                "mellow",
                "wellow",
                "xello",
                "yellow",
                "zello",
            ],
        },
        profiler=ge.profile.ColumnsExistProfiler,
    )
    my_df.expect_column_values_to_be_of_type("x", "int")
    my_df.expect_column_values_to_be_of_type("y", "int")
    my_df.expect_column_values_to_be_of_type("z",
                                             "int",
                                             include_config=True,
                                             catch_exceptions=True)
    my_df.expect_column_values_to_be_increasing("x")
    my_df.expect_column_values_to_match_regex("z", "ello")

    with pytest.raises(ValueError) as exc:
        my_df.remove_expectation("expect_column_to_exist", "w", dry_run=True),

    assert "No matching expectation found." in str(exc.value)

    assert my_df.remove_expectation(
        "expect_column_to_exist", "x", expectation_kwargs={},
        dry_run=True) == ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "x"})

    assert my_df.remove_expectation(
        "expect_column_to_exist",
        expectation_kwargs={"column": "y"},
        dry_run=True) == ExpectationConfiguration(
            expectation_type="expect_column_to_exist", kwargs={"column": "y"})

    assert my_df.remove_expectation(
        "expect_column_to_exist",
        expectation_kwargs={"column": "y"},
        remove_multiple_matches=True,
        dry_run=True,
    ) == [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "y"})
    ]

    with pytest.raises(ValueError) as exc:
        my_df.remove_expectation("expect_column_to_exist", dry_run=True)

    assert "Multiple expectations matched arguments. No expectations removed." in str(
        exc.value)

    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "x"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "y"}),
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "z"}),
    ]

    assert (my_df.remove_expectation("expect_column_to_exist",
                                     remove_multiple_matches=True,
                                     dry_run=True) == exp1)

    with pytest.raises(ValueError) as exc:
        my_df.remove_expectation("expect_column_to_exist",
                                 "x", {"column": "y"},
                                 dry_run=True)

    assert "Conflicting column names in find_expectation_indexes" in str(
        exc.value)

    exp1 = [
        ExpectationConfiguration(expectation_type="expect_column_to_exist",
                                 kwargs={"column": "x"}),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": "x",
                "type_": "int"
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_increasing",
            kwargs={"column": "x"},
        ),
    ]

    assert (my_df.remove_expectation(column="x",
                                     remove_multiple_matches=True,
                                     dry_run=True) == exp1)

    assert len(my_df._expectation_suite.expectations) == 8

    assert my_df.remove_expectation("expect_column_to_exist", "x") is None
    assert len(my_df._expectation_suite.expectations) == 7
    assert my_df.remove_expectation(column="x",
                                    remove_multiple_matches=True) is None
    assert len(my_df._expectation_suite.expectations) == 5

    my_df.remove_expectation(column="z", remove_multiple_matches=True)
    assert len(my_df._expectation_suite.expectations) == 2

    assert my_df.get_expectation_suite(
        discard_failed_expectations=False) == ExpectationSuite(
            expectations=[
                ExpectationConfiguration(
                    expectation_type="expect_column_to_exist",
                    kwargs={"column": "y"}),
                ExpectationConfiguration(
                    expectation_type="expect_column_values_to_be_of_type",
                    kwargs={
                        "column": "y",
                        "type_": "int"
                    },
                ),
            ],
            expectation_suite_name="default",
            data_asset_type="Dataset",
            meta={"great_expectations.__version__": ge.__version__},
        )

Exemple #21

0

Afficher le fichier

Fichier : test_column_section_renderer.py Projet : shrutikaponde/great_expectations

def test_ExpectationSuiteColumnSectionRenderer_render_header(
        titanic_profiled_name_column_expectations):
    remaining_expectations, content_blocks = ExpectationSuiteColumnSectionRenderer._render_header(
        titanic_profiled_name_column_expectations, )

    expected = {
        'content_block_type': 'header',
        'styling': {
            'classes': ['col-12'],
            'header': {
                'classes': ['alert', 'alert-secondary']
            }
        },
        'header': {
            'content_block_type': 'string_template',
            'string_template': {
                'template': 'Name',
                'tag': 'h5',
                'styling': {
                    'classes': ['m-0']
                }
            }
        }
    }

    print(content_blocks.to_json_dict())

    assert content_blocks.to_json_dict() == expected

    expectation_with_unescaped_dollar_sign = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_type_list",
        kwargs={
            "column":
            "Car Insurance Premiums ($)",
            "type_list": [
                "DOUBLE_PRECISION", "DoubleType", "FLOAT", "FLOAT4", "FLOAT8",
                "FloatType", "NUMERIC", "float"
            ],
            "result_format":
            "SUMMARY"
        },
        meta={"BasicDatasetProfiler": {
            "confidence": "very low"
        }})
    remaining_expectations, content_blocks = ExpectationSuiteColumnSectionRenderer._render_header(
        [expectation_with_unescaped_dollar_sign], )

    print(content_blocks.to_json_dict())
    expected = {
        'content_block_type': 'header',
        'styling': {
            'classes': ['col-12'],
            'header': {
                'classes': ['alert', 'alert-secondary']
            }
        },
        'header': {
            'content_block_type': 'string_template',
            'string_template': {
                'template': 'Car Insurance Premiums ($$)',
                'tag': 'h5',
                'styling': {
                    'classes': ['m-0']
                }
            }
        }
    }
    assert content_blocks.to_json_dict() == expected

Exemple #22

0

Afficher le fichier

Fichier : basic_suite_builder_profiler.py Projet : waqassiddiqi/great_expectations

    def _create_expectations_for_datetime_column(
        cls, dataset, column, excluded_expectations=None, included_expectations=None
    ):
        cls._create_non_nullity_expectations(
            dataset,
            column,
            excluded_expectations=excluded_expectations,
            included_expectations=included_expectations,
        )

        if (
            not excluded_expectations
            or "expect_column_min_to_be_between" not in excluded_expectations
        ) and (
            not included_expectations
            or "expect_column_min_to_be_between" in included_expectations
        ):
            min_value = dataset.expect_column_min_to_be_between(
                column, min_value=None, max_value=None, result_format="SUMMARY"
            ).result["observed_value"]

            if min_value is not None:
                dataset.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_min_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                try:
                    min_value = min_value + datetime.timedelta(days=-365)
                except OverflowError:
                    min_value = datetime.datetime.min
                except TypeError:
                    min_value = parse(min_value) + datetime.timedelta(days=-365)

        if (
            not excluded_expectations
            or "expect_column_max_to_be_between" not in excluded_expectations
        ) and (
            not included_expectations
            or "expect_column_max_to_be_between" in included_expectations
        ):
            max_value = dataset.expect_column_max_to_be_between(
                column, min_value=None, max_value=None, result_format="SUMMARY"
            ).result["observed_value"]
            if max_value is not None:
                dataset.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_max_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                try:
                    max_value = max_value + datetime.timedelta(days=365)
                except OverflowError:
                    max_value = datetime.datetime.max
                except TypeError:
                    max_value = parse(max_value) + datetime.timedelta(days=365)

        if (
            not excluded_expectations
            or "expect_column_min_to_be_between" not in excluded_expectations
        ) and (
            not included_expectations
            or "expect_column_min_to_be_between" in included_expectations
        ):
            if min_value is not None or max_value is not None:
                dataset.expect_column_values_to_be_between(
                    column, min_value, max_value, parse_strings_as_datetimes=True
                )

Exemple #23

0

Afficher le fichier

Fichier : test_column_section_renderer.py Projet : shrutikaponde/great_expectations

def test_ExpectationSuiteColumnSectionRenderer_expectation_with_string_list_meta_notes(
):
    expectation_with_string_list_note = ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_type_list",
        kwargs={
            "column":
            "Car Insurance Premiums ($)",
            "type_list": [
                "DOUBLE_PRECISION", "DoubleType", "FLOAT", "FLOAT4", "FLOAT8",
                "FloatType", "NUMERIC", "float"
            ],
            "result_format":
            "SUMMARY"
        },
        meta={
            "BasicDatasetProfiler": {
                "confidence": "very low"
            },
            "notes":
            ["This is a list", "of strings", "assigned to the notes", "key."]
        })
    expectations = [expectation_with_string_list_note]
    expected_result_json = {
        'content_blocks': [{
            'content_block_type': 'header',
            'styling': {
                'classes': ['col-12'],
                'header': {
                    'classes': ['alert', 'alert-secondary']
                }
            },
            'header': {
                'content_block_type': 'string_template',
                'string_template': {
                    'template': 'Car Insurance Premiums ($$)',
                    'tag': 'h5',
                    'styling': {
                        'classes': ['m-0']
                    }
                }
            }
        }, {
            'content_block_type':
            'bullet_list',
            'styling': {
                'classes': ['col-12']
            },
            'bullet_list': [[{
                'content_block_type': 'string_template',
                'string_template': {
                    'template':
                    'value types must belong to this set: $v__0 $v__1 '
                    '$v__2 $v__3 $v__4 $v__5 $v__6 $v__7.',
                    'params': {
                        "column":
                        "Car Insurance Premiums ($)",
                        "type_list": [
                            "DOUBLE_PRECISION", "DoubleType", "FLOAT",
                            "FLOAT4", "FLOAT8", "FloatType", "NUMERIC", "float"
                        ],
                        "result_format":
                        "SUMMARY",
                        "mostly":
                        None,
                        "v__0":
                        "DOUBLE_PRECISION",
                        "v__1":
                        "DoubleType",
                        "v__2":
                        "FLOAT",
                        "v__3":
                        "FLOAT4",
                        "v__4":
                        "FLOAT8",
                        "v__5":
                        "FloatType",
                        "v__6":
                        "NUMERIC",
                        "v__7":
                        "float"
                    },
                    'styling': {
                        'default': {
                            'classes': ['badge', 'badge-secondary']
                        },
                        'params': {
                            'column': {
                                'classes': ['badge', 'badge-primary']
                            }
                        }
                    }
                }
            }, {
                'content_block_type':
                'collapse',
                'styling': {
                    'body': {
                        'classes': ['card', 'card-body', 'p-1']
                    },
                    'parent': {
                        'styles': {
                            'list-style-type': 'none'
                        }
                    }
                },
                'collapse_toggle_link': {
                    'content_block_type': 'string_template',
                    'string_template': {
                        'template': '$icon',
                        'params': {
                            'icon': ''
                        },
                        'styling': {
                            'params': {
                                'icon': {
                                    'classes':
                                    ['fas', 'fa-comment', 'text-info'],
                                    'tag': 'i'
                                }
                            }
                        }
                    }
                },
                'collapse': [{
                    'content_block_type':
                    'text',
                    'styling': {
                        'classes': ['col-12', 'mt-2', 'mb-2'],
                        'parent': {
                            'styles': {
                                'list-style-type': 'none'
                            }
                        }
                    },
                    'subheader':
                    'Notes:',
                    'text': [
                        'This is a list', 'of strings',
                        'assigned to the notes', 'key.'
                    ]
                }],
                'inline_link':
                True
            }], {
                'content_block_type': 'string_template',
                'styling': {
                    'parent': {
                        'styles': {
                            'list-style-type': 'none'
                        }
                    }
                },
                'string_template': {
                    'template': '',
                    'tag': 'hr',
                    'styling': {
                        'classes': ['mt-1', 'mb-1']
                    }
                }
            }]
        }],
        'section_name':
        'Car Insurance Premiums ($)'
    }

    result_json = ExpectationSuiteColumnSectionRenderer().render(
        expectations).to_json_dict()
    print(result_json)
    assert result_json == expected_result_json

Exemple #24

0

Afficher le fichier

Fichier : test_filedata_asset.py Projet : rexboyce/great_expectations

def test_expectation_suite_filedata_asset():
    # Load in data files
    file_path = file_relative_path(__file__,
                                   "../test_sets/toy_data_complete.csv")

    # Create FileDataAsset objects
    f_dat = ge.data_asset.FileDataAsset(file_path)

    # Set up expectations
    f_dat.expect_file_line_regex_match_count_to_equal(
        regex=r",\S",
        expected_count=3,
        skip=1,
        result_format="BASIC",
        catch_exceptions=True,
    )

    f_dat.expect_file_line_regex_match_count_to_be_between(
        regex=r",\S",
        expected_max_count=2,
        skip=1,
        result_format="SUMMARY",
        include_config=True,
    )

    # Test basic config output
    complete_config = f_dat.get_expectation_suite()
    assert [
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_equal",
            kwargs=ExpectationKwargs(expected_count=3, regex=",\\S", skip=1),
        )
    ] == complete_config.expectations

    # Include result format kwargs
    complete_config2 = f_dat.get_expectation_suite(
        discard_result_format_kwargs=False, discard_failed_expectations=False)
    assert [
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_equal",
            kwargs={
                "expected_count": 3,
                "regex": ",\\S",
                "result_format": "BASIC",
                "skip": 1,
            },
        ),
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_be_between",
            kwargs={
                "expected_max_count": 2,
                "regex": ",\\S",
                "result_format": "SUMMARY",
                "skip": 1,
            },
        ),
    ] == complete_config2.expectations

    # Discard Failing Expectations
    complete_config3 = f_dat.get_expectation_suite(
        discard_result_format_kwargs=False, discard_failed_expectations=True)

    assert [
        ExpectationConfiguration(
            expectation_type="expect_file_line_regex_match_count_to_equal",
            kwargs={
                "expected_count": 3,
                "regex": ",\\S",
                "result_format": "BASIC",
                "skip": 1,
            },
        )
    ] == complete_config3.expectations

Exemple #25

0

Afficher le fichier

def test_column_map_expectation_decorator():

    # Create a new CustomPandasDataset to
    # (1) demonstrate that custom subclassing works, and
    # (2) Test expectation business logic without dependencies on any other functions.
    class CustomPandasDataset(PandasDataset):
        @MetaPandasDataset.column_map_expectation
        def expect_column_values_to_be_odd(self, column):
            return column.map(lambda x: x % 2)

        @MetaPandasDataset.column_map_expectation
        def expectation_that_crashes_on_sixes(self, column):
            return column.map(lambda x: (x - 6) / 0 != "duck")

    df = CustomPandasDataset({
        "all_odd": [1, 3, 5, 5, 5, 7, 9, 9, 9, 11],
        "mostly_odd": [1, 3, 5, 7, 9, 2, 4, 1, 3, 5],
        "all_even": [2, 4, 4, 6, 6, 6, 8, 8, 8, 8],
        "odd_missing": [1, 3, 5, None, None, None, None, 1, 3, None],
        "mixed_missing": [1, 3, 5, None, None, 2, 4, 1, 3, None],
        "all_missing":
        [None, None, None, None, None, None, None, None, None, None],
    })
    df.set_default_expectation_argument("result_format", "COMPLETE")
    df.set_default_expectation_argument("include_config", False)

    assert df.expect_column_values_to_be_odd(
        "all_odd") == ExpectationValidationResult(
            result={
                "element_count": 10,
                "missing_count": 0,
                "missing_percent": 0.0,
                "partial_unexpected_counts": [],
                "partial_unexpected_index_list": [],
                "partial_unexpected_list": [],
                "unexpected_count": 0,
                "unexpected_index_list": [],
                "unexpected_list": [],
                "unexpected_percent": 0.0,
                "unexpected_percent_nonmissing": 0.0,
            },
            success=True,
        )

    assert df.expect_column_values_to_be_odd(
        "all_missing") == ExpectationValidationResult(
            result={
                "element_count": 10,
                "missing_count": 10,
                "missing_percent": 100.0,
                "partial_unexpected_counts": [],
                "partial_unexpected_index_list": [],
                "partial_unexpected_list": [],
                "unexpected_count": 0,
                "unexpected_index_list": [],
                "unexpected_list": [],
                "unexpected_percent": 0.0,
                "unexpected_percent_nonmissing": None,
            },
            success=True,
        )

    assert df.expect_column_values_to_be_odd(
        "odd_missing") == ExpectationValidationResult(
            result={
                "element_count": 10,
                "missing_count": 5,
                "missing_percent": 50.0,
                "partial_unexpected_counts": [],
                "partial_unexpected_index_list": [],
                "partial_unexpected_list": [],
                "unexpected_count": 0,
                "unexpected_index_list": [],
                "unexpected_list": [],
                "unexpected_percent": 0.0,
                "unexpected_percent_nonmissing": 0.0,
            },
            success=True,
        )

    assert df.expect_column_values_to_be_odd(
        "mixed_missing") == ExpectationValidationResult(
            result={
                "element_count":
                10,
                "missing_count":
                3,
                "missing_percent":
                30.0,
                "partial_unexpected_counts": [
                    {
                        "value": 2.0,
                        "count": 1
                    },
                    {
                        "value": 4.0,
                        "count": 1
                    },
                ],
                "partial_unexpected_index_list": [5, 6],
                "partial_unexpected_list": [2.0, 4.0],
                "unexpected_count":
                2,
                "unexpected_index_list": [5, 6],
                "unexpected_list": [2, 4],
                "unexpected_percent":
                20.0,
                "unexpected_percent_nonmissing": (2 / 7 * 100),
            },
            success=False,
        )

    assert df.expect_column_values_to_be_odd(
        "mostly_odd") == ExpectationValidationResult(
            result={
                "element_count":
                10,
                "missing_count":
                0,
                "missing_percent":
                0,
                "partial_unexpected_counts": [
                    {
                        "value": 2.0,
                        "count": 1
                    },
                    {
                        "value": 4.0,
                        "count": 1
                    },
                ],
                "partial_unexpected_index_list": [5, 6],
                "partial_unexpected_list": [2.0, 4.0],
                "unexpected_count":
                2,
                "unexpected_index_list": [5, 6],
                "unexpected_list": [2, 4],
                "unexpected_percent":
                20.0,
                "unexpected_percent_nonmissing":
                20.0,
            },
            success=False,
        )

    assert df.expect_column_values_to_be_odd(
        "mostly_odd", mostly=0.6) == ExpectationValidationResult(
            result={
                "element_count":
                10,
                "missing_count":
                0,
                "missing_percent":
                0,
                "partial_unexpected_counts": [
                    {
                        "value": 2.0,
                        "count": 1
                    },
                    {
                        "value": 4.0,
                        "count": 1
                    },
                ],
                "partial_unexpected_index_list": [5, 6],
                "partial_unexpected_list": [2.0, 4.0],
                "unexpected_count":
                2,
                "unexpected_index_list": [5, 6],
                "unexpected_list": [2, 4],
                "unexpected_percent":
                20.0,
                "unexpected_percent_nonmissing":
                20.0,
            },
            success=True,
        )

    assert df.expect_column_values_to_be_odd(
        "mostly_odd",
        result_format="BOOLEAN_ONLY") == ExpectationValidationResult(
            success=False)

    df.default_expectation_args["result_format"] = "BOOLEAN_ONLY"

    assert df.expect_column_values_to_be_odd(
        "mostly_odd") == ExpectationValidationResult(success=False)

    df.default_expectation_args["result_format"] = "BASIC"

    assert df.expect_column_values_to_be_odd(
        "mostly_odd", include_config=True) == ExpectationValidationResult(
            expectation_config=ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_odd",
                kwargs={
                    "column": "mostly_odd",
                    "result_format": "BASIC"
                },
            ),
            result={
                "element_count": 10,
                "missing_count": 0,
                "missing_percent": 0,
                "partial_unexpected_list": [2, 4],
                "unexpected_count": 2,
                "unexpected_percent": 20.0,
                "unexpected_percent_nonmissing": 20.0,
            },
            success=False,
        )

Exemple #26

0

Afficher le fichier

Fichier : test_expectation_configuration.py Projet : whiteDigitalAI/great_expectations

def config3():
    return ExpectationConfiguration(
        expectation_type="expect_column_values_to_be_in_set",
        kwargs={"column": "a", "value_set": [1, 2, 3], "result_format": "BASIC"},
        meta={"notes": "This is another expectation."},
    )

Exemple #27

0

Afficher le fichier

def test_ge_pandas_automatic_failure_removal():
    df = ge.dataset.PandasDataset({
        'A': [1, 2, 3, 4],
        'B': [5, 6, 7, 8],
        'C': ['a', 'b', 'c', 'd'],
        'D': ['e', 'f', 'g', 'h']
    })

    # Put some simple expectations on the data frame
    df.profile(ge.profile.ColumnsExistProfiler)
    df.expect_column_values_to_be_in_set("A", [1, 2, 3, 4])
    df.expect_column_values_to_be_in_set("B", [5, 6, 7, 8])
    df.expect_column_values_to_be_in_set("C", ['w', 'x', 'y', 'z'])
    df.expect_column_values_to_be_in_set("D", ['e', 'f', 'g', 'h'])

    # First check that failing expectations are NOT automatically
    # dropped when sampling.
    # For this data frame, the expectation on column "C" above fails.
    exp1 = [
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'A'}),
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'B'}),
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'C'}),
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'D'}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'A', 'value_set': [1, 2, 3, 4]}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'B', 'value_set': [5, 6, 7, 8]}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'C', 'value_set': ['w', 'x', 'y', 'z']}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'D', 'value_set': ['e', 'f', 'g', 'h']})
    ]
    samp1 = df.sample(n=2)
    assert samp1.find_expectations() == exp1

    # Now check subsetting to verify that failing expectations are NOT
    # automatically dropped when subsetting.
    sub1 = df[['A', 'D']]
    assert samp1.find_expectations() == exp1

    # Set property/attribute so that failing expectations are
    # automatically removed when sampling or subsetting.
    df.discard_subset_failing_expectations = True

    ###
    # Note: Order matters in this test, and a validationoperator may change order
    ###

    exp_samp = [
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'A'}),
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'B'}),
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'C'}),
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'D'}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'A', 'value_set': [1, 2, 3, 4]}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'B', 'value_set': [5, 6, 7, 8]}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'D', 'value_set': ['e', 'f', 'g', 'h']})
    ]

    samp2 = df.sample(n=2)
    assert samp2.find_expectations() == exp_samp

    # Now check subsetting. In additional to the failure on column "C",
    # the expectations on column "B" now fail since column "B" doesn't
    # exist in the subset.
    sub2 = df[['A', 'D']]
    exp_sub = [
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'A'}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'A', 'value_set': [1, 2, 3, 4]}),
        ExpectationConfiguration(expectation_type='expect_column_to_exist',
                                 kwargs={'column': 'D'}),
        ExpectationConfiguration(expectation_type='expect_column_values_to_be_in_set',
                                 kwargs={'column': 'D', 'value_set': ['e', 'f', 'g', 'h']})
    ]
    assert samp2.find_expectations() == exp_samp