def _create_expectations_for_low_card_column( cls, dataset, column, column_cache, excluded_expectations=None, included_expectations=None, ): cls._create_non_nullity_expectations( dataset, column, excluded_expectations=excluded_expectations, included_expectations=included_expectations, ) if ( not excluded_expectations or "expect_column_distinct_values_to_be_in_set" not in excluded_expectations ) and ( not included_expectations or "expect_column_distinct_values_to_be_in_set" in included_expectations ): value_set = dataset.expect_column_distinct_values_to_be_in_set( column, value_set=None, result_format="SUMMARY" ).result["observed_value"] dataset.expect_column_distinct_values_to_be_in_set( column, value_set=value_set, result_format="SUMMARY" ) if ( not excluded_expectations or "expect_column_kl_divergence_to_be_less_than" not in excluded_expectations ) and ( not included_expectations or "expect_column_kl_divergence_to_be_less_than" in included_expectations ): if cls._get_column_cardinality_with_caching( dataset, column, column_cache ) in [ProfilerCardinality.TWO, ProfilerCardinality.VERY_FEW,]: partition_object = build_categorical_partition_object(dataset, column) dataset.expect_column_kl_divergence_to_be_less_than( column, partition_object=partition_object, threshold=0.6, catch_exceptions=True, )
def _create_expectations_for_low_card_column(cls, dataset, column, column_cache): cls._create_non_nullity_expectations(dataset, column) value_set = \ dataset.expect_column_distinct_values_to_be_in_set(column, value_set=None, result_format="SUMMARY").result[ "observed_value"] dataset.expect_column_distinct_values_to_be_in_set( column, value_set=value_set, result_format="SUMMARY") if cls._get_column_cardinality_with_caching( dataset, column, column_cache) in ["two", "very few"]: partition_object = build_categorical_partition_object( dataset, column) dataset.expect_column_kl_divergence_to_be_less_than( column, partition_object=partition_object, threshold=0.6, catch_exceptions=True)
def test_build_categorical_partition(non_numeric_high_card_dataset): # Verify that we can build expected categorical partition objects # Note that this relies on the underlying sort behavior of the system in question # For weights, that will be unambiguous, but for values, it could depend on locale partition = build_categorical_partition_object( non_numeric_high_card_dataset, "medcardnonnum", sort="count" ) assert partition == { "values": [ "hW0kFZ6ijfciJWN4vvgcFa6MWv8cTeVk", "T7EUE54HUhyJ9Hnxv1pKY0Bmg42qiggP", "2K8njWnvuq1u6tkzreNhxTEyO8PTeWer", "k8B9KCXhaQb6Q82zFbAzOESAtDxK174J", "NhTsracusfp5V6zVeWqLZnychDl7jjO4", "oRnY5jDWFw2KZRYLh6ihFd021ggy4UxJ", "ajcLVizD2vwZlmmGKyXYki03SWn7fnt3", "NfX4KfEompMbbKloFq8NQpdXtk5PjaPe", "mS2AVcLFp6i36sX7yAUrdfM0g0RB2X4D", ], "weights": [0.18, 0.17, 0.16, 0.145, 0.125, 0.11, 0.085, 0.02, 0.005], } partition = build_categorical_partition_object( non_numeric_high_card_dataset, "medcardnonnum", sort="value" ) try: assert partition == { "values": [ "2K8njWnvuq1u6tkzreNhxTEyO8PTeWer", "NfX4KfEompMbbKloFq8NQpdXtk5PjaPe", "NhTsracusfp5V6zVeWqLZnychDl7jjO4", "T7EUE54HUhyJ9Hnxv1pKY0Bmg42qiggP", "ajcLVizD2vwZlmmGKyXYki03SWn7fnt3", "hW0kFZ6ijfciJWN4vvgcFa6MWv8cTeVk", "k8B9KCXhaQb6Q82zFbAzOESAtDxK174J", "mS2AVcLFp6i36sX7yAUrdfM0g0RB2X4D", "oRnY5jDWFw2KZRYLh6ihFd021ggy4UxJ", ], "weights": [0.16, 0.02, 0.125, 0.17, 0.085, 0.18, 0.145, 0.005, 0.11], } except AssertionError: # Postgres uses a lexigraphical sort that differs from the one used in python natively # Since we *want* to preserve the underlying system's ability to do compute (and the user # can override if desired), we allow this explicitly. assert partition == { "values": [ "2K8njWnvuq1u6tkzreNhxTEyO8PTeWer", "ajcLVizD2vwZlmmGKyXYki03SWn7fnt3", "hW0kFZ6ijfciJWN4vvgcFa6MWv8cTeVk", "k8B9KCXhaQb6Q82zFbAzOESAtDxK174J", "mS2AVcLFp6i36sX7yAUrdfM0g0RB2X4D", "NfX4KfEompMbbKloFq8NQpdXtk5PjaPe", "NhTsracusfp5V6zVeWqLZnychDl7jjO4", "oRnY5jDWFw2KZRYLh6ihFd021ggy4UxJ", "T7EUE54HUhyJ9Hnxv1pKY0Bmg42qiggP", ], "weights": [0.16, 0.085, 0.18, 0.145, 0.005, 0.02, 0.125, 0.11, 0.17], }