Python RuleBasedProfilerConfig Exemples, great_expectations.rule_based_profiler.config.base.RuleBasedProfilerConfig Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : conftest.py Projet : rpatil524/great_expectations

def profiler_config_with_placeholder_args_multiple_rules_custom_values(
    profiler_config_with_placeholder_args_custom_values:
    RuleBasedProfilerConfig,
) -> RuleBasedProfilerConfig:
    rules: dict = profiler_config_with_placeholder_args_custom_values.rules
    rule: dict = {
        "domain_builder": {
            "class_name": "MyAdditionalCustomDomainBuilder"
        },
        "parameter_builders": [{
            "class_name": "MyAdditionalCustomParameterBuilder",
            "metric_name": "yet_another_metric",
            "name": "yet_another_parameter",
        }],
        "expectation_configuration_builders": [{
            "class_name": "MyAdditionalCustomExpectationConfigurationBuilder",
            "expectation_type": "expect_additional_custom_expectation",
            "meta": {
                "details": {
                    "note": "Here's another rule"
                }
            },
        }],
    }
    rules["rule_2"] = rule
    return RuleBasedProfilerConfig(
        name=profiler_config_with_placeholder_args_custom_values.name,
        config_version=profiler_config_with_placeholder_args_custom_values.
        config_version,
        rules=rules,
        variables=profiler_config_with_placeholder_args_custom_values.
        variables,
    )

Exemple #2

0

Afficher le fichier

Fichier : conftest.py Projet : rpatil524/great_expectations

def profiler_config_with_placeholder_args_multiple_rules(
    profiler_config_with_placeholder_args: RuleBasedProfilerConfig,
) -> RuleBasedProfilerConfig:
    rules: dict = profiler_config_with_placeholder_args.rules
    rule: dict = {
        "domain_builder": {
            "class_name": "TableDomainBuilder"
        },
        "parameter_builders": [{
            "class_name": "MetricMultiBatchParameterBuilder",
            "metric_name": "my_other_metric",
            "name": "my_additional_parameter",
        }],
        "expectation_configuration_builders": [{
            "class_name": "DefaultExpectationConfigurationBuilder",
            "expectation_type": "expect_column_values_to_be_between",
            "meta": {
                "details": {
                    "note": "Here's another rule"
                }
            },
        }],
    }
    rules["rule_2"] = rule
    return RuleBasedProfilerConfig(
        name=profiler_config_with_placeholder_args.name,
        config_version=profiler_config_with_placeholder_args.config_version,
        rules=rules,
        variables=profiler_config_with_placeholder_args.variables,
    )

Exemple #3

0

Afficher le fichier

Fichier : conftest.py Projet : rpatil524/great_expectations

def profiler_config_with_placeholder_args_custom_values(
) -> RuleBasedProfilerConfig:
    config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
        name="my_profiler",
        config_version=1.0,
        rules={
            "rule_1": {
                "domain_builder": {
                    "class_name": "MyCustomDomainBuilder"
                },
                "expectation_configuration_builders": [{
                    "class_name": "MyCustomExpectationConfigurationBuilder",
                    "expectation_type": "expect_custom_expectation",
                    "meta": {
                        "details": {
                            "note": "My custom config"
                        }
                    },
                }],
                "parameter_builders": [{
                    "class_name": "MyCustomParameterBuilder",
                    "metric_name": "my_metric",
                    "name": "my_parameter",
                }],
            }
        },
        variables={"my_variable": "my_value"},
    )
    return config

Exemple #4

0

Afficher le fichier

Fichier : rule_based_profiler.py Projet : alfredo-f/great_expectations

    def add_profiler(
        config: RuleBasedProfilerConfig,
        data_context: "DataContext",  # noqa: F821
        profiler_store: ProfilerStore,
        ge_cloud_id: Optional[str] = None,
    ) -> "RuleBasedProfiler":
        if not RuleBasedProfiler._check_validity_of_batch_requests_in_config(
                config=config):
            raise ge_exceptions.InvalidConfigError(
                f'batch_data found in batch_request cannot be saved to ProfilerStore "{profiler_store.store_name}"'
            )

        # Chetan - 20220204 - DataContext to be removed once it can be decoupled from RBP
        new_profiler: "RuleBasedProfiler" = instantiate_class_from_config(
            config=config.to_json_dict(),
            runtime_environment={
                "data_context": data_context,
            },
            config_defaults={
                "module_name": "great_expectations.rule_based_profiler",
                "class_name": "RuleBasedProfiler",
            },
        )

        key: Union[GeCloudIdentifier, ConfigurationIdentifier]
        if ge_cloud_id:
            key = GeCloudIdentifier(resource_type="contract",
                                    ge_cloud_id=ge_cloud_id)
        else:
            key = ConfigurationIdentifier(configuration_key=config.name, )

        profiler_store.set(key=key, value=config)

        return new_profiler

Exemple #5

0

Afficher le fichier

Fichier : rule_based_profiler.py Projet : alfredo-f/great_expectations

    def __init__(
            self,
            name: str,
            config_version: float,
            variables: Optional[Dict[str, Any]] = None,
            rules: Optional[Dict[str, Dict[str, Any]]] = None,
            data_context: Optional["DataContext"] = None,  # noqa: F821
    ):
        """
        Create a new Profiler using configured rules.
        For a rule or an item in a rule configuration, instantiates the following if
        available: a domain builder, a parameter builder, and a configuration builder.
        These will be used to define profiler computation patterns.

        Args:
            name: The name of the RBP instance
            config_version: The version of the RBP (currently only 1.0 is supported)
            rules: A set of dictionaries, each of which contains its own domain_builder, parameter_builders, and
            expectation_configuration_builders configuration components
            variables: Any variables to be substituted within the rules
            data_context: DataContext object that defines a full runtime environment (data access, etc.)
        """
        profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
            name=name,
            config_version=config_version,
            variables=variables,
            rules=rules,
        )

        super().__init__(
            profiler_config=profiler_config,
            data_context=data_context,
        )

Exemple #6

0

Afficher le fichier

class ExpectColumnUniqueValueCountToBeBetween(ColumnExpectation):
    """Expect the number of unique values to be between a minimum value and a maximum value.

            expect_column_unique_value_count_to_be_between is a \
            :func:`column_aggregate_expectation
            <great_expectations.execution_engine.MetaExecutionEngine.column_aggregate_expectation>`.

            Args:
                column (str): \
                    The column name.
                min_value (int or None): \
                    The minimum number of unique values allowed.
                max_value (int or None): \
                    The maximum number of unique values allowed.

            Other Parameters:
                result_format (str or None): \
                    Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
                    For more detail, see :ref:`result_format <result_format>`.
                include_config (boolean): \
                    If True, then include the expectation config as part of the result object. \
                    For more detail, see :ref:`include_config`.
                catch_exceptions (boolean or None): \
                    If True, then catch exceptions and include them as part of the result object. \
                    For more detail, see :ref:`catch_exceptions`.
                meta (dict or None): \
                    A JSON-serializable dictionary (nesting allowed) that will be included in the output without \
                    modification. For more detail, see :ref:`meta`.

            Returns:
                An ExpectationSuiteValidationResult

                Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
                :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.

            Notes:
                These fields in the result object are customized for this expectation:
                ::

                    {
                        "observed_value": (int) The number of unique values in the column
                    }

                * min_value and max_value are both inclusive.
                * If min_value is None, then max_value is treated as an upper bound
                * If max_value is None, then min_value is treated as a lower bound

            See Also:
                :func:`expect_column_proportion_of_unique_values_to_be_between \
                <great_expectations.execution_engine.execution_engine.ExecutionEngine
                .expect_column_proportion_of_unique_values_to_be_between>`

            """

    # This dictionary contains metadata for display in the public gallery
    library_metadata = {
        "maturity": "production",
        "tags": ["core expectation", "column aggregate expectation"],
        "contributors": ["@great_expectations"],
        "requirements": [],
        "has_full_test_suite": True,
        "manually_reviewed_code": True,
    }

    # Setting necessary computation metric dependencies and defining kwargs, as well as assigning kwargs default values\
    metric_dependencies = ("column.distinct_values.count",)
    success_keys = (
        "min_value",
        "max_value",
        "auto",
        "profiler_config",
    )

    column_unique_values_range_estimator_parameter_builder_config: ParameterBuilderConfig = ParameterBuilderConfig(
        module_name="great_expectations.rule_based_profiler.parameter_builder",
        class_name="NumericMetricRangeMultiBatchParameterBuilder",
        name="column_unique_values_range_estimator",
        metric_name="column.distinct_values.count",
        metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
        metric_value_kwargs=None,
        enforce_numeric_metric=True,
        replace_nan_with_zero=True,
        reduce_scalar_metric=True,
        false_positive_rate=f"{VARIABLES_KEY}false_positive_rate",
        quantile_statistic_interpolation_method=f"{VARIABLES_KEY}quantile_statistic_interpolation_method",
        estimator=f"{VARIABLES_KEY}estimator",
        n_resamples=f"{VARIABLES_KEY}n_resamples",
        random_seed=f"{VARIABLES_KEY}random_seed",
        include_estimator_samples_histogram_in_details=f"{VARIABLES_KEY}include_estimator_samples_histogram_in_details",
        truncate_values=f"{VARIABLES_KEY}truncate_values",
        round_decimals=f"{VARIABLES_KEY}round_decimals",
        evaluation_parameter_builder_configs=None,
        json_serialize=True,
    )
    validation_parameter_builder_configs: List[ParameterBuilderConfig] = [
        column_unique_values_range_estimator_parameter_builder_config,
    ]
    default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
        name="expect_column_unique_value_count_to_be_between",  # Convention: use "expectation_type" as profiler name.
        config_version=1.0,
        variables={},
        rules={
            "default_expect_column_unique_values_to_be_between_rule": {
                "variables": {
                    "mostly": 1.0,
                    "strict_min": False,
                    "strict_max": False,
                    "false_positive_rate": 0.05,
                    "quantile_statistic_interpolation_method": "auto",
                    "estimator": "bootstrap",
                    "n_resamples": 9999,
                    "random_seed": None,
                    "include_estimator_samples_histogram_in_details": False,
                    "truncate_values": {
                        "lower_bound": 0,
                        "upper_bound": None,
                    },
                    "round_decimals": 0,
                },
                "domain_builder": {
                    "class_name": "ColumnDomainBuilder",
                    "module_name": "great_expectations.rule_based_profiler.domain_builder",
                },
                "expectation_configuration_builders": [
                    {
                        "expectation_type": "expect_column_unique_value_count_to_be_between",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "module_name": "great_expectations.rule_based_profiler.expectation_configuration_builder",
                        "validation_parameter_builder_configs": validation_parameter_builder_configs,
                        "column": f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
                        "min_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[0]",
                        "max_value": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}[1]",
                        "strict_min": f"{VARIABLES_KEY}strict_min",
                        "strict_max": f"{VARIABLES_KEY}strict_max",
                        "meta": {
                            "profiler_details": f"{PARAMETER_KEY}{column_unique_values_range_estimator_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
                        },
                    },
                ],
            },
        },
    )

    # Default values
    default_kwarg_values = {
        "row_condition": None,
        "condition_parser": None,
        "min_value": None,
        "max_value": None,
        "result_format": "BASIC",
        "include_config": True,
        "catch_exceptions": False,
        "auto": False,
        "profiler_config": default_profiler_config,
    }
    args_keys = (
        "column",
        "min_value",
        "max_value",
    )

    """ A Column Aggregate Metric Decorator for the Unique Value Count"""

    def validate_configuration(
        self, configuration: Optional[ExpectationConfiguration]
    ) -> None:
        """
        Validates that a configuration has been set, and sets a configuration if it has yet to be set. Ensures that
        necessary configuration arguments have been provided for the validation of the expectation.

        Args:
            configuration (OPTIONAL[ExpectationConfiguration]): \
                An optional Expectation Configuration entry that will be used to configure the expectation
        Returns:
            None. Raises InvalidExpectationConfigurationError if the config is not validated successfully
        """
        super().validate_configuration(configuration)
        self.validate_metric_value_between_configuration(configuration=configuration)

    @classmethod
    def _atomic_prescriptive_template(
        cls,
        configuration=None,
        result=None,
        language=None,
        runtime_configuration=None,
        **kwargs,
    ):
        runtime_configuration = runtime_configuration or {}
        include_column_name = runtime_configuration.get("include_column_name", True)
        include_column_name = (
            include_column_name if include_column_name is not None else True
        )
        styling = runtime_configuration.get("styling")
        params = substitute_none_for_missing(
            configuration.kwargs,
            [
                "column",
                "min_value",
                "max_value",
                "mostly",
                "row_condition",
                "condition_parser",
                "strict_min",
                "strict_max",
            ],
        )
        params_with_json_schema = {
            "column": {"schema": {"type": "string"}, "value": params.get("column")},
            "min_value": {
                "schema": {"type": "number"},
                "value": params.get("min_value"),
            },
            "max_value": {
                "schema": {"type": "number"},
                "value": params.get("max_value"),
            },
            "mostly": {"schema": {"type": "number"}, "value": params.get("mostly")},
            "mostly_pct": {
                "schema": {"type": "string"},
                "value": params.get("mostly_pct"),
            },
            "row_condition": {
                "schema": {"type": "string"},
                "value": params.get("row_condition"),
            },
            "condition_parser": {
                "schema": {"type": "string"},
                "value": params.get("condition_parser"),
            },
            "strict_min": {
                "schema": {"type": "boolean"},
                "value": params.get("strict_min"),
            },
            "strict_max": {
                "schema": {"type": "boolean"},
                "value": params.get("strict_max"),
            },
        }

        at_least_str, at_most_str = handle_strict_min_max(params)

        if (params["min_value"] is None) and (params["max_value"] is None):
            template_str = "may have any number of unique values."
        else:
            if params["mostly"] is not None and params["mostly"] < 1.0:
                params_with_json_schema["mostly_pct"]["value"] = num_to_str(
                    params["mostly"] * 100, precision=15, no_scientific=True
                )
                # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
                if params["min_value"] is None:
                    template_str = f"must have {at_most_str} $max_value unique values, at least $mostly_pct % of the time."
                elif params["max_value"] is None:
                    template_str = f"must have {at_least_str} $min_value unique values, at least $mostly_pct % of the time."
                else:
                    template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values, at least $mostly_pct % of the time."
            else:
                if params["min_value"] is None:
                    template_str = f"must have {at_most_str} $max_value unique values."
                elif params["max_value"] is None:
                    template_str = f"must have {at_least_str} $min_value unique values."
                else:
                    template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values."

        if include_column_name:
            template_str = f"$column {template_str}"

        if params["row_condition"] is not None:
            (
                conditional_template_str,
                conditional_params,
            ) = parse_row_condition_string_pandas_engine(
                params["row_condition"], with_schema=True
            )
            template_str = f"{conditional_template_str}, then {template_str}"
            params_with_json_schema.update(conditional_params)

        return (template_str, params_with_json_schema, styling)

    @classmethod
    @renderer(renderer_type="renderer.prescriptive")
    @render_evaluation_parameter_string
    def _prescriptive_renderer(
        cls,
        configuration=None,
        result=None,
        language=None,
        runtime_configuration=None,
        **kwargs,
    ):
        runtime_configuration = runtime_configuration or {}
        include_column_name = runtime_configuration.get("include_column_name", True)
        include_column_name = (
            include_column_name if include_column_name is not None else True
        )
        styling = runtime_configuration.get("styling")
        params = substitute_none_for_missing(
            configuration.kwargs,
            [
                "column",
                "min_value",
                "max_value",
                "mostly",
                "row_condition",
                "condition_parser",
                "strict_min",
                "strict_max",
            ],
        )

        at_least_str, at_most_str = handle_strict_min_max(params)

        if (params["min_value"] is None) and (params["max_value"] is None):
            template_str = "may have any number of unique values."
        else:
            if params["mostly"] is not None and params["mostly"] < 1.0:
                params["mostly_pct"] = num_to_str(
                    params["mostly"] * 100, precision=15, no_scientific=True
                )
                # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
                if params["min_value"] is None:
                    template_str = f"must have {at_most_str} $max_value unique values, at least $mostly_pct % of the time."
                elif params["max_value"] is None:
                    template_str = f"must have {at_least_str} $min_value unique values, at least $mostly_pct % of the time."
                else:
                    template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values, at least $mostly_pct % of the time."
            else:
                if params["min_value"] is None:
                    template_str = f"must have {at_most_str} $max_value unique values."
                elif params["max_value"] is None:
                    template_str = f"must have {at_least_str} $min_value unique values."
                else:
                    template_str = f"must have {at_least_str} $min_value and {at_most_str} $max_value unique values."

        if include_column_name:
            template_str = f"$column {template_str}"

        if params["row_condition"] is not None:
            (
                conditional_template_str,
                conditional_params,
            ) = parse_row_condition_string_pandas_engine(params["row_condition"])
            template_str = f"{conditional_template_str}, then {template_str}"
            params.update(conditional_params)

        return [
            RenderedStringTemplateContent(
                **{
                    "content_block_type": "string_template",
                    "string_template": {
                        "template": template_str,
                        "params": params,
                        "styling": styling,
                    },
                }
            )
        ]

    @classmethod
    @renderer(
        renderer_type="renderer.descriptive.column_properties_table.distinct_count_row"
    )
    def _descriptive_column_properties_table_distinct_count_row_renderer(
        cls,
        configuration=None,
        result=None,
        language=None,
        runtime_configuration=None,
        **kwargs,
    ):
        assert result, "Must pass in result."
        observed_value = result.result["observed_value"]
        template_string_object = RenderedStringTemplateContent(
            **{
                "content_block_type": "string_template",
                "string_template": {
                    "template": "Distinct (n)",
                    "tooltip": {
                        "content": "expect_column_unique_value_count_to_be_between"
                    },
                },
            }
        )
        if not observed_value:
            return [template_string_object, "--"]
        else:
            return [template_string_object, observed_value]

    def _validate(
        self,
        configuration: ExpectationConfiguration,
        metrics: Dict,
        runtime_configuration: dict = None,
        execution_engine: ExecutionEngine = None,
    ):
        return self._validate_metric_value_between(
            metric_name="column.distinct_values.count",
            configuration=configuration,
            metrics=metrics,
            runtime_configuration=runtime_configuration,
            execution_engine=execution_engine,
        )

Exemple #7

0

Afficher le fichier

Fichier : expect_column_values_to_match_strftime_format.py Projet : admariner/great_expectations

class ExpectColumnValuesToMatchStrftimeFormat(ColumnMapExpectation):
    """Expect column entries to be strings representing a date or time with a given format.

    expect_column_values_to_match_strftime_format is a \
    :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine
    .column_map_expectation>`.

    Args:
        column (str): \
            The column name.
        strftime_format (str): \
            A strftime format string to use for matching

    Keyword Args:
        mostly (None or a float between 0 and 1): \
            Return `"success": True` if at least mostly fraction of values match the expectation. \
            For more detail, see :ref:`mostly`.

    Other Parameters:
        result_format (str or None): \
            Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
            For more detail, see :ref:`result_format <result_format>`.
        include_config (boolean): \
            If True, then include the expectation config as part of the result object. \
            For more detail, see :ref:`include_config`.
        catch_exceptions (boolean or None): \
            If True, then catch exceptions and include them as part of the result object. \
            For more detail, see :ref:`catch_exceptions`.
        meta (dict or None): \
            A JSON-serializable dictionary (nesting allowed) that will be included in the output without \
            modification. For more detail, see :ref:`meta`.

    Returns:
        An ExpectationSuiteValidationResult

        Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
        :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.

    """

    library_metadata = {
        "maturity": "production",
        "tags": ["core expectation", "column map expectation"],
        "contributors": [
            "@great_expectations",
        ],
        "requirements": [],
        "has_full_test_suite": True,
        "manually_reviewed_code": True,
    }

    map_metric = "column_values.match_strftime_format"
    success_keys = (
        "strftime_format",
        "mostly",
        "auto",
        "profiler_config",
    )

    date_format_string_parameter_builder_config: ParameterBuilderConfig = (
        ParameterBuilderConfig(
            module_name=
            "great_expectations.rule_based_profiler.parameter_builder",
            class_name="SimpleDateFormatStringParameterBuilder",
            name="date_format_string_parameter_builder",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
        ))
    validation_parameter_builder_configs: List[ParameterBuilderConfig] = [
        date_format_string_parameter_builder_config
    ]
    default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
        name=
        "expect_column_values_to_match_strftime_format",  # Convention: use "expectation_type" as profiler name.
        config_version=1.0,
        variables={},
        rules={
            "default_expect_column_values_to_match_strftime_format_rule": {
                "variables": {
                    "mostly": 1.0,
                },
                "domain_builder": {
                    "class_name":
                    "ColumnDomainBuilder",
                    "module_name":
                    "great_expectations.rule_based_profiler.domain_builder",
                },
                "expectation_configuration_builders": [
                    {
                        "expectation_type":
                        "expect_column_values_to_match_strftime_format",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "module_name":
                        "great_expectations.rule_based_profiler.expectation_configuration_builder",
                        "validation_parameter_builder_configs":
                        validation_parameter_builder_configs,
                        "column":
                        f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
                        "strftime_format":
                        f"{PARAMETER_KEY}{date_format_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}",
                        "mostly": f"{VARIABLES_KEY}mostly",
                        "meta": {
                            "profiler_details":
                            f"{PARAMETER_KEY}{date_format_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
                        },
                    },
                ],
            },
        },
    )

    default_kwarg_values = {
        "row_condition": None,
        "condition_parser":
        None,  # we expect this to be explicitly set whenever a row_condition is passed
        "mostly": 1,
        "result_format": "BASIC",
        "include_config": True,
        "catch_exceptions": True,
        "auto": False,
        "profiler_config": default_profiler_config,
    }
    args_keys = (
        "column",
        "strftime_format",
    )

    def validate_configuration(
            self, configuration: Optional[ExpectationConfiguration]) -> None:
        super().validate_configuration(configuration)
        if configuration is None:
            configuration = self.configuration

        assert "strftime_format" in configuration.kwargs, "strftime_format is required"

        strftime_format = configuration.kwargs["strftime_format"]

        try:
            if isinstance(strftime_format, dict):
                assert (
                    "$PARAMETER" in strftime_format
                ), 'Evaluation Parameter dict for strftime_format kwarg must have "$PARAMETER" key.'
            else:
                datetime.strptime(
                    datetime.strftime(datetime.now(), strftime_format),
                    strftime_format,
                )
        except ValueError as e:
            raise ValueError(
                f"Unable to use provided strftime_format. {str(e)}")
        except AssertionError as e:
            raise InvalidExpectationConfigurationError(str(e))

    @classmethod
    def _atomic_prescriptive_template(
        cls,
        configuration=None,
        result=None,
        language=None,
        runtime_configuration=None,
        **kwargs,
    ):
        runtime_configuration = runtime_configuration or {}
        include_column_name = runtime_configuration.get(
            "include_column_name", True)
        include_column_name = (include_column_name
                               if include_column_name is not None else True)
        styling = runtime_configuration.get("styling")
        params = substitute_none_for_missing(
            configuration.kwargs,
            [
                "column",
                "strftime_format",
                "mostly",
                "row_condition",
                "condition_parser",
            ],
        )
        params_with_json_schema = {
            "column": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("column")
            },
            "strftime_format": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("strftime_format"),
            },
            "mostly": {
                "schema": {
                    "type": "number"
                },
                "value": params.get("mostly")
            },
            "mostly_pct": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("mostly_pct"),
            },
            "row_condition": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("row_condition"),
            },
            "condition_parser": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("condition_parser"),
            },
        }

        if not params.get("strftime_format"):
            template_str = "values must match a strftime format but none was specified."
        else:
            template_str = (
                "values must match the following strftime format: $strftime_format"
            )
            if params["mostly"] is not None and params["mostly"] < 1.0:
                params_with_json_schema["mostly_pct"]["value"] = num_to_str(
                    params["mostly"] * 100, precision=15, no_scientific=True)
                # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
                template_str += ", at least $mostly_pct % of the time."
            else:
                template_str += "."

        if include_column_name:
            template_str = f"$column {template_str}"

        if params["row_condition"] is not None:
            (
                conditional_template_str,
                conditional_params,
            ) = parse_row_condition_string_pandas_engine(
                params["row_condition"], with_schema=True)
            template_str = f"{conditional_template_str}, then {template_str}"
            params_with_json_schema.update(conditional_params)

        return (template_str, params_with_json_schema, styling)

    @classmethod
    @renderer(renderer_type="renderer.prescriptive")
    @render_evaluation_parameter_string
    def _prescriptive_renderer(
        cls,
        configuration=None,
        result=None,
        language=None,
        runtime_configuration=None,
        **kwargs,
    ):
        runtime_configuration = runtime_configuration or {}
        include_column_name = runtime_configuration.get(
            "include_column_name", True)
        include_column_name = (include_column_name
                               if include_column_name is not None else True)
        styling = runtime_configuration.get("styling")
        params = substitute_none_for_missing(
            configuration.kwargs,
            [
                "column",
                "strftime_format",
                "mostly",
                "row_condition",
                "condition_parser",
            ],
        )

        if not params.get("strftime_format"):
            template_str = "values must match a strftime format but none was specified."
        else:
            template_str = (
                "values must match the following strftime format: $strftime_format"
            )
            if params["mostly"] is not None and params["mostly"] < 1.0:
                params["mostly_pct"] = num_to_str(params["mostly"] * 100,
                                                  precision=15,
                                                  no_scientific=True)
                # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
                template_str += ", at least $mostly_pct % of the time."
            else:
                template_str += "."

        if include_column_name:
            template_str = f"$column {template_str}"

        if params["row_condition"] is not None:
            (
                conditional_template_str,
                conditional_params,
            ) = parse_row_condition_string_pandas_engine(
                params["row_condition"])
            template_str = f"{conditional_template_str}, then {template_str}"
            params.update(conditional_params)

        return [
            RenderedStringTemplateContent(
                **{
                    "content_block_type": "string_template",
                    "string_template": {
                        "template": template_str,
                        "params": params,
                        "styling": styling,
                    },
                })
        ]

Exemple #8

0

Afficher le fichier

Fichier : expect_column_values_to_match_regex.py Projet : admariner/great_expectations

class ExpectColumnValuesToMatchRegex(ColumnMapExpectation):
    """Expect column entries to be strings that match a given regular expression.

    Valid matches can be found \
    anywhere in the string, for example "[at]+" will identify the following strings as expected: "cat", "hat", \
    "aa", "a", and "t", and the following strings as unexpected: "fish", "dog".

    expect_column_values_to_match_regex is a \
    :func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine
    .column_map_expectation>`.

    Args:
        column (str): \
            The column name.
        regex (str): \
            The regular expression the column entries should match.

    Keyword Args:
        mostly (None or a float between 0 and 1): \
            Return `"success": True` if at least mostly fraction of values match the expectation. \
            For more detail, see :ref:`mostly`.

    Other Parameters:
        result_format (str or None): \
            Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
            For more detail, see :ref:`result_format <result_format>`.
        include_config (boolean): \
            If True, then include the expectation config as part of the result object. \
            For more detail, see :ref:`include_config`.
        catch_exceptions (boolean or None): \
            If True, then catch exceptions and include them as part of the result object. \
            For more detail, see :ref:`catch_exceptions`.
        meta (dict or None): \
            A JSON-serializable dictionary (nesting allowed) that will be included in the output without \
            modification. For more detail, see :ref:`meta`.

    Returns:
        An ExpectationSuiteValidationResult

        Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
        :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.

    See Also:
        :func:`expect_column_values_to_not_match_regex \
        <great_expectations.execution_engine.execution_engine.ExecutionEngine
        .expect_column_values_to_not_match_regex>`

        :func:`expect_column_values_to_match_regex_list \
        <great_expectations.execution_engine.execution_engine.ExecutionEngine
        .expect_column_values_to_match_regex_list>`

    """

    library_metadata = {
        "maturity": "production",
        "tags": ["core expectation", "column map expectation"],
        "contributors": [
            "@great_expectations",
        ],
        "requirements": [],
        "has_full_test_suite": True,
        "manually_reviewed_code": False,
    }

    map_metric = "column_values.match_regex"
    success_keys = (
        "regex",
        "mostly",
        "auto",
        "profiler_config",
    )

    regex_pattern_string_parameter_builder_config: ParameterBuilderConfig = (
        ParameterBuilderConfig(
            module_name=
            "great_expectations.rule_based_profiler.parameter_builder",
            class_name="RegexPatternStringParameterBuilder",
            name="regex_pattern_string_parameter_builder",
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            evaluation_parameter_builder_configs=None,
        ))
    validation_parameter_builder_configs: List[ParameterBuilderConfig] = [
        regex_pattern_string_parameter_builder_config
    ]
    default_profiler_config: RuleBasedProfilerConfig = RuleBasedProfilerConfig(
        name=
        "expect_column_values_to_match_regex",  # Convention: use "expectation_type" as profiler name.
        config_version=1.0,
        variables={},
        rules={
            "default_expect_column_values_to_match_regex_rule": {
                "variables": {
                    "mostly": 1.0,
                },
                "domain_builder": {
                    "class_name":
                    "ColumnDomainBuilder",
                    "module_name":
                    "great_expectations.rule_based_profiler.domain_builder",
                },
                "expectation_configuration_builders": [
                    {
                        "expectation_type":
                        "expect_column_values_to_match_regex",
                        "class_name": "DefaultExpectationConfigurationBuilder",
                        "module_name":
                        "great_expectations.rule_based_profiler.expectation_configuration_builder",
                        "validation_parameter_builder_configs":
                        validation_parameter_builder_configs,
                        "column":
                        f"{DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}column",
                        "regex":
                        f"{PARAMETER_KEY}{regex_pattern_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_VALUE_KEY}",
                        "mostly": f"{VARIABLES_KEY}mostly",
                        "meta": {
                            "profiler_details":
                            f"{PARAMETER_KEY}{regex_pattern_string_parameter_builder_config.name}{FULLY_QUALIFIED_PARAMETER_NAME_SEPARATOR_CHARACTER}{FULLY_QUALIFIED_PARAMETER_NAME_METADATA_KEY}",
                        },
                    },
                ],
            },
        },
    )

    default_kwarg_values = {
        "row_condition": None,
        "condition_parser":
        None,  # we expect this to be explicitly set whenever a row_condition is passed
        "mostly": 1,
        "result_format": "BASIC",
        "include_config": True,
        "catch_exceptions": True,
        "regex": "(?s).*",
        "auto": False,
        "profiler_config": default_profiler_config,
    }
    args_keys = (
        "column",
        "regex",
    )

    def validate_configuration(
            self, configuration: Optional[ExpectationConfiguration]) -> None:
        super().validate_configuration(configuration)
        if configuration is None:
            configuration = self.configuration

        # supports extensibility by allowing value_set to not be provided in config but captured via child-class default_kwarg_values, e.g. parameterized expectations
        regex = configuration.kwargs.get(
            "regex") or self.default_kwarg_values.get("regex")

        try:
            assert "regex" in configuration.kwargs or regex, "regex is required"
            assert isinstance(regex, (str, dict)), "regex must be a string"
            if isinstance(regex, dict):
                assert (
                    "$PARAMETER" in regex
                ), 'Evaluation Parameter dict for regex kwarg must have "$PARAMETER" key.'
        except AssertionError as e:
            raise InvalidExpectationConfigurationError(str(e))

    @classmethod
    @renderer(renderer_type="renderer.question")
    def _question_renderer(cls,
                           configuration,
                           result=None,
                           language=None,
                           runtime_configuration=None):
        column = configuration.kwargs.get("column")
        mostly = configuration.kwargs.get("mostly")
        regex = configuration.kwargs.get("regex")

        return f'Do at least {mostly * 100}% of values in column "{column}" match the regular expression {regex}?'

    @classmethod
    @renderer(renderer_type="renderer.answer")
    def _answer_renderer(cls,
                         configuration=None,
                         result=None,
                         language=None,
                         runtime_configuration=None):
        column = result.expectation_config.kwargs.get("column")
        mostly = result.expectation_config.kwargs.get("mostly")
        regex = result.expectation_config.kwargs.get("regex")
        if result.success:
            return f'At least {mostly * 100}% of values in column "{column}" match the regular expression {regex}.'
        else:
            return f'Less than {mostly * 100}% of values in column "{column}" match the regular expression {regex}.'

    @classmethod
    def _atomic_prescriptive_template(
        cls,
        configuration=None,
        result=None,
        language=None,
        runtime_configuration=None,
        **kwargs,
    ):
        runtime_configuration = runtime_configuration or {}
        include_column_name = runtime_configuration.get(
            "include_column_name", True)
        include_column_name = (include_column_name
                               if include_column_name is not None else True)
        styling = runtime_configuration.get("styling")
        params = substitute_none_for_missing(
            configuration.kwargs,
            ["column", "regex", "mostly", "row_condition", "condition_parser"],
        )
        params_with_json_schema = {
            "column": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("column")
            },
            "mostly": {
                "schema": {
                    "type": "number"
                },
                "value": params.get("mostly")
            },
            "mostly_pct": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("mostly_pct"),
            },
            "regex": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("regex")
            },
            "row_condition": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("row_condition"),
            },
            "condition_parser": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("condition_parser"),
            },
        }

        if not params.get("regex"):
            template_str = (
                "values must match a regular expression but none was specified."
            )
        else:
            template_str = "values must match this regular expression: $regex"
            if params["mostly"] is not None and params["mostly"] < 1.0:
                params_with_json_schema["mostly_pct"]["value"] = num_to_str(
                    params["mostly"] * 100, precision=15, no_scientific=True)
                # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
                template_str += ", at least $mostly_pct % of the time."
            else:
                template_str += "."

        if include_column_name:
            template_str = f"$column {template_str}"

        if params["row_condition"] is not None:
            (
                conditional_template_str,
                conditional_params,
            ) = parse_row_condition_string_pandas_engine(
                params["row_condition"], with_schema=True)
            template_str = f"{conditional_template_str}, then {template_str}"
            params_with_json_schema.update(conditional_params)

        return (template_str, params_with_json_schema, styling)

    @classmethod
    @renderer(renderer_type="renderer.prescriptive")
    @render_evaluation_parameter_string
    def _prescriptive_renderer(
        cls,
        configuration=None,
        result=None,
        language=None,
        runtime_configuration=None,
        **kwargs,
    ):
        runtime_configuration = runtime_configuration or {}
        include_column_name = runtime_configuration.get(
            "include_column_name", True)
        include_column_name = (include_column_name
                               if include_column_name is not None else True)
        styling = runtime_configuration.get("styling")
        params = substitute_none_for_missing(
            configuration.kwargs,
            ["column", "regex", "mostly", "row_condition", "condition_parser"],
        )

        if not params.get("regex"):
            template_str = (
                "values must match a regular expression but none was specified."
            )
        else:
            template_str = "values must match this regular expression: $regex"
            if params["mostly"] is not None and params["mostly"] < 1.0:
                params["mostly_pct"] = num_to_str(params["mostly"] * 100,
                                                  precision=15,
                                                  no_scientific=True)
                # params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
                template_str += ", at least $mostly_pct % of the time."
            else:
                template_str += "."

        if include_column_name:
            template_str = f"$column {template_str}"

        if params["row_condition"] is not None:
            (
                conditional_template_str,
                conditional_params,
            ) = parse_row_condition_string_pandas_engine(
                params["row_condition"])
            template_str = f"{conditional_template_str}, then {template_str}"
            params.update(conditional_params)

        params_with_json_schema = {
            "column": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("column")
            },
            "mostly": {
                "schema": {
                    "type": "number"
                },
                "value": params.get("mostly")
            },
            "mostly_pct": {
                "schema": {
                    "type": "number"
                },
                "value": params.get("mostly_pct"),
            },
            "regex": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("regex")
            },
            "row_condition": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("row_condition"),
            },
            "condition_parser": {
                "schema": {
                    "type": "string"
                },
                "value": params.get("condition_parser"),
            },
        }

        return [
            RenderedStringTemplateContent(
                **{
                    "content_block_type": "string_template",
                    "string_template": {
                        "template": template_str,
                        "params": params,
                        "styling": styling,
                    },
                })
        ]