Example #1
0
 def metrics_parameter_builders_by_domain(
     self,
 ) -> Dict[Domain, List[ParameterBuilder]]:
     table_row_count_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
         name="table_row_count",
         metric_name="table.row_count",
         metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
         metric_value_kwargs=None,
         enforce_numeric_metric=True,
         replace_nan_with_zero=True,
         reduce_scalar_metric=True,
         evaluation_parameter_builder_configs=None,
         json_serialize=True,
         data_context=None,
     )
     column_distinct_values_metric_multi_batch_parameter_builder: MetricMultiBatchParameterBuilder = MetricMultiBatchParameterBuilder(
         name="column_distinct_values.count",
         metric_name="column.distinct_values.count",
         metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
         metric_value_kwargs=None,
         enforce_numeric_metric=True,
         replace_nan_with_zero=True,
         reduce_scalar_metric=True,
         evaluation_parameter_builder_configs=None,
         json_serialize=True,
         data_context=None,
     )
     return {
         Domain(domain_type=MetricDomainTypes.TABLE,): [
             table_row_count_metric_multi_batch_parameter_builder,
         ],
         Domain(domain_type=MetricDomainTypes.COLUMN,): [
             column_distinct_values_metric_multi_batch_parameter_builder,
         ],
     }
Example #2
0
def test_simple_date_format_parameter_builder_zero_batch_id_error(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    date_format_string_parameter: ParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder",
            data_context=data_context,
        ))

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        date_format_string_parameter.build_parameters(
            domain=domain,
            parameters=parameters,
        )

    assert (
        str(e.value) ==
        "Utilizing a SimpleDateFormatStringParameterBuilder requires a non-empty list of Batch identifiers."
    )
Example #3
0
def table_Users_domain():
    return Domain(
        domain_type=MetricDomainTypes.TABLE,
        domain_kwargs=None,
        details=None,
        rule_name="my_rule",
    )
def test_default_expectation_configuration_builder_alice_null_condition_parameter_builder_validation_dependency_included(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs: dict = {"column": "user_id"}

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    fully_qualified_parameter_name_for_value: str = "$parameter.my_min_user_id.value[0]"

    condition: Optional[str] = None
    max_user_id: int = 999999999999

    min_user_id_parameter_builder_config: ParameterBuilderConfig = (
        ParameterBuilderConfig(
            module_name=
            "great_expectations.rule_based_profiler.parameter_builder",
            class_name="MetricMultiBatchParameterBuilder",
            name="my_min_user_id",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
        ))
    validation_parameter_builder_configs: Optional[
        List[ParameterBuilderConfig]] = [
            min_user_id_parameter_builder_config,
        ]
    default_expectation_configuration_builder = DefaultExpectationConfigurationBuilder(
        expectation_type="expect_column_values_to_be_between",
        condition=condition,
        min_value=fully_qualified_parameter_name_for_value,
        max_value=max_user_id,
        validation_parameter_builder_configs=
        validation_parameter_builder_configs,
        data_context=data_context,
    )

    expectation_configuration: Optional[
        ExpectationConfiguration] = default_expectation_configuration_builder.build_expectation_configuration(
            domain=domain,
            parameters=parameters,
            batch_request=batch_request,
        )

    assert expectation_configuration.kwargs["min_value"] == 397433
Example #5
0
def test_onboarding_data_assistant_metrics_count(
    bobby_onboarding_data_assistant_result: OnboardingDataAssistantResult,
) -> None:
    domain: Domain
    parameter_values_for_fully_qualified_parameter_names: Dict[str, ParameterNode]
    num_metrics: int

    domain_key: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
    )

    num_metrics = 0
    for (
        domain,
        parameter_values_for_fully_qualified_parameter_names,
    ) in bobby_onboarding_data_assistant_result.metrics_by_domain.items():
        if domain.is_superset(domain_key):
            num_metrics += len(parameter_values_for_fully_qualified_parameter_names)

    assert num_metrics == 2

    num_metrics = 0
    for (
        domain,
        parameter_values_for_fully_qualified_parameter_names,
    ) in bobby_onboarding_data_assistant_result.metrics_by_domain.items():
        num_metrics += len(parameter_values_for_fully_qualified_parameter_names)

    assert num_metrics == 184
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        domains: List[Domain] = [Domain(domain_type=self.domain_type, )]

        return domains
Example #7
0
def table_Users_domain():
    skip_if_python_below_minimum_version()

    return Domain(
        domain_type=MetricDomainTypes.TABLE,
        domain_kwargs=None,
        details=None,
    )
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_very_small(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    # a commonly used defect rate in quality control that equates to 3.4 defects per million opportunities
    six_sigma_false_positive_rate: float = 3.4 / 1000000.0
    assert six_sigma_false_positive_rate > NP_EPSILON

    # what if user tries a false positive rate smaller than NP_EPSILON (by an order of magnitude in this case)?
    smaller_than_np_epsilon_false_positive_rate: float = NP_EPSILON / 10

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="bootstrap",
            false_positive_rate=smaller_than_np_epsilon_false_positive_rate,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    warning_message: str = re.escape(
        f"""You have chosen a false_positive_rate of {smaller_than_np_epsilon_false_positive_rate}, which is too close to 0.
A false_positive_rate of {NP_EPSILON} has been selected instead.""")

    with pytest.warns(UserWarning, match=warning_message):
        numeric_metric_range_parameter_builder.build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            batch_request=batch_request,
        )
def test_simple_date_format_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    metric_domain_kwargs: dict = {"column": "pickup_datetime"}
    candidate_strings: set[str] = {
        "%Y-%m-%d",
        "%Y-%m-%d %H:%M:%S",
    }
    threshold: float = 0.9
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_strings=candidate_strings,
            threshold=threshold,
            data_context=data_context,
            batch_request=batch_request,
        ))

    assert date_format_string_parameter.CANDIDATE_STRINGS != candidate_strings
    assert date_format_string_parameter._candidate_strings == candidate_strings
    assert date_format_string_parameter._threshold == 0.9

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN,
                            domain_kwargs=metric_domain_kwargs)

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain)

    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = (
        "$parameter.my_simple_date_format_string_parameter_builder")
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0
        },
    }

    assert (get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters={domain.id: parameter_container},
    ) == expected_value)
Example #10
0
def column_Date_domain():
    return Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs={
            "column": "Date",
        },
        details=None,
        rule_name="my_rule",
    )
Example #11
0
def column_pair_Age_Date_domain():
    return Domain(
        domain_type=MetricDomainTypes.COLUMN_PAIR,
        domain_kwargs={
            "column_A": "Age",
            "column_B": "Date",
        },
        details=None,
        rule_name="my_rule",
    )
Example #12
0
def column_Date_domain():
    skip_if_python_below_minimum_version()

    return Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs={
            "column": "Date",
            "batch_id": "c260e179bb1bc81d84bba72a8110d8e2",
        },
        details=None,
    )
def test_get_fully_qualified_parameter_names(
    parameters_with_different_depth_level_values,
):
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    build_parameter_container(
        parameter_container=parameter_container,
        parameter_values=parameters_with_different_depth_level_values,
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=None,
        details=None,
        rule_name="my_rule",
    )
    # Convert variables argument to ParameterContainer
    variables: ParameterContainer = build_parameter_container_for_variables(
        variables_configs={
            "my_int": 9,
            "my_float": 3.38,
            "my_string": "hello",
        }
    )
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    expected_fully_qualified_parameter_names: List[str] = [
        "$variables",
        "$parameter.date_strings.yyyy_mm_dd_hh_mm_ss_tz_date_format",
        "$parameter.date_strings.yyyy_mm_dd_date_format",
        "$parameter.date_strings.mm_yyyy_dd_hh_mm_ss_tz_date_format",
        "$parameter.date_strings.mm_yyyy_dd_date_format",
        "$parameter.date_strings.tolerances.max_abs_error_time_milliseconds",
        "$parameter.date_strings.tolerances.max_num_conversion_attempts",
        "$parameter.tolerances.mostly",
        "$parameter.tolerances.financial.usd",
        "$parameter.monthly_taxi_fairs.mean_values",
        "$parameter.daily_taxi_fairs.mean_values",
        "$parameter.weekly_taxi_fairs.mean_values",
        "$mean",
    ]

    fully_qualified_parameter_names: List[str] = get_fully_qualified_parameter_names(
        domain=domain,
        variables=variables,
        parameters=parameters,
    )
    assert len(fully_qualified_parameter_names) == len(
        expected_fully_qualified_parameter_names
    )
    assert sorted(fully_qualified_parameter_names) == sorted(
        expected_fully_qualified_parameter_names
    )
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_negative(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="bootstrap",
            false_positive_rate=-0.05,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    error_message: str = re.escape(
        """false_positive_rate must be a positive decimal number between 0 and 1 inclusive [0, 1],
but -0.05 was provided.""")

    with pytest.raises(ge_exceptions.ProfilerExecutionError,
                       match=error_message):
        numeric_metric_range_parameter_builder.build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            batch_request=batch_request,
        )
Example #15
0
def multi_column_Age_Date_Description_domain():
    return Domain(
        domain_type=MetricDomainTypes.MULTICOLUMN,
        domain_kwargs={
            "column_list": [
                "Age",
                "Date",
                "Description",
            ],
        },
        details=None,
        rule_name="my_rule",
    )
def test_bootstrap_numeric_metric_range_multi_batch_parameter_builder_bobby_false_positive_rate_zero(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            metric_multi_batch_parameter_builder_name=None,
            metric_domain_kwargs=DOMAIN_KWARGS_PARAMETER_FULLY_QUALIFIED_NAME,
            metric_value_kwargs=None,
            estimator="bootstrap",
            false_positive_rate=0.0,
            round_decimals=0,
            evaluation_parameter_builder_configs=None,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.TABLE,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    warning_message: str = re.escape(
        f"""You have chosen a false_positive_rate of 0.0, which is too close to 0.
A false_positive_rate of {NP_EPSILON} has been selected instead.""")

    with pytest.warns(UserWarning, match=warning_message):
        numeric_metric_range_parameter_builder.build_parameters(
            domain=domain,
            variables=variables,
            parameters=parameters,
            batch_request=batch_request,
        )
def test_simple_date_format_parameter_builder_alice(
    alice_columnar_table_single_batch_context, ):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: dict = {
        "datasource_name": "alice_columnar_table_single_batch_datasource",
        "data_connector_name":
        "alice_columnar_table_single_batch_data_connector",
        "data_asset_name": "alice_columnar_table_single_batch_data_asset",
    }

    metric_domain_kwargs = {"column": "event_ts"}

    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_date_format",
            metric_domain_kwargs=metric_domain_kwargs,
            data_context=data_context,
            batch_request=batch_request,
        ))

    assert date_format_string_parameter.CANDIDATE_STRINGS == DEFAULT_CANDIDATE_STRINGS
    assert date_format_string_parameter.candidate_strings is None
    assert date_format_string_parameter._threshold == 1.0

    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN,
                            domain_kwargs=metric_domain_kwargs)

    assert parameter_container.parameter_nodes is None

    date_format_string_parameter._build_parameters(
        parameter_container=parameter_container, domain=domain)

    # noinspection PyTypeChecker
    assert len(parameter_container.parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.my_date_format"
    expected_value: dict = {
        "value": "%Y-%m-%d %H:%M:%S",
        "details": {
            "success_ratio": 1.0
        },
    }

    assert (get_parameter_value_by_fully_qualified_parameter_name(
        fully_qualified_parameter_name=fully_qualified_parameter_name_for_value,
        domain=domain,
        parameters={domain.id: parameter_container},
    ) == expected_value)
Example #18
0
def test_column_values_unique_single_batch(
        alice_columnar_table_single_batch_context):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: BatchRequest = BatchRequest(
        datasource_name="alice_columnar_table_single_batch_datasource",
        data_connector_name="alice_columnar_table_single_batch_data_connector",
        data_asset_name="alice_columnar_table_single_batch_data_asset",
    )

    domain_builder: MapMetricColumnDomainBuilder = MapMetricColumnDomainBuilder(
        map_metric_name="column_values.unique",
        max_unexpected_values=0,
        max_unexpected_ratio=None,
        min_max_unexpected_values_proportion=9.75e-1,
        data_context=data_context,
    )
    domains: List[Domain] = domain_builder.get_domains(
        rule_name="my_rule", batch_request=batch_request)

    # Unit Tests for "inferred_semantic_domain_type" are provided separately.
    domain: Domain
    for domain in domains:
        domain.details = {}

    domains = sorted(domains, key=lambda x: x.domain_kwargs["column"])

    alice_compliant_column_names: List[str] = [
        "id",
        "event_type",
        "user_id",
        "event_ts",
        "server_ts",
        "device_ts",
    ]

    column_name: str
    alice_expected_column_domains: List[Domain] = [
        Domain(
            domain_type=MetricDomainTypes.COLUMN,
            domain_kwargs={
                "column": column_name,
            },
            rule_name="my_rule",
        ) for column_name in alice_compliant_column_names
    ]
    alice_expected_column_domains = sorted(
        alice_expected_column_domains, key=lambda x: x.domain_kwargs["column"])

    assert len(domains) == 6
    assert domains == alice_expected_column_domains
    def _get_domains(
        self,
        rule_name: str,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """Return domains matching the specified tolerance limits.

        Args:
            rule_name: name of Rule object, for which "Domain" objects are obtained.
            variables: Optional variables to substitute when evaluating.

        Returns:
            List of domains that match the desired tolerance limits.
        """
        batch_ids: List[str] = self.get_batch_ids(variables=variables)

        validator: "Validator" = self.get_validator(variables=variables)  # noqa: F821

        effective_column_names: List[str] = self.get_effective_column_names(
            batch_ids=batch_ids,
            validator=validator,
            variables=variables,
        )

        if not (self.include_column_names and effective_column_names):
            raise ge_exceptions.ProfilerExecutionError(
                message=f'Error: "column_list" in {self.__class__.__name__} must not be empty.'
            )

        column_name: str
        semantic_types_by_column_name: Dict[str, SemanticDomainTypes] = {
            column_name: self.semantic_type_filter.table_column_name_to_inferred_semantic_domain_type_map[
                column_name
            ]
            for column_name in effective_column_names
        }

        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column_list": effective_column_names,
                },
                details={
                    INFERRED_SEMANTIC_TYPE_KEY: semantic_types_by_column_name,
                },
                rule_name=rule_name,
            ),
        ]

        return domains
def test_simple_date_format_parameter_builder_zero_batch_id_error():
    date_format_string_parameter: SimpleDateFormatStringParameterBuilder = (
        SimpleDateFormatStringParameterBuilder(
            name="my_simple_date_format_string_parameter_builder", ))
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    domain: Domain = Domain(domain_type=MetricDomainTypes.COLUMN)

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        date_format_string_parameter._build_parameters(
            parameter_container=parameter_container, domain=domain)

    assert (
        str(e.value) ==
        "Utilizing a SimpleDateFormatStringParameterBuilder requires a non-empty list of batch identifiers."
    )
Example #21
0
def test_excluded_columns_single_batch(
        alice_columnar_table_single_batch_context):
    data_context: DataContext = alice_columnar_table_single_batch_context

    batch_request: BatchRequest = BatchRequest(
        datasource_name="alice_columnar_table_single_batch_datasource",
        data_connector_name="alice_columnar_table_single_batch_data_connector",
        data_asset_name="alice_columnar_table_single_batch_data_asset",
    )

    domain_builder: DomainBuilder = CategoricalColumnDomainBuilder(
        limit_mode="VERY_FEW",
        exclude_column_names=[
            "id",
            "event_type",
            "user_id",
            "event_ts",
            "server_ts",
        ],
        data_context=data_context,
    )
    domains: List[Domain] = domain_builder.get_domains(
        rule_name="my_rule", batch_request=batch_request)

    alice_all_column_names: List[str] = [
        "device_ts",
        "user_agent",
    ]

    column_name: str
    alice_all_column_domains: List[Domain] = [
        Domain(
            domain_type=MetricDomainTypes.COLUMN,
            domain_kwargs={
                "column": column_name,
            },
            rule_name="my_rule",
        ) for column_name in alice_all_column_names
    ]
    assert len(domains) == 2

    # Unit Tests for "inferred_semantic_domain_type" are provided separately.
    domain: Domain
    for domain in domains:
        domain.details = {}

    assert domains == alice_all_column_domains
Example #22
0
    def __init__(
        self,
        name: str,
        validator: Optional[Validator],
    ) -> None:
        """
        DataAssistant subclasses guide "RuleBasedProfiler" to contain Rule configurations to embody profiling behaviors,
        corresponding to indended exploration and validation goals.  Then executing "RuleBasedProfiler.run()" yields
        "RuleBasedProfilerResult" object, containing "fully_qualified_parameter_names_by_domain",
        "parameter_values_for_fully_qualified_parameter_names_by_domain", "expectation_configurations", and "citation",
        immediately available for composing "ExpectationSuite" and validating underlying data "Batch" objects.

        Args:
            name: the name of this DataAssistant object
            validator: Validator object, containing loaded Batch objects as well as Expectation and Metric operations
        """
        self._name = name

        self._validator = validator

        if validator is None:
            self._data_context = None
            self._batches = None
        else:
            self._data_context = self._validator.data_context
            self._batches = self._validator.batches

        variables: Optional[Dict[str, Any]] = self.get_variables() or {}
        self._profiler = RuleBasedProfiler(
            name=self.name,
            config_version=1.0,
            variables=variables,
            data_context=self._data_context,
        )

        self._metrics_parameter_builders_by_domain = {}

        rules: Optional[List[Rule]] = self.get_rules() or []

        rule: Rule
        for rule in rules:
            self.profiler.add_rule(rule=rule)
            self._metrics_parameter_builders_by_domain[Domain(
                domain_type=rule.domain_builder.domain_type,
                rule_name=rule.name,
            )] = rule.parameter_builders
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Find the column suffix for each column and return all domains matching the specified suffix.
        """
        column_name_suffixes: Union[str, Iterable,
                                    List[str]] = self.column_name_suffixes
        if isinstance(column_name_suffixes, str):
            column_name_suffixes = [column_name_suffixes]
        else:
            if not isinstance(column_name_suffixes, (Iterable, List)):
                raise ValueError(
                    "Unrecognized column_name_suffixes directive -- must be a list or a string."
                )

        batch_id: str = self.get_batch_id(variables=variables)
        table_column_names: List[str] = self.get_validator(
            variables=variables).get_metric(metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            ))

        candidate_column_names: List[str] = list(
            filter(
                lambda candidate_column_name: candidate_column_name.endswith(
                    tuple(column_name_suffixes)),
                table_column_names,
            ))

        column_name: str
        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column": column_name,
                },
            ) for column_name in candidate_column_names
        ]

        return domains
Example #24
0
def test_regex_wrong_domain(mock_data_context: mock.MagicMock, batch_fixture: Batch):
    batch: Batch = batch_fixture
    mock_data_context.get_batch_list.return_value = [batch]
    mock_data_context.get_validator_using_batch_list.return_value = Validator(
        execution_engine=PandasExecutionEngine(), batches=[batch]
    )

    data_context: DataContext = mock_data_context

    # column : c does not exist
    metric_domain_kwargs: dict = {"column": "c"}
    candidate_regexes: List[str] = [r"^\d{1}$"]

    regex_pattern_string_parameter_builder: ParameterBuilder = (
        RegexPatternStringParameterBuilder(
            name="my_regex_pattern_string_parameter_builder",
            metric_domain_kwargs=metric_domain_kwargs,
            candidate_regexes=candidate_regexes,
            data_context=data_context,
        )
    )

    domain: Domain = Domain(
        domain_type=MetricDomainTypes.COLUMN,
        domain_kwargs=metric_domain_kwargs,
        rule_name="my_rule",
    )
    parameter_container: ParameterContainer = ParameterContainer(parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    with pytest.raises(ge_exceptions.ProfilerExecutionError) as e:
        regex_pattern_string_parameter_builder.build_parameters(
            domain=domain,
            parameters=parameters,
            batch_list=[batch],
        )

    assert (
        e.value.message
        == "Result of metric computations for RegexPatternStringParameterBuilder is empty."
    )
def test_semantic_domain_consistency():
    domain: Domain

    with pytest.raises(ValueError) as excinfo:
        # noinspection PyUnusedLocal
        domain = Domain(
            domain_type="column",
            domain_kwargs={"column": "passenger_count"},
            details={
                "estimator": "categorical",
                "cardinality": "low",
                INFERRED_SEMANTIC_TYPE_KEY: {
                    "num_passengers": SemanticDomainTypes.NUMERIC,
                },
            },
            rule_name="my_rule",
        )

    assert (
        """Cannot instantiate Domain (domain_type "MetricDomainTypes.COLUMN" of type "<enum 'MetricDomainTypes'>" -- key "num_passengers", detected in "inferred_semantic_domain_type" dictionary, does not exist as value of appropriate key in "domain_kwargs" dictionary."""
        in str(excinfo.value))
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Obtains and returns domains for all columns of a table (or for configured columns, if they exist in the table).
        """
        batch_id: str = self.get_batch_id(variables=variables)
        table_columns: List[str] = self.get_validator(variables=variables).get_metric(
            metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            )
        )
        if self.column_names is None:
            self.column_names = table_columns
        else:
            column_name: str
            for column_name in self.column_names:
                if column_name not in table_columns:
                    raise ge_exceptions.ProfilerExecutionError(
                        message=f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

        column_name: str
        domains: List[Domain] = [
            Domain(
                domain_type=self.domain_type,
                domain_kwargs={
                    "column": column_name,
                },
            )
            for column_name in self.column_names
        ]

        return domains
    def _get_domains(
        self,
        variables: Optional[ParameterContainer] = None,
    ) -> List[Domain]:
        """
        Find the semantic column type for each column and return all domains matching the specified type or types.
        """
        batch_id: str = self.get_batch_id(variables=variables)
        table_column_names: List[str] = self.get_validator(
            variables=variables).get_metric(metric=MetricConfiguration(
                metric_name="table.columns",
                metric_domain_kwargs={
                    "batch_id": batch_id,
                },
                metric_value_kwargs=None,
                metric_dependencies=None,
            ))

        # First check the column name ends in "_id".
        candidate_column_names: List[str] = list(
            filter(
                lambda candidate_column_name: candidate_column_name.endswith(
                    tuple(self.column_name_suffixes)),
                table_column_names,
            ))

        column_name: str
        domains: List[Domain] = [
            Domain(
                domain_type=MetricDomainTypes.COLUMN,
                domain_kwargs={
                    "column": column_name,
                },
            ) for column_name in candidate_column_names
        ]

        return domains
Example #28
0
def build_domains_from_column_names(
    rule_name: str,
    column_names: List[str],
    domain_type: MetricDomainTypes,
    table_column_name_to_inferred_semantic_domain_type_map: Optional[Dict[
        str, SemanticDomainTypes]] = None,
) -> List[Domain]:
    """
    This utility method builds "simple" Domain objects (i.e., required fields only, no "details" metadata accepted).

    :param rule_name: name of Rule object, for which "Domain" objects are obtained.
    :param column_names: list of column names to serve as values for "column" keys in "domain_kwargs" dictionary
    :param domain_type: type of Domain objects (same "domain_type" must be applicable to all Domain objects returned)
    :param table_column_name_to_inferred_semantic_domain_type_map: map from column name to inferred semantic type
    :return: list of resulting Domain objects
    """
    column_name: str
    domains: List[Domain] = [
        Domain(
            domain_type=domain_type,
            domain_kwargs={
                "column": column_name,
            },
            details={
                INFERRED_SEMANTIC_TYPE_KEY: {
                    column_name:
                    table_column_name_to_inferred_semantic_domain_type_map[
                        column_name],
                } if table_column_name_to_inferred_semantic_domain_type_map
                else None,
            },
            rule_name=rule_name,
        ) for column_name in column_names
    ]

    return domains
Example #29
0
def test_numeric_metric_range_multi_batch_parameter_builder_bobby_kde_bw_method(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    """
    This tests whether a change to bw_method results in a change to the range
    """

    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    metric_domain_kwargs: dict = {"column": "fare_amount"}

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="kde",
            false_positive_rate=5.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    default_bw_method_value: np.ndarray = parameter_node.pop("value")

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="column_min_range",
            metric_name="column.min",
            metric_domain_kwargs=metric_domain_kwargs,
            estimator="kde",
            bw_method=0.5,
            false_positive_rate=5.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.column_min_range"

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    other_bw_method_value: np.ndarray = parameter_node.pop("value")

    assert default_bw_method_value[0] != other_bw_method_value[0]
Example #30
0
def test_kde_numeric_metric_range_multi_batch_parameter_builder_bobby(
    bobby_columnar_table_multi_batch_deterministic_data_context, ):
    data_context: DataContext = (
        bobby_columnar_table_multi_batch_deterministic_data_context)

    # BatchRequest yielding three batches
    batch_request: dict = {
        "datasource_name": "taxi_pandas",
        "data_connector_name": "monthly",
        "data_asset_name": "my_reports",
    }

    numeric_metric_range_parameter_builder: ParameterBuilder = (
        NumericMetricRangeMultiBatchParameterBuilder(
            name="row_count_range",
            metric_name="table.row_count",
            estimator="kde",
            include_estimator_samples_histogram_in_details=True,
            false_positive_rate=1.0e-2,
            round_decimals=0,
            json_serialize=False,
            data_context=data_context,
        ))

    variables: Optional[ParameterContainer] = None

    domain: Domain = Domain(
        rule_name="my_rule",
        domain_type=MetricDomainTypes.TABLE,
    )
    parameter_container: ParameterContainer = ParameterContainer(
        parameter_nodes=None)
    parameters: Dict[str, ParameterContainer] = {
        domain.id: parameter_container,
    }

    assert parameter_container.parameter_nodes is None

    numeric_metric_range_parameter_builder.build_parameters(
        domain=domain,
        variables=variables,
        parameters=parameters,
        batch_request=batch_request,
    )

    parameter_nodes: Optional[Dict[str, ParameterNode]] = (
        parameter_container.parameter_nodes or {})
    assert len(parameter_nodes) == 1

    fully_qualified_parameter_name_for_value: str = "$parameter.row_count_range"
    expected_value_dict: dict = {
        "value": None,
        "details": {
            "metric_configuration": {
                "domain_kwargs": {},
                "metric_name": "table.row_count",
                "metric_value_kwargs": None,
                "metric_dependencies": None,
            },
            "num_batches": 3,
        },
    }

    parameter_node: ParameterNode = (
        get_parameter_value_by_fully_qualified_parameter_name(
            fully_qualified_parameter_name=
            fully_qualified_parameter_name_for_value,
            domain=domain,
            parameters=parameters,
        ))

    actual_value: np.ndarray = parameter_node.pop("value")
    parameter_node["value"] = None

    actual_estimation_histogram: np.ndarray = parameter_node.details.pop(
        "estimation_histogram")

    assert parameter_node == expected_value_dict

    expected_value: np.ndarray = np.array([6180, 10277])

    # Measure of "closeness" between "actual" and "desired" is computed as: atol + rtol * abs(desired)
    # (see "https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_allclose.html" for details).
    rtol: float = 1.0e-2
    atol: float = 0

    # kde results should be stable +/- 1%
    np.testing.assert_allclose(
        actual=actual_value,
        desired=expected_value,
        rtol=rtol,
        atol=atol,
        err_msg=
        f"Actual value of {actual_value} differs from expected value of {expected_value} by more than {atol + rtol * abs(expected_value)} tolerance.",
    )

    expected_estimation_histogram: np.ndarray = np.array([
        13.0,
        155.0,
        719.0,
        1546.0,
        2221.0,
        2570.0,
        1946.0,
        683.0,
        137.0,
        9.0,
    ])

    # Assert no significant difference between expected (null hypothesis) and actual estimation histograms.
    ks_result: tuple = stats.ks_2samp(data1=actual_estimation_histogram,
                                      data2=expected_estimation_histogram)
    p_value: float = ks_result[1]
    assert p_value > 9.5e-1