def _self_check_fetch_batch(
        self,
        pretty_print: bool,
        example_data_reference: Any,
        data_asset_name: str,
    ):
        """
        Helper function for self_check() to retrieve batch using example_data_reference and data_asset_name,
        all while printing helpful messages. First 5 rows of batch_data are printed by default.

        Args:
            pretty_print (bool): print to console?
            example_data_reference (Any): data_reference to retrieve
            data_asset_name (str): data_asset_name to retrieve

        """
        if pretty_print:
            print(f"\n\t\tFetching batch data...")

        batch_definition_list = self._map_data_reference_to_batch_definition_list(
            data_reference=example_data_reference,
            data_asset_name=data_asset_name,
        )
        assert len(batch_definition_list) == 1
        batch_definition = batch_definition_list[0]

        # _execution_engine might be None for some tests
        if batch_definition is None or self._execution_engine is None:
            return {}
        batch_data, batch_spec, _ = self.get_batch_data_and_metadata(
            batch_definition=batch_definition
        )

        # Note: get_batch_data_and_metadata will have loaded the data into the currently-defined execution engine.
        # Consequently, when we build a Validator, we do not need to specifically load the batch into it to
        # resolve metrics.
        validator = Validator(execution_engine=batch_data.execution_engine)
        df = validator.get_metric(
            MetricConfiguration(
                "table.head", {"batch_id": batch_definition.id}, {"n_rows": 5}
            )
        )
        n_rows = validator.get_metric(
            MetricConfiguration("table.row_count", {"batch_id": batch_definition.id})
        )

        if pretty_print and df is not None:
            print(f"\n\t\tShowing 5 rows")
            print(df)

        return {
            "batch_spec": batch_spec,
            "n_rows": n_rows,
        }
Ejemplo n.º 2
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        selectable, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE
        )
        df = None
        table_name = getattr(selectable, "name", None)
        if table_name is not None:
            try:
                if metric_value_kwargs["fetch_all"]:
                    df = pd.read_sql_table(
                        table_name=getattr(selectable, "name", None),
                        schema=getattr(selectable, "schema", None),
                        con=execution_engine.engine,
                    )
                else:
                    df = next(
                        pd.read_sql_table(
                            table_name=getattr(selectable, "name", None),
                            schema=getattr(selectable, "schema", None),
                            con=execution_engine.engine,
                            chunksize=metric_value_kwargs["n_rows"],
                        )
                    )
            except (ValueError, NotImplementedError):
                # it looks like MetaData that is used by pd.read_sql_table
                # cannot work on a temp table.
                # If it fails, we are trying to get the data using read_sql
                df = None
            except StopIteration:
                validator = Validator(execution_engine=execution_engine)
                columns = validator.get_metric(
                    MetricConfiguration("table.columns", metric_domain_kwargs)
                )
                df = pd.DataFrame(columns=columns)
        if df is None:
            # we want to compile our selectable
            stmt = sa.select(["*"]).select_from(selectable)
            if metric_value_kwargs["fetch_all"]:
                pass
            else:
                stmt = stmt.limit(metric_value_kwargs["n_rows"])
            sql = stmt.compile(
                dialect=execution_engine.engine.dialect,
                compile_kwargs={"literal_binds": True},
            )
            df = pd.read_sql(sql, con=execution_engine.engine)

        return df
Ejemplo n.º 3
0
class UserConfigurableProfiler:
    """
    The UserConfigurableProfiler is used to build an expectation suite from a dataset. The expectations built are
    strict - they can be used to determine whether two tables are the same.

    The profiler may be instantiated with or without a number of configuration arguments. Once a profiler is
    instantiated, if these arguments change, a new profiler will be needed.

    A profiler is used to build a suite without a config as follows:

    profiler = UserConfigurableProfiler(dataset)
    suite = profiler.build_suite()


    A profiler is used to build a suite with a semantic_types dict, as follows:

    semantic_types_dict = {
                "numeric": ["c_acctbal"],
                "string": ["c_address","c_custkey"],
                "value_set": ["c_nationkey","c_mktsegment", 'c_custkey', 'c_name', 'c_address', 'c_phone'],
            }

    profiler = UserConfigurableProfiler(dataset, semantic_types_dict=semantic_types_dict)
    suite = profiler.build_suite()
    """
    def __init__(
        self,
        profile_dataset,
        excluded_expectations: list = None,
        ignored_columns: list = None,
        not_null_only: bool = False,
        primary_or_compound_key: list = False,
        semantic_types_dict: dict = None,
        table_expectations_only: bool = False,
        value_set_threshold: str = "MANY",
    ):
        """
                The UserConfigurableProfiler is used to build an expectation suite from a dataset. The profiler may be
                instantiated with or without a config. The config may contain a semantic_types dict or not. Once a profiler is
                instantiated, if config items change, a new profiler will be needed.
        Write an entry on how to use the profiler for the GE docs site
                Args:
                    profile_dataset: A Great Expectations Dataset or Validator object
                    excluded_expectations: A list of expectations to not include in the suite
                    ignored_columns: A list of columns for which you would like to NOT create expectations
                    not_null_only: Boolean, default False. By default, each column is evaluated for nullity. If the column
                        values contain fewer than 50% null values, then the profiler will add
                        `expect_column_values_to_not_be_null`; if greater than 50% it will add
                        `expect_column_values_to_be_null`. If not_null_only is set to True, the profiler will add a
                        not_null expectation irrespective of the percent nullity (and therefore will not add an
                        `expect_column_values_to_be_null`
                    primary_or_compound_key: A list containing one or more columns which are a dataset's primary or
                        compound key. This will create an `expect_column_values_to_be_unique` or
                        `expect_compound_columns_to_be_unique` expectation. This will occur even if one or more of the
                        primary_or_compound_key columns are specified in ignored_columns
                    semantic_types_dict: A dictionary where the keys are available semantic_types (see profiler.base.profiler_semantic_types)
                        and the values are lists of columns for which you would like to create semantic_type specific
                        expectations e.g.:
                        "semantic_types": { "value_set": ["state","country"], "numeric":["age", "amount_due"]}
                    table_expectations_only: Boolean, default False. If True, this will only create the two table level expectations
                        available to this profiler (`expect_table_columns_to_match_ordered_list` and
                        `expect_table_row_count_to_be_between`). If a primary_or_compound key is specified, it will create
                        a uniqueness expectation for that column as well
                    value_set_threshold: Takes a string from the following ordered list - "none", "one", "two",
                        "very_few", "few", "many", "very_many", "unique". When the profiler runs without a semantic_types
                        dict, each column is profiled for cardinality. This threshold determines the greatest cardinality
                        for which to add `expect_column_values_to_be_in_set`. For example, if value_set_threshold is set to
                        "unique", it will add a value_set expectation for every included column. If set to "few", it will
                        add a value_set expectation for columns whose cardinality is one of "one", "two", "very_few" or
                        "few". The default value is "many". For the purposes of comparing whether two tables are identical,
                        it might make the most sense to set this to "unique"
        """
        self.column_info = {}
        self.profile_dataset = profile_dataset
        assert isinstance(self.profile_dataset, (Dataset, Validator, Batch))

        if isinstance(self.profile_dataset, Batch):
            self.profile_dataset = Validator(
                execution_engine=self.profile_dataset.data.execution_engine,
                batches=[self.profile_dataset],
            )
            self.all_table_columns = self.profile_dataset.get_metric(
                MetricConfiguration("table.columns", dict()))
        elif isinstance(self.profile_dataset, Validator):
            self.all_table_columns = self.profile_dataset.get_metric(
                MetricConfiguration("table.columns", dict()))
        else:
            self.all_table_columns = self.profile_dataset.get_table_columns()

        self.semantic_types_dict = semantic_types_dict
        assert isinstance(self.semantic_types_dict, (dict, type(None)))

        self.ignored_columns = ignored_columns or []
        assert isinstance(self.ignored_columns, list)

        self.excluded_expectations = excluded_expectations or []
        assert isinstance(self.excluded_expectations, list)

        self.value_set_threshold = value_set_threshold.upper()
        assert isinstance(self.value_set_threshold, str)

        self.not_null_only = not_null_only
        assert isinstance(self.not_null_only, bool)

        self.table_expectations_only = table_expectations_only
        assert isinstance(self.table_expectations_only, bool)
        if self.table_expectations_only is True:
            logger.info(
                "table_expectations_only is set to True. When used to build a suite, this profiler will ignore all"
                "columns and create expectations only at the table level. If you would also like to create expectations "
                "at the column level, you can instantiate a new profiler with table_expectations_only set to False"
            )

        self.primary_or_compound_key = primary_or_compound_key or []
        assert isinstance(self.primary_or_compound_key, list)

        if self.table_expectations_only:
            self.ignored_columns = self.all_table_columns

        if self.primary_or_compound_key:
            for column in self.primary_or_compound_key:
                if column not in self.all_table_columns:
                    raise ValueError(
                        f"Column {column} not found. Please ensure that this column is in the {type(profile_dataset).__name__} "
                        f"if you would like to use it as a primary_or_compound_key."
                    )

        included_columns = [
            column_name for column_name in self.all_table_columns
            if column_name not in self.ignored_columns
        ]

        for column_name in included_columns:
            self._add_column_cardinality_to_column_info(
                self.profile_dataset, column_name)
            self._add_column_type_to_column_info(self.profile_dataset,
                                                 column_name)

        if self.semantic_types_dict is not None:
            self._validate_semantic_types_dict(self.profile_dataset)
            for column_name in included_columns:
                self._add_semantic_types_by_column_from_config_to_column_info(
                    column_name)
        self.semantic_type_functions = {
            "DATETIME": self._build_expectations_datetime,
            "NUMERIC": self._build_expectations_numeric,
            "STRING": self._build_expectations_string,
            "VALUE_SET": self._build_expectations_value_set,
            "BOOLEAN": self._build_expectations_value_set,
        }

    def build_suite(self):
        """
        User-facing expectation-suite building function. Works with an instantiated UserConfigurableProfiler object.
        Args:

        Returns:
            An expectation suite built either with or without a semantic_types dict

        """
        if len(self.profile_dataset.get_expectation_suite().expectations) > 0:
            suite_name = self.profile_dataset._expectation_suite.expectation_suite_name
            self.profile_dataset._expectation_suite = ExpectationSuite(
                suite_name)

        if self.semantic_types_dict:
            return self._build_expectation_suite_from_semantic_types_dict()

        return self._profile_and_build_expectation_suite()

    def _build_expectation_suite_from_semantic_types_dict(self):
        """
        Uses a semantic_type dict to determine which expectations to add to the suite, then builds the suite
        Args:

        Returns:
            An expectation suite built from a semantic_types dict
        """
        if not self.semantic_types_dict:
            raise ValueError(
                "A config with a semantic_types dict must be included in order to use this profiler."
            )
        self._build_expectations_table(self.profile_dataset)

        if self.value_set_threshold:
            logger.info(
                "Using this profiler with a semantic_types dict will ignore the value_set_threshold parameter. If "
                "you would like to include value_set expectations, you can include a 'value_set' entry in your "
                "semantic_types dict with any columns for which you would like a value_set expectation, or you can "
                "remove the semantic_types dict from the config.")

        if self.primary_or_compound_key:
            self._build_expectations_primary_or_compound_key(
                self.profile_dataset, self.primary_or_compound_key)

        for column_name, column_info in self.column_info.items():
            semantic_types = column_info.get("semantic_types")
            for semantic_type in semantic_types:
                semantic_type_fn = self.semantic_type_functions.get(
                    semantic_type)
                semantic_type_fn(profile_dataset=self.profile_dataset,
                                 column=column_name)

        for column_name in self.column_info.keys():
            self._build_expectations_for_all_column_types(
                self.profile_dataset, column_name)

        expectation_suite = self._build_column_description_metadata(
            self.profile_dataset)
        self._display_suite_by_column(suite=expectation_suite)
        return expectation_suite

    def _profile_and_build_expectation_suite(self):
        """
        Profiles the provided dataset to determine which expectations to add to the suite, then builds the suite
        Args:

        Returns:
            An expectation suite built after profiling the dataset
        """
        if self.primary_or_compound_key:
            self._build_expectations_primary_or_compound_key(
                profile_dataset=self.profile_dataset,
                column_list=self.primary_or_compound_key,
            )
        self._build_expectations_table(profile_dataset=self.profile_dataset)
        for column_name, column_info in self.column_info.items():
            data_type = column_info.get("type")
            cardinality = column_info.get("cardinality")

            if data_type in ("FLOAT", "INT", "NUMERIC"):
                self._build_expectations_numeric(
                    profile_dataset=self.profile_dataset,
                    column=column_name,
                )

            if data_type == "DATETIME":
                self._build_expectations_datetime(
                    profile_dataset=self.profile_dataset,
                    column=column_name,
                )

            if (OrderedProfilerCardinality[self.value_set_threshold] >=
                    OrderedProfilerCardinality[cardinality]):
                self._build_expectations_value_set(
                    profile_dataset=self.profile_dataset, column=column_name)

            self._build_expectations_for_all_column_types(
                profile_dataset=self.profile_dataset, column=column_name)

        expectation_suite = self._build_column_description_metadata(
            self.profile_dataset)
        self._display_suite_by_column(
            suite=expectation_suite)  # include in the actual profiler
        return expectation_suite

    def _validate_semantic_types_dict(self, profile_dataset):
        """
        Validates a semantic_types dict to ensure correct formatting, that all semantic_types are recognized, and that
        the semantic_types align with the column data types
        Args:
            profile_dataset: A GE dataset
            config: A config dictionary

        Returns:
            The validated semantic_types dictionary

        """
        if not isinstance(self.semantic_types_dict, dict):
            raise ValueError(
                f"The semantic_types dict in the config must be a dictionary, but is currently a "
                f"{type(self.semantic_types_dict)}. Please reformat.")
        for k, v in self.semantic_types_dict.items():
            assert isinstance(v, list), (
                "Entries in semantic type dict must be lists of column names e.g. "
                "{'semantic_types': {'numeric': ['number_of_transactions']}}")
            if k.upper() not in profiler_semantic_types:
                raise ValueError(
                    f"{k} is not a recognized semantic_type. Please only include one of "
                    f"{profiler_semantic_types}")

        selected_columns = [
            column for column_list in self.semantic_types_dict.values()
            for column in column_list
        ]
        if selected_columns:
            for column in selected_columns:
                if column not in self.all_table_columns:
                    raise ProfilerError(f"Column {column} does not exist.")
                elif column in self.ignored_columns:
                    raise ValueError(
                        f"Column {column} is specified in both the semantic_types_dict and the list of "
                        f"ignored columns. Please remove one of these entries to proceed."
                    )

        for semantic_type, column_list in self.semantic_types_dict.items():
            for column_name in column_list:
                processed_column = self.column_info.get(column_name)
                if semantic_type == "datetime":
                    assert processed_column.get("type") in (
                        "DATETIME",
                        "STRING",
                    ), (f"Column {column_name} must be a datetime column or a string but appears to be "
                        f"{processed_column.get('type')}")
                elif semantic_type == "numeric":
                    assert processed_column.get("type") in (
                        "INT",
                        "FLOAT",
                        "NUMERIC",
                    ), f"Column {column_name} must be an int or a float but appears to be {processed_column.get('type')}"
                elif semantic_type in ("STRING", "VALUE_SET"):
                    pass
        return self.semantic_types_dict

    def _add_column_type_to_column_info(self, profile_dataset, column_name):
        """
        Adds the data type of a column to the column_info dictionary on self
        Args:
            profile_dataset: A GE dataset
            column_name: The name of the column for which to retrieve the data type

        Returns:
            The type of the column
        """
        if "expect_column_values_to_be_in_type_list" in self.excluded_expectations:
            logger.info(
                "expect_column_values_to_be_in_type_list is in the excluded_expectations list. This"
                "expectation is required to establish column data, so it will be run and then removed from the"
                "expectation suite.")
        column_info_entry = self.column_info.get(column_name)
        if not column_info_entry:
            column_info_entry = {}
            self.column_info[column_name] = column_info_entry
        column_type = column_info_entry.get("type")
        if not column_type:
            column_type = self._get_column_type(profile_dataset, column_name)
            column_info_entry["type"] = column_type

        return column_type

    def _get_column_type(self, profile_dataset, column):
        """
        Determines the data type of a column by evaluating the success of `expect_column_values_to_be_in_type_list`.
        In the case of type Decimal, this data type is returned as NUMERIC, which contains the type lists for both INTs
        and FLOATs.

        The type_list expectation used here is removed, since it will need to be built once the build_suite function is
        actually called. This is because calling build_suite wipes any existing expectations, so expectations called
        during the init of the profiler do not persist.

        Args:
            profile_dataset: A GE dataset
            column: The column for which to get the data type

        Returns:
            The data type of the specified column
        """
        # list of types is used to support pandas and sqlalchemy
        type_ = None
        try:

            if (profile_dataset.expect_column_values_to_be_in_type_list(
                    column,
                    type_list=sorted(list(
                        ProfilerTypeMapping.INT_TYPE_NAMES))).success and
                    profile_dataset.expect_column_values_to_be_in_type_list(
                        column,
                        type_list=sorted(
                            list(
                                ProfilerTypeMapping.FLOAT_TYPE_NAMES))).success
                ):
                type_ = "NUMERIC"

            elif profile_dataset.expect_column_values_to_be_in_type_list(
                    column,
                    type_list=sorted(list(
                        ProfilerTypeMapping.INT_TYPE_NAMES))).success:
                type_ = "INT"

            elif profile_dataset.expect_column_values_to_be_in_type_list(
                    column,
                    type_list=sorted(list(
                        ProfilerTypeMapping.FLOAT_TYPE_NAMES))).success:
                type_ = "FLOAT"

            elif profile_dataset.expect_column_values_to_be_in_type_list(
                    column,
                    type_list=sorted(
                        list(ProfilerTypeMapping.STRING_TYPE_NAMES))).success:
                type_ = "STRING"

            elif profile_dataset.expect_column_values_to_be_in_type_list(
                    column,
                    type_list=sorted(
                        list(ProfilerTypeMapping.BOOLEAN_TYPE_NAMES))).success:
                type_ = "BOOLEAN"

            elif profile_dataset.expect_column_values_to_be_in_type_list(
                    column,
                    type_list=sorted(
                        list(
                            ProfilerTypeMapping.DATETIME_TYPE_NAMES))).success:
                type_ = "DATETIME"

            else:
                type_ = "UNKNOWN"
        except NotImplementedError:
            type_ = "unknown"

        if type_ == "NUMERIC":
            profile_dataset.expect_column_values_to_be_in_type_list(
                column,
                type_list=sorted(list(ProfilerTypeMapping.INT_TYPE_NAMES)) +
                sorted(list(ProfilerTypeMapping.FLOAT_TYPE_NAMES)),
            )

        profile_dataset._expectation_suite.remove_expectation(
            ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_in_type_list",
                kwargs={"column": column},
            ))
        return type_

    def _add_column_cardinality_to_column_info(self, profile_dataset,
                                               column_name):
        """
        Adds the cardinality of a column to the column_info dictionary on self
        Args:
            profile_dataset: A GE Dataset
            column_name: The name of the column for which to add cardinality

        Returns:
            The cardinality of the column
        """
        column_info_entry = self.column_info.get(column_name)
        if not column_info_entry:
            column_info_entry = {}
            self.column_info[column_name] = column_info_entry
        column_cardinality = column_info_entry.get("cardinality")
        if not column_cardinality:
            column_cardinality = self._get_column_cardinality(
                profile_dataset, column_name)
            column_info_entry["cardinality"] = column_cardinality
            # remove the expectations
            profile_dataset._expectation_suite.remove_expectation(
                ExpectationConfiguration(
                    expectation_type=
                    "expect_column_unique_value_count_to_be_between",
                    kwargs={"column": column_name},
                ))
            profile_dataset._expectation_suite.remove_expectation(
                ExpectationConfiguration(
                    expectation_type=
                    "expect_column_proportion_of_unique_values_to_be_between",
                    kwargs={"column": column_name},
                ))

        return column_cardinality

    def _get_column_cardinality(self, profile_dataset, column):
        """
        Determines the cardinality of a column using the get_basic_column_cardinality method from
        OrderedProfilerCardinality
        Args:
            profile_dataset: A GE Dataset
            column: The column for which to get cardinality

        Returns:
            The cardinality of the specified column
        """
        num_unique = None
        pct_unique = None

        try:
            num_unique = profile_dataset.expect_column_unique_value_count_to_be_between(
                column, None, None).result["observed_value"]
            pct_unique = (
                profile_dataset.
                expect_column_proportion_of_unique_values_to_be_between(
                    column, None, None).result["observed_value"])
        except KeyError:  # if observed_value value is not set
            logger.error(
                "Failed to get cardinality of column {:s} - continuing...".
                format(column))
        # Previously, if we had 25 possible categories out of 1000 rows, this would comes up as many, because of its
        #  percentage, so it was tweaked here, but is still experimental.
        cardinality = OrderedProfilerCardinality.get_basic_column_cardinality(
            num_unique, pct_unique)

        return cardinality.name

    def _add_semantic_types_by_column_from_config_to_column_info(
            self, column_name):
        """
        Adds the semantic type of a column to the column_info dict on self, for display purposes after suite creation
        Args:
            column_name: The name of the column

        Returns:
            A list of semantic_types for a given colum
        """
        column_info_entry = self.column_info.get(column_name)
        if not column_info_entry:
            column_info_entry = {}
            self.column_info[column_name] = column_info_entry

        semantic_types = column_info_entry.get("semantic_types")

        if not semantic_types:
            assert isinstance(
                self.semantic_types_dict, dict
            ), f"The semantic_types dict in the config must be a dictionary, but is currently a {type(self.semantic_types_dict)}. Please reformat."
            semantic_types = []
            for semantic_type, column_list in self.semantic_types_dict.items():
                if column_name in column_list:
                    semantic_types.append(semantic_type.upper())
            column_info_entry["semantic_types"] = semantic_types
            if all(i in column_info_entry.get("semantic_types")
                   for i in ["BOOLEAN", "VALUE_SET"]):
                logger.info(
                    f"Column {column_name} has both 'BOOLEAN' and 'VALUE_SET' specified as semantic_types."
                    f"As these are currently the same in function, the 'VALUE_SET' type will be removed."
                )
                column_info_entry["semantic_types"].remove("VALUE_SET")

        self.column_info[column_name] = column_info_entry

        return semantic_types

    def _build_column_description_metadata(self, profile_dataset):
        """
        Adds column description metadata to the suite on a Dataset object
        Args:
            profile_dataset: A GE Dataset

        Returns:
            An expectation suite with column description metadata
        """
        columns = self.all_table_columns
        expectation_suite = profile_dataset.get_expectation_suite(
            suppress_warnings=True, discard_failed_expectations=False)

        meta_columns = {}
        for column in columns:
            meta_columns[column] = {"description": ""}
        if not expectation_suite.meta:
            expectation_suite.meta = {"columns": meta_columns, "notes": {""}}
        else:
            expectation_suite.meta["columns"] = meta_columns

        return expectation_suite

    def _display_suite_by_column(self, suite):
        """
        Displays the expectations of a suite by column, along with the column cardinality, and semantic or data type so
        that a user can easily see which expectations were created for which columns
        Args:
            suite: An ExpectationSuite

        Returns:
            The ExpectationSuite
        """
        expectations = suite.expectations
        expectations_by_column = {}
        for expectation in expectations:
            domain = expectation["kwargs"].get(
                "column") or "table_level_expectations"
            if expectations_by_column.get(domain) is None:
                expectations_by_column[domain] = [expectation]
            else:
                expectations_by_column[domain].append(expectation)

        if not expectations_by_column:
            print("No expectations included in suite.")
        else:
            print(
                "Creating an expectation suite with the following expectations:\n"
            )

        if "table_level_expectations" in expectations_by_column:
            table_level_expectations = expectations_by_column.pop(
                "table_level_expectations")
            print("Table-Level Expectations")
            for expectation in sorted(table_level_expectations,
                                      key=lambda x: x.expectation_type):
                print(expectation.expectation_type)

        if expectations_by_column:
            print("\nExpectations by Column")

        contains_semantic_types = [
            v for v in self.column_info.values() if v.get("semantic_types")
        ]
        for column in sorted(expectations_by_column):
            info_column = self.column_info.get(column) or {}

            semantic_types = info_column.get(
                "semantic_types") or "not_specified"
            type_ = info_column.get("type")
            cardinality = info_column.get("cardinality")

            if len(contains_semantic_types) > 0:
                type_string = f" | Semantic Type: {semantic_types[0] if len(semantic_types)==1 else semantic_types}"
            elif type_:
                type_string = f" | Column Data Type: {type_}"
            else:
                type_string = ""

            if cardinality:
                cardinality_string = f" | Cardinality: {cardinality}"
            else:
                cardinality_string = ""

            column_string = (
                f"Column Name: {column}{type_string or ''}{cardinality_string or ''}"
            )
            print(column_string)

            for expectation in sorted(expectations_by_column.get(column),
                                      key=lambda x: x.expectation_type):
                print(expectation.expectation_type)
            print("\n")

        return True

    def _build_expectations_value_set(self, profile_dataset, column, **kwargs):
        """
        Adds a value_set expectation for a given column
        Args:
            profile_dataset: A GE Dataset
            column: The column for which to add an expectation
            **kwargs:

        Returns:
            The GE Dataset
        """
        if "expect_column_values_to_be_in_set" not in self.excluded_expectations:
            value_set = profile_dataset.expect_column_distinct_values_to_be_in_set(
                column, value_set=None,
                result_format="SUMMARY").result["observed_value"]

            profile_dataset._expectation_suite.remove_expectation(
                ExpectationConfiguration(
                    expectation_type=
                    "expect_column_distinct_values_to_be_in_set",
                    kwargs={"column": column},
                ),
                match_type="domain",
            )

            profile_dataset.expect_column_values_to_be_in_set(
                column, value_set=value_set)
        return profile_dataset

    def _build_expectations_numeric(self, profile_dataset, column, **kwargs):
        """
        Adds a set of numeric expectations for a given column
        Args:
            profile_dataset: A GE Dataset
            column: The column for which to add expectations
            **kwargs:

        Returns:
            The GE Dataset
        """

        # min
        if "expect_column_min_to_be_between" not in self.excluded_expectations:
            observed_min = profile_dataset.expect_column_min_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_min):

                profile_dataset.expect_column_min_to_be_between(
                    column,
                    min_value=observed_min,
                    max_value=observed_min,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_min_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_min_to_be_between because observed value is nan: {observed_min}"
                )

        # max
        if "expect_column_max_to_be_between" not in self.excluded_expectations:
            observed_max = profile_dataset.expect_column_max_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_max):
                profile_dataset.expect_column_max_to_be_between(
                    column,
                    min_value=observed_max,
                    max_value=observed_max,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_max_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_max_to_be_between because observed value is nan: {observed_max}"
                )

        # mean
        if "expect_column_mean_to_be_between" not in self.excluded_expectations:
            observed_mean = profile_dataset.expect_column_mean_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_mean):
                profile_dataset.expect_column_mean_to_be_between(
                    column,
                    min_value=observed_mean,
                    max_value=observed_mean,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_mean_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_mean_to_be_between because observed value is nan: {observed_mean}"
                )

        # median
        if "expect_column_median_to_be_between" not in self.excluded_expectations:
            observed_median = profile_dataset.expect_column_median_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY").result["observed_value"]
            if not self._is_nan(observed_median):

                profile_dataset.expect_column_median_to_be_between(
                    column,
                    min_value=observed_median,
                    max_value=observed_median,
                )

            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type="expect_column_median_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    f"Skipping expect_column_median_to_be_between because observed value is nan: {observed_median}"
                )

        if ("expect_column_quantile_values_to_be_between"
                not in self.excluded_expectations):
            if isinstance(profile_dataset, Dataset):
                if isinstance(profile_dataset, PandasDataset):
                    allow_relative_error = "lower"
                else:
                    allow_relative_error = (
                        profile_dataset.attempt_allowing_relative_error())
            elif isinstance(profile_dataset, Validator):
                if isinstance(profile_dataset.execution_engine,
                              PandasExecutionEngine):
                    allow_relative_error = "lower"
                if isinstance(profile_dataset.execution_engine,
                              SparkDFExecutionEngine):
                    allow_relative_error = 0.0
                if isinstance(profile_dataset.execution_engine,
                              SqlAlchemyExecutionEngine):
                    allow_relative_error = attempt_allowing_relative_error(
                        profile_dataset.execution_engine.engine.dialect)

            quantile_result = (
                profile_dataset.expect_column_quantile_values_to_be_between(
                    column,
                    quantile_ranges={
                        "quantiles": [0.05, 0.25, 0.5, 0.75, 0.95],
                        "value_ranges": [
                            [None, None],
                            [None, None],
                            [None, None],
                            [None, None],
                            [None, None],
                        ],
                    },
                    allow_relative_error=allow_relative_error,
                    result_format="SUMMARY",
                ))
            if quantile_result.exception_info and (
                    quantile_result.exception_info["exception_traceback"]
                    or quantile_result.exception_info["exception_message"]):
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type=
                        "expect_column_quantile_values_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )
                logger.debug(
                    quantile_result.exception_info["exception_traceback"])
                logger.debug(
                    quantile_result.exception_info["exception_message"])
            else:

                profile_dataset.expect_column_quantile_values_to_be_between(
                    column,
                    quantile_ranges={
                        "quantiles":
                        quantile_result.result["observed_value"]["quantiles"],
                        "value_ranges":
                        [[v, v] for v in
                         quantile_result.result["observed_value"]["values"]],
                    },
                    allow_relative_error=allow_relative_error,
                )
        return profile_dataset

    def _build_expectations_primary_or_compound_key(self, profile_dataset,
                                                    column_list, **kwargs):
        """
        Adds a uniqueness expectation for a given column or set of columns
        Args:
            profile_dataset: A GE Dataset
            column_list: A list containing one or more columns for which to add a uniqueness expectation
            **kwargs:

        Returns:
            The GE Dataset
        """
        # uniqueness
        if (len(column_list) > 1 and "expect_compound_columns_to_be_unique"
                not in self.excluded_expectations):
            profile_dataset.expect_compound_columns_to_be_unique(column_list)
        elif len(column_list) < 1:
            raise ValueError(
                "When specifying a primary or compound key, column_list must not be empty"
            )
        else:
            [column] = column_list
            if "expect_column_values_to_be_unique" not in self.excluded_expectations:
                profile_dataset.expect_column_values_to_be_unique(column)
        return profile_dataset

    def _build_expectations_string(self, profile_dataset, column, **kwargs):
        """
        Adds a set of string expectations for a given column. Currently does not do anything.
        With the 0.12 API there isn't a quick way to introspect for value_lengths - if we did that, we could build a
        potentially useful value_lengths expectation here.
        Args:
            profile_dataset: A GE Dataset
            column: The column for which to add expectations
            **kwargs:

        Returns:
            The GE Dataset
        """

        if ("expect_column_value_lengths_to_be_between"
                not in self.excluded_expectations):

            pass
        return profile_dataset

    def _build_expectations_datetime(self, profile_dataset, column, **kwargs):
        """
        Adds `expect_column_values_to_be_between` for a given column
        Args:
            profile_dataset: A GE Dataset
            column: The column for which to add the expectation
            **kwargs:

        Returns:
            The GE Dataset
        """

        if "expect_column_values_to_be_between" not in self.excluded_expectations:
            min_value = profile_dataset.expect_column_min_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY",
                parse_strings_as_datetimes=True,
            ).result["observed_value"]

            if min_value is not None:
                try:
                    min_value = parse(min_value)
                except TypeError:
                    pass

            profile_dataset._expectation_suite.remove_expectation(
                ExpectationConfiguration(
                    expectation_type="expect_column_min_to_be_between",
                    kwargs={"column": column},
                ),
                match_type="domain",
            )

            max_value = profile_dataset.expect_column_max_to_be_between(
                column,
                min_value=None,
                max_value=None,
                result_format="SUMMARY",
                parse_strings_as_datetimes=True,
            ).result["observed_value"]
            if max_value is not None:
                try:
                    max_value = parse(max_value)
                except TypeError:
                    pass

            profile_dataset._expectation_suite.remove_expectation(
                ExpectationConfiguration(
                    expectation_type="expect_column_max_to_be_between",
                    kwargs={"column": column},
                ),
                match_type="domain",
            )
            if min_value is not None or max_value is not None:
                profile_dataset.expect_column_values_to_be_between(
                    column,
                    min_value=min_value,
                    max_value=max_value,
                    parse_strings_as_datetimes=True,
                )
        return profile_dataset

    def _build_expectations_for_all_column_types(self, profile_dataset, column,
                                                 **kwargs):
        """
        Adds these expectations for all included columns irrespective of type. Includes:
            - `expect_column_values_to_not_be_null` (or `expect_column_values_to_be_null`)
            - `expect_column_proportion_of_unique_values_to_be_between`
            - `expect_column_values_to_be_in_type_list`
        Args:
            profile_dataset: A GE Dataset
            column: The column for which to add the expectations
            **kwargs:

        Returns:
            The GE Dataset
        """
        if "expect_column_values_to_not_be_null" not in self.excluded_expectations:
            not_null_result = profile_dataset.expect_column_values_to_not_be_null(
                column)
            if not not_null_result.success:
                unexpected_percent = float(
                    not_null_result.result["unexpected_percent"])
                if unexpected_percent >= 50 and not self.not_null_only:
                    potential_mostly_value = unexpected_percent / 100.0
                    profile_dataset._expectation_suite.remove_expectation(
                        ExpectationConfiguration(
                            expectation_type=
                            "expect_column_values_to_not_be_null",
                            kwargs={"column": column},
                        ),
                        match_type="domain",
                    )
                    if ("expect_column_values_to_be_null"
                            not in self.excluded_expectations):
                        profile_dataset.expect_column_values_to_be_null(
                            column, mostly=potential_mostly_value)
                else:
                    potential_mostly_value = (100.0 -
                                              unexpected_percent) / 100.0
                    safe_mostly_value = round(
                        max(0.001, potential_mostly_value), 3)
                    profile_dataset.expect_column_values_to_not_be_null(
                        column, mostly=safe_mostly_value)
        if ("expect_column_proportion_of_unique_values_to_be_between"
                not in self.excluded_expectations):
            pct_unique = (
                profile_dataset.
                expect_column_proportion_of_unique_values_to_be_between(
                    column, None, None).result["observed_value"])

            if not self._is_nan(pct_unique):
                profile_dataset.expect_column_proportion_of_unique_values_to_be_between(
                    column, min_value=pct_unique, max_value=pct_unique)
            else:
                profile_dataset._expectation_suite.remove_expectation(
                    ExpectationConfiguration(
                        expectation_type=
                        "expect_column_proportion_of_unique_values_to_be_between",
                        kwargs={"column": column},
                    ),
                    match_type="domain",
                )

                logger.debug(
                    f"Skipping expect_column_proportion_of_unique_values_to_be_between because observed value is nan: {pct_unique}"
                )

        if "expect_column_values_to_be_in_type_list" not in self.excluded_expectations:
            col_type = self.column_info.get(column).get("type")
            if col_type != "UNKNOWN":
                type_list = profiler_data_types_with_mapping.get(col_type)
                profile_dataset.expect_column_values_to_be_in_type_list(
                    column, type_list=type_list)
            else:
                logger.info(
                    f"Column type for column {column} is unknown. "
                    f"Skipping expect_column_values_to_be_in_type_list for this column."
                )

    def _build_expectations_table(self, profile_dataset, **kwargs):
        """
        Adds two table level expectations to the dataset
        Args:
            profile_dataset: A GE Dataset
            **kwargs:

        Returns:
            The GE Dataset
        """

        if ("expect_table_columns_to_match_ordered_list"
                not in self.excluded_expectations):
            columns = self.all_table_columns
            profile_dataset.expect_table_columns_to_match_ordered_list(columns)

        if "expect_table_row_count_to_be_between" not in self.excluded_expectations:
            row_count = profile_dataset.expect_table_row_count_to_be_between(
                min_value=0, max_value=None).result["observed_value"]
            min_value = max(0, int(row_count))
            max_value = int(row_count)

            profile_dataset.expect_table_row_count_to_be_between(
                min_value=min_value, max_value=max_value)

    def _is_nan(self, value):
        """
        If value is an array, test element-wise for NaN and return result as a boolean array.
        If value is a scalar, return boolean.
        Args:
            value: The value to test

        Returns:
            The results of the test
        """
        try:
            return np.isnan(value)
        except TypeError:
            return True
Ejemplo n.º 4
0
    def get_metrics(
        self,
        batch_ids: List[str],
        validator: Validator,
        metric_name: str,
        metric_domain_kwargs: Optional[Union[str, dict]] = None,
        metric_value_kwargs: Optional[Union[str, dict]] = None,
        enforce_numeric_metric: Optional[Union[str, bool]] = False,
        replace_nan_with_zero: Optional[Union[str, bool]] = False,
        domain: Optional[Domain] = None,
        variables: Optional[ParameterContainer] = None,
        parameters: Optional[Dict[str, ParameterContainer]] = None,
    ) -> Dict[str, Union[Union[np.ndarray, List[Union[Any, Number]]], Dict[
            str, Any]]]:
        domain_kwargs = build_metric_domain_kwargs(
            batch_id=None,
            metric_domain_kwargs=metric_domain_kwargs,
            domain=domain,
            variables=variables,
            parameters=parameters,
        )

        metric_domain_kwargs: dict = copy.deepcopy(domain_kwargs)

        # Obtain value kwargs from rule state (i.e., variables and parameters); from instance variable otherwise.
        metric_value_kwargs = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=metric_value_kwargs,
            expected_return_type=None,
            variables=variables,
            parameters=parameters,
        )

        # Obtain enforce_numeric_metric from rule state (i.e., variables and parameters); from instance variable otherwise.
        enforce_numeric_metric = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=enforce_numeric_metric,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        # Obtain replace_nan_with_zero from rule state (i.e., variables and parameters); from instance variable otherwise.
        replace_nan_with_zero = get_parameter_value_and_validate_return_type(
            domain=domain,
            parameter_reference=replace_nan_with_zero,
            expected_return_type=bool,
            variables=variables,
            parameters=parameters,
        )

        metric_values: List[Union[Any, Number]] = []

        metric_value: Union[Any, Number]
        batch_id: str
        for batch_id in batch_ids:
            metric_domain_kwargs["batch_id"] = batch_id
            metric_configuration_arguments: Dict[str, Any] = {
                "metric_name": metric_name,
                "metric_domain_kwargs": metric_domain_kwargs,
                "metric_value_kwargs": metric_value_kwargs,
                "metric_dependencies": None,
            }
            metric_value = validator.get_metric(metric=MetricConfiguration(
                **metric_configuration_arguments))
            if enforce_numeric_metric:
                if not is_numeric(value=metric_value):
                    raise ge_exceptions.ProfilerExecutionError(
                        message=
                        f"""Applicability of {self.__class__.__name__} is restricted to numeric-valued metrics \
(value of type "{str(type(metric_value))}" was computed).
""")
                if np.isnan(metric_value):
                    if not replace_nan_with_zero:
                        raise ValueError(
                            f"""Computation of metric "{metric_name}" resulted in NaN ("not a number") value.
""")
                    metric_value = 0.0

            metric_values.append(metric_value)

        return {
            "metric_values": metric_values,
            "details": {
                "metric_configuration": {
                    "metric_name": metric_name,
                    "domain_kwargs": domain_kwargs,
                    "metric_value_kwargs": metric_value_kwargs,
                    "metric_dependencies": None,
                },
                "num_batches": len(metric_values),
            },
        }