Esempio n. 1
0
    def wrapper(metric_fn: Callable):
        @wraps(metric_fn)
        def inner_func(*args, **kwargs):
            return metric_fn(*args, **kwargs)

        inner_func.metric_engine = engine
        inner_func.metric_fn_type = MetricPartialFunctionTypes(
            partial_fn_type)  # raises ValueError if unknown type
        inner_func.domain_type = MetricDomainTypes(domain_type)
        inner_func.metric_definition_kwargs = kwargs
        return inner_func
Esempio n. 2
0
    def __init__(
        self,
        domain_type: Union[str, MetricDomainTypes],
        domain_kwargs: Optional[Union[Dict[str, Any], DomainKwargs]] = None,
        details: Optional[Dict[str, Any]] = None,
        rule_name: Optional[str] = None,
    ) -> None:
        if isinstance(domain_type, str):
            try:
                domain_type = MetricDomainTypes(domain_type)
            except (TypeError, KeyError) as e:
                raise ValueError(
                    f""" {e}: Cannot instantiate Domain (domain_type "{str(domain_type)}" of type \
"{str(type(domain_type))}" is not supported).
""")
        elif not isinstance(domain_type, MetricDomainTypes):
            raise ValueError(
                f"""Cannot instantiate Domain (domain_type "{str(domain_type)}" of type "{str(type(domain_type))}" is \
not supported).
""")

        if domain_kwargs is None:
            domain_kwargs = DomainKwargs({})
        elif isinstance(domain_kwargs, dict):
            domain_kwargs = DomainKwargs(domain_kwargs)

        domain_kwargs_dot_dict: DomainKwargs = (
            self._convert_dictionaries_to_domain_kwargs(source=domain_kwargs))

        if details is None:
            details = {}

        inferred_semantic_domain_type: Dict[str, Union[
            str,
            SemanticDomainTypes]] = details.get(INFERRED_SEMANTIC_TYPE_KEY)
        if inferred_semantic_domain_type:
            semantic_domain_key: str
            metric_domain_key: str
            metric_domain_value: Any
            is_consistent: bool
            for semantic_domain_key in inferred_semantic_domain_type:
                is_consistent = False
                for (
                        metric_domain_key,
                        metric_domain_value,
                ) in domain_kwargs_dot_dict.items():
                    if (isinstance(metric_domain_value, (list, set, tuple))
                            and semantic_domain_key in metric_domain_value
                        ) or (semantic_domain_key == metric_domain_value):
                        is_consistent = True
                        break

                if not is_consistent:
                    raise ValueError(
                        f"""Cannot instantiate Domain (domain_type "{str(domain_type)}" of type \
"{str(type(domain_type))}" -- key "{semantic_domain_key}", detected in "{INFERRED_SEMANTIC_TYPE_KEY}" dictionary, does \
not exist as value of appropriate key in "domain_kwargs" dictionary.
""")

        super().__init__(
            domain_type=domain_type,
            domain_kwargs=domain_kwargs_dot_dict,
            details=details,
            rule_name=rule_name,
        )
Esempio n. 3
0
    def get_compute_domain(
        self,
        domain_kwargs: Dict,
        domain_type: Union[str, "MetricDomainTypes"],
        accessor_keys: Optional[Iterable[str]] = None,
    ) -> Tuple["sa.sql.Selectable", dict, dict]:
        """Uses a given batch dictionary and domain kwargs to obtain a SqlAlchemy column object.

        Args:
            domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain
            domain_type (str or "MetricDomainTypes") - an Enum value indicating which metric domain the user would
            like to be using, or a corresponding string value representing it. String types include "identity", "column",
            "column_pair", "table" and "other". Enum types include capitalized versions of these from the class
            MetricDomainTypes.
            accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when describing
            the domain and simply transferred with their associated values into accessor_domain_kwargs.

        Returns:
            SqlAlchemy column
        """
        # Extracting value from enum if it is given for future computation
        domain_type = MetricDomainTypes(domain_type)
        batch_id = domain_kwargs.get("batch_id")
        if batch_id is None:
            # We allow no batch id specified if there is only one batch
            if self.active_batch_data:
                data_object = self.active_batch_data
            else:
                raise GreatExpectationsError(
                    "No batch is specified, but could not identify a loaded batch."
                )
        else:
            if batch_id in self.loaded_batch_data_dict:
                data_object = self.loaded_batch_data_dict[batch_id]
            else:
                raise GreatExpectationsError(
                    f"Unable to find batch with batch_id {batch_id}"
                )

        compute_domain_kwargs = copy.deepcopy(domain_kwargs)
        accessor_domain_kwargs = dict()
        if "table" in domain_kwargs and domain_kwargs["table"] is not None:
            if domain_kwargs["table"] != data_object.record_set_name:
                raise ValueError("Unrecognized table name.")
            else:
                selectable = data_object.selectable
        elif "query" in domain_kwargs:
            raise ValueError(
                "query is not currently supported by SqlAlchemyExecutionEngine"
            )
        else:
            selectable = data_object.selectable

        if (
            "row_condition" in domain_kwargs
            and domain_kwargs["row_condition"] is not None
        ):
            condition_parser = domain_kwargs["condition_parser"]
            if condition_parser == "great_expectations__experimental__":
                parsed_condition = parse_condition_to_sqlalchemy(
                    domain_kwargs["row_condition"]
                )
                selectable = sa.select(
                    "*", from_obj=selectable, whereclause=parsed_condition
                )

            else:
                raise GreatExpectationsError(
                    "SqlAlchemyExecutionEngine only supports the great_expectations condition_parser."
                )

        # Warning user if accessor keys are in any domain that is not of type table, will be ignored
        if (
            domain_type != MetricDomainTypes.TABLE
            and accessor_keys is not None
            and len(accessor_keys) > 0
        ):
            logger.warning(
                "Accessor keys ignored since Metric Domain Type is not 'table'"
            )

        if domain_type == MetricDomainTypes.TABLE:
            if accessor_keys is not None and len(accessor_keys) > 0:
                for key in accessor_keys:
                    accessor_domain_kwargs[key] = compute_domain_kwargs.pop(key)
            if len(domain_kwargs.keys()) > 0:
                for key in compute_domain_kwargs.keys():
                    # Warning user if kwarg not "normal"
                    if key not in [
                        "batch_id",
                        "table",
                        "row_condition",
                        "condition_parser",
                    ]:
                        logger.warning(
                            f"Unexpected key {key} found in domain_kwargs for domain type {domain_type.value}"
                        )
            return selectable, compute_domain_kwargs, accessor_domain_kwargs

        # If user has stated they want a column, checking if one is provided, and
        elif domain_type == MetricDomainTypes.COLUMN:
            if "column" in compute_domain_kwargs:
                # Checking if case- sensitive and using appropriate name
                if self.active_batch_data.use_quoted_name:
                    accessor_domain_kwargs["column"] = quoted_name(
                        compute_domain_kwargs.pop("column")
                    )
                else:
                    accessor_domain_kwargs["column"] = compute_domain_kwargs.pop(
                        "column"
                    )
            else:
                # If column not given
                raise GreatExpectationsError(
                    "Column not provided in compute_domain_kwargs"
                )

        # Else, if column pair values requested
        elif domain_type == MetricDomainTypes.COLUMN_PAIR:
            # Ensuring column_A and column_B parameters provided
            if (
                "column_A" in compute_domain_kwargs
                and "column_B" in compute_domain_kwargs
            ):
                if self.active_batch_data.use_quoted_name:
                    # If case matters...
                    accessor_domain_kwargs["column_A"] = quoted_name(
                        compute_domain_kwargs.pop("column_A")
                    )
                    accessor_domain_kwargs["column_B"] = quoted_name(
                        compute_domain_kwargs.pop("column_B")
                    )
                else:
                    accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop(
                        "column_A"
                    )
                    accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop(
                        "column_B"
                    )
            else:
                raise GreatExpectationsError(
                    "column_A or column_B not found within compute_domain_kwargs"
                )

        # Checking if table or identity or other provided, column is not specified. If it is, warning the user
        elif domain_type == MetricDomainTypes.MULTICOLUMN:
            if "columns" in compute_domain_kwargs:
                # If columns exist
                accessor_domain_kwargs["columns"] = compute_domain_kwargs.pop("columns")

        # Filtering if identity
        elif domain_type == MetricDomainTypes.IDENTITY:
            # If we would like our data to become a single column
            if "column" in compute_domain_kwargs:
                if self.active_batch_data.use_quoted_name:
                    selectable = sa.select(
                        [sa.column(quoted_name(compute_domain_kwargs["column"]))]
                    ).select_from(selectable)
                else:
                    selectable = sa.select(
                        [sa.column(compute_domain_kwargs["column"])]
                    ).select_from(selectable)

            # If we would like our data to now become a column pair
            elif ("column_A" in compute_domain_kwargs) and (
                "column_B" in compute_domain_kwargs
            ):
                if self.active_batch_data.use_quoted_name:
                    selectable = sa.select(
                        [
                            sa.column(quoted_name(compute_domain_kwargs["column_A"])),
                            sa.column(quoted_name(compute_domain_kwargs["column_B"])),
                        ]
                    ).select_from(selectable)
                else:
                    selectable = sa.select(
                        [
                            sa.column(compute_domain_kwargs["column_A"]),
                            sa.column(compute_domain_kwargs["column_B"]),
                        ]
                    ).select_from(selectable)
            else:
                # If we would like our data to become a multicolumn
                if "columns" in compute_domain_kwargs:
                    if self.active_batch_data.use_quoted_name:
                        # Building a list of column objects used for sql alchemy selection
                        to_select = [
                            sa.column(quoted_name(col))
                            for col in compute_domain_kwargs["columns"]
                        ]
                        selectable = sa.select(to_select).select_from(selectable)
                    else:
                        to_select = [
                            sa.column(col) for col in compute_domain_kwargs["columns"]
                        ]
                        selectable = sa.select(to_select).select_from(selectable)

        # Letting selectable fall through
        return selectable, compute_domain_kwargs, accessor_domain_kwargs
Esempio n. 4
0
    def get_compute_domain(
        self,
        domain_kwargs: dict,
        domain_type: Union[str, MetricDomainTypes],
        accessor_keys: Optional[Iterable[str]] = None,
    ) -> Tuple["pyspark.sql.DataFrame", dict, dict]:
        """Uses a given batch dictionary and domain kwargs (which include a row condition and a condition parser)
        to obtain and/or query a batch. Returns in the format of a Pandas Series if only a single column is desired,
        or otherwise a Data Frame.

        Args:
            domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain
            domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would
            like to be using, or a corresponding string value representing it. String types include "identity",
            "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the
            class MetricDomainTypes.
            accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when
            describing the domain and simply transferred with their associated values into accessor_domain_kwargs.

        Returns:
            A tuple including:
              - a DataFrame (the data on which to compute)
              - a dictionary of compute_domain_kwargs, describing the DataFrame
              - a dictionary of accessor_domain_kwargs, describing any accessors needed to
                identify the domain within the compute domain
        """
        # Extracting value from enum if it is given for future computation
        domain_type = MetricDomainTypes(domain_type)

        batch_id = domain_kwargs.get("batch_id")
        if batch_id is None:
            # We allow no batch id specified if there is only one batch
            if self.active_batch_data:
                data = self.active_batch_data.dataframe
            else:
                raise ValidationError(
                    "No batch is specified, but could not identify a loaded batch."
                )
        else:
            if batch_id in self.loaded_batch_data_dict:
                data = self.loaded_batch_data_dict[batch_id].dataframe
            else:
                raise ValidationError(
                    f"Unable to find batch with batch_id {batch_id}")

        compute_domain_kwargs = copy.deepcopy(domain_kwargs)
        accessor_domain_kwargs = dict()
        table = domain_kwargs.get("table", None)
        if table:
            raise ValueError(
                "SparkDFExecutionEngine does not currently support multiple named tables."
            )

        row_condition = domain_kwargs.get("row_condition", None)
        if row_condition:
            condition_parser = domain_kwargs.get("condition_parser", None)
            if condition_parser == "spark":
                data = data.filter(row_condition)
            elif condition_parser == "great_expectations__experimental__":
                parsed_condition = parse_condition_to_spark(row_condition)
                data = data.filter(parsed_condition)
            else:
                raise GreatExpectationsError(
                    f"unrecognized condition_parser {str(condition_parser)}for Spark execution engine"
                )

        # Warning user if accessor keys are in any domain that is not of type table, will be ignored
        if (domain_type != MetricDomainTypes.TABLE
                and accessor_keys is not None
                and len(list(accessor_keys)) > 0):
            logger.warning(
                'Accessor keys ignored since Metric Domain Type is not "table"'
            )

        if domain_type == MetricDomainTypes.TABLE:
            if accessor_keys is not None and len(list(accessor_keys)) > 0:
                for key in accessor_keys:
                    accessor_domain_kwargs[key] = compute_domain_kwargs.pop(
                        key)
            if len(compute_domain_kwargs.keys()) > 0:
                # Warn user if kwarg not "normal".
                unexpected_keys: set = set(
                    compute_domain_kwargs.keys()).difference({
                        "batch_id",
                        "table",
                        "row_condition",
                        "condition_parser",
                    })
                if len(unexpected_keys) > 0:
                    unexpected_keys_str: str = ", ".join(
                        map(lambda element: f'"{element}"', unexpected_keys))
                    logger.warning(
                        f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".'
                    )
            return data, compute_domain_kwargs, accessor_domain_kwargs

        # If user has stated they want a column, checking if one is provided, and
        elif domain_type == MetricDomainTypes.COLUMN:
            if "column" in compute_domain_kwargs:
                accessor_domain_kwargs["column"] = compute_domain_kwargs.pop(
                    "column")
            else:
                # If column not given
                raise GreatExpectationsError(
                    "Column not provided in compute_domain_kwargs")

        # Else, if column pair values requested
        elif domain_type == MetricDomainTypes.COLUMN_PAIR:
            # Ensuring column_A and column_B parameters provided
            if ("column_A" in compute_domain_kwargs
                    and "column_B" in compute_domain_kwargs):
                accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop(
                    "column_A")
                accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop(
                    "column_B")
            else:
                raise GreatExpectationsError(
                    "column_A or column_B not found within compute_domain_kwargs"
                )

        # Checking if table or identity or other provided, column is not specified. If it is, warning the user
        elif domain_type == MetricDomainTypes.MULTICOLUMN:
            if "column_list" in compute_domain_kwargs:
                # If column_list exists
                accessor_domain_kwargs[
                    "column_list"] = compute_domain_kwargs.pop("column_list")

        # Filtering if identity
        elif domain_type == MetricDomainTypes.IDENTITY:

            # If we would like our data to become a single column
            if "column" in compute_domain_kwargs:
                data = data.select(compute_domain_kwargs["column"])

            # If we would like our data to now become a column pair
            elif ("column_A"
                  in compute_domain_kwargs) and ("column_B"
                                                 in compute_domain_kwargs):
                data = data.select(compute_domain_kwargs["column_A"],
                                   compute_domain_kwargs["column_B"])
            else:

                # If we would like our data to become a multicolumn
                if "column_list" in compute_domain_kwargs:
                    data = data.select(compute_domain_kwargs["column_list"])

        return data, compute_domain_kwargs, accessor_domain_kwargs
Esempio n. 5
0
    def get_compute_domain(
        self,
        domain_kwargs: Dict,
        domain_type: Union[str, MetricDomainTypes],
        accessor_keys: Optional[Iterable[str]] = None,
    ) -> Tuple[Selectable, dict, dict]:
        """Uses a given batch dictionary and domain kwargs to obtain a SqlAlchemy column object.

        Args:
            domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain
            domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would
            like to be using, or a corresponding string value representing it. String types include "identity",
            "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the
            class MetricDomainTypes.
            accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when
            describing the domain and simply transferred with their associated values into accessor_domain_kwargs.

        Returns:
            SqlAlchemy column
        """
        selectable = self.get_domain_records(domain_kwargs=domain_kwargs, )
        # Extracting value from enum if it is given for future computation
        domain_type = MetricDomainTypes(domain_type)

        # Warning user if accessor keys are in any domain that is not of type table, will be ignored
        if (domain_type != MetricDomainTypes.TABLE
                and accessor_keys is not None
                and len(list(accessor_keys)) > 0):
            logger.warning(
                'Accessor keys ignored since Metric Domain Type is not "table"'
            )

        compute_domain_kwargs = copy.deepcopy(domain_kwargs)
        accessor_domain_kwargs = {}
        if domain_type == MetricDomainTypes.TABLE:
            if accessor_keys is not None and len(list(accessor_keys)) > 0:
                for key in accessor_keys:
                    accessor_domain_kwargs[key] = compute_domain_kwargs.pop(
                        key)
            if len(domain_kwargs.keys()) > 0:
                # Warn user if kwarg not "normal".
                unexpected_keys: set = set(
                    compute_domain_kwargs.keys()).difference({
                        "batch_id",
                        "table",
                        "row_condition",
                        "condition_parser",
                    })
                if len(unexpected_keys) > 0:
                    unexpected_keys_str: str = ", ".join(
                        map(lambda element: f'"{element}"', unexpected_keys))
                    logger.warning(
                        f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".'
                    )
            return selectable, compute_domain_kwargs, accessor_domain_kwargs

        elif domain_type == MetricDomainTypes.COLUMN:
            if "column" not in compute_domain_kwargs:
                raise GreatExpectationsError(
                    "Column not provided in compute_domain_kwargs")

            # Checking if case-sensitive and using appropriate name
            if self.active_batch_data.use_quoted_name:
                accessor_domain_kwargs["column"] = quoted_name(
                    compute_domain_kwargs.pop("column"), quote=True)
            else:
                accessor_domain_kwargs["column"] = compute_domain_kwargs.pop(
                    "column")

            return selectable, compute_domain_kwargs, accessor_domain_kwargs

        elif domain_type == MetricDomainTypes.COLUMN_PAIR:
            if not ("column_A" in compute_domain_kwargs
                    and "column_B" in compute_domain_kwargs):
                raise GreatExpectationsError(
                    "column_A or column_B not found within compute_domain_kwargs"
                )

            # Checking if case-sensitive and using appropriate name
            if self.active_batch_data.use_quoted_name:
                accessor_domain_kwargs["column_A"] = quoted_name(
                    compute_domain_kwargs.pop("column_A"), quote=True)
                accessor_domain_kwargs["column_B"] = quoted_name(
                    compute_domain_kwargs.pop("column_B"), quote=True)
            else:
                accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop(
                    "column_A")
                accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop(
                    "column_B")

            return selectable, compute_domain_kwargs, accessor_domain_kwargs

        elif domain_type == MetricDomainTypes.MULTICOLUMN:
            if "column_list" not in domain_kwargs:
                raise GreatExpectationsError(
                    "column_list not found within domain_kwargs")

            column_list = compute_domain_kwargs.pop("column_list")

            if len(column_list) < 2:
                raise GreatExpectationsError(
                    "column_list must contain at least 2 columns")

            # Checking if case-sensitive and using appropriate name
            if self.active_batch_data.use_quoted_name:
                accessor_domain_kwargs["column_list"] = [
                    quoted_name(column_name, quote=True)
                    for column_name in column_list
                ]
            else:
                accessor_domain_kwargs["column_list"] = column_list

            return selectable, compute_domain_kwargs, accessor_domain_kwargs

        # Letting selectable fall through
        return selectable, compute_domain_kwargs, accessor_domain_kwargs
    def get_compute_domain(
        self,
        domain_kwargs: dict,
        domain_type: Union[str, MetricDomainTypes],
        accessor_keys: Optional[Iterable[str]] = None,
    ) -> Tuple[DataFrame, dict, dict]:
        """Uses a given batch dictionary and domain kwargs (which include a row condition and a condition parser)
        to obtain and/or query a batch. Returns in the format of a Spark DataFrame.

        Args:
            domain_kwargs (dict) - A dictionary consisting of the domain kwargs specifying which data to obtain
            domain_type (str or MetricDomainTypes) - an Enum value indicating which metric domain the user would
            like to be using, or a corresponding string value representing it. String types include "identity",
            "column", "column_pair", "table" and "other". Enum types include capitalized versions of these from the
            class MetricDomainTypes.
            accessor_keys (str iterable) - keys that are part of the compute domain but should be ignored when
            describing the domain and simply transferred with their associated values into accessor_domain_kwargs.

        Returns:
            A tuple including:
              - a DataFrame (the data on which to compute)
              - a dictionary of compute_domain_kwargs, describing the DataFrame
              - a dictionary of accessor_domain_kwargs, describing any accessors needed to
                identify the domain within the compute domain
        """
        data = self.get_domain_records(
            domain_kwargs=domain_kwargs,
        )
        # Extracting value from enum if it is given for future computation
        domain_type = MetricDomainTypes(domain_type)

        compute_domain_kwargs = copy.deepcopy(domain_kwargs)
        accessor_domain_kwargs = {}
        table = domain_kwargs.get("table", None)
        if table:
            raise ValueError(
                "SparkDFExecutionEngine does not currently support multiple named tables."
            )

        # Warning user if accessor keys are in any domain that is not of type table, will be ignored
        if (
            domain_type != MetricDomainTypes.TABLE
            and accessor_keys is not None
            and len(list(accessor_keys)) > 0
        ):
            logger.warning(
                'Accessor keys ignored since Metric Domain Type is not "table"'
            )

        if domain_type == MetricDomainTypes.TABLE:
            if accessor_keys is not None and len(list(accessor_keys)) > 0:
                for key in accessor_keys:
                    accessor_domain_kwargs[key] = compute_domain_kwargs.pop(key)
            if len(compute_domain_kwargs.keys()) > 0:
                # Warn user if kwarg not "normal".
                unexpected_keys: set = set(compute_domain_kwargs.keys()).difference(
                    {
                        "batch_id",
                        "table",
                        "row_condition",
                        "condition_parser",
                    }
                )
                if len(unexpected_keys) > 0:
                    unexpected_keys_str: str = ", ".join(
                        map(lambda element: f'"{element}"', unexpected_keys)
                    )
                    logger.warning(
                        f'Unexpected key(s) {unexpected_keys_str} found in domain_kwargs for domain type "{domain_type.value}".'
                    )
            return data, compute_domain_kwargs, accessor_domain_kwargs

        elif domain_type == MetricDomainTypes.COLUMN:
            if "column" not in compute_domain_kwargs:
                raise GreatExpectationsError(
                    "Column not provided in compute_domain_kwargs"
                )

            accessor_domain_kwargs["column"] = compute_domain_kwargs.pop("column")

        elif domain_type == MetricDomainTypes.COLUMN_PAIR:
            if not (
                "column_A" in compute_domain_kwargs
                and "column_B" in compute_domain_kwargs
            ):
                raise GreatExpectationsError(
                    "column_A or column_B not found within compute_domain_kwargs"
                )

            accessor_domain_kwargs["column_A"] = compute_domain_kwargs.pop("column_A")
            accessor_domain_kwargs["column_B"] = compute_domain_kwargs.pop("column_B")

        elif domain_type == MetricDomainTypes.MULTICOLUMN:
            if "column_list" not in domain_kwargs:
                raise ge_exceptions.GreatExpectationsError(
                    "column_list not found within domain_kwargs"
                )

            column_list = compute_domain_kwargs.pop("column_list")

            if len(column_list) < 2:
                raise ge_exceptions.GreatExpectationsError(
                    "column_list must contain at least 2 columns"
                )

            accessor_domain_kwargs["column_list"] = column_list

        return data, compute_domain_kwargs, accessor_domain_kwargs