コード例 #1
0
    def _sqlalchemy(
        cls,
        column,
        min_value=None,
        max_value=None,
        strict_min=None,
        strict_max=None,
        parse_strings_as_datetimes: bool = False,
        **kwargs
    ):
        if parse_strings_as_datetimes:
            # deprecated-v0.13.41
            warnings.warn(
                """The parameter "parse_strings_as_datetimes" is deprecated as of v0.13.41 in \
v0.16. As part of the V3 API transition, we've moved away from input transformation. For more information, \
please see: https://greatexpectations.io/blog/why_we_dont_do_transformations_for_expectations/
""",
                DeprecationWarning,
            )

            if min_value is not None:
                try:
                    min_value = parse(min_value)
                except TypeError:
                    pass

            if max_value is not None:
                try:
                    max_value = parse(max_value)
                except TypeError:
                    pass

        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")

        if min_value is None:
            if strict_max:
                return column < max_value
            else:
                return column <= max_value

        elif max_value is None:
            if strict_min:
                return min_value < column
            else:
                return min_value <= column

        else:
            if strict_min and strict_max:
                return sa.and_(min_value < column, column < max_value)
            elif strict_min:
                return sa.and_(min_value < column, column <= max_value)
            elif strict_max:
                return sa.and_(min_value <= column, column < max_value)
            else:
                return sa.and_(min_value <= column, column <= max_value)
コード例 #2
0
    def _sqlalchemy(cls,
                    column,
                    _metrics,
                    min_value=None,
                    max_value=None,
                    strict_min=None,
                    strict_max=None,
                    **kwargs):
        column_lengths, _, _ = _metrics.get("column_values.value_length.map")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")

        # Assert that min_value and max_value are integers
        try:
            if min_value is not None and not float(min_value).is_integer():
                raise ValueError("min_value and max_value must be integers")

            if max_value is not None and not float(max_value).is_integer():
                raise ValueError("min_value and max_value must be integers")

        except ValueError:
            raise ValueError("min_value and max_value must be integers")

        if min_value is not None and max_value is not None:
            return sa.and_(
                column_lengths >= min_value,
                column_lengths <= max_value,
            )

        elif min_value is None and max_value is not None:
            return column_lengths <= max_value

        elif min_value is not None and max_value is None:
            return column_lengths >= min_value
コード例 #3
0
    def _sqlalchemy(cls, column, regex_list, match_on, _dialect, **kwargs):
        if match_on not in ["any", "all"]:
            raise ValueError("match_on must be any or all")

        if len(regex_list) == 0:
            raise ValueError("At least one regex must be supplied in the regex_list.")

        regex_expression = get_dialect_regex_expression(column, regex_list[0], _dialect)
        if regex_expression is None:
            logger.warning(f"Regex is not supported for dialect {str(_dialect)}")
            raise NotImplementedError

        if match_on == "any":
            condition = sa.or_(
                *(
                    get_dialect_regex_expression(column, regex, _dialect)
                    for regex in regex_list
                )
            )
        else:
            condition = sa.and_(
                *(
                    get_dialect_regex_expression(column, regex, _dialect)
                    for regex in regex_list
                )
            )
        return condition
    def _sqlalchemy(cls, column, like_pattern_list, match_on, _dialect,
                    **kwargs):
        if not match_on:
            match_on = "any"

        if match_on not in ["any", "all"]:
            raise ValueError("match_on must be any or all")

        if len(like_pattern_list) == 0:
            raise ValueError(
                "At least one like_pattern must be supplied in the like_pattern_list."
            )

        like_pattern_expression = get_dialect_like_pattern_expression(
            column, _dialect, like_pattern_list[0])
        if like_pattern_expression is None:
            logger.warning(
                f"Like patterns are not supported for dialect {str(_dialect.dialect.name)}"
            )
            raise NotImplementedError

        if match_on == "any":
            condition = sa.or_(*(get_dialect_like_pattern_expression(
                column, _dialect, like_pattern)
                                 for like_pattern in like_pattern_list))
        else:
            condition = sa.and_(*(get_dialect_like_pattern_expression(
                column, _dialect, like_pattern)
                                  for like_pattern in like_pattern_list))
        return condition
コード例 #5
0
    def _sqlalchemy(
        cls,
        column,
        min_value=None,
        max_value=None,
        strict_min=None,
        strict_max=None,
        parse_strings_as_datetimes=None,
        **kwargs
    ):
        if parse_strings_as_datetimes:
            if min_value:
                min_value = parse(min_value)

            if max_value:
                max_value = parse(max_value)

        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")

        if min_value is None:
            if strict_max:
                return column < max_value
            else:
                return column <= max_value

        elif max_value is None:
            if strict_min:
                return min_value < column
            else:
                return min_value <= column

        else:
            if strict_min and strict_max:
                return sa.and_(min_value < column, column < max_value)
            elif strict_min:
                return sa.and_(min_value < column, column <= max_value)
            elif strict_max:
                return sa.and_(min_value <= column, column < max_value)
            else:
                return sa.and_(min_value <= column, column <= max_value)
コード例 #6
0
    def _sqlalchemy(cls, column, regex_list, _dialect, **kwargs):
        if len(regex_list) == 0:
            raise ValueError(
                "At least one regex must be supplied in the regex_list.")

        regex_expression = get_dialect_regex_expression(column,
                                                        regex_list[0],
                                                        _dialect,
                                                        positive=False)
        if regex_expression is None:
            logger.warning("Regex is not supported for dialect %s" %
                           str(_dialect))
            raise NotImplementedError

        return sa.and_(*(get_dialect_regex_expression(
            column, regex, _dialect, positive=False) for regex in regex_list))
コード例 #7
0
    def _sqlalchemy(cls, column, like_pattern_list, _dialect, **kwargs):
        if len(like_pattern_list) == 0:
            raise ValueError(
                "At least one like_pattern must be supplied in the like_pattern_list."
            )

        like_pattern_expression = get_dialect_like_pattern_expression(
            column, _dialect, like_pattern_list[0], positive=False)
        if like_pattern_expression is None:
            logger.warning("Like patterns are not supported for dialect %s" %
                           str(_dialect.name))
            raise NotImplementedError

        return sa.and_(*(get_dialect_like_pattern_expression(
            column, _dialect, like_pattern, positive=False)
                         for like_pattern in like_pattern_list))
コード例 #8
0
    def _sqlalchemy(cls, column_A, column_B, **kwargs):
        value_pairs_set = kwargs.get("value_pairs_set")

        if value_pairs_set is None:
            # vacuously true
            return sa.case([(column_A == column_B, True)], else_=True)

        value_pairs_set = [(x, y) for x, y in value_pairs_set]

        # or_ implementation was required due to mssql issues with in_
        conditions = [
            sa.or_(sa.and_(column_A == x, column_B == y))
            for x, y in value_pairs_set
        ]
        row_wise_cond = sa.or_(*conditions)

        return row_wise_cond
コード例 #9
0
    def _sqlalchemy(cls, column_A, column_B, **kwargs):
        allow_cross_type_comparisons: bool = (
            kwargs.get("allow_cross_type_comparisons") or False
        )
        if allow_cross_type_comparisons:
            raise NotImplementedError

        parse_strings_as_datetimes: bool = (
            kwargs.get("parse_strings_as_datetimes") or False
        )
        if parse_strings_as_datetimes:
            raise NotImplementedError

        or_equal: bool = kwargs.get("or_equal") or False
        if or_equal:
            return sa.or_(
                column_A >= column_B, sa.and_(column_A == None, column_B == None)
            )
        else:
            return column_A > column_B
    def _sqlalchemy(cls, column_list, **kwargs):
        """
        The present approach relies on an inefficient query condition construction implementation, whose computational
        cost is O(num_columns^2).  However, until a more efficient implementation compatible with SQLAlchemy is
        available, this is the only feasible mechanism under the current architecture, where map metric providers must
        return a condition.  Nevertheless, SQL query length limit is 1GB (sufficient for most practical scenarios).
        """
        num_columns = len(column_list)

        # An arbitrary "num_columns" value used for issuing an explanatory message as a warning.
        if num_columns > 100:
            logger.warning(
                f"""Batch data with {num_columns} columns is detected.  Computing the "{cls.condition_metric_name}" \
metric for wide tables using SQLAlchemy leads to long WHERE clauses for the underlying database engine to process.
""")

        conditions = sa.or_(*(sa.or_(
            column_list[idx_src] == column_list[idx_dest],
            sa.and_(column_list[idx_src] == None, column_list[idx_dest] ==
                    None),
        ) for idx_src in range(num_columns - 1)
                              for idx_dest in range(idx_src + 1, num_columns)))
        row_wise_cond = sa.not_(conditions)
        return row_wise_cond
コード例 #11
0
 def _sqlalchemy(cls, column_A, column_B, **kwargs):
     row_wise_cond = sa.and_(
         column_A == column_B,
         sa.not_(sa.or_(column_A == None, column_B == None)))
     return row_wise_cond
コード例 #12
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")
        dialect_name = execution_engine.engine.dialect.name.lower()

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = sa.column(accessor_domain_kwargs["column"])

        if min_value is None:
            if strict_max:
                condition = column < max_value
            else:
                condition = column <= max_value

        elif max_value is None:
            if strict_min:
                condition = column > min_value
            else:
                condition = column >= min_value

        else:
            if strict_min and strict_max:
                condition = sa.and_(column > min_value, column < max_value)
            elif strict_min:
                condition = sa.and_(column > min_value, column <= max_value)
            elif strict_max:
                condition = sa.and_(column >= min_value, column < max_value)
            else:
                condition = sa.and_(column >= min_value, column <= max_value)

        return execution_engine.engine.execute(
            sa.select([sa.func.count()
                       ]).select_from(selectable).where(condition)).scalar()
コード例 #13
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        """return a list of counts corresponding to bins

        Args:
            column: the name of the column for which to get the histogram
            bins: tuple of bin edges for which to get histogram values; *must* be tuple to support caching
        """
        selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]
        bins = metric_value_kwargs["bins"]

        case_conditions = []
        idx = 0
        if isinstance(bins, np.ndarray):
            bins = bins.tolist()
        else:
            bins = list(bins)

        # If we have an infinite lower bound, don't express that in sql
        if (bins[0] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    bins[0] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(sa.column(column) < bins[idx + 1], 1)],
                            else_=0)).label("bin_" + str(idx)))
            idx += 1

        for idx in range(idx, len(bins) - 2):
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[idx] <= sa.column(column),
                                sa.column(column) < bins[idx + 1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(idx)))

        if (bins[-1] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    bins[-1] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(bins[-2] <= sa.column(column), 1)],
                            else_=0)).label("bin_" + str(len(bins) - 1)))
        else:
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[-2] <= sa.column(column),
                                sa.column(column) <= bins[-1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(len(bins) - 1)))

        query = (sa.select(case_conditions).where(
            sa.column(column) != None, ).select_from(selectable))

        # Run the data through convert_to_json_serializable to ensure we do not have Decimal types
        hist = convert_to_json_serializable(
            list(execution_engine.engine.execute(query).fetchone()))
        return hist
 def _sqlalchemy(cls, column_A, column_B, **kwargs):
     row_wise_cond = sa.and_(
         sa.func.abs(column_A - column_B) == 3,
         sa.not_(sa.or_(column_A == None, column_B == None)),
     )
     return row_wise_cond
コード例 #15
0
    def _sqlalchemy_function(self, column_list, **kwargs):
        """
        Computes the "map" between the specified "column_list" (treated as a group so as to model the "compound" aspect)
        and the number of occurrences of every permutation of the values of "column_list" as the grouped subset of all
        rows of the table.  In the present context, the term "compound" refers to having to treat the specified columns
        as unique together (e.g., as a multi-column primary key).  For example, suppose that in the example below, all
        three columns ("A", "B", and "C") of the table are included as part of the "compound" columns list (i.e.,
        column_list = ["A", "B", "C"]):

            A B C _num_rows
            1 1 2 2
            1 2 3 1
            1 1 2 2
            2 2 2 1
            3 2 3 1

        The fourth column, "_num_rows", holds the value of the "map" function -- the number of rows the group occurs in.
        """

        # Needed as keys (hence, string valued) to access "ColumnElement" objects contained within the "FROM" clauses.
        column_names = kwargs.get("_column_names")

        # Need all columns of the table for the purposes of reporting entire rows satisfying unexpected condition logic.
        table_columns = kwargs.get("_table_columns")

        table = kwargs.get(
            "_table"
        )  # Note that here, "table" is of the "sqlalchemy.sql.selectable.Subquery" type.

        # First, obtain the SQLAlchemy "FromClause" version of the original "table" for the purposes of gaining the
        # "FromClause.c" attribute, which is a namespace of all the columns contained within the "FROM" clause (these
        # elements are themselves subclasses of the SQLAlchemy "ColumnElement" class).
        table_columns_selector = [
            sa.column(column_name) for column_name in table_columns
        ]
        original_table_clause = (sa.select(table_columns_selector).select_from(
            table).alias("original_table_clause"))

        # Second, "SELECT FROM" the original table, represented by the "FromClause" object, querying all columns of the
        # table and the count of occurrences of distinct "compound" (i.e., group, as specified by "column_list") values.
        # Give this aggregated group count a distinctive label.
        # Give the resulting sub-query a unique alias in order to disambiguate column names in subsequent queries.
        count_selector = column_list + [sa.func.count().label("_num_rows")]
        group_count_query = (sa.select(count_selector).group_by(
            *column_list).select_from(original_table_clause).alias(
                "group_counts_subquery"))

        # The above "group_count_query", if executed, will produce the result set containing the number of rows that
        # equals the number of distinct values of the group -- unique grouping (e.g., as in a multi-column primary key).
        # Hence, in order for the "_num_rows" column values to provide an entry for each row of the original table, the
        # "SELECT FROM" of "group_count_query" must undergo an "INNER JOIN" operation with the "original_table_clause"
        # object, whereby all table columns in the two "FromClause" objects must match, respectively, as the conditions.
        conditions = sa.and_(
            *(group_count_query.c[name] == original_table_clause.c[name]
              for name in column_names))
        # noinspection PyProtectedMember
        compound_columns_count_query = (sa.select([
            original_table_clause,
            group_count_query.c._num_rows.label("_num_rows"),
        ]).select_from(
            original_table_clause.join(
                right=group_count_query, onclause=conditions,
                isouter=False)).alias(
                    "records_with_grouped_column_counts_subquery"))

        # The returned SQLAlchemy "FromClause" "compound_columns_count_query" object realizes the "map" metric function.
        return compound_columns_count_query