def _get_column_quantiles_mysql(
    column, quantiles: Iterable, selectable, sqlalchemy_engine
) -> list:
    # MySQL does not support "percentile_disc", so we implement it as a compound query.
    # Please see https://stackoverflow.com/questions/19770026/calculate-percentile-value-using-mysql for reference.
    percent_rank_query: CTE = (
        sa.select(
            [
                column,
                sa.cast(
                    sa.func.percent_rank().over(order_by=column.asc()),
                    sa.dialects.mysql.DECIMAL(18, 15),
                ).label("p"),
            ]
        )
        .order_by(sa.column("p").asc())
        .select_from(selectable)
        .cte("t")
    )

    selects: List[WithinGroup] = []
    for idx, quantile in enumerate(quantiles):
        # pymysql cannot handle conversion of numpy float64 to float; convert just in case
        if np.issubdtype(type(quantile), np.float_):
            quantile = float(quantile)
        quantile_column: Label = (
            sa.func.first_value(column)
            .over(
                order_by=sa.case(
                    [
                        (
                            percent_rank_query.c.p
                            <= sa.cast(quantile, sa.dialects.mysql.DECIMAL(18, 15)),
                            percent_rank_query.c.p,
                        )
                    ],
                    else_=None,
                ).desc()
            )
            .label(f"q_{idx}")
        )
        selects.append(quantile_column)
    quantiles_query: Select = (
        sa.select(selects).distinct().order_by(percent_rank_query.c.p.desc())
    )

    try:
        quantiles_results: Row = sqlalchemy_engine.execute(quantiles_query).fetchone()
        return list(quantiles_results)
    except ProgrammingError as pe:
        exception_message: str = "An SQL syntax Exception occurred."
        exception_traceback: str = traceback.format_exc()
        exception_message += (
            f'{type(pe).__name__}: "{str(pe)}".  Traceback: "{exception_traceback}".'
        )
        logger.error(exception_message)
        raise pe
    def _sqlalchemy(
        cls,
        execution_engine: "SqlAlchemyExecutionEngine",
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)

        column_name = accessor_domain_kwargs["column"]
        column = sa.column(column_name)
        sqlalchemy_engine = execution_engine.engine
        dialect = sqlalchemy_engine.dialect

        column_mean = _get_query_result(
            func=sa.func.avg(column * 1.0),
            selectable=selectable,
            sqlalchemy_engine=sqlalchemy_engine,
        )

        column_count = _get_query_result(
            func=sa.func.count(column),
            selectable=selectable,
            sqlalchemy_engine=sqlalchemy_engine,
        )

        if dialect.name.lower() == "mssql":
            standard_deviation = sa.func.stdev(column)
        else:
            standard_deviation = sa.func.stddev_samp(column)

        column_std = _get_query_result(
            func=standard_deviation,
            selectable=selectable,
            sqlalchemy_engine=sqlalchemy_engine,
        )

        column_third_moment = _get_query_result(
            func=sa.func.sum(sa.func.pow(column - column_mean, 3)),
            selectable=selectable,
            sqlalchemy_engine=sqlalchemy_engine,
        )

        column_skew = column_third_moment / (column_std**
                                             3) / (column_count - 1)
        if metric_value_kwargs["abs"]:
            return np.abs(column_skew)
        else:
            return column_skew
Ejemplo n.º 3
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        sort = metric_value_kwargs.get("sort",
                                       cls.default_kwarg_values["sort"])
        collate = metric_value_kwargs.get("collate",
                                          cls.default_kwarg_values["collate"])

        if sort not in ["value", "count", "none"]:
            raise ValueError("sort must be either 'value', 'count', or 'none'")
        if collate is not None:
            raise ValueError(
                "collate parameter is not supported in PandasDataset")

        selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            metric_domain_kwargs, MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]

        if sort not in ["value", "count", "none"]:
            raise ValueError("sort must be either 'value', 'count', or 'none'")

        query = (sa.select([
            sa.column(column).label("value"),
            sa.func.count(sa.column(column)).label("count"),
        ]).where(sa.column(column) != None).group_by(sa.column(column)))
        if sort == "value":
            # NOTE: depending on the way the underlying database collates columns,
            # ordering can vary. postgresql collate "C" matches default sort
            # for python and most other systems, but is not universally supported,
            # so we use the default sort for the system, unless specifically overridden
            if collate is not None:
                query = query.order_by(sa.column(column).collate(collate))
            else:
                query = query.order_by(sa.column(column))
        elif sort == "count":
            query = query.order_by(sa.column("count").desc())
        results = execution_engine.engine.execute(
            query.select_from(selectable)).fetchall()
        series = pd.Series(
            [row[1] for row in results],
            index=pd.Index(data=[row[0] for row in results], name="value"),
            name="count",
        )
        return series
Ejemplo n.º 4
0
    def _sqlalchemy(
        cls,
        execution_engine: "SqlAlchemyExecutionEngine",
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)
        column_name = accessor_domain_kwargs["column"]
        column = sa.column(column_name)
        sqlalchemy_engine = execution_engine.engine
        dialect = sqlalchemy_engine.dialect
        """SqlAlchemy Median Implementation"""
        if dialect.name.lower() == "awsathena":
            raise NotImplementedError("AWS Athena does not support OFFSET.")
        nonnull_count = metrics.get("column_values.nonnull.count")
        if not nonnull_count:
            return None
        element_values = sqlalchemy_engine.execute(
            sa.select([column]).order_by(column).where(column != None).offset(
                max(nonnull_count // 2 - 1,
                    0)).limit(2).select_from(selectable))

        column_values = list(element_values.fetchall())

        if len(column_values) == 0:
            column_median = None
        elif nonnull_count % 2 == 0:
            # An even number of column values: take the average of the two center values
            column_median = (
                float(column_values[0][0] + column_values[1][
                    0]  # left center value  # right center value
                      ) / 2.0)  # Average center values
        else:
            # An odd number of column values, we can just take the center value
            column_median = column_values[1][0]  # True center value
        return column_median
Ejemplo n.º 5
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs,
        metric_value_kwargs,
        metrics,
        runtime_configuration,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)

        column_name = accessor_domain_kwargs["column"]
        column = sa.column(column_name)
        sqlalchemy_engine = execution_engine.engine

        query = sa.select(sa.func.max(column)).select_from(selectable)
        result = sqlalchemy_engine.execute(query).fetchone()

        return result[0]
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")
        dialect_name = execution_engine.engine.dialect.name.lower()

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = sa.column(accessor_domain_kwargs["column"])

        if min_value is None:
            if strict_max:
                condition = column < max_value
            else:
                condition = column <= max_value

        elif max_value is None:
            if strict_min:
                condition = column > min_value
            else:
                condition = column >= min_value

        else:
            if strict_min and strict_max:
                condition = sa.and_(column > min_value, column < max_value)
            elif strict_min:
                condition = sa.and_(column > min_value, column <= max_value)
            elif strict_max:
                condition = sa.and_(column >= min_value, column < max_value)
            else:
                condition = sa.and_(column >= min_value, column <= max_value)

        return execution_engine.engine.execute(
            sa.select([sa.func.count()
                       ]).select_from(selectable).where(condition)).scalar()
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        """return a list of counts corresponding to bins

        Args:
            column: the name of the column for which to get the histogram
            bins: tuple of bin edges for which to get histogram values; *must* be tuple to support caching
        """
        selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]
        bins = metric_value_kwargs["bins"]

        case_conditions = []
        idx = 0
        if isinstance(bins, np.ndarray):
            bins = bins.tolist()
        else:
            bins = list(bins)

        # If we have an infinite lower bound, don't express that in sql
        if (bins[0] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    bins[0] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(sa.column(column) < bins[idx + 1], 1)],
                            else_=0)).label("bin_" + str(idx)))
            idx += 1

        for idx in range(idx, len(bins) - 2):
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[idx] <= sa.column(column),
                                sa.column(column) < bins[idx + 1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(idx)))

        if (bins[-1] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    bins[-1] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(bins[-2] <= sa.column(column), 1)],
                            else_=0)).label("bin_" + str(len(bins) - 1)))
        else:
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[-2] <= sa.column(column),
                                sa.column(column) <= bins[-1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(len(bins) - 1)))

        query = (sa.select(case_conditions).where(
            sa.column(column) != None, ).select_from(selectable))

        # Run the data through convert_to_json_serializable to ensure we do not have Decimal types
        hist = convert_to_json_serializable(
            list(execution_engine.engine.execute(query).fetchone()))
        return hist
 def _sqlalchemy(
     cls,
     execution_engine: SqlAlchemyExecutionEngine,
     metric_domain_kwargs: Dict,
     metric_value_kwargs: Dict,
     metrics: Dict[str, Any],
     runtime_configuration: Dict,
 ):
     (
         selectable,
         compute_domain_kwargs,
         accessor_domain_kwargs,
     ) = execution_engine.get_compute_domain(
         metric_domain_kwargs, domain_type=MetricDomainTypes.COLUMN
     )
     column_name = accessor_domain_kwargs["column"]
     column = sa.column(column_name)
     sqlalchemy_engine = execution_engine.engine
     dialect = sqlalchemy_engine.dialect
     quantiles = metric_value_kwargs["quantiles"]
     allow_relative_error = metric_value_kwargs.get("allow_relative_error", False)
     table_row_count = metrics.get("table.row_count")
     if dialect.name.lower() == "mssql":
         return _get_column_quantiles_mssql(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "bigquery":
         return _get_column_quantiles_bigquery(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "mysql":
         return _get_column_quantiles_mysql(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "snowflake":
         # NOTE: 20201216 - JPC - snowflake has a representation/precision limitation
         # in its percentile_disc implementation that causes an error when we do
         # not round. It is unclear to me *how* the call to round affects the behavior --
         # the binary representation should be identical before and after, and I do
         # not observe a type difference. However, the issue is replicable in the
         # snowflake console and directly observable in side-by-side comparisons with
         # and without the call to round()
         quantiles = [round(x, 10) for x in quantiles]
         return _get_column_quantiles_generic_sqlalchemy(
             column=column,
             quantiles=quantiles,
             allow_relative_error=allow_relative_error,
             dialect=dialect,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
     elif dialect.name.lower() == "sqlite":
         return _get_column_quantiles_sqlite(
             column=column,
             quantiles=quantiles,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
             table_row_count=table_row_count,
         )
     else:
         return _get_column_quantiles_generic_sqlalchemy(
             column=column,
             quantiles=quantiles,
             allow_relative_error=allow_relative_error,
             dialect=dialect,
             selectable=selectable,
             sqlalchemy_engine=sqlalchemy_engine,
         )
Ejemplo n.º 9
0
    def _sqlalchemy_function(self, column_list, **kwargs):
        """
        Computes the "map" between the specified "column_list" (treated as a group so as to model the "compound" aspect)
        and the number of occurrences of every permutation of the values of "column_list" as the grouped subset of all
        rows of the table.  In the present context, the term "compound" refers to having to treat the specified columns
        as unique together (e.g., as a multi-column primary key).  For example, suppose that in the example below, all
        three columns ("A", "B", and "C") of the table are included as part of the "compound" columns list (i.e.,
        column_list = ["A", "B", "C"]):

            A B C _num_rows
            1 1 2 2
            1 2 3 1
            1 1 2 2
            2 2 2 1
            3 2 3 1

        The fourth column, "_num_rows", holds the value of the "map" function -- the number of rows the group occurs in.
        """

        # Needed as keys (hence, string valued) to access "ColumnElement" objects contained within the "FROM" clauses.
        column_names = kwargs.get("_column_names")

        # Need all columns of the table for the purposes of reporting entire rows satisfying unexpected condition logic.
        table_columns = kwargs.get("_table_columns")

        table = kwargs.get(
            "_table"
        )  # Note that here, "table" is of the "sqlalchemy.sql.selectable.Subquery" type.

        # First, obtain the SQLAlchemy "FromClause" version of the original "table" for the purposes of gaining the
        # "FromClause.c" attribute, which is a namespace of all the columns contained within the "FROM" clause (these
        # elements are themselves subclasses of the SQLAlchemy "ColumnElement" class).
        table_columns_selector = [
            sa.column(column_name) for column_name in table_columns
        ]
        original_table_clause = (sa.select(table_columns_selector).select_from(
            table).alias("original_table_clause"))

        # Second, "SELECT FROM" the original table, represented by the "FromClause" object, querying all columns of the
        # table and the count of occurrences of distinct "compound" (i.e., group, as specified by "column_list") values.
        # Give this aggregated group count a distinctive label.
        # Give the resulting sub-query a unique alias in order to disambiguate column names in subsequent queries.
        count_selector = column_list + [sa.func.count().label("_num_rows")]
        group_count_query = (sa.select(count_selector).group_by(
            *column_list).select_from(original_table_clause).alias(
                "group_counts_subquery"))

        # The above "group_count_query", if executed, will produce the result set containing the number of rows that
        # equals the number of distinct values of the group -- unique grouping (e.g., as in a multi-column primary key).
        # Hence, in order for the "_num_rows" column values to provide an entry for each row of the original table, the
        # "SELECT FROM" of "group_count_query" must undergo an "INNER JOIN" operation with the "original_table_clause"
        # object, whereby all table columns in the two "FromClause" objects must match, respectively, as the conditions.
        conditions = sa.and_(
            *(group_count_query.c[name] == original_table_clause.c[name]
              for name in column_names))
        # noinspection PyProtectedMember
        compound_columns_count_query = (sa.select([
            original_table_clause,
            group_count_query.c._num_rows.label("_num_rows"),
        ]).select_from(
            original_table_clause.join(
                right=group_count_query, onclause=conditions,
                isouter=False)).alias(
                    "records_with_grouped_column_counts_subquery"))

        # The returned SQLAlchemy "FromClause" "compound_columns_count_query" object realizes the "map" metric function.
        return compound_columns_count_query