Ejemplo n.º 1
0
def _get_column_quantiles_mssql(column, quantiles: Iterable, selectable,
                                sqlalchemy_engine) -> list:
    # mssql requires over(), so we add an empty over() clause
    selects: List[WithinGroup] = [
        sa.func.percentile_disc(quantile).within_group(column.asc()).over()
        for quantile in quantiles
    ]
    quantiles_query: Select = sa.select(selects).select_from(selectable)

    try:
        quantiles_results: Row = sqlalchemy_engine.execute(
            quantiles_query).fetchone()
        return list(quantiles_results)
    except ProgrammingError as pe:
        exception_message: str = "An SQL syntax Exception occurred."
        exception_traceback: str = traceback.format_exc()
        exception_message += (
            f'{type(pe).__name__}: "{str(pe)}".  Traceback: "{exception_traceback}".'
        )
        logger.error(exception_message)
        raise pe
Ejemplo n.º 2
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)
        column_name = accessor_domain_kwargs["column"]
        column = sa.column(column_name)
        sqlalchemy_engine = execution_engine.engine
        """SqlAlchemy Median Implementation"""
        nonnull_count = metrics.get("column_values.nonnull.count")
        if not nonnull_count:
            return None
        element_values = sqlalchemy_engine.execute(
            sa.select([column]).order_by(column).where(column != None).offset(
                max(nonnull_count // 2 - 1,
                    0)).limit(2).select_from(selectable))

        column_values = list(element_values.fetchall())

        if len(column_values) == 0:
            column_median = None
        elif nonnull_count % 2 == 0:
            # An even number of column values: take the average of the two center values
            column_median = (
                float(column_values[0][0] + column_values[1][
                    0]  # left center value  # right center value
                      ) / 2.0)  # Average center values
        else:
            # An odd number of column values, we can just take the center value
            column_median = column_values[1][0]  # True center value
        return column_median
def _get_column_quantiles_sqlite(
    column, quantiles: Iterable, selectable, sqlalchemy_engine, table_row_count
) -> list:
    """
    The present implementation is somewhat inefficient, because it requires as many calls to
    "sqlalchemy_engine.execute()" as the number of partitions in the "quantiles" parameter (albeit, typically,
    only a few).  However, this is the only mechanism available for SQLite at the present time (11/17/2021), because
    the analytical processing is not a very strongly represented capability of the SQLite database management system.
    """
    offsets: List[int] = [quantile * table_row_count - 1 for quantile in quantiles]
    quantile_queries: List[Select] = [
        sa.select([column])
        .order_by(column.asc())
        .offset(offset)
        .limit(1)
        .select_from(selectable)
        for offset in offsets
    ]

    quantile_result: Row
    quantile_query: Select
    try:
        quantiles_results: List[Row] = [
            sqlalchemy_engine.execute(quantile_query).fetchone()
            for quantile_query in quantile_queries
        ]
        return list(
            itertools.chain.from_iterable(
                [list(quantile_result) for quantile_result in quantiles_results]
            )
        )
    except ProgrammingError as pe:
        exception_message: str = "An SQL syntax Exception occurred."
        exception_traceback: str = traceback.format_exc()
        exception_message += (
            f'{type(pe).__name__}: "{str(pe)}".  Traceback: "{exception_traceback}".'
        )
        logger.error(exception_message)
        raise pe
Ejemplo n.º 4
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs,
        metric_value_kwargs,
        metrics,
        runtime_configuration,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(metric_domain_kwargs,
                                                MetricDomainTypes.COLUMN)

        column_name = accessor_domain_kwargs["column"]
        column = sa.column(column_name)
        sqlalchemy_engine = execution_engine.engine

        query = sa.select(sa.func.max(column)).select_from(selectable)
        result = sqlalchemy_engine.execute(query).fetchone()

        return result[0]
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")
        dialect_name = execution_engine.engine.dialect.name.lower()

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = sa.column(accessor_domain_kwargs["column"])

        if min_value is None:
            if strict_max:
                condition = column < max_value
            else:
                condition = column <= max_value

        elif max_value is None:
            if strict_min:
                condition = column > min_value
            else:
                condition = column >= min_value

        else:
            if strict_min and strict_max:
                condition = sa.and_(column > min_value, column < max_value)
            elif strict_min:
                condition = sa.and_(column > min_value, column <= max_value)
            elif strict_max:
                condition = sa.and_(column >= min_value, column < max_value)
            else:
                condition = sa.and_(column >= min_value, column <= max_value)

        return execution_engine.engine.execute(
            sa.select([sa.func.count()
                       ]).select_from(selectable).where(condition)).scalar()
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name: str = "main.yellow_tripdata_sample_2019_01"

context.create_expectation_suite(
    expectation_suite_name="test_suite", overwrite_existing=True
)
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name="test_suite"
)
print(validator.head(n_rows=10))

batch_list = context.get_batch_list(batch_request=batch_request)
assert len(batch_list) == 1
batch_data = batch_list[0].data
num_rows = batch_data.execution_engine.engine.execute(
    sa.select([sa.func.count()]).select_from(batch_data.selectable)
).one()[0]
assert num_rows == 10000

# Here is a BatchRequest naming an inferred data_asset partitioned by day.
# This BatchRequest specifies multiple batches, which is useful for dataset exploration.
batch_request = BatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="daily",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name: str = "main.yellow_tripdata_sample_2019_01"
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        """return a list of counts corresponding to bins

        Args:
            column: the name of the column for which to get the histogram
            bins: tuple of bin edges for which to get histogram values; *must* be tuple to support caching
        """
        selectable, _, accessor_domain_kwargs = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = accessor_domain_kwargs["column"]
        bins = metric_value_kwargs["bins"]

        case_conditions = []
        idx = 0
        if isinstance(bins, np.ndarray):
            bins = bins.tolist()
        else:
            bins = list(bins)

        # If we have an infinite lower bound, don't express that in sql
        if (bins[0] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    bins[0] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(sa.column(column) < bins[idx + 1], 1)],
                            else_=0)).label("bin_" + str(idx)))
            idx += 1

        for idx in range(idx, len(bins) - 2):
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[idx] <= sa.column(column),
                                sa.column(column) < bins[idx + 1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(idx)))

        if (bins[-1] == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    bins[-1] == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            case_conditions.append(
                sa.func.sum(
                    sa.case([(bins[-2] <= sa.column(column), 1)],
                            else_=0)).label("bin_" + str(len(bins) - 1)))
        else:
            case_conditions.append(
                sa.func.sum(
                    sa.case(
                        [(
                            sa.and_(
                                bins[-2] <= sa.column(column),
                                sa.column(column) <= bins[-1],
                            ),
                            1,
                        )],
                        else_=0,
                    )).label("bin_" + str(len(bins) - 1)))

        query = (sa.select(case_conditions).where(
            sa.column(column) != None, ).select_from(selectable))

        # Run the data through convert_to_json_serializable to ensure we do not have Decimal types
        hist = convert_to_json_serializable(
            list(execution_engine.engine.execute(query).fetchone()))
        return hist
# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "yellow_tripdata_sample_2019_01"

context.create_expectation_suite(expectation_suite_name="test_suite",
                                 overwrite_existing=True)
validator = context.get_validator(batch_request=batch_request,
                                  expectation_suite_name="test_suite")
print(validator.head(n_rows=10))

batch_list = context.get_batch_list(batch_request=batch_request)
assert len(batch_list) == 1
batch_data = batch_list[0].data
num_rows = batch_data.execution_engine.engine.execute(
    sa.select([sa.func.count()]).select_from(batch_data.selectable)).one()[0]
assert num_rows == 10000

# Here is a BatchRequest naming an inferred data_asset partitioned by day.
# This BatchRequest specifies multiple batches, which is useful for dataset exploration.
batch_request = BatchRequest(
    datasource_name="taxi_datasource",
    data_connector_name="daily",
    data_asset_name="<YOUR_DATA_ASSET_NAME>",
)

# Please note this override is only to provide good UX for docs and tests.
# In normal usage you'd set your data asset name directly in the BatchRequest above.
batch_request.data_asset_name = "yellow_tripdata_sample_2019_01"

batch_list = context.get_batch_list(batch_request=batch_request)
Ejemplo n.º 9
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[str, Any],
        runtime_configuration: Dict,
    ):
        selectable, _, _ = execution_engine.get_compute_domain(
            metric_domain_kwargs, domain_type=MetricDomainTypes.TABLE)
        df = None
        table_name = getattr(selectable, "name", None)
        if table_name is None:
            # if a custom query was passed
            try:
                if metric_value_kwargs["fetch_all"]:
                    df = pd.read_sql_query(
                        sql=selectable,
                        con=execution_engine.engine,
                    )
                else:
                    df = next(
                        pd.read_sql_query(
                            sql=selectable,
                            con=execution_engine.engine,
                            chunksize=metric_value_kwargs["n_rows"],
                        ))
            except (ValueError, NotImplementedError):
                # it looks like MetaData that is used by pd.read_sql_query
                # cannot work on a temp table.
                # If it fails, we are trying to get the data using read_sql
                df = None
            except StopIteration:
                validator = Validator(execution_engine=execution_engine)
                columns = validator.get_metric(
                    MetricConfiguration("table.columns", metric_domain_kwargs))
                df = pd.DataFrame(columns=columns)
        else:
            try:
                if metric_value_kwargs["fetch_all"]:
                    df = pd.read_sql_table(
                        table_name=getattr(selectable, "name", None),
                        schema=getattr(selectable, "schema", None),
                        con=execution_engine.engine,
                    )
                else:
                    df = next(
                        pd.read_sql_table(
                            table_name=getattr(selectable, "name", None),
                            schema=getattr(selectable, "schema", None),
                            con=execution_engine.engine,
                            chunksize=metric_value_kwargs["n_rows"],
                        ))
            except (ValueError, NotImplementedError):
                # it looks like MetaData that is used by pd.read_sql_table
                # cannot work on a temp table.
                # If it fails, we are trying to get the data using read_sql
                df = None
            except StopIteration:
                validator = Validator(execution_engine=execution_engine)
                columns = validator.get_metric(
                    MetricConfiguration("table.columns", metric_domain_kwargs))
                df = pd.DataFrame(columns=columns)

        if df is None:
            # we want to compile our selectable
            stmt = sa.select(["*"]).select_from(selectable)
            if metric_value_kwargs["fetch_all"]:
                sql = stmt.compile(
                    dialect=execution_engine.engine.dialect,
                    compile_kwargs={"literal_binds": True},
                )
            elif execution_engine.engine.dialect.name.lower() == "mssql":
                # limit doesn't compile properly for mssql
                sql = str(
                    stmt.compile(
                        dialect=execution_engine.engine.dialect,
                        compile_kwargs={"literal_binds": True},
                    ))
                sql = f"SELECT TOP {metric_value_kwargs['n_rows']}{sql[6:]}"
            else:
                stmt = stmt.limit(metric_value_kwargs["n_rows"])
                sql = stmt.compile(
                    dialect=execution_engine.engine.dialect,
                    compile_kwargs={"literal_binds": True},
                )

            df = pd.read_sql(sql, con=execution_engine.engine)

        return df
Ejemplo n.º 10
0
    def _sqlalchemy_function(self, column_list, **kwargs):
        """
        Computes the "map" between the specified "column_list" (treated as a group so as to model the "compound" aspect)
        and the number of occurrences of every permutation of the values of "column_list" as the grouped subset of all
        rows of the table.  In the present context, the term "compound" refers to having to treat the specified columns
        as unique together (e.g., as a multi-column primary key).  For example, suppose that in the example below, all
        three columns ("A", "B", and "C") of the table are included as part of the "compound" columns list (i.e.,
        column_list = ["A", "B", "C"]):

            A B C _num_rows
            1 1 2 2
            1 2 3 1
            1 1 2 2
            2 2 2 1
            3 2 3 1

        The fourth column, "_num_rows", holds the value of the "map" function -- the number of rows the group occurs in.
        """

        # Needed as keys (hence, string valued) to access "ColumnElement" objects contained within the "FROM" clauses.
        column_names = kwargs.get("_column_names")

        # Need all columns of the table for the purposes of reporting entire rows satisfying unexpected condition logic.
        table_columns = kwargs.get("_table_columns")

        table = kwargs.get(
            "_table"
        )  # Note that here, "table" is of the "sqlalchemy.sql.selectable.Subquery" type.

        # First, obtain the SQLAlchemy "FromClause" version of the original "table" for the purposes of gaining the
        # "FromClause.c" attribute, which is a namespace of all the columns contained within the "FROM" clause (these
        # elements are themselves subclasses of the SQLAlchemy "ColumnElement" class).
        table_columns_selector = [
            sa.column(column_name) for column_name in table_columns
        ]
        original_table_clause = (sa.select(table_columns_selector).select_from(
            table).alias("original_table_clause"))

        # Second, "SELECT FROM" the original table, represented by the "FromClause" object, querying all columns of the
        # table and the count of occurrences of distinct "compound" (i.e., group, as specified by "column_list") values.
        # Give this aggregated group count a distinctive label.
        # Give the resulting sub-query a unique alias in order to disambiguate column names in subsequent queries.
        count_selector = column_list + [sa.func.count().label("_num_rows")]
        group_count_query = (sa.select(count_selector).group_by(
            *column_list).select_from(original_table_clause).alias(
                "group_counts_subquery"))

        # The above "group_count_query", if executed, will produce the result set containing the number of rows that
        # equals the number of distinct values of the group -- unique grouping (e.g., as in a multi-column primary key).
        # Hence, in order for the "_num_rows" column values to provide an entry for each row of the original table, the
        # "SELECT FROM" of "group_count_query" must undergo an "INNER JOIN" operation with the "original_table_clause"
        # object, whereby all table columns in the two "FromClause" objects must match, respectively, as the conditions.
        conditions = sa.and_(
            *(group_count_query.c[name] == original_table_clause.c[name]
              for name in column_names))
        # noinspection PyProtectedMember
        compound_columns_count_query = (sa.select([
            original_table_clause,
            group_count_query.c._num_rows.label("_num_rows"),
        ]).select_from(
            original_table_clause.join(
                right=group_count_query, onclause=conditions,
                isouter=False)).alias(
                    "records_with_grouped_column_counts_subquery"))

        # The returned SQLAlchemy "FromClause" "compound_columns_count_query" object realizes the "map" metric function.
        return compound_columns_count_query