Esempio n. 1
0
def test_instantiation_via_url_and_retrieve_data_with_other_dialect(sa):
    """Ensure that we can still retrieve data when the dialect is not recognized."""

    # 1. Create engine with sqlite db
    db_file = file_relative_path(
        __file__,
        os.path.join("..", "test_sets",
                     "test_cases_for_sql_data_connector.db"),
    )
    my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file)
    assert my_execution_engine.connection_string is None
    assert my_execution_engine.credentials is None
    assert my_execution_engine.url[
        -36:] == "test_cases_for_sql_data_connector.db"

    # 2. Change dialect to one not listed in GESqlDialect
    my_execution_engine.engine.dialect.name = "other_dialect"

    # 3. Get data
    num_rows_in_sample: int = 10
    batch_data, _ = my_execution_engine.get_batch_data_and_markers(
        batch_spec=SqlAlchemyDatasourceBatchSpec(
            table_name="table_partitioned_by_date_column__A",
            sampling_method="_sample_using_limit",
            sampling_kwargs={"n": num_rows_in_sample},
        ))

    # 4. Assert dialect and data are as expected

    assert batch_data.dialect == GESqlDialect.OTHER

    my_execution_engine.load_batch_data("__", batch_data)
    validator = Validator(my_execution_engine)
    assert len(validator.head(fetch_all=True)) == num_rows_in_sample
Esempio n. 2
0
def _build_sa_engine(df, sa):
    eng = sa.create_engine("sqlite://", echo=False)
    df.to_sql("test", eng, index=False)
    engine = SqlAlchemyExecutionEngine(engine=eng)
    batch_data = SqlAlchemyBatchData(execution_engine=engine, table_name="test")
    engine.load_batch_data("my_id", batch_data)
    return engine
Esempio n. 3
0
def test_sa_expect_column_value_z_scores_to_be_less_than_impl(postgresql_engine):
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10]})
    df.to_sql(
        name="z_score_test_data",
        con=postgresql_engine,
        index=False,
        if_exists="replace",
    )
    expectationConfiguration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    expectation = ExpectColumnValueZScoresToBeLessThan(expectationConfiguration)
    engine = SqlAlchemyExecutionEngine(engine=postgresql_engine)
    engine.load_batch_data(
        "my_id",
        SqlAlchemyBatchData(execution_engine=engine, table_name="z_score_test_data"),
    )
    result = expectation.validate(Validator(execution_engine=engine))
    assert result == ExpectationValidationResult(
        success=True,
    )
Esempio n. 4
0
def test_instantiation_via_connection_string(sa, test_db_connection_string):
    my_execution_engine = SqlAlchemyExecutionEngine(
        connection_string=test_db_connection_string)
    assert my_execution_engine.connection_string == test_db_connection_string
    assert my_execution_engine.credentials == None
    assert my_execution_engine.url == None

    my_execution_engine.get_batch_data_and_markers(
        BatchSpec(
            table_name="main.table_1",
            sampling_method="_sample_using_limit",
            sampling_kwargs={"n": 5},
        ))
Esempio n. 5
0
def test_cases_for_sql_data_connector_sqlite_execution_engine(sa):
    if sa is None:
        raise ValueError(
            "SQL Database tests require sqlalchemy to be installed.")

    db_file_path: str = file_relative_path(
        __file__,
        os.path.join("..", "test_sets",
                     "test_cases_for_sql_data_connector.db"),
    )

    engine: sa.engine.Engine = sa.create_engine(
        get_sqlite_connection_url(db_file_path))
    conn: sa.engine.Connection = engine.connect()
    raw_connection = engine.raw_connection()
    raw_connection.create_function("sqrt", 1, lambda x: math.sqrt(x))
    raw_connection.create_function(
        "md5", 2,
        lambda x, d: hashlib.md5(str(x).encode("utf-8")).hexdigest()[-1 * d:])

    # Build a SqlAlchemyDataset using that database
    return SqlAlchemyExecutionEngine(
        name="test_sql_execution_engine",
        engine=conn,
    )
def test_map_of_type_sa(sa):
    eng = sa.create_engine("sqlite://")
    df = pd.DataFrame({"a": [1, 2, 3, 3, None]})
    df.to_sql("test", eng, index=False)
    batch_data = SqlAlchemyBatchData(engine=eng, table_name="test")
    engine = SqlAlchemyExecutionEngine(engine=eng,
                                       batch_data_dict={"my_id": batch_data})
    desired_metric = MetricConfiguration(
        metric_name="table.column_types",
        metric_domain_kwargs={},
        metric_value_kwargs={},
    )

    results = engine.resolve_metrics(metrics_to_resolve=(desired_metric, ))
    assert results[desired_metric.id][0]["name"] == "a"
    assert isinstance(results[desired_metric.id][0]["type"], sa.FLOAT)
def _build_sa_engine(df, sa):
    eng = sa.create_engine("sqlite://", echo=False)
    df.to_sql("test", eng, index=False)
    batch_data = SqlAlchemyBatchData(engine=eng, table_name="test")
    engine = SqlAlchemyExecutionEngine(engine=eng,
                                       batch_data_dict={"my_id": batch_data})
    return engine
def ge_validator_sqlalchemy() -> Validator:
    validator = Validator(
        execution_engine=SqlAlchemyExecutionEngine(
            connection_string="postgresql://localhost:5432/test"),
        batches=[
            Batch(
                data=None,
                batch_request=BatchRequest(
                    datasource_name="my_postgresql_datasource",
                    data_connector_name="whole_table",
                    data_asset_name="foo2",
                ),
                batch_definition=BatchDefinition(
                    datasource_name="my_postgresql_datasource",
                    data_connector_name="whole_table",
                    data_asset_name="foo2",
                    batch_identifiers=IDDict(),
                ),
                batch_spec=SqlAlchemyDatasourceBatchSpec({
                    "data_asset_name": "foo2",
                    "table_name": "foo2",
                    "batch_identifiers": {},
                    "schema_name": "public",
                    "type": "table",
                }),
            )
        ],
    )
    return validator
Esempio n. 9
0
            def inner_func(
                cls,
                execution_engine: SqlAlchemyExecutionEngine,
                metric_domain_kwargs: Dict,
                metric_value_kwargs: Dict,
                metrics: Dict[str, Any],
                runtime_configuration: Dict,
            ):
                filter_column_isnull = kwargs.get(
                    "filter_column_isnull",
                    getattr(cls, "filter_column_isnull", False))
                if filter_column_isnull:
                    compute_domain_kwargs = execution_engine.add_column_row_condition(
                        metric_domain_kwargs)
                else:
                    # We do not copy here because if compute domain is different, it will be copied by get_compute_domain
                    compute_domain_kwargs = metric_domain_kwargs
                (
                    selectable,
                    compute_domain_kwargs,
                    accessor_domain_kwargs,
                ) = execution_engine.get_compute_domain(
                    compute_domain_kwargs, domain_type=domain_type)

                column_name: str = accessor_domain_kwargs["column"]

                sqlalchemy_engine: sa.engine.Engine = execution_engine.engine

                if column_name not in metrics["table.columns"]:
                    raise ge_exceptions.InvalidMetricAccessorDomainKwargsKeyError(
                        message=
                        f'Error: The column "{column_name}" in BatchData does not exist.'
                    )

                dialect = sqlalchemy_engine.dialect
                metric_aggregate = metric_fn(
                    cls,
                    column=sa.column(column_name),
                    **metric_value_kwargs,
                    _dialect=dialect,
                    _table=selectable,
                    _column_name=column_name,
                    _sqlalchemy_engine=sqlalchemy_engine,
                    _metrics=metrics,
                )
                return metric_aggregate, compute_domain_kwargs, accessor_domain_kwargs
def test_instantiation_via_url(sa):
    db_file = file_relative_path(
        __file__,
        os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"),
    )
    my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file)
    assert my_execution_engine.connection_string is None
    assert my_execution_engine.credentials is None
    assert my_execution_engine.url[-36:] == "test_cases_for_sql_data_connector.db"

    my_execution_engine.get_batch_data_and_markers(
        batch_spec=SqlAlchemyDatasourceBatchSpec(
            table_name="table_partitioned_by_date_column__A",
            sampling_method="_sample_using_limit",
            sampling_kwargs={"n": 5},
        )
    )
def test_get_batch_data_and_markers_using_query(sqlite_view_engine, test_df):
    my_execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine(
        engine=sqlite_view_engine)
    test_df.to_sql("test_table_0", con=my_execution_engine.engine)

    query: str = "SELECT * FROM test_table_0"
    batch_data, batch_markers = my_execution_engine.get_batch_data_and_markers(
        batch_spec=RuntimeQueryBatchSpec(query=query, ))

    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 2
    assert batch_markers.get("ge_load_time") is not None
Esempio n. 12
0
def test_cases_for_sql_data_connector_sqlite_execution_engine(sa):
    if sa is None:
        raise ValueError(
            "SQL Database tests require sqlalchemy to be installed.")

    db_file_path: str = file_relative_path(
        __file__,
        os.path.join("..", "test_sets",
                     "test_cases_for_sql_data_connector.db"),
    )

    engine: sa.engine.Engine = sa.create_engine(
        get_sqlite_connection_url(db_file_path))
    conn: sa.engine.Connection = engine.connect()

    # Build a SqlAlchemyDataset using that database
    return SqlAlchemyExecutionEngine(
        name="test_sql_execution_engine",
        engine=conn,
    )
Esempio n. 13
0
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            metric_domain_kwargs, MetricDomainTypes.TABLE
        )

        # get the all column names of table using existing metric.
        columns = metrics.get("table.columns")
        ignore_columns = runtime_configuration["ignore_columns"]
        dialect_name = execution_engine.engine.dialect.name

        selectcolumns = select_column_list(columns, ignore_columns)

        if execution_engine.engine.dialect.name == "sqlite":
            cksumquery = get_sqlite_checksum_query(
                selectable.name, selectcolumns, ignore_columns
            )
        elif dialect_name == "bigquery":
            cksumquery = get_bigquery_checksum_query(
                selectable.name, selectcolumns, ignore_columns
            )
        else:
            logger.error("sql dialect is not supported: " + dialect_name)
            return 0

        if DEBUG:
            logger.error("\n***********cksumquery***********\n" + cksumquery)

        return int(execution_engine.engine.execute(cksumquery).scalar())
def test_instantiation_via_credentials(sa, test_backends, test_df):
    if "postgresql" not in test_backends:
        pytest.skip(
            "test_database_store_backend_get_url_for_key requires postgresql")

    my_execution_engine = SqlAlchemyExecutionEngine(
        credentials={
            "drivername": "postgresql",
            "username": "******",
            "password": "",
            "host": os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost"),
            "port": "5432",
            "database": "test_ci",
        })
    assert my_execution_engine.connection_string is None
    assert my_execution_engine.credentials == {
        "username": "******",
        "password": "",
        "host": os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost"),
        "port": "5432",
        "database": "test_ci",
    }
    assert my_execution_engine.url is None
def test_sample_using_random(sqlite_view_engine, test_df):
    my_execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine(
        engine=sqlite_view_engine)

    p: float
    batch_spec: SqlAlchemyDatasourceBatchSpec
    batch_data: SqlAlchemyBatchData
    num_rows: int
    rows_0: List[tuple]
    rows_1: List[tuple]

    # First, make sure that degenerative case never passes.

    test_df_0: pd.DataFrame = test_df.iloc[:1]
    test_df_0.to_sql("test_table_0", con=my_execution_engine.engine)

    p = 1.0
    batch_spec = SqlAlchemyDatasourceBatchSpec(
        table_name="test_table_0",
        schema_name="main",
        sampling_method="_sample_using_random",
        sampling_kwargs={"p": p},
    )

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()
                           ]).select_from(batch_data.selectable)).scalar()
    assert num_rows == round(p * test_df_0.shape[0])

    rows_0: List[tuple] = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")
                           ]).select_from(batch_data.selectable)).fetchall()

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()
                           ]).select_from(batch_data.selectable)).scalar()
    assert num_rows == round(p * test_df_0.shape[0])

    rows_1: List[tuple] = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")
                           ]).select_from(batch_data.selectable)).fetchall()

    assert len(rows_0) == len(rows_1) == 1

    assert rows_0 == rows_1

    # Second, verify that realistic case always returns different random sample of rows.

    test_df_1: pd.DataFrame = test_df
    test_df_1.to_sql("test_table_1", con=my_execution_engine.engine)

    p = 2.0e-1
    batch_spec = SqlAlchemyDatasourceBatchSpec(
        table_name="test_table_1",
        schema_name="main",
        sampling_method="_sample_using_random",
        sampling_kwargs={"p": p},
    )

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()
                           ]).select_from(batch_data.selectable)).scalar()
    assert num_rows == round(p * test_df_1.shape[0])

    rows_0 = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")
                           ]).select_from(batch_data.selectable)).fetchall()

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()
                           ]).select_from(batch_data.selectable)).scalar()
    assert num_rows == round(p * test_df_1.shape[0])

    rows_1 = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")
                           ]).select_from(batch_data.selectable)).fetchall()

    assert len(rows_0) == len(rows_1)

    assert not (rows_0 == rows_1)
def test_instantiation_error_states(sa, test_db_connection_string):
    with pytest.raises(ge_exceptions.InvalidConfigError):
        SqlAlchemyExecutionEngine()
    def _sqlalchemy(
        cls,
        execution_engine: SqlAlchemyExecutionEngine,
        metric_domain_kwargs: Dict,
        metric_value_kwargs: Dict,
        metrics: Dict[Tuple, Any],
        runtime_configuration: Dict,
    ):
        min_value = metric_value_kwargs.get("min_value")
        max_value = metric_value_kwargs.get("max_value")
        strict_min = metric_value_kwargs.get("strict_min")
        strict_max = metric_value_kwargs.get("strict_max")
        if min_value is not None and max_value is not None and min_value > max_value:
            raise ValueError("min_value cannot be greater than max_value")

        if min_value is None and max_value is None:
            raise ValueError("min_value and max_value cannot both be None")
        dialect_name = execution_engine.engine.dialect.name.lower()

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (min_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    min_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            min_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=True)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=True)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=True)

        if (max_value == get_sql_dialect_floating_point_infinity_value(
                schema="api_np", negative=False)) or (
                    max_value == get_sql_dialect_floating_point_infinity_value(
                        schema="api_cast", negative=False)):
            max_value = get_sql_dialect_floating_point_infinity_value(
                schema=dialect_name, negative=False)

        (
            selectable,
            compute_domain_kwargs,
            accessor_domain_kwargs,
        ) = execution_engine.get_compute_domain(
            domain_kwargs=metric_domain_kwargs,
            domain_type=MetricDomainTypes.COLUMN)
        column = sa.column(accessor_domain_kwargs["column"])

        if min_value is None:
            if strict_max:
                condition = column < max_value
            else:
                condition = column <= max_value

        elif max_value is None:
            if strict_min:
                condition = column > min_value
            else:
                condition = column >= min_value

        else:
            if strict_min and strict_max:
                condition = sa.and_(column > min_value, column < max_value)
            elif strict_min:
                condition = sa.and_(column > min_value, column <= max_value)
            elif strict_max:
                condition = sa.and_(column >= min_value, column < max_value)
            else:
                condition = sa.and_(column >= min_value, column <= max_value)

        return execution_engine.engine.execute(
            sa.select([sa.func.count()
                       ]).select_from(selectable).where(condition)).scalar()