def test_get_table_metric_provider_metric_dependencies(empty_sqlite_db):
    mp = ColumnMax()
    metric = MetricConfiguration("column.max", {}, {})
    dependencies = mp.get_evaluation_dependencies(
        metric,
        execution_engine=SqlAlchemyExecutionEngine(engine=empty_sqlite_db))
    assert dependencies["metric_partial_fn"].id[0] == "column.max.aggregate_fn"

    mp = ColumnMax()
    metric = MetricConfiguration("column.max", {}, {})
    dependencies = mp.get_evaluation_dependencies(
        metric, execution_engine=PandasExecutionEngine())

    table_column_types_metric: MetricConfiguration = dependencies[
        "table.column_types"]
    table_columns_metric: MetricConfiguration = dependencies["table.columns"]
    table_row_count_metric: MetricConfiguration = dependencies[
        "table.row_count"]
    assert dependencies == {
        "table.column_types": table_column_types_metric,
        "table.columns": table_columns_metric,
        "table.row_count": table_row_count_metric,
    }
    assert dependencies["table.columns"].id == (
        "table.columns",
        (),
        (),
    )
def test_table_column_introspection(sa):
    db_file = file_relative_path(
        __file__,
        "../../test_sets/test_cases_for_sql_data_connector.db",
    )
    eng = sa.create_engine(f"sqlite:///{db_file}")
    engine = SqlAlchemyExecutionEngine(engine=eng)
    batch_data = SqlAlchemyBatchData(
        execution_engine=engine,
        table_name="table_partitioned_by_date_column__A")
    engine.load_batch_data("__", batch_data)
    assert isinstance(batch_data.selectable, sa.Table)
    assert batch_data.selectable.name == "table_partitioned_by_date_column__A"
    assert batch_data.selectable.schema is None

    insp = reflection.Inspector.from_engine(eng)
    columns = insp.get_columns(batch_data.selectable.name,
                               schema=batch_data.selectable.schema)
    assert [x["name"] for x in columns] == [
        "index",
        "id",
        "date",
        "event_type",
        "favorite_color",
    ]
Esempio n. 3
0
def test_instantiation_with_unknown_dialect(sqlite_view_engine):
    execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine(
        engine=sqlite_view_engine)
    execution_engine.engine.dialect.name = "not_a_supported_dialect"
    batch_data: SqlAlchemyBatchData = SqlAlchemyBatchData(
        execution_engine=execution_engine,
        table_name="test_table",
    )

    assert batch_data.dialect == GESqlDialect.OTHER
def test_get_table_metric_provider_metric_dependencies(empty_sqlite_db):
    mp = ColumnMax()
    metric = MetricConfiguration("column.max", dict(), dict())
    dependencies = mp.get_evaluation_dependencies(
        metric,
        execution_engine=SqlAlchemyExecutionEngine(engine=empty_sqlite_db))
    assert dependencies["metric_partial_fn"].id[0] == "column.max.aggregate_fn"

    mp = ColumnMax()
    metric = MetricConfiguration("column.max", dict(), dict())
    dependencies = mp.get_evaluation_dependencies(
        metric, execution_engine=PandasExecutionEngine())
    assert dependencies == dict()
Esempio n. 5
0
def clean_up_tables_with_prefix(connection_string: str, table_prefix: str) -> List[str]:
    """Drop all tables starting with the provided table_prefix.
    Note: Uses private method InferredAssetSqlDataConnector._introspect_db()
    to get the table names to not duplicate code, but should be refactored in the
    future to not use a private method.

    Args:
        connection_string: To connect to the database.
        table_prefix: First characters of the tables you want to remove.

    Returns:
        List of deleted tables.
    """
    execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine(
        connection_string=connection_string
    )
    data_connector = instantiate_class_from_config(
        config={
            "class_name": "InferredAssetSqlDataConnector",
            "name": "temp_data_connector",
        },
        runtime_environment={
            "execution_engine": execution_engine,
            "datasource_name": "temp_datasource",
        },
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
    )
    introspection_output = data_connector._introspect_db()

    tables_to_drop: List[str] = []
    tables_dropped: List[str] = []

    for table in introspection_output:
        if table["table_name"].startswith(table_prefix):
            tables_to_drop.append(table["table_name"])

    connection = execution_engine.engine.connect()
    for table_name in tables_to_drop:
        print(f"Dropping table {table_name}")
        connection.execute(f"DROP TABLE IF EXISTS {table_name}")
        tables_dropped.append(table_name)

    tables_skipped: List[str] = list(set(tables_to_drop) - set(tables_dropped))
    if len(tables_skipped) > 0:
        warnings.warn(f"Warning: Tables skipped: {tables_skipped}")

    return tables_dropped
Esempio n. 6
0
def test_instantiation_with_table_name(sqlite_view_engine):
    engine = SqlAlchemyExecutionEngine(engine=sqlite_view_engine)
    batch_data = SqlAlchemyBatchData(
        execution_engine=engine,
        table_name="test_table",
    )

    # This is a very hacky type check.
    # A better way would be to figure out the proper parent class for dialects within SQLAlchemy
    assert (str(type(
        batch_data.sql_engine_dialect))[:28] == "<class 'sqlalchemy.dialects.")

    assert isinstance(batch_data.selectable, sqlalchemy.Table)

    assert type(batch_data.record_set_name) == str
    assert batch_data.record_set_name == "great_expectations_sub_selection"

    assert batch_data.use_quoted_name == False
Esempio n. 7
0
def test_instantiation_with_and_without_temp_table(sqlite_view_engine, sa):

    print(get_sqlite_temp_table_names(sqlite_view_engine))
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1
    assert get_sqlite_temp_table_names(sqlite_view_engine) == {
        "test_temp_view"
    }

    engine = SqlAlchemyExecutionEngine(engine=sqlite_view_engine)
    # When the SqlAlchemyBatchData object is based on a table, a new temp table is NOT created, even if create_temp_table=True
    SqlAlchemyBatchData(
        execution_engine=engine,
        table_name="test_table",
        create_temp_table=True,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1

    selectable = sa.select("*").select_from(sa.text("test_table"))

    # If create_temp_table=False, a new temp table should NOT be created
    SqlAlchemyBatchData(
        execution_engine=engine,
        selectable=selectable,
        create_temp_table=False,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1

    # If create_temp_table=True, a new temp table should be created
    SqlAlchemyBatchData(
        execution_engine=engine,
        selectable=selectable,
        create_temp_table=True,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 2

    # If create_temp_table=True, a new temp table should be created
    SqlAlchemyBatchData(
        execution_engine=engine,
        selectable=selectable,
        # create_temp_table defaults to True
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 3
Esempio n. 8
0
def test_sample_using_random(sqlite_view_engine, test_df):
    my_execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine(
        engine=sqlite_view_engine
    )

    p: float
    batch_spec: SqlAlchemyDatasourceBatchSpec
    batch_data: SqlAlchemyBatchData
    num_rows: int
    rows_0: List[tuple]
    rows_1: List[tuple]

    # First, make sure that degenerative case never passes.

    test_df_0: pd.DataFrame = test_df.iloc[:1]
    test_df_0.to_sql("test_table_0", con=my_execution_engine.engine)

    p = 1.0
    batch_spec = SqlAlchemyDatasourceBatchSpec(
        table_name="test_table_0",
        schema_name="main",
        sampling_method="_sample_using_random",
        sampling_kwargs={"p": p},
    )

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()]).select_from(batch_data.selectable)
    ).scalar()
    assert num_rows == round(p * test_df_0.shape[0])

    rows_0: List[tuple] = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")]).select_from(batch_data.selectable)
    ).fetchall()

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()]).select_from(batch_data.selectable)
    ).scalar()
    assert num_rows == round(p * test_df_0.shape[0])

    rows_1: List[tuple] = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")]).select_from(batch_data.selectable)
    ).fetchall()

    assert len(rows_0) == len(rows_1) == 1

    assert rows_0 == rows_1

    # Second, verify that realistic case always returns different random sample of rows.

    test_df_1: pd.DataFrame = test_df
    test_df_1.to_sql("test_table_1", con=my_execution_engine.engine)

    p = 2.0e-1
    batch_spec = SqlAlchemyDatasourceBatchSpec(
        table_name="test_table_1",
        schema_name="main",
        sampling_method="_sample_using_random",
        sampling_kwargs={"p": p},
    )

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()]).select_from(batch_data.selectable)
    ).scalar()
    assert num_rows == round(p * test_df_1.shape[0])

    rows_0 = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")]).select_from(batch_data.selectable)
    ).fetchall()

    batch_data = my_execution_engine.get_batch_data(batch_spec=batch_spec)
    num_rows = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.func.count()]).select_from(batch_data.selectable)
    ).scalar()
    assert num_rows == round(p * test_df_1.shape[0])

    rows_1 = batch_data.execution_engine.engine.execute(
        sqlalchemy.select([sqlalchemy.text("*")]).select_from(batch_data.selectable)
    ).fetchall()

    assert len(rows_0) == len(rows_1)

    assert not (rows_0 == rows_1)
def test_instantiation_with_and_without_temp_table(sqlite_view_engine, sa):
    print(get_sqlite_temp_table_names(sqlite_view_engine))
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1
    assert get_sqlite_temp_table_names(sqlite_view_engine) == {
        "test_temp_view"
    }

    execution_engine: SqlAlchemyExecutionEngine = SqlAlchemyExecutionEngine(
        engine=sqlite_view_engine)
    # When the SqlAlchemyBatchData object is based on a table, a new temp table is NOT created, even if create_temp_table=True
    SqlAlchemyBatchData(
        execution_engine=execution_engine,
        table_name="test_table",
        create_temp_table=True,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1

    selectable = sa.select("*").select_from(sa.text("main.test_table"))

    # If create_temp_table=False, a new temp table should NOT be created
    SqlAlchemyBatchData(
        execution_engine=execution_engine,
        selectable=selectable,
        create_temp_table=False,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 1

    # If create_temp_table=True, a new temp table should be created
    SqlAlchemyBatchData(
        execution_engine=execution_engine,
        selectable=selectable,
        create_temp_table=True,
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 2

    # If create_temp_table=True, a new temp table should be created
    SqlAlchemyBatchData(
        execution_engine=execution_engine,
        selectable=selectable,
        # create_temp_table defaults to True
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 3

    # testing whether schema is supported
    selectable = sa.select("*").select_from(
        sa.table(name="test_table", schema="main"))
    SqlAlchemyBatchData(
        execution_engine=execution_engine,
        selectable=selectable,
        # create_temp_table defaults to True
    )
    assert len(get_sqlite_temp_table_names(sqlite_view_engine)) == 4

    # test schema with execution engine
    # TODO : Will20210222 Add tests for specifying schema with non-sqlite backend that actually supports new schema creation
    my_batch_spec = SqlAlchemyDatasourceBatchSpec(
        **{
            "table_name": "test_table",
            "batch_identifiers": {},
            "schema_name": "main",
        })
    res = execution_engine.get_batch_data_and_markers(batch_spec=my_batch_spec)
    assert len(res) == 2
Esempio n. 10
0
def get_sqlalchemy_runtime_validator_postgresql(df,
                                                schemas=None,
                                                caching=True,
                                                table_name=None):
    sa_engine_name = "postgresql"
    db_hostname = os.getenv("GE_TEST_LOCAL_DB_HOSTNAME", "localhost")
    try:
        engine = connection_manager.get_engine(
            f"postgresql://postgres@{db_hostname}/test_ci")
    except sqlalchemy.exc.OperationalError:
        return None

    sql_dtypes = {}

    if (schemas and sa_engine_name in schemas
            and isinstance(engine.dialect, postgresqltypes.dialect)):
        schema = schemas[sa_engine_name]
        sql_dtypes = {
            col: POSTGRESQL_TYPES[dtype]
            for (col, dtype) in schema.items()
        }

        for col in schema:
            type_ = schema[col]
            if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                df[col] = pd.to_numeric(df[col], downcast="signed")
            elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                df[col] = pd.to_numeric(df[col])
                min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                    schema=sa_engine_name, negative=True)
                max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                    schema=sa_engine_name, negative=False)
                for api_schema_type in ["api_np", "api_cast"]:
                    min_value_api = get_sql_dialect_floating_point_infinity_value(
                        schema=api_schema_type, negative=True)
                    max_value_api = get_sql_dialect_floating_point_infinity_value(
                        schema=api_schema_type, negative=False)
                    df.replace(
                        to_replace=[min_value_api, max_value_api],
                        value=[min_value_dbms, max_value_dbms],
                        inplace=True,
                    )
            elif type_ in ["DATETIME", "TIMESTAMP"]:
                df[col] = pd.to_datetime(df[col])

    if table_name is None:
        table_name = "test_data_" + "".join([
            random.choice(string.ascii_letters + string.digits)
            for _ in range(8)
        ])
    df.to_sql(
        name=table_name,
        con=engine,
        index=False,
        dtype=sql_dtypes,
        if_exists="replace",
    )
    batch_data = SqlAlchemyBatchData(execution_engine=engine,
                                     table_name=table_name)
    batch = Batch(data=batch_data)
    execution_engine = SqlAlchemyExecutionEngine(caching=caching,
                                                 engine=engine)
    batch_data = SqlAlchemyBatchData(execution_engine=execution_engine,
                                     table_name=table_name)
    batch = Batch(data=batch_data)

    return Validator(execution_engine=execution_engine, batches=(batch, ))