def test_standalone_sqlalchemy_datasource(test_db_connection_string, sa):
    datasource = SqlAlchemyDatasource(
        "SqlAlchemy",
        connection_string=test_db_connection_string,
        echo=False,
        batch_kwargs_generators={
            "default": {
                "class_name": "TableBatchKwargsGenerator"
            }
        },
    )

    assert set(
        datasource.get_available_data_asset_names()["default"]["names"]) == {
            ("main.table_1", "table"),
            ("main.table_2", "table"),
        }
    batch_kwargs = datasource.build_batch_kwargs("default", "main.table_1")
    batch = datasource.get_batch(batch_kwargs=batch_kwargs)
    assert isinstance(batch, Batch)
    batch_data = batch.data
    assert isinstance(
        batch_data,
        great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference,
    )
    dataset = SqlAlchemyDataset(**batch.data.get_init_kwargs())
    assert len(dataset.head(10)) == 5
Example #2
0
def test_dataset_attempt_allowing_relative_error_when_redshift_library_not_installed(
    sa, ):
    engine = sa.create_engine("sqlite://")
    dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1")

    assert isinstance(dataset, SqlAlchemyDataset)
    assert dataset.attempt_allowing_relative_error() is False
Example #3
0
def test_pybigquery_module_type_import(sqlitedb_engine):
    # We're really just testing that this particular hack works
    dataset = SqlAlchemyDataset(engine=sqlitedb_engine,
                                custom_sql='SELECT "cat" as "pet_name"')
    dataset.engine.dialect = pybigquery.sqlalchemy_bigquery.BigQueryDialect()
    assert getattr(dataset._get_dialect_type_module(),
                   "RECORD") == sa.types.JSON
def test_expect_compound_columns_to_be_unique(sa):
    engine = sa.create_engine("sqlite://")

    data = pd.DataFrame({
        "col1": [1, 2, 3, 1, 2, 3, 4, 5, None],
        "col2": [1, 2, 2, 2, 2, 3, None, None, None],
        "col3": [1, 1, 2, 2, 3, 2, None, None, None],
        "col4": [1, None, 2, 2, None, None, None, None, None],
    })

    data.to_sql(name="test_sql_data", con=engine, index=False)
    dataset = SqlAlchemyDataset("test_sql_data", engine=engine)

    assert not dataset.expect_compound_columns_to_be_unique(["col1", "col2"
                                                             ]).success
    assert not dataset.expect_compound_columns_to_be_unique(["col2", "col3"
                                                             ]).success
    assert not dataset.expect_compound_columns_to_be_unique(["col1", "col3"
                                                             ]).success
    assert dataset.expect_compound_columns_to_be_unique(
        ["col1", "col2", "col3"]).success
    assert not dataset.expect_compound_columns_to_be_unique(
        ["col1", "col2", "col4"]).success
    assert dataset.expect_compound_columns_to_be_unique(
        ["col1", "col2", "col4"],
        ignore_row_if="any_value_is_missing",
    ).success
    assert not dataset.expect_compound_columns_to_be_unique(
        ["col1", "col2", "col4"],
        ignore_row_if="never",
    ).success
def pyathena_dataset(sa):
    from pyathena import sqlalchemy_athena

    engine = sa.create_engine("sqlite://")

    data = pd.DataFrame({"col": ["test_val1", "test_val2"]})

    data.to_sql(name="test_sql_data", con=engine, index=False)
    dataset = SqlAlchemyDataset("test_sql_data", engine=engine)
    dataset.dialect = sqlalchemy_athena
    return dataset
def test_sqlalchemydataset_with_custom_sql():
    engine = sa.create_engine('sqlite://')

    data = pd.DataFrame({
        "name": ["Frank", "Steve", "Jane", "Frank", "Michael"],
        "age": [16, 21, 38, 22, 10],
        "pet": ["fish", "python", "cat", "python", "frog"]
    })

    data.to_sql(name='test_sql_data', con=engine, index=False)

    custom_sql = "SELECT name, pet FROM test_sql_data WHERE age > 25"
    custom_sql_dataset = SqlAlchemyDataset('test_sql_data',
                                           engine=engine,
                                           custom_sql=custom_sql)

    custom_sql_dataset._initialize_expectations()
    custom_sql_dataset.set_default_expectation_argument(
        "result_format", {"result_format": "COMPLETE"})

    result = custom_sql_dataset.expect_column_values_to_be_in_set(
        "pet", ["fish", "cat", "python"])
    assert result['success'] == True

    result = custom_sql_dataset.expect_column_to_exist("age")
    assert result['success'] == False
Example #7
0
def test_expect_compound_columns_to_be_unique_with_no_rows(sa):
    engine = sa.create_engine("sqlite://")

    data = pd.DataFrame({
        "col1": [],
        "col2": [],
        "col3": [],
        "col4": [],
    })

    data.to_sql(name="test_sql_data", con=engine, index=False)
    dataset = SqlAlchemyDataset("test_sql_data", engine=engine)

    assert dataset.expect_compound_columns_to_be_unique(["col1",
                                                         "col2"]).success
Example #8
0
def test_sqlalchemydataset_builds_guid_for_table_name_on_custom_sql(sa):
    engine = sa.create_engine('sqlite://')
    with mock.patch("uuid.uuid4") as mock_uuid:
        mock_uuid.return_value = "a-guid-with-dashes-that-will-break-sql"

        dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1")
        assert dataset._table.name == "ge_tmp_a_guid_with_dashes_that_will_break_sql"
Example #9
0
def get_dataset(dataset_type, data):
    """For Pandas, data should be either a DataFrame or a dictionary that can be instantiated as a DataFrame
    For SQL, data should have the following shape:
        {
            'table':
                'table': SqlAlchemy Table object
                named_column: [list of values]
        }

    """
    if dataset_type == 'PandasDataset':
        return PandasDataset(data)
    elif dataset_type == 'SqlAlchemyDataset':
        # Create a new database

        engine = create_engine('sqlite://')

        # Add the data to the database as a new table
        df = pd.DataFrame(data)
        df.to_sql(name='test_data', con=engine, index=False)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset('test_data', engine=engine)
    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Example #10
0
def test_sqlalchemydataset_builds_guid_for_table_name_on_custom_sql(sa):
    engine = sa.create_engine("sqlite://")
    with mock.patch("uuid.uuid4") as mock_uuid:
        mock_uuid.return_value = "12345678-lots-more-stuff"

        dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1")
        assert dataset._table.name == "ge_tmp_12345678"
def test_column(sa):
    engine = sa.create_engine("sqlite://")

    data = pd.DataFrame(
        {
            "name": ["Frank", "Steve", "Jane", "Frank", "Michael"],
            "age": [16, 21, 38, 22, 10],
            "pet": ["fish", "python", "cat", "python", "frog"],
        }
    )
    data.to_sql(name="test_sql_data", con=engine, index=False)
    dataset = SqlAlchemyDataset("test_sql_data", engine=engine)
    assert set(dataset.get_table_columns()) == {"name", "age", "pet"}

    obs = dataset.columns
    # Hacks to check instances of types
    for col in obs:
        col["type"] = str(col["type"])

    assert len(obs) == 3
    for column in [
        {
            "name": "name",
            "type": "TEXT",
            "nullable": True,
            "default": None,
            "autoincrement": "auto",
            "primary_key": 0,
        },
        {
            "name": "age",
            "type": "BIGINT",
            "nullable": True,
            "default": None,
            "autoincrement": "auto",
            "primary_key": 0,
        },
        {
            "name": "pet",
            "type": "TEXT",
            "nullable": True,
            "default": None,
            "autoincrement": "auto",
            "primary_key": 0,
        },
    ]:
        assert column in obs
def test_adding_expectation_to_sqlalchemy_dataset_not_send_usage_message(
        mock_emit, sa):
    """
    What does this test and why?

    When an Expectation is called using a SqlAlchemyDataset, it validates the dataset using the implementation of
    the Expectation. As part of the process, it also adds the Expectation to the active
    ExpectationSuite. This test ensures that this in-direct way of adding an Expectation to the ExpectationSuite
    (ie not calling add_expectations() directly) does not emit a usage_stats event.
    """
    engine = sa.create_engine("sqlite://")

    data = pd.DataFrame({
        "name": ["Frank", "Steve", "Jane", "Frank", "Michael"],
        "age": [16, 21, 38, 22, 10],
        "pet": ["fish", "python", "cat", "python", "frog"],
    })
    data.to_sql(name="test_sql_data", con=engine, index=False)

    custom_sql = "SELECT name, pet FROM test_sql_data WHERE age > 12"
    custom_sql_dataset = SqlAlchemyDataset(engine=engine,
                                           custom_sql=custom_sql)

    custom_sql_dataset._initialize_expectations()
    custom_sql_dataset.set_default_expectation_argument(
        "result_format", {"result_format": "COMPLETE"})

    result = custom_sql_dataset.expect_column_values_to_be_in_set(
        "pet", ["fish", "cat", "python"])
    # add_expectation() will not send usage_statistics event when called from a SqlAlchemy Dataset
    assert mock_emit.call_count == 0
    assert mock_emit.call_args_list == []
Example #13
0
def test_schema_custom_sql_error(sa):
    engine = sa.create_engine('sqlite://')

    with pytest.raises(ValueError) as err:
        SqlAlchemyDataset('test_schema_custom',
                          schema='example',
                          engine=engine,
                          custom_sql='SELECT * FROM example.fake')
        assert "Cannot specify both schema and custom_sql." in str(err.value)
def custom_dataset():
    engine = sa.create_engine('sqlite:///ge_custom_sql.db')
    query = '''
        select p.id, p.name, a.street_name
        from person as p 
            inner join address as a
            on p.id = a.id
    '''
    
    # Error
    dataset = SqlAlchemyDataset(table_name='person', engine=engine, custom_sql=query)   

    return dataset
Example #15
0
def test_dataset_from_sql_source(test_db_file, tmpdir):
    connection_url = f'sqlite:///{test_db_file}'
    engine = create_engine(connection_url)

    ds = SqlAlchemyDataset(table_name=TABLE_NAME, engine=engine)

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')
    datasets = action._fetch_datasets_from_sql_source(ds, result_suite)
    assert datasets is not None
    assert len(datasets) == 1
    input_ds = datasets[0]
    assert input_ds.name == TABLE_NAME
    assert input_ds.namespace == "sqlite"

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "sqlite"
    assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'TEXT'),
                         SchemaField('birthdate', 'TEXT'),
                         SchemaField('address', 'TEXT'),
                         SchemaField('size', 'INTEGER')])

    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])
Example #16
0
def test_sqlalchemy_dataset_view(sqlite_view_engine):
    # This test demonstrates that a view can be used as a SqlAlchemyDataset table for purposes of validation
    dataset = SqlAlchemyDataset("test_view", engine=sqlite_view_engine)
    res = dataset.expect_table_row_count_to_equal(1)
    assert res.success is True

    # A temp view can also be used, though generators will not see it
    dataset = SqlAlchemyDataset("test_temp_view", engine=sqlite_view_engine)
    res = dataset.expect_table_row_count_to_equal(3)
    assert res.success is True
Example #17
0
def test_only_connection_string(sa):
    SqlAlchemyDataset('test_engine', connection_string='sqlite://')
Example #18
0
def get_dataset(dataset_type,
                data,
                schemas=None,
                profiler=ColumnsExistProfiler,
                caching=True):
    """Utility to create datasets for json-formatted tests.
    """
    df = pd.DataFrame(data)
    if dataset_type == 'PandasDataset':
        if schemas and "pandas" in schemas:
            schema = schemas["pandas"]
            pandas_schema = {}
            for (key, value) in schema.items():
                # Note, these are just names used in our internal schemas to build datasets *for internal tests*
                # Further, some changes in pandas internal about how datetimes are created means to support pandas
                # pre- 0.25, we need to explicitly specify when we want timezone.

                # We will use timestamp for timezone-aware (UTC only) dates in our tests
                if value.lower() in ["timestamp", "datetime64[ns, tz]"]:
                    df[key] = pd.to_datetime(df[key], utc=True)
                    continue
                elif value.lower() in [
                        "datetime", "datetime64", "datetime64[ns]"
                ]:
                    df[key] = pd.to_datetime(df[key])
                    continue
                try:
                    type_ = np.dtype(value)
                except TypeError:
                    type_ = getattr(pd.core.dtypes.dtypes, value)
                    # If this raises AttributeError it's okay: it means someone built a bad test
                pandas_schema[key] = type_
            # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()}
            df = df.astype(pandas_schema)
        return PandasDataset(df, profiler=profiler, caching=caching)

    elif dataset_type == "sqlite":
        from sqlalchemy import create_engine
        engine = create_engine('sqlite://')
        conn = engine.connect()
        # Add the data to the database as a new table

        sql_dtypes = {}
        if schemas and "sqlite" in schemas and isinstance(
                engine.dialect, sqlitetypes.dialect):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == 'postgresql':
        from sqlalchemy import create_engine
        # Create a new database
        engine = create_engine('postgresql://postgres@localhost/test_ci')
        conn = engine.connect()

        sql_dtypes = {}
        if schemas and "postgresql" in schemas and isinstance(
                engine.dialect, postgresqltypes.dialect):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == 'mysql':
        engine = create_engine('mysql://root@localhost/test_ci')
        conn = engine.connect()

        sql_dtypes = {}
        if schemas and "mysql" in schemas and isinstance(
                engine.dialect, mysqltypes.dialect):
            schema = schemas["mysql"]
            sql_dtypes = {
                col: MYSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == 'SparkDFDataset':
        from pyspark.sql import SparkSession
        import pyspark.sql.types as sparktypes

        SPARK_TYPES = {
            "StringType": sparktypes.StringType,
            "IntegerType": sparktypes.IntegerType,
            "LongType": sparktypes.LongType,
            "DateType": sparktypes.DateType,
            "TimestampType": sparktypes.TimestampType,
            "FloatType": sparktypes.FloatType,
            "DoubleType": sparktypes.DoubleType,
            "BooleanType": sparktypes.BooleanType,
            "DataType": sparktypes.DataType,
            "NullType": sparktypes.NullType
        }

        spark = SparkSession.builder.getOrCreate()
        # We need to allow null values in some column types that do not support them natively, so we skip
        # use of df in this case.
        data_reshaped = list(
            zip(*[v for _, v in data.items()]))  # create a list of rows
        if schemas and 'spark' in schemas:
            schema = schemas['spark']
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]](), True)
                    for column in schema
                ])
                # We create these every time, which is painful for testing
                # However nuance around null treatment as well as the desire
                # for real datetime support in tests makes this necessary
                data = copy.deepcopy(data)
                if "ts" in data:
                    print(data)
                    print(schema)
                for col in schema:
                    type_ = schema[col]
                    if type_ in ["IntegerType", "LongType"]:
                        # Ints cannot be None...but None can be valid in Spark (as Null)
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(int(val))
                        data[col] = vals
                    elif type_ in ["FloatType", "DoubleType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(float(val))
                        data[col] = vals
                    elif type_ in ["DateType", "TimestampType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(parse(val))
                        data[col] = vals
                # Do this again, now that we have done type conversion using the provided schema
                data_reshaped = list(
                    zip(*[v
                          for _, v in data.items()]))  # create a list of rows
                spark_df = spark.createDataFrame(data_reshaped,
                                                 schema=spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, profiler=profiler, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Example #19
0
def test_only_connection_string(titanic_sqlite_db, sa):
    conn_string = titanic_sqlite_db.url
    SqlAlchemyDataset("titanic", connection_string=conn_string)
Example #20
0
def test_missing_engine_error(sa):
    with pytest.raises(ValueError) as err:
        SqlAlchemyDataset("test_engine", schema="example")
    assert "Engine or connection_string must be provided." in str(err.value)
Example #21
0
def test_column_fallback(sa):
    engine = sa.create_engine("sqlite://")

    data = pd.DataFrame({
        "name": ["Frank", "Steve", "Jane", "Frank", "Michael"],
        "age": [16, 21, 38, 22, 10],
        "pet": ["fish", "python", "cat", "python", "frog"],
    })

    data.to_sql(name="test_sql_data", con=engine, index=False)
    dataset = SqlAlchemyDataset("test_sql_data", engine=engine)
    assert set(dataset.get_table_columns()) == {"name", "age", "pet"}

    fallback_dataset = SqlAlchemyDataset("test_sql_data", engine=engine)
    # override columns attribute to test fallback
    fallback_dataset.columns = fallback_dataset.column_reflection_fallback()
    assert set(fallback_dataset.get_table_columns()) == {"name", "age", "pet"}

    # check that the results are the same for a few expectations
    assert dataset.expect_column_to_exist(
        "age") == fallback_dataset.expect_column_to_exist("age")

    assert dataset.expect_column_mean_to_be_between(
        "age",
        min_value=10) == fallback_dataset.expect_column_mean_to_be_between(
            "age", min_value=10)

    # Test a failing expectation
    assert dataset.expect_table_row_count_to_equal(
        value=3) == fallback_dataset.expect_table_row_count_to_equal(value=3)
Example #22
0
def test_sqlalchemydataset_raises_error_on_missing_table_name(sa):
    with pytest.raises(ValueError) as ve:
        SqlAlchemyDataset(table_name=None,
                          engine="foo",
                          connection_string="bar")
    assert str(ve.value) == "No table_name provided."
def test_missing_engine_error():
    with pytest.raises(ValueError) as err:
        SqlAlchemyDataset('test_engine', schema='example')
        assert "Engine or connection_string must be provided." in str(err)
Example #24
0
def get_dataset(dataset_type,
                data,
                schemas=None,
                autoinspect_func=autoinspect.columns_exist,
                caching=False):
    """For Pandas, data should be either a DataFrame or a dictionary that can
    be instantiated as a DataFrame.
    For SQL, data should have the following shape:
        {
            'table':
                'table': SqlAlchemy Table object
                named_column: [list of values]
        }

    """
    if dataset_type == 'PandasDataset':
        df = pd.DataFrame(data)
        if schemas and "pandas" in schemas:
            pandas_schema = {
                key: np.dtype(value)
                for (key, value) in schemas["pandas"].items()
            }
            df = df.astype(pandas_schema)
        return PandasDataset(df,
                             autoinspect_func=autoinspect_func,
                             caching=caching)
    elif dataset_type == 'SqlAlchemyDataset':
        # Create a new database

        # Try to use a local postgres instance (e.g. on Travis); this will allow more testing than sqlite
        try:
            engine = create_engine('postgresql://*****:*****@localhost/test_ci')
            conn = engine.connect()
        except SQLAlchemyError:
            warnings.warn("Falling back to sqlite database.")
            engine = create_engine('sqlite://')
            conn = engine.connect()

        # Add the data to the database as a new table
        df = pd.DataFrame(data)

        sql_dtypes = {}
        if schemas and "sqlite" in schemas and isinstance(
                engine.dialect, sqlitetypes.dialect):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type = schema[col]
                if type == "int":
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type == "float":
                    df[col] = pd.to_numeric(df[col], downcast='float')
                elif type == "datetime":
                    df[col] = pd.to_datetime(df[col])
        elif schemas and "postgresql" in schemas and isinstance(
                engine.dialect, postgresqltypes.dialect):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type = schema[col]
                if type == "int":
                    df[col] = pd.to_numeric(df[col], downcast='signed')
                elif type == "float":
                    df[col] = pd.to_numeric(df[col], downcast='float')
                elif type == "timestamp":
                    df[col] = pd.to_datetime(df[col])

        tablename = "test_data_" + ''.join([
            random.choice(string.ascii_letters + string.digits)
            for n in range(8)
        ])
        df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes)

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(tablename,
                                 engine=conn,
                                 autoinspect_func=autoinspect_func,
                                 caching=caching)

    elif dataset_type == 'SparkDFDataset':
        spark = SparkSession.builder.getOrCreate()
        data_reshaped = list(zip(*[v for _, v in data.items()]))
        if schemas and 'spark' in schemas:
            schema = schemas['spark']
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]]())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))
Example #25
0
def test_dataset_from_custom_sql(test_db_file, tmpdir):
    connection_url = f'sqlite:///{test_db_file}'
    engine = create_engine(connection_url)
    engine.execute("""CREATE TABLE join_table (name text, workplace text, position text)""")
    custom_sql = f"""SELECT * FROM {TABLE_NAME} t INNER JOIN join_table j ON t.name=j.name"""

    # note the batch_kwarg key is 'query', but the constructor arg is 'custom_sql'
    ds = SqlAlchemyDataset(engine=engine,
                           custom_sql=custom_sql,
                           batch_kwargs={'query': custom_sql})

    store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir)
    project_config.stores = store_defaults.stores
    project_config.expectations_store_name = store_defaults.expectations_store_name
    project_config.validations_store_name = store_defaults.validations_store_name
    project_config.checkpoint_store_name = store_defaults.checkpoint_store_name

    ctx = BaseDataContext(project_config=project_config)
    action = OpenLineageValidationAction(ctx,
                                         openlineage_host='http://localhost:5000',
                                         openlineage_namespace='test_ns',
                                         job_name='test_job')
    datasets = action._fetch_datasets_from_sql_source(ds, result_suite)
    assert datasets is not None
    assert len(datasets) == 2
    assert all(name in [TABLE_NAME, 'join_table'] for name in [ds.name for ds in datasets])

    input_ds = next(ds for ds in datasets if ds.name == TABLE_NAME)

    assert "dataSource" in input_ds.facets
    assert input_ds.facets["dataSource"].name == "sqlite"
    assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file

    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 4
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'TEXT'),
                         SchemaField('birthdate', 'TEXT'),
                         SchemaField('address', 'TEXT'),
                         SchemaField('size', 'INTEGER')])
    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
    assert input_ds.inputFacets['dataQuality'].rowCount == 10
    assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics
    assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60

    assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2
    assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions
               for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True),
                         GreatExpectationsAssertion('expect_column_sum_to_be_between', True,
                                                    'size')])

    input_ds = next(ds for ds in datasets if ds.name == 'join_table')
    assert 'schema' in input_ds.facets
    assert len(input_ds.facets['schema'].fields) == 3
    assert all(f in input_ds.facets['schema'].fields
               for f in [SchemaField('name', 'TEXT'),
                         SchemaField('workplace', 'TEXT'),
                         SchemaField('position', 'TEXT')])
    assert len(input_ds.inputFacets) == 3
    assert all(k in input_ds.inputFacets for k in
               ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
Example #26
0
def get_dataset(
    dataset_type,
    data,
    schemas=None,
    profiler=ColumnsExistProfiler,
    caching=True,
    table_name=None,
    sqlite_db_path=None,
):
    """Utility to create datasets for json-formatted tests.
    """
    df = pd.DataFrame(data)
    if dataset_type == "PandasDataset":
        if schemas and "pandas" in schemas:
            schema = schemas["pandas"]
            pandas_schema = {}
            for (key, value) in schema.items():
                # Note, these are just names used in our internal schemas to build datasets *for internal tests*
                # Further, some changes in pandas internal about how datetimes are created means to support pandas
                # pre- 0.25, we need to explicitly specify when we want timezone.

                # We will use timestamp for timezone-aware (UTC only) dates in our tests
                if value.lower() in ["timestamp", "datetime64[ns, tz]"]:
                    df[key] = pd.to_datetime(df[key], utc=True)
                    continue
                elif value.lower() in [
                        "datetime", "datetime64", "datetime64[ns]"
                ]:
                    df[key] = pd.to_datetime(df[key])
                    continue
                try:
                    type_ = np.dtype(value)
                except TypeError:
                    type_ = getattr(pd.core.dtypes.dtypes, value)
                    # If this raises AttributeError it's okay: it means someone built a bad test
                pandas_schema[key] = type_
            # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()}
            df = df.astype(pandas_schema)
        return PandasDataset(df, profiler=profiler, caching=caching)

    elif dataset_type == "sqlite":
        if not create_engine:
            return None

        if sqlite_db_path is not None:
            engine = create_engine(f"sqlite:////{sqlite_db_path}")
        else:
            engine = create_engine("sqlite://")
        conn = engine.connect()
        # Add the data to the database as a new table

        sql_dtypes = {}
        if (schemas and "sqlite" in schemas
                and isinstance(engine.dialect, sqlitetypes.dialect)):
            schema = schemas["sqlite"]
            sql_dtypes = {
                col: SQLITE_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "postgresql":
        if not create_engine:
            return None

        # Create a new database
        engine = create_engine("postgresql://postgres@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "postgresql" in schemas
                and isinstance(engine.dialect, postgresqltypes.dialect)):
            schema = schemas["postgresql"]
            sql_dtypes = {
                col: POSTGRESQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mysql":
        if not create_engine:
            return None

        engine = create_engine("mysql+pymysql://root@localhost/test_ci")
        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and "mysql" in schemas
                and isinstance(engine.dialect, mysqltypes.dialect)):
            schema = schemas["mysql"]
            sql_dtypes = {
                col: MYSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "mssql":
        if not create_engine:
            return None

        engine = create_engine(
            "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true",
            # echo=True,
        )

        # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit"
        # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful).
        # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True))

        conn = engine.connect()

        sql_dtypes = {}
        if (schemas and dataset_type in schemas
                and isinstance(engine.dialect, mssqltypes.dialect)):
            schema = schemas[dataset_type]
            sql_dtypes = {
                col: MSSQL_TYPES[dtype]
                for (col, dtype) in schema.items()
            }
            for col in schema:
                type_ = schema[col]
                if type_ in ["INTEGER", "SMALLINT", "BIGINT"]:
                    df[col] = pd.to_numeric(df[col], downcast="signed")
                elif type_ in ["FLOAT"]:
                    df[col] = pd.to_numeric(df[col])
                    min_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=True)
                    max_value_dbms = get_sql_dialect_floating_point_infinity_value(
                        schema=dataset_type, negative=False)
                    for api_schema_type in ["api_np", "api_cast"]:
                        min_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=True)
                        max_value_api = get_sql_dialect_floating_point_infinity_value(
                            schema=api_schema_type, negative=False)
                        df.replace(
                            to_replace=[min_value_api, max_value_api],
                            value=[min_value_dbms, max_value_dbms],
                            inplace=True,
                        )
                elif type_ in ["DATETIME", "TIMESTAMP"]:
                    df[col] = pd.to_datetime(df[col])

        if table_name is None:
            table_name = "test_data_" + "".join([
                random.choice(string.ascii_letters + string.digits)
                for _ in range(8)
            ])
        df.to_sql(
            name=table_name,
            con=conn,
            index=False,
            dtype=sql_dtypes,
            if_exists="replace",
        )

        # Build a SqlAlchemyDataset using that database
        return SqlAlchemyDataset(table_name,
                                 engine=conn,
                                 profiler=profiler,
                                 caching=caching)

    elif dataset_type == "SparkDFDataset":
        from pyspark.sql import SparkSession
        import pyspark.sql.types as sparktypes

        SPARK_TYPES = {
            "StringType": sparktypes.StringType,
            "IntegerType": sparktypes.IntegerType,
            "LongType": sparktypes.LongType,
            "DateType": sparktypes.DateType,
            "TimestampType": sparktypes.TimestampType,
            "FloatType": sparktypes.FloatType,
            "DoubleType": sparktypes.DoubleType,
            "BooleanType": sparktypes.BooleanType,
            "DataType": sparktypes.DataType,
            "NullType": sparktypes.NullType,
        }

        spark = SparkSession.builder.getOrCreate()
        # We need to allow null values in some column types that do not support them natively, so we skip
        # use of df in this case.
        data_reshaped = list(
            zip(*[v for _, v in data.items()]))  # create a list of rows
        if schemas and "spark" in schemas:
            schema = schemas["spark"]
            # sometimes first method causes Spark to throw a TypeError
            try:
                spark_schema = sparktypes.StructType([
                    sparktypes.StructField(column,
                                           SPARK_TYPES[schema[column]](), True)
                    for column in schema
                ])
                # We create these every time, which is painful for testing
                # However nuance around null treatment as well as the desire
                # for real datetime support in tests makes this necessary
                data = copy.deepcopy(data)
                if "ts" in data:
                    print(data)
                    print(schema)
                for col in schema:
                    type_ = schema[col]
                    if type_ in ["IntegerType", "LongType"]:
                        # Ints cannot be None...but None can be valid in Spark (as Null)
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(int(val))
                        data[col] = vals
                    elif type_ in ["FloatType", "DoubleType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(float(val))
                        data[col] = vals
                    elif type_ in ["DateType", "TimestampType"]:
                        vals = []
                        for val in data[col]:
                            if val is None:
                                vals.append(val)
                            else:
                                vals.append(parse(val))
                        data[col] = vals
                # Do this again, now that we have done type conversion using the provided schema
                data_reshaped = list(
                    zip(*[v
                          for _, v in data.items()]))  # create a list of rows
                spark_df = spark.createDataFrame(data_reshaped,
                                                 schema=spark_schema)
            except TypeError:
                string_schema = sparktypes.StructType([
                    sparktypes.StructField(column, sparktypes.StringType())
                    for column in schema
                ])
                spark_df = spark.createDataFrame(data_reshaped, string_schema)
                for c in spark_df.columns:
                    spark_df = spark_df.withColumn(
                        c, spark_df[c].cast(SPARK_TYPES[schema[c]]()))
        elif len(data_reshaped) == 0:
            # if we have an empty dataset and no schema, need to assign an arbitrary type
            columns = list(data.keys())
            spark_schema = sparktypes.StructType([
                sparktypes.StructField(column, sparktypes.StringType())
                for column in columns
            ])
            spark_df = spark.createDataFrame(data_reshaped, spark_schema)
        else:
            # if no schema provided, uses Spark's schema inference
            columns = list(data.keys())
            spark_df = spark.createDataFrame(data_reshaped, columns)
        return SparkDFDataset(spark_df, profiler=profiler, caching=caching)

    else:
        raise ValueError("Unknown dataset_type " + str(dataset_type))