def test_standalone_sqlalchemy_datasource(test_db_connection_string, sa): datasource = SqlAlchemyDatasource( "SqlAlchemy", connection_string=test_db_connection_string, echo=False, batch_kwargs_generators={ "default": { "class_name": "TableBatchKwargsGenerator" } }, ) assert set( datasource.get_available_data_asset_names()["default"]["names"]) == { ("main.table_1", "table"), ("main.table_2", "table"), } batch_kwargs = datasource.build_batch_kwargs("default", "main.table_1") batch = datasource.get_batch(batch_kwargs=batch_kwargs) assert isinstance(batch, Batch) batch_data = batch.data assert isinstance( batch_data, great_expectations.dataset.sqlalchemy_dataset.SqlAlchemyBatchReference, ) dataset = SqlAlchemyDataset(**batch.data.get_init_kwargs()) assert len(dataset.head(10)) == 5
def test_dataset_attempt_allowing_relative_error_when_redshift_library_not_installed( sa, ): engine = sa.create_engine("sqlite://") dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1") assert isinstance(dataset, SqlAlchemyDataset) assert dataset.attempt_allowing_relative_error() is False
def test_pybigquery_module_type_import(sqlitedb_engine): # We're really just testing that this particular hack works dataset = SqlAlchemyDataset(engine=sqlitedb_engine, custom_sql='SELECT "cat" as "pet_name"') dataset.engine.dialect = pybigquery.sqlalchemy_bigquery.BigQueryDialect() assert getattr(dataset._get_dialect_type_module(), "RECORD") == sa.types.JSON
def test_expect_compound_columns_to_be_unique(sa): engine = sa.create_engine("sqlite://") data = pd.DataFrame({ "col1": [1, 2, 3, 1, 2, 3, 4, 5, None], "col2": [1, 2, 2, 2, 2, 3, None, None, None], "col3": [1, 1, 2, 2, 3, 2, None, None, None], "col4": [1, None, 2, 2, None, None, None, None, None], }) data.to_sql(name="test_sql_data", con=engine, index=False) dataset = SqlAlchemyDataset("test_sql_data", engine=engine) assert not dataset.expect_compound_columns_to_be_unique(["col1", "col2" ]).success assert not dataset.expect_compound_columns_to_be_unique(["col2", "col3" ]).success assert not dataset.expect_compound_columns_to_be_unique(["col1", "col3" ]).success assert dataset.expect_compound_columns_to_be_unique( ["col1", "col2", "col3"]).success assert not dataset.expect_compound_columns_to_be_unique( ["col1", "col2", "col4"]).success assert dataset.expect_compound_columns_to_be_unique( ["col1", "col2", "col4"], ignore_row_if="any_value_is_missing", ).success assert not dataset.expect_compound_columns_to_be_unique( ["col1", "col2", "col4"], ignore_row_if="never", ).success
def pyathena_dataset(sa): from pyathena import sqlalchemy_athena engine = sa.create_engine("sqlite://") data = pd.DataFrame({"col": ["test_val1", "test_val2"]}) data.to_sql(name="test_sql_data", con=engine, index=False) dataset = SqlAlchemyDataset("test_sql_data", engine=engine) dataset.dialect = sqlalchemy_athena return dataset
def test_sqlalchemydataset_with_custom_sql(): engine = sa.create_engine('sqlite://') data = pd.DataFrame({ "name": ["Frank", "Steve", "Jane", "Frank", "Michael"], "age": [16, 21, 38, 22, 10], "pet": ["fish", "python", "cat", "python", "frog"] }) data.to_sql(name='test_sql_data', con=engine, index=False) custom_sql = "SELECT name, pet FROM test_sql_data WHERE age > 25" custom_sql_dataset = SqlAlchemyDataset('test_sql_data', engine=engine, custom_sql=custom_sql) custom_sql_dataset._initialize_expectations() custom_sql_dataset.set_default_expectation_argument( "result_format", {"result_format": "COMPLETE"}) result = custom_sql_dataset.expect_column_values_to_be_in_set( "pet", ["fish", "cat", "python"]) assert result['success'] == True result = custom_sql_dataset.expect_column_to_exist("age") assert result['success'] == False
def test_expect_compound_columns_to_be_unique_with_no_rows(sa): engine = sa.create_engine("sqlite://") data = pd.DataFrame({ "col1": [], "col2": [], "col3": [], "col4": [], }) data.to_sql(name="test_sql_data", con=engine, index=False) dataset = SqlAlchemyDataset("test_sql_data", engine=engine) assert dataset.expect_compound_columns_to_be_unique(["col1", "col2"]).success
def test_sqlalchemydataset_builds_guid_for_table_name_on_custom_sql(sa): engine = sa.create_engine('sqlite://') with mock.patch("uuid.uuid4") as mock_uuid: mock_uuid.return_value = "a-guid-with-dashes-that-will-break-sql" dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1") assert dataset._table.name == "ge_tmp_a_guid_with_dashes_that_will_break_sql"
def get_dataset(dataset_type, data): """For Pandas, data should be either a DataFrame or a dictionary that can be instantiated as a DataFrame For SQL, data should have the following shape: { 'table': 'table': SqlAlchemy Table object named_column: [list of values] } """ if dataset_type == 'PandasDataset': return PandasDataset(data) elif dataset_type == 'SqlAlchemyDataset': # Create a new database engine = create_engine('sqlite://') # Add the data to the database as a new table df = pd.DataFrame(data) df.to_sql(name='test_data', con=engine, index=False) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset('test_data', engine=engine) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def test_sqlalchemydataset_builds_guid_for_table_name_on_custom_sql(sa): engine = sa.create_engine("sqlite://") with mock.patch("uuid.uuid4") as mock_uuid: mock_uuid.return_value = "12345678-lots-more-stuff" dataset = SqlAlchemyDataset(engine=engine, custom_sql="select 1") assert dataset._table.name == "ge_tmp_12345678"
def test_column(sa): engine = sa.create_engine("sqlite://") data = pd.DataFrame( { "name": ["Frank", "Steve", "Jane", "Frank", "Michael"], "age": [16, 21, 38, 22, 10], "pet": ["fish", "python", "cat", "python", "frog"], } ) data.to_sql(name="test_sql_data", con=engine, index=False) dataset = SqlAlchemyDataset("test_sql_data", engine=engine) assert set(dataset.get_table_columns()) == {"name", "age", "pet"} obs = dataset.columns # Hacks to check instances of types for col in obs: col["type"] = str(col["type"]) assert len(obs) == 3 for column in [ { "name": "name", "type": "TEXT", "nullable": True, "default": None, "autoincrement": "auto", "primary_key": 0, }, { "name": "age", "type": "BIGINT", "nullable": True, "default": None, "autoincrement": "auto", "primary_key": 0, }, { "name": "pet", "type": "TEXT", "nullable": True, "default": None, "autoincrement": "auto", "primary_key": 0, }, ]: assert column in obs
def test_adding_expectation_to_sqlalchemy_dataset_not_send_usage_message( mock_emit, sa): """ What does this test and why? When an Expectation is called using a SqlAlchemyDataset, it validates the dataset using the implementation of the Expectation. As part of the process, it also adds the Expectation to the active ExpectationSuite. This test ensures that this in-direct way of adding an Expectation to the ExpectationSuite (ie not calling add_expectations() directly) does not emit a usage_stats event. """ engine = sa.create_engine("sqlite://") data = pd.DataFrame({ "name": ["Frank", "Steve", "Jane", "Frank", "Michael"], "age": [16, 21, 38, 22, 10], "pet": ["fish", "python", "cat", "python", "frog"], }) data.to_sql(name="test_sql_data", con=engine, index=False) custom_sql = "SELECT name, pet FROM test_sql_data WHERE age > 12" custom_sql_dataset = SqlAlchemyDataset(engine=engine, custom_sql=custom_sql) custom_sql_dataset._initialize_expectations() custom_sql_dataset.set_default_expectation_argument( "result_format", {"result_format": "COMPLETE"}) result = custom_sql_dataset.expect_column_values_to_be_in_set( "pet", ["fish", "cat", "python"]) # add_expectation() will not send usage_statistics event when called from a SqlAlchemy Dataset assert mock_emit.call_count == 0 assert mock_emit.call_args_list == []
def test_schema_custom_sql_error(sa): engine = sa.create_engine('sqlite://') with pytest.raises(ValueError) as err: SqlAlchemyDataset('test_schema_custom', schema='example', engine=engine, custom_sql='SELECT * FROM example.fake') assert "Cannot specify both schema and custom_sql." in str(err.value)
def custom_dataset(): engine = sa.create_engine('sqlite:///ge_custom_sql.db') query = ''' select p.id, p.name, a.street_name from person as p inner join address as a on p.id = a.id ''' # Error dataset = SqlAlchemyDataset(table_name='person', engine=engine, custom_sql=query) return dataset
def test_dataset_from_sql_source(test_db_file, tmpdir): connection_url = f'sqlite:///{test_db_file}' engine = create_engine(connection_url) ds = SqlAlchemyDataset(table_name=TABLE_NAME, engine=engine) store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir) project_config.stores = store_defaults.stores project_config.expectations_store_name = store_defaults.expectations_store_name project_config.validations_store_name = store_defaults.validations_store_name project_config.checkpoint_store_name = store_defaults.checkpoint_store_name ctx = BaseDataContext(project_config=project_config) action = OpenLineageValidationAction(ctx, openlineage_host='http://localhost:5000', openlineage_namespace='test_ns', job_name='test_job') datasets = action._fetch_datasets_from_sql_source(ds, result_suite) assert datasets is not None assert len(datasets) == 1 input_ds = datasets[0] assert input_ds.name == TABLE_NAME assert input_ds.namespace == "sqlite" assert "dataSource" in input_ds.facets assert input_ds.facets["dataSource"].name == "sqlite" assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file assert 'schema' in input_ds.facets assert len(input_ds.facets['schema'].fields) == 4 assert all(f in input_ds.facets['schema'].fields for f in [SchemaField('name', 'TEXT'), SchemaField('birthdate', 'TEXT'), SchemaField('address', 'TEXT'), SchemaField('size', 'INTEGER')]) assert len(input_ds.inputFacets) == 3 assert all(k in input_ds.inputFacets for k in ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics']) assert input_ds.inputFacets['dataQuality'].rowCount == 10 assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60 assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2 assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True), GreatExpectationsAssertion('expect_column_sum_to_be_between', True, 'size')])
def test_sqlalchemy_dataset_view(sqlite_view_engine): # This test demonstrates that a view can be used as a SqlAlchemyDataset table for purposes of validation dataset = SqlAlchemyDataset("test_view", engine=sqlite_view_engine) res = dataset.expect_table_row_count_to_equal(1) assert res.success is True # A temp view can also be used, though generators will not see it dataset = SqlAlchemyDataset("test_temp_view", engine=sqlite_view_engine) res = dataset.expect_table_row_count_to_equal(3) assert res.success is True
def test_only_connection_string(sa): SqlAlchemyDataset('test_engine', connection_string='sqlite://')
def get_dataset(dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == 'PandasDataset': if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": from sqlalchemy import create_engine engine = create_engine('sqlite://') conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if schemas and "sqlite" in schemas and isinstance( engine.dialect, sqlitetypes.dialect): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'postgresql': from sqlalchemy import create_engine # Create a new database engine = create_engine('postgresql://postgres@localhost/test_ci') conn = engine.connect() sql_dtypes = {} if schemas and "postgresql" in schemas and isinstance( engine.dialect, postgresqltypes.dialect): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'mysql': engine = create_engine('mysql://root@localhost/test_ci') conn = engine.connect() sql_dtypes = {} if schemas and "mysql" in schemas and isinstance( engine.dialect, mysqltypes.dialect): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast='signed') elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, profiler=profiler, caching=caching) elif dataset_type == 'SparkDFDataset': from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and 'spark' in schemas: schema = schemas['spark'] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def test_only_connection_string(titanic_sqlite_db, sa): conn_string = titanic_sqlite_db.url SqlAlchemyDataset("titanic", connection_string=conn_string)
def test_missing_engine_error(sa): with pytest.raises(ValueError) as err: SqlAlchemyDataset("test_engine", schema="example") assert "Engine or connection_string must be provided." in str(err.value)
def test_column_fallback(sa): engine = sa.create_engine("sqlite://") data = pd.DataFrame({ "name": ["Frank", "Steve", "Jane", "Frank", "Michael"], "age": [16, 21, 38, 22, 10], "pet": ["fish", "python", "cat", "python", "frog"], }) data.to_sql(name="test_sql_data", con=engine, index=False) dataset = SqlAlchemyDataset("test_sql_data", engine=engine) assert set(dataset.get_table_columns()) == {"name", "age", "pet"} fallback_dataset = SqlAlchemyDataset("test_sql_data", engine=engine) # override columns attribute to test fallback fallback_dataset.columns = fallback_dataset.column_reflection_fallback() assert set(fallback_dataset.get_table_columns()) == {"name", "age", "pet"} # check that the results are the same for a few expectations assert dataset.expect_column_to_exist( "age") == fallback_dataset.expect_column_to_exist("age") assert dataset.expect_column_mean_to_be_between( "age", min_value=10) == fallback_dataset.expect_column_mean_to_be_between( "age", min_value=10) # Test a failing expectation assert dataset.expect_table_row_count_to_equal( value=3) == fallback_dataset.expect_table_row_count_to_equal(value=3)
def test_sqlalchemydataset_raises_error_on_missing_table_name(sa): with pytest.raises(ValueError) as ve: SqlAlchemyDataset(table_name=None, engine="foo", connection_string="bar") assert str(ve.value) == "No table_name provided."
def test_missing_engine_error(): with pytest.raises(ValueError) as err: SqlAlchemyDataset('test_engine', schema='example') assert "Engine or connection_string must be provided." in str(err)
def get_dataset(dataset_type, data, schemas=None, autoinspect_func=autoinspect.columns_exist, caching=False): """For Pandas, data should be either a DataFrame or a dictionary that can be instantiated as a DataFrame. For SQL, data should have the following shape: { 'table': 'table': SqlAlchemy Table object named_column: [list of values] } """ if dataset_type == 'PandasDataset': df = pd.DataFrame(data) if schemas and "pandas" in schemas: pandas_schema = { key: np.dtype(value) for (key, value) in schemas["pandas"].items() } df = df.astype(pandas_schema) return PandasDataset(df, autoinspect_func=autoinspect_func, caching=caching) elif dataset_type == 'SqlAlchemyDataset': # Create a new database # Try to use a local postgres instance (e.g. on Travis); this will allow more testing than sqlite try: engine = create_engine('postgresql://*****:*****@localhost/test_ci') conn = engine.connect() except SQLAlchemyError: warnings.warn("Falling back to sqlite database.") engine = create_engine('sqlite://') conn = engine.connect() # Add the data to the database as a new table df = pd.DataFrame(data) sql_dtypes = {} if schemas and "sqlite" in schemas and isinstance( engine.dialect, sqlitetypes.dialect): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type = schema[col] if type == "int": df[col] = pd.to_numeric(df[col], downcast='signed') elif type == "float": df[col] = pd.to_numeric(df[col], downcast='float') elif type == "datetime": df[col] = pd.to_datetime(df[col]) elif schemas and "postgresql" in schemas and isinstance( engine.dialect, postgresqltypes.dialect): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type = schema[col] if type == "int": df[col] = pd.to_numeric(df[col], downcast='signed') elif type == "float": df[col] = pd.to_numeric(df[col], downcast='float') elif type == "timestamp": df[col] = pd.to_datetime(df[col]) tablename = "test_data_" + ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(8) ]) df.to_sql(name=tablename, con=conn, index=False, dtype=sql_dtypes) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(tablename, engine=conn, autoinspect_func=autoinspect_func, caching=caching) elif dataset_type == 'SparkDFDataset': spark = SparkSession.builder.getOrCreate() data_reshaped = list(zip(*[v for _, v in data.items()])) if schemas and 'spark' in schemas: schema = schemas['spark'] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]]()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))
def test_dataset_from_custom_sql(test_db_file, tmpdir): connection_url = f'sqlite:///{test_db_file}' engine = create_engine(connection_url) engine.execute("""CREATE TABLE join_table (name text, workplace text, position text)""") custom_sql = f"""SELECT * FROM {TABLE_NAME} t INNER JOIN join_table j ON t.name=j.name""" # note the batch_kwarg key is 'query', but the constructor arg is 'custom_sql' ds = SqlAlchemyDataset(engine=engine, custom_sql=custom_sql, batch_kwargs={'query': custom_sql}) store_defaults = FilesystemStoreBackendDefaults(root_directory=tmpdir) project_config.stores = store_defaults.stores project_config.expectations_store_name = store_defaults.expectations_store_name project_config.validations_store_name = store_defaults.validations_store_name project_config.checkpoint_store_name = store_defaults.checkpoint_store_name ctx = BaseDataContext(project_config=project_config) action = OpenLineageValidationAction(ctx, openlineage_host='http://localhost:5000', openlineage_namespace='test_ns', job_name='test_job') datasets = action._fetch_datasets_from_sql_source(ds, result_suite) assert datasets is not None assert len(datasets) == 2 assert all(name in [TABLE_NAME, 'join_table'] for name in [ds.name for ds in datasets]) input_ds = next(ds for ds in datasets if ds.name == TABLE_NAME) assert "dataSource" in input_ds.facets assert input_ds.facets["dataSource"].name == "sqlite" assert input_ds.facets["dataSource"].uri == "sqlite:/" + test_db_file assert 'schema' in input_ds.facets assert len(input_ds.facets['schema'].fields) == 4 assert all(f in input_ds.facets['schema'].fields for f in [SchemaField('name', 'TEXT'), SchemaField('birthdate', 'TEXT'), SchemaField('address', 'TEXT'), SchemaField('size', 'INTEGER')]) assert len(input_ds.inputFacets) == 3 assert all(k in input_ds.inputFacets for k in ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics']) assert input_ds.inputFacets['dataQuality'].rowCount == 10 assert 'size' in input_ds.inputFacets['dataQuality'].columnMetrics assert input_ds.inputFacets['dataQuality'].columnMetrics['size'].sum == 60 assert len(input_ds.inputFacets['greatExpectations_assertions'].assertions) == 2 assert all(a in input_ds.inputFacets['greatExpectations_assertions'].assertions for a in [GreatExpectationsAssertion('expect_table_row_count_to_equal', True), GreatExpectationsAssertion('expect_column_sum_to_be_between', True, 'size')]) input_ds = next(ds for ds in datasets if ds.name == 'join_table') assert 'schema' in input_ds.facets assert len(input_ds.facets['schema'].fields) == 3 assert all(f in input_ds.facets['schema'].fields for f in [SchemaField('name', 'TEXT'), SchemaField('workplace', 'TEXT'), SchemaField('position', 'TEXT')]) assert len(input_ds.inputFacets) == 3 assert all(k in input_ds.inputFacets for k in ['dataQuality', 'greatExpectations_assertions', 'dataQualityMetrics'])
def get_dataset( dataset_type, data, schemas=None, profiler=ColumnsExistProfiler, caching=True, table_name=None, sqlite_db_path=None, ): """Utility to create datasets for json-formatted tests. """ df = pd.DataFrame(data) if dataset_type == "PandasDataset": if schemas and "pandas" in schemas: schema = schemas["pandas"] pandas_schema = {} for (key, value) in schema.items(): # Note, these are just names used in our internal schemas to build datasets *for internal tests* # Further, some changes in pandas internal about how datetimes are created means to support pandas # pre- 0.25, we need to explicitly specify when we want timezone. # We will use timestamp for timezone-aware (UTC only) dates in our tests if value.lower() in ["timestamp", "datetime64[ns, tz]"]: df[key] = pd.to_datetime(df[key], utc=True) continue elif value.lower() in [ "datetime", "datetime64", "datetime64[ns]" ]: df[key] = pd.to_datetime(df[key]) continue try: type_ = np.dtype(value) except TypeError: type_ = getattr(pd.core.dtypes.dtypes, value) # If this raises AttributeError it's okay: it means someone built a bad test pandas_schema[key] = type_ # pandas_schema = {key: np.dtype(value) for (key, value) in schemas["pandas"].items()} df = df.astype(pandas_schema) return PandasDataset(df, profiler=profiler, caching=caching) elif dataset_type == "sqlite": if not create_engine: return None if sqlite_db_path is not None: engine = create_engine(f"sqlite:////{sqlite_db_path}") else: engine = create_engine("sqlite://") conn = engine.connect() # Add the data to the database as a new table sql_dtypes = {} if (schemas and "sqlite" in schemas and isinstance(engine.dialect, sqlitetypes.dialect)): schema = schemas["sqlite"] sql_dtypes = { col: SQLITE_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "postgresql": if not create_engine: return None # Create a new database engine = create_engine("postgresql://postgres@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "postgresql" in schemas and isinstance(engine.dialect, postgresqltypes.dialect)): schema = schemas["postgresql"] sql_dtypes = { col: POSTGRESQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mysql": if not create_engine: return None engine = create_engine("mysql+pymysql://root@localhost/test_ci") conn = engine.connect() sql_dtypes = {} if (schemas and "mysql" in schemas and isinstance(engine.dialect, mysqltypes.dialect)): schema = schemas["mysql"] sql_dtypes = { col: MYSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT", "DOUBLE", "DOUBLE_PRECISION"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "mssql": if not create_engine: return None engine = create_engine( "mssql+pyodbc://sa:ReallyStrongPwd1234%^&*@localhost:1433/test_ci?driver=ODBC Driver 17 for SQL Server&charset=utf8&autocommit=true", # echo=True, ) # If "autocommit" is not desired to be on by default, then use the following pattern when explicit "autocommit" # is desired (e.g., for temporary tables, "autocommit" is off by default, so the override option may be useful). # engine.execute(sa.text(sql_query_string).execution_options(autocommit=True)) conn = engine.connect() sql_dtypes = {} if (schemas and dataset_type in schemas and isinstance(engine.dialect, mssqltypes.dialect)): schema = schemas[dataset_type] sql_dtypes = { col: MSSQL_TYPES[dtype] for (col, dtype) in schema.items() } for col in schema: type_ = schema[col] if type_ in ["INTEGER", "SMALLINT", "BIGINT"]: df[col] = pd.to_numeric(df[col], downcast="signed") elif type_ in ["FLOAT"]: df[col] = pd.to_numeric(df[col]) min_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=True) max_value_dbms = get_sql_dialect_floating_point_infinity_value( schema=dataset_type, negative=False) for api_schema_type in ["api_np", "api_cast"]: min_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=True) max_value_api = get_sql_dialect_floating_point_infinity_value( schema=api_schema_type, negative=False) df.replace( to_replace=[min_value_api, max_value_api], value=[min_value_dbms, max_value_dbms], inplace=True, ) elif type_ in ["DATETIME", "TIMESTAMP"]: df[col] = pd.to_datetime(df[col]) if table_name is None: table_name = "test_data_" + "".join([ random.choice(string.ascii_letters + string.digits) for _ in range(8) ]) df.to_sql( name=table_name, con=conn, index=False, dtype=sql_dtypes, if_exists="replace", ) # Build a SqlAlchemyDataset using that database return SqlAlchemyDataset(table_name, engine=conn, profiler=profiler, caching=caching) elif dataset_type == "SparkDFDataset": from pyspark.sql import SparkSession import pyspark.sql.types as sparktypes SPARK_TYPES = { "StringType": sparktypes.StringType, "IntegerType": sparktypes.IntegerType, "LongType": sparktypes.LongType, "DateType": sparktypes.DateType, "TimestampType": sparktypes.TimestampType, "FloatType": sparktypes.FloatType, "DoubleType": sparktypes.DoubleType, "BooleanType": sparktypes.BooleanType, "DataType": sparktypes.DataType, "NullType": sparktypes.NullType, } spark = SparkSession.builder.getOrCreate() # We need to allow null values in some column types that do not support them natively, so we skip # use of df in this case. data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows if schemas and "spark" in schemas: schema = schemas["spark"] # sometimes first method causes Spark to throw a TypeError try: spark_schema = sparktypes.StructType([ sparktypes.StructField(column, SPARK_TYPES[schema[column]](), True) for column in schema ]) # We create these every time, which is painful for testing # However nuance around null treatment as well as the desire # for real datetime support in tests makes this necessary data = copy.deepcopy(data) if "ts" in data: print(data) print(schema) for col in schema: type_ = schema[col] if type_ in ["IntegerType", "LongType"]: # Ints cannot be None...but None can be valid in Spark (as Null) vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(int(val)) data[col] = vals elif type_ in ["FloatType", "DoubleType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(float(val)) data[col] = vals elif type_ in ["DateType", "TimestampType"]: vals = [] for val in data[col]: if val is None: vals.append(val) else: vals.append(parse(val)) data[col] = vals # Do this again, now that we have done type conversion using the provided schema data_reshaped = list( zip(*[v for _, v in data.items()])) # create a list of rows spark_df = spark.createDataFrame(data_reshaped, schema=spark_schema) except TypeError: string_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in schema ]) spark_df = spark.createDataFrame(data_reshaped, string_schema) for c in spark_df.columns: spark_df = spark_df.withColumn( c, spark_df[c].cast(SPARK_TYPES[schema[c]]())) elif len(data_reshaped) == 0: # if we have an empty dataset and no schema, need to assign an arbitrary type columns = list(data.keys()) spark_schema = sparktypes.StructType([ sparktypes.StructField(column, sparktypes.StringType()) for column in columns ]) spark_df = spark.createDataFrame(data_reshaped, spark_schema) else: # if no schema provided, uses Spark's schema inference columns = list(data.keys()) spark_df = spark.createDataFrame(data_reshaped, columns) return SparkDFDataset(spark_df, profiler=profiler, caching=caching) else: raise ValueError("Unknown dataset_type " + str(dataset_type))