def test_sqlalchemy_source_limit(sqlitedb_engine): df1 = pd.DataFrame({ "col_1": [1, 2, 3, 4, 5], "col_2": ["a", "b", "c", "d", "e"] }) df2 = pd.DataFrame({ "col_1": [0, 1, 2, 3, 4], "col_2": ["b", "c", "d", "e", "f"] }) df1.to_sql("table_1", con=sqlitedb_engine, index=True) df2.to_sql("table_2", con=sqlitedb_engine, index=True, schema="main") datasource = SqlAlchemyDatasource("SqlAlchemy", engine=sqlitedb_engine) limited_batch = datasource.get_batch({ "table": "table_1", "limit": 1, "offset": 2 }) assert isinstance(limited_batch, Batch) limited_dataset = Validator( limited_batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset, ).get_dataset() assert limited_dataset._table.name.startswith( "ge_tmp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)["col_1"][0] == 3 # offset should have been applied
def test_sqlalchemy_source_limit(sqlitedb_engine): df1 = pd.DataFrame({ 'col_1': [1, 2, 3, 4, 5], 'col_2': ['a', 'b', 'c', 'd', 'e'] }) df2 = pd.DataFrame({ 'col_1': [0, 1, 2, 3, 4], 'col_2': ['b', 'c', 'd', 'e', 'f'] }) df1.to_sql('table_1', con=sqlitedb_engine, index=True) df2.to_sql('table_2', con=sqlitedb_engine, index=True, schema='main') datasource = SqlAlchemyDatasource('SqlAlchemy', engine=sqlitedb_engine) limited_batch = datasource.get_batch({ "table": "table_1", "limit": 1, "offset": 2 }) assert isinstance(limited_batch, Batch) limited_dataset = Validator( limited_batch, expectation_suite=ExpectationSuite("test"), expectation_engine=SqlAlchemyDataset).get_dataset() assert limited_dataset._table.name.startswith( "ge_tmp_") # we have generated a temporary table assert len(limited_dataset.head(10)) == 1 # and it is only one row long assert limited_dataset.head( 10)['col_1'][0] == 3 # offset should have been applied
def test_to_make_sure_splitter_and_sampler_methods_are_optional( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( { "table_name": "table_partitioned_by_date_column__A", "batch_identifiers": {}, "sampling_method": "_sample_using_mod", "sampling_kwargs": { "column_name": "id", "mod": 10, "value": 8, }, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 12 batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( { "table_name": "table_partitioned_by_date_column__A", "batch_identifiers": {}, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 120 batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( { "table_name": "table_partitioned_by_date_column__A", "batch_identifiers": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 120
def test_sampling_method__limit( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( { "table_name": "table_partitioned_by_date_column__A", "batch_identifiers": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_limit", "sampling_kwargs": {"n": 20}, } ) ) batch = Batch(data=batch_data) validator = Validator(execution_engine, batches=[batch]) assert len(validator.head(fetch_all=True)) == 20 assert not validator.expect_column_values_to_be_in_set( "date", value_set=["2020-01-02"] ).success
def test_instantiation_via_url_and_retrieve_data_with_other_dialect(sa): """Ensure that we can still retrieve data when the dialect is not recognized.""" # 1. Create engine with sqlite db db_file = file_relative_path( __file__, os.path.join("..", "test_sets", "test_cases_for_sql_data_connector.db"), ) my_execution_engine = SqlAlchemyExecutionEngine(url="sqlite:///" + db_file) assert my_execution_engine.connection_string is None assert my_execution_engine.credentials is None assert my_execution_engine.url[ -36:] == "test_cases_for_sql_data_connector.db" # 2. Change dialect to one not listed in GESqlDialect my_execution_engine.engine.dialect.name = "other_dialect" # 3. Get data num_rows_in_sample: int = 10 batch_data, _ = my_execution_engine.get_batch_data_and_markers( batch_spec=SqlAlchemyDatasourceBatchSpec( table_name="table_partitioned_by_date_column__A", sampling_method="_sample_using_limit", sampling_kwargs={"n": num_rows_in_sample}, )) # 4. Assert dialect and data are as expected assert batch_data.dialect == GESqlDialect.OTHER my_execution_engine.load_batch_data("__", batch_data) validator = Validator(my_execution_engine) assert len(validator.head(fetch_all=True)) == num_rows_in_sample
def test_sampling_method__limit( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec( { "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_limit", "sampling_kwargs": {"n": 20}, } ) ) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 20 assert ( validator.expect_column_values_to_be_in_set( "date", value_set=["2020-01-02"] ).success == False )
def test_sampling_method__a_list( test_cases_for_sql_data_connector_sqlite_execution_engine, ): execution_engine = test_cases_for_sql_data_connector_sqlite_execution_engine batch_data, batch_markers = execution_engine.get_batch_data_and_markers( batch_spec=BatchSpec({ "table_name": "table_partitioned_by_date_column__A", "partition_definition": {}, "splitter_method": "_split_on_whole_table", "splitter_kwargs": {}, "sampling_method": "_sample_using_a_list", "sampling_kwargs": { "column_name": "id", "value_list": [10, 20, 30, 40], }, })) execution_engine.load_batch_data("__", batch_data) validator = Validator(execution_engine) assert len(validator.head(fetch_all=True)) == 4