def test_basic_operation(basic_sqlalchemy_datasource): table_generator = TableBatchKwargsGenerator( datasource=basic_sqlalchemy_datasource, assets={ "my_asset": { "table": "my_table", "schema": "$schema" # Note the use of python $-template to allow substitution }, "my_no_schema_asset": { "table": "important_data" }, "dangerous.named_asset": { # Usually, a period in the name will be interpreted as implying the schema "table": "named_asset", "schema": "$schema" } }) batch_kwargs = table_generator.yield_batch_kwargs( "my_asset", query_parameters={"schema": "foo"}) assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs) assert batch_kwargs.schema == "foo" assert batch_kwargs.table == "my_table" # Note that schema is ignored in this case -- it's not part of the defined asset batch_kwargs = table_generator.yield_batch_kwargs( "my_no_schema_asset", query_parameters={"schema": "foo"}) assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs) assert batch_kwargs.schema is None assert batch_kwargs.table == "important_data" # Here, it's just a classic acceptable case batch_kwargs = table_generator.yield_batch_kwargs("my_no_schema_asset") assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs) assert batch_kwargs.schema is None assert batch_kwargs.table == "important_data" # Note that in this case, we have a confusingly named asset, since it "could" be a schema + table name # Since it's not available to be found via introspection, however, and it *is* a valid name, this works fine batch_kwargs = table_generator.yield_batch_kwargs( "dangerous.named_asset", query_parameters={"schema": "bar"}) assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs) assert batch_kwargs.schema == "bar" assert batch_kwargs.table == "named_asset" # When a data asset is configured to require a template but it is not available, we should # fail with an informative message with pytest.raises(BatchKwargsError) as exc: table_generator.yield_batch_kwargs("my_asset") assert "missing template key" in exc.value.message
def test_db_introspection(postgresql_sqlalchemy_datasource, caplog): table_generator = TableBatchKwargsGenerator( datasource=postgresql_sqlalchemy_datasource) # Get a list of tables visible inside the defined database assets = table_generator.get_available_data_asset_names() assert len(assets["names"]) > 0 table_name = assets["names"].pop()[0] # We should be able to get kwargs without having them specifically configured based on discovery batch_kwargs = table_generator.build_batch_kwargs(table_name) assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs) assert batch_kwargs.table == table_name assert batch_kwargs.schema == "public" # ... and that should work with and without explicit inclusion of the schema batch_kwargs = table_generator.yield_batch_kwargs("public." + table_name) assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs) assert batch_kwargs.table == table_name assert batch_kwargs.schema == "public" # We should be able to pass a limit; but calling yield again with different kwargs should yield a warning caplog.clear() batch_kwargs = table_generator.yield_batch_kwargs("public." + table_name, limit=10) assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs) assert batch_kwargs.table == table_name assert batch_kwargs.schema == "public" assert batch_kwargs.limit == 10 assert [rec.message for rec in caplog.records] == [ "Asked to yield batch_kwargs using different supplemental kwargs. Resetting iterator to " "use new supplemental kwargs." ]
def _get_batch_kwargs_for_sqlalchemy_datasource(context, datasource_name, additional_batch_kwargs=None): data_asset_name = None sql_query = None datasource = context.get_datasource(datasource_name) msg_prompt_how_to_connect_to_data = """ You have selected a datasource that is a SQL database. How would you like to specify the data? 1. Enter a table name and schema 2. Enter a custom SQL query 3. List all tables in the database (this may take a very long time) """ default_schema = _get_default_schema(datasource) temp_generator = TableBatchKwargsGenerator(name="temp", datasource=datasource) while data_asset_name is None: single_or_multiple_data_asset_selection = click.prompt( msg_prompt_how_to_connect_to_data, type=click.Choice(["1", "2", "3"]), show_choices=False, ) if single_or_multiple_data_asset_selection == "1": # name the table and schema schema_name = click.prompt( "Please provide the schema name of the table (this is optional)", default=default_schema, ) table_name = click.prompt( "Please provide the table name (this is required)") data_asset_name = f"{schema_name}.{table_name}" elif single_or_multiple_data_asset_selection == "2": # SQL query sql_query = click.prompt("Please provide the SQL query") data_asset_name = "custom_sql_query" elif single_or_multiple_data_asset_selection == "3": # list it all msg_prompt_warning = fr"""Warning: If you have a large number of tables in your datasource, this may take a very long time. \m Would you like to continue?""" confirmation = click.prompt(msg_prompt_warning, type=click.Choice(["y", "n"]), show_choices=True) if confirmation == "y": # avoid this call until necessary available_data_asset_names = ( temp_generator.get_available_data_asset_names()["names"]) available_data_asset_names_str = [ "{} ({})".format(name[0], name[1]) for name in available_data_asset_names ] data_asset_names_to_display = available_data_asset_names_str choices = "\n".join([ " {}. {}".format(i, name) for i, name in enumerate(data_asset_names_to_display, 1) ]) msg_prompt_enter_data_asset_name = ( "\nWhich table would you like to use? (Choose one)\n") prompt = msg_prompt_enter_data_asset_name + choices + os.linesep selection = click.prompt(prompt, show_default=False) selection = selection.strip() try: data_asset_index = int(selection) - 1 try: data_asset_name = [ name[0] for name in available_data_asset_names ][data_asset_index] except IndexError: print( f"You have specified {selection}, which is an incorrect index" ) pass except ValueError: print( f"You have specified {selection}, which is an incorrect value" ) pass if additional_batch_kwargs is None: additional_batch_kwargs = {} # Some backends require named temporary table parameters. We specifically elicit those and add them # where appropriate. temp_table_kwargs = dict() datasource = context.get_datasource(datasource_name) if datasource.engine.dialect.name.lower() == "bigquery": # bigquery also requires special handling bigquery_temp_table = click.prompt( "Great Expectations will create a table to use for " "validation." + os.linesep + "Please enter a name for this table: ", default="SOME_PROJECT.SOME_DATASET.ge_tmp_" + str(uuid.uuid4())[:8], ) temp_table_kwargs = { "bigquery_temp_table": bigquery_temp_table, } # now building the actual batch_kwargs if sql_query is None: batch_kwargs = temp_generator.build_batch_kwargs( data_asset_name, **additional_batch_kwargs) batch_kwargs.update(temp_table_kwargs) else: batch_kwargs = {"query": sql_query, "datasource": datasource_name} batch_kwargs.update(temp_table_kwargs) BridgeValidator( batch=datasource.get_batch(batch_kwargs), expectation_suite=ExpectationSuite("throwaway"), ).get_dataset() batch_kwargs["data_asset_name"] = data_asset_name return data_asset_name, batch_kwargs