def test_basic_operation(basic_sqlalchemy_datasource):
    table_generator = TableBatchKwargsGenerator(
        datasource=basic_sqlalchemy_datasource,
        assets={
            "my_asset": {
                "table": "my_table",
                "schema":
                "$schema"  # Note the use of python $-template to allow substitution
            },
            "my_no_schema_asset": {
                "table": "important_data"
            },
            "dangerous.named_asset":
            {  # Usually, a period in the name will be interpreted as implying the schema
                "table": "named_asset",
                "schema": "$schema"
            }
        })

    batch_kwargs = table_generator.yield_batch_kwargs(
        "my_asset", query_parameters={"schema": "foo"})
    assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs)
    assert batch_kwargs.schema == "foo"
    assert batch_kwargs.table == "my_table"

    # Note that schema is ignored in this case -- it's not part of the defined asset
    batch_kwargs = table_generator.yield_batch_kwargs(
        "my_no_schema_asset", query_parameters={"schema": "foo"})
    assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs)
    assert batch_kwargs.schema is None
    assert batch_kwargs.table == "important_data"

    # Here, it's just a classic acceptable case
    batch_kwargs = table_generator.yield_batch_kwargs("my_no_schema_asset")
    assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs)
    assert batch_kwargs.schema is None
    assert batch_kwargs.table == "important_data"

    # Note that in this case, we have a confusingly named asset, since it "could" be a schema + table name
    # Since it's not available to be found via introspection, however, and it *is* a valid name, this works fine
    batch_kwargs = table_generator.yield_batch_kwargs(
        "dangerous.named_asset", query_parameters={"schema": "bar"})
    assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs)
    assert batch_kwargs.schema == "bar"
    assert batch_kwargs.table == "named_asset"

    # When a data asset is configured to require a template but it is not available, we should
    # fail with an informative message
    with pytest.raises(BatchKwargsError) as exc:
        table_generator.yield_batch_kwargs("my_asset")
    assert "missing template key" in exc.value.message
def test_db_introspection(postgresql_sqlalchemy_datasource, caplog):
    table_generator = TableBatchKwargsGenerator(
        datasource=postgresql_sqlalchemy_datasource)

    # Get a list of tables visible inside the defined database
    assets = table_generator.get_available_data_asset_names()
    assert len(assets["names"]) > 0
    table_name = assets["names"].pop()[0]

    # We should be able to get kwargs without having them specifically configured based on discovery
    batch_kwargs = table_generator.build_batch_kwargs(table_name)
    assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs)
    assert batch_kwargs.table == table_name
    assert batch_kwargs.schema == "public"

    # ... and that should work with and without explicit inclusion of the schema
    batch_kwargs = table_generator.yield_batch_kwargs("public." + table_name)
    assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs)
    assert batch_kwargs.table == table_name
    assert batch_kwargs.schema == "public"

    # We should be able to pass a limit; but calling yield again with different kwargs should yield a warning
    caplog.clear()
    batch_kwargs = table_generator.yield_batch_kwargs("public." + table_name,
                                                      limit=10)
    assert isinstance(batch_kwargs, SqlAlchemyDatasourceTableBatchKwargs)
    assert batch_kwargs.table == table_name
    assert batch_kwargs.schema == "public"
    assert batch_kwargs.limit == 10
    assert [rec.message for rec in caplog.records] == [
        "Asked to yield batch_kwargs using different supplemental kwargs. Resetting iterator to "
        "use new supplemental kwargs."
    ]
Example #3
0
def _get_batch_kwargs_for_sqlalchemy_datasource(context,
                                                datasource_name,
                                                additional_batch_kwargs=None):
    data_asset_name = None
    sql_query = None
    datasource = context.get_datasource(datasource_name)
    msg_prompt_how_to_connect_to_data = """
You have selected a datasource that is a SQL database. How would you like to specify the data?
1. Enter a table name and schema
2. Enter a custom SQL query
3. List all tables in the database (this may take a very long time)
"""
    default_schema = _get_default_schema(datasource)
    temp_generator = TableBatchKwargsGenerator(name="temp",
                                               datasource=datasource)

    while data_asset_name is None:
        single_or_multiple_data_asset_selection = click.prompt(
            msg_prompt_how_to_connect_to_data,
            type=click.Choice(["1", "2", "3"]),
            show_choices=False,
        )
        if single_or_multiple_data_asset_selection == "1":  # name the table and schema
            schema_name = click.prompt(
                "Please provide the schema name of the table (this is optional)",
                default=default_schema,
            )
            table_name = click.prompt(
                "Please provide the table name (this is required)")
            data_asset_name = f"{schema_name}.{table_name}"

        elif single_or_multiple_data_asset_selection == "2":  # SQL query
            sql_query = click.prompt("Please provide the SQL query")
            data_asset_name = "custom_sql_query"

        elif single_or_multiple_data_asset_selection == "3":  # list it all
            msg_prompt_warning = fr"""Warning: If you have a large number of tables in your datasource, this may take a very long time. \m
                    Would you like to continue?"""
            confirmation = click.prompt(msg_prompt_warning,
                                        type=click.Choice(["y", "n"]),
                                        show_choices=True)
            if confirmation == "y":
                # avoid this call until necessary
                available_data_asset_names = (
                    temp_generator.get_available_data_asset_names()["names"])
                available_data_asset_names_str = [
                    "{} ({})".format(name[0], name[1])
                    for name in available_data_asset_names
                ]

                data_asset_names_to_display = available_data_asset_names_str
                choices = "\n".join([
                    "    {}. {}".format(i, name)
                    for i, name in enumerate(data_asset_names_to_display, 1)
                ])
                msg_prompt_enter_data_asset_name = (
                    "\nWhich table would you like to use? (Choose one)\n")
                prompt = msg_prompt_enter_data_asset_name + choices + os.linesep
                selection = click.prompt(prompt, show_default=False)
                selection = selection.strip()
                try:
                    data_asset_index = int(selection) - 1
                    try:
                        data_asset_name = [
                            name[0] for name in available_data_asset_names
                        ][data_asset_index]

                    except IndexError:
                        print(
                            f"You have specified {selection}, which is an incorrect index"
                        )
                        pass
                except ValueError:
                    print(
                        f"You have specified {selection}, which is an incorrect value"
                    )
                    pass

    if additional_batch_kwargs is None:
        additional_batch_kwargs = {}

    # Some backends require named temporary table parameters. We specifically elicit those and add them
    # where appropriate.
    temp_table_kwargs = dict()
    datasource = context.get_datasource(datasource_name)

    if datasource.engine.dialect.name.lower() == "bigquery":
        # bigquery also requires special handling
        bigquery_temp_table = click.prompt(
            "Great Expectations will create a table to use for "
            "validation." + os.linesep +
            "Please enter a name for this table: ",
            default="SOME_PROJECT.SOME_DATASET.ge_tmp_" +
            str(uuid.uuid4())[:8],
        )
        temp_table_kwargs = {
            "bigquery_temp_table": bigquery_temp_table,
        }

    # now building the actual batch_kwargs
    if sql_query is None:
        batch_kwargs = temp_generator.build_batch_kwargs(
            data_asset_name, **additional_batch_kwargs)
        batch_kwargs.update(temp_table_kwargs)
    else:
        batch_kwargs = {"query": sql_query, "datasource": datasource_name}
        batch_kwargs.update(temp_table_kwargs)
        BridgeValidator(
            batch=datasource.get_batch(batch_kwargs),
            expectation_suite=ExpectationSuite("throwaway"),
        ).get_dataset()

    batch_kwargs["data_asset_name"] = data_asset_name
    return data_asset_name, batch_kwargs