def initialized_sqlite_project(mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file): """This is an initialized project through the CLI.""" project_dir = str(tmp_path_factory.mktemp("my_rad_project")) engine = create_engine("sqlite:///{}".format(titanic_sqlite_db_file)) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="Y\n2\n5\ntitanic\n{}\n1\nwarning\n\n".format(engine.url), catch_exceptions=False, ) assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format( project_dir) in mock_webbrowser.call_args[0][0] assert_no_logging_messages_or_tracebacks(caplog, result) context = DataContext(os.path.join(project_dir, DataContext.GE_DIR)) assert isinstance(context, DataContext) assert len(context.list_datasources()) == 1 assert context.list_datasources() == [{ "class_name": "SqlAlchemyDatasource", "name": "titanic" }] return project_dir
def initialized_sqlite_project( mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file, sa ): """This is an initialized project through the CLI.""" project_dir = str(tmp_path_factory.mktemp("my_rad_project")) engine = sa.create_engine( "sqlite:///{}".format(titanic_sqlite_db_file), pool_recycle=3600 ) inspector = sa.inspect(engine) # get the default schema and table for testing schemas = inspector.get_schema_names() default_schema = schemas[0] tables = [ table_name for table_name in inspector.get_table_names(schema=default_schema) ] default_table = tables[0] runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="\n\n2\n6\ntitanic\n{url}\n\n\n1\n{schema}\n{table}\nwarning\n\n\n\n".format( url=engine.url, schema=default_schema, table=default_table ), catch_exceptions=False, ) assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format( project_dir ) in mock_webbrowser.call_args[0][0] ) assert_no_logging_messages_or_tracebacks(caplog, result) context = DataContext(os.path.join(project_dir, DataContext.GE_DIR)) assert isinstance(context, DataContext) assert len(context.list_datasources()) == 1 assert context.list_datasources() == [ { "class_name": "SqlAlchemyDatasource", "name": "titanic", "module_name": "great_expectations.datasource", "credentials": {"url": str(engine.url)}, "data_asset_type": { "class_name": "SqlAlchemyDataset", "module_name": "great_expectations.dataset", }, } ] return project_dir
def test_cli_datasource_new_connection_string( mock_subprocess, empty_data_context, empty_sqlite_db, caplog, monkeypatch ): root_dir = empty_data_context.root_directory context = DataContext(root_dir) assert context.list_datasources() == [] runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, "--v3-api datasource new", input=f"2\n6\n", catch_exceptions=False, ) stdout = result.stdout assert "What data would you like Great Expectations to connect to?" in stdout assert result.exit_code == 0 uncommitted_dir = os.path.join(root_dir, context.GE_UNCOMMITTED_DIR) expected_notebook = os.path.join(uncommitted_dir, "datasource_new.ipynb") assert os.path.isfile(expected_notebook) mock_subprocess.assert_called_once_with(["jupyter", "notebook", expected_notebook]) # Run notebook with open(expected_notebook) as f: nb = nbformat.read(f, as_version=4) # mock the user adding a connection string into the notebook by overwriting the right cell assert "connection_string" in nb["cells"][5]["source"] nb["cells"][5]["source"] = 'connection_string = "sqlite://"' ep = ExecutePreprocessor(timeout=60, kernel_name="python3") ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}}) del context context = DataContext(root_dir) assert context.list_datasources() == [ { "class_name": "SimpleSqlalchemyDatasource", "connection_string": "sqlite://", "introspection": { "whole_table": {"data_asset_name_suffix": "__whole_table"} }, "module_name": "great_expectations.datasource", "name": "my_datasource", } ] assert_no_logging_messages_or_tracebacks(caplog, result)
def test_init_on_existing_project_with_multiple_datasources_exist_do_nothing( mock_webbrowser, caplog, initialized_sqlite_project, titanic_sqlite_db, empty_sqlite_db, ): project_dir = initialized_sqlite_project ge_dir = os.path.join(project_dir, DataContext.GE_DIR) context = DataContext(ge_dir) datasource_name = "wow_a_datasource" context = _add_datasource_and_credentials_to_context( context, datasource_name, empty_sqlite_db ) assert len(context.list_datasources()) == 2 runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="n\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert mock_webbrowser.call_count == 0 assert "Error: invalid input" not in stdout assert "Always know what to expect from your data" in stdout assert "This looks like an existing project that" in stdout assert "appears complete" in stdout assert "Would you like to build & view this project's Data Docs" in stdout assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasorce_list(empty_data_context, empty_sqlite_db, caplog): """Test an empty project and after adding a single datasource.""" project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False) stdout = result.output.strip() assert "[]" in stdout assert context.list_datasources() == [] datasource_name = "wow_a_datasource" _add_datasource_and_credentials_to_context(context, datasource_name, empty_sqlite_db) runner = CliRunner(mix_stderr=False) result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False) stdout = result.output.strip() if PY2: # deal with legacy python dictionary sorting assert ("'name': 'wow_a_datasource'" and "'class_name': u'SqlAlchemyDatasource'" in stdout) assert len(stdout) >= 60 and len(stdout) <= 75 else: assert ( "[{'name': 'wow_a_datasource', 'class_name': 'SqlAlchemyDatasource'}]" in stdout) assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_delete_on_project_with_one_datasource_declining_prompt_does_not_delete( caplog, monkeypatch, titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled, ): context = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled assert "my_datasource" in [ds["name"] for ds in context.list_datasources()] assert len(context.list_datasources()) == 1 runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, "--v3-api datasource delete my_datasource", input="n\n", catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert "Using v3 (Batch Request) API" in stdout assert "Datasource `my_datasource` was not deleted." in stdout # reload context from disk to see if the datasource was in fact deleted root_directory = context.root_directory del context context = DataContext(root_directory) assert len(context.list_datasources()) == 1 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_delete_on_project_with_one_datasource_assume_yes_flag( caplog, monkeypatch, titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled, ): context = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled assert "my_datasource" in [ds["name"] for ds in context.list_datasources()] assert len(context.list_datasources()) == 1 runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, "--v3-api --assume-yes datasource delete my_datasource", catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert "Would you like to proceed? [Y/n]:" not in stdout # This assertion is extra assurance since this test is too permissive if we change the confirmation message assert "[Y/n]" not in stdout assert "Using v3 (Batch Request) API" in stdout assert "Datasource deleted successfully." in stdout # reload context from disk to see if the datasource was in fact deleted root_directory = context.root_directory del context context = DataContext(root_directory) assert len(context.list_datasources()) == 0 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasorce_new(caplog, empty_data_context, filesystem_csv_2): project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) assert context.list_datasources() == [] runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "new", "-d", project_root_dir], input="1\n1\n%s\nmynewsource\n" % str(filesystem_csv_2), catch_exceptions=False, ) stdout = result.stdout assert "What data would you like Great Expectations to connect to?" in stdout assert "What are you processing your files with?" in stdout assert "Give your new data source a short name." in stdout assert "A new datasource 'mynewsource' was added to your project." in stdout assert result.exit_code == 0 config_path = os.path.join(project_root_dir, DataContext.GE_YML) config = yaml.load(open(config_path, "r")) datasources = config["datasources"] assert "mynewsource" in datasources.keys() data_source_class = datasources["mynewsource"]["data_asset_type"]["class_name"] assert data_source_class == "PandasDataset" assert_no_logging_messages_or_tracebacks(caplog, result)
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one( mock_webbrowser, capsys, caplog, initialized_project, ): project_dir = initialized_project ge_dir = os.path.join(project_dir, DataContext.GE_DIR) # mangle the project to remove all traces of a suite and validations _remove_all_datasources(ge_dir) os.remove(os.path.join(ge_dir, "expectations", "Titanic", "warning.json")) uncommitted_dir = os.path.join(ge_dir, "uncommitted") validations_dir = os.path.join(ge_dir, uncommitted_dir, "validations") shutil.rmtree(validations_dir) os.mkdir(validations_dir) shutil.rmtree(os.path.join(uncommitted_dir, "data_docs", "local_site")) context = DataContext(ge_dir) assert not context.list_expectation_suites() csv_path = os.path.join(project_dir, "data", "Titanic.csv") runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="1\n1\n{}\nmy_suite\n\n".format(csv_path, catch_exceptions=False), catch_exceptions=False, ) assert mock_webbrowser.call_count == 1 assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/".format( project_dir) in mock_webbrowser.call_args[0][0] stdout = result.stdout assert result.exit_code == 0 assert "Error: invalid input" not in stdout assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert "Enter the path (relative or absolute) of a data file" in stdout assert "Name the new expectation suite [Titanic.warning]:" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations" in stdout) assert "A new Expectation suite 'my_suite' was added to your project" in stdout assert "Great Expectations is now set up." in stdout config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML)) assert "files_datasource" in config["datasources"].keys() context = DataContext(ge_dir) assert context.list_datasources() == [{ "name": "files_datasource", "class_name": "PandasDatasource" }] assert context.list_expectation_suites( )[0].expectation_suite_name == "my_suite" assert len(context.list_expectation_suites()) == 1 assert_no_logging_messages_or_tracebacks(caplog, result)
def select_datasource(context: DataContext, datasource_name: str = None) -> Datasource: """Select a datasource interactively.""" # TODO consolidate all the myriad CLI tests into this data_source = None if datasource_name is None: data_sources = sorted(context.list_datasources(), key=lambda x: x["name"]) if len(data_sources) == 0: cli_message( "<red>No datasources found in the context. To add a datasource, run `great_expectations datasource new`</red>" ) elif len(data_sources) == 1: datasource_name = data_sources[0]["name"] else: choices = "\n".join([ " {}. {}".format(i, data_source["name"]) for i, data_source in enumerate(data_sources, 1) ]) option_selection = click.prompt( "Select a datasource" + "\n" + choices + "\n", type=click.Choice( [str(i) for i, data_source in enumerate(data_sources, 1)]), show_choices=False, ) datasource_name = data_sources[int(option_selection) - 1]["name"] if datasource_name is not None: data_source = context.get_datasource(datasource_name) return data_source
def test_init_on_existing_project_with_multiple_datasources_exist_do_nothing( mock_webbrowser, caplog, initialized_project, filesystem_csv_2 ): project_dir = initialized_project ge_dir = os.path.join(project_dir, DataContext.GE_DIR) context = DataContext(ge_dir) context.add_datasource( "another_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", ) assert len(context.list_datasources()) == 2 runner = CliRunner(mix_stderr=False) with pytest.warns( UserWarning, match="Warning. An existing `great_expectations.yml` was found" ): result = runner.invoke( cli, ["init", "-d", project_dir], input="n\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert mock_webbrowser.call_count == 0 assert "Error: invalid input" not in stdout assert "Always know what to expect from your data" in stdout assert "This looks like an existing project that" in stdout assert "appears complete" in stdout assert "Would you like to build & view this project's Data Docs" in stdout assert_no_logging_messages_or_tracebacks(caplog, result)
def initialized_project(mock_webbrowser, tmp_path_factory): """This is an initialized project through the CLI.""" project_dir = str(tmp_path_factory.mktemp("my_rad_project")) os.makedirs(os.path.join(project_dir, "data")) data_folder_path = os.path.join(project_dir, "data") data_path = os.path.join(project_dir, "data/Titanic.csv") fixture_path = file_relative_path(__file__, "../test_sets/Titanic.csv") shutil.copy(fixture_path, data_path) runner = CliRunner(mix_stderr=False) _ = runner.invoke( cli, ["init", "-d", project_dir], input="\n\n1\n1\n{}\n\n\n\n2\n{}\n\n\n\n".format( data_folder_path, data_path), catch_exceptions=False, ) assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/" .format(project_dir) in mock_webbrowser.call_args[0][0]) context = DataContext(os.path.join(project_dir, DataContext.GE_DIR)) assert isinstance(context, DataContext) assert len(context.list_datasources()) == 1 return project_dir
def test_cli_datasource_profile_with_valid_data_asset_arg( caplog, empty_data_context, filesystem_csv_2): empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) context = empty_data_context project_root_dir = context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "my_datasource", "--data-assets", "f1", "-d", project_root_dir, "--no-view", ], catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert "Profiling 'my_datasource'" in stdout assert "The following Data Docs sites were built:\n- local_site:" in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert (suites[0].expectation_suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler") validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) suite_name = validation.meta["expectation_suite_name"] assert suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler" assert validation.success is False assert len(validation.results) == 8 assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_cli_datasource_profile_with_skip_prompt_flag(caplog, empty_data_context, filesystem_csv_2): empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) not_so_empty_data_context = empty_data_context project_root_dir = not_so_empty_data_context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "profile", "-d", project_root_dir, "--no-view", "-y"], input="Y\n", catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert ( "Profiling 'my_datasource' will create expectations and documentation." in stdout) assert "Would you like to profile 'my_datasource'" not in stdout assert ( "Great Expectations is building Data Docs from the data you just profiled!" in stdout) context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert (suites[0].expectation_suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler") validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert (validation.meta["expectation_suite_name"] == "my_datasource.subdir_reader.f1.BasicDatasetProfiler") assert validation.success is False assert len(validation.results) == 8 assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def generate_basic_suites( kedro_context: KedroContext, ge_context: DataContext, empty=False, replace=False, batch_kwargs=None, ): from great_expectations.profile import BasicSuiteBuilderProfiler if batch_kwargs is None: batch_kwargs = {} catalog = kedro_context.catalog existing_datasource_names = { ds["name"] for ds in ge_context.list_datasources() } for dataset_name in catalog.list(): suite_name = generate_basic_suite_name(dataset_name) if suite_name in ge_context.list_expectation_suite_names( ) and not replace: continue datasource_name = generate_datasource_name(dataset_name) if datasource_name not in existing_datasource_names: continue dataset = catalog._get_dataset(dataset_name) data_path = str(dataset._filepath) dataasset_name, _ = os.path.splitext(os.path.basename(data_path)) suite_batch_kwargs = { "datasource": datasource_name, "data_asset_name": dataasset_name, "path": data_path, "reader_options": dataset._load_args, } batch_kwargs_generator_name = "path" profiler_configuration = "demo" additional_batch_kwargs = batch_kwargs run_id = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") if empty: create_empty_suite(ge_context, suite_name, suite_batch_kwargs) else: ge_context.profile_data_asset( datasource_name, batch_kwargs_generator_name=batch_kwargs_generator_name, data_asset_name=dataasset_name, batch_kwargs=suite_batch_kwargs, profiler=BasicSuiteBuilderProfiler, profiler_configuration=profiler_configuration, expectation_suite_name=suite_name, run_id=run_id, additional_batch_kwargs=additional_batch_kwargs, )
def test_cli_datasource_profile_with_valid_data_asset_arg( empty_data_context, titanic_sqlite_db, caplog ): project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) datasource_name = "wow_a_datasource" context = _add_datasource_and_credentials_to_context( context, datasource_name, titanic_sqlite_db ) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", datasource_name, "--data-assets", "main.titanic", "-d", project_root_dir, "--no-view", ], catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Profiling '{}'".format(datasource_name) in stdout assert "The following Data Docs sites were built:\n" in stdout assert "local_site:" in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert ( suites[0].expectation_suite_name == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler" ) validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert ( validation.meta["expectation_suite_name"] == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler" ) assert validation.success is False assert len(validation.results) == 51 assert "Preparing column 1 of 7" in caplog.messages[0] assert len(caplog.messages) == 10 assert_no_tracebacks(result)
def test_cli_datasorce_list(caplog, empty_data_context, filesystem_csv_2): """Test an empty project and after adding a single datasource.""" project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False) stdout = result.output.strip() assert "[]" in stdout assert context.list_datasources() == [] context.add_datasource( "wow_a_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) assert context.list_datasources() == [{ "name": "wow_a_datasource", "class_name": "PandasDatasource" }] runner = CliRunner(mix_stderr=False) result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False) stdout = result.output.strip() if PY2: # deal with legacy python dictionary sorting assert ("'name': 'wow_a_datasource'" and "'class_name': 'PandasDatasource'" in stdout) assert len(stdout) >= 60 and len(stdout) <= 70 else: assert ( "[{'name': 'wow_a_datasource', 'class_name': 'PandasDatasource'}]" in stdout) assert_no_logging_messages_or_tracebacks(caplog, result)
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one( mock_webbrowser, caplog, initialized_sqlite_project, titanic_sqlite_db_file, ): project_dir = initialized_sqlite_project ge_dir = os.path.join(project_dir, DataContext.GE_DIR) _remove_all_datasources(ge_dir) os.remove(os.path.join(ge_dir, "expectations", "warning.json")) context = DataContext(ge_dir) assert not context.list_expectation_suites() runner = CliRunner(mix_stderr=False) with pytest.warns( UserWarning, match="Warning. An existing `great_expectations.yml` was found"): result = runner.invoke( cli, ["init", "-d", project_dir], input="2\n5\nsqlite\nsqlite:///{}\n1\nmy_suite\n\n".format( titanic_sqlite_db_file), catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/".format( project_dir) in mock_webbrowser.call_args[0][0] assert "Error: invalid input" not in stdout assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert ( "Next, we will configure database credentials and store them in the `sqlite` section" in stdout) assert "What is the url/connection string for the sqlalchemy connection?" in stdout assert "Which table would you like to use?" in stdout assert "Great Expectations connected to your database" in stdout assert "A new Expectation suite 'my_suite' was added to your project" in stdout assert "This looks like an existing project that" not in stdout config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML)) assert "sqlite" in config["datasources"].keys() context = DataContext(ge_dir) assert context.list_datasources() == [{ "class_name": "SqlAlchemyDatasource", "name": "sqlite" }] assert context.list_expectation_suites( )[0].expectation_suite_name == "my_suite" assert len(context.list_expectation_suites()) == 1 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_profile_with_datasource_arg_and_generator_name_arg( empty_data_context, titanic_sqlite_db, caplog ): """ Here we are verifying that when generator_name argument is passed to the methods down the stack. We use a datasource with two generators. This way we can check that the name of the expectation suite created by the profiler corresponds to the name of the data asset listed by the generator that we told the profiler to use. The logic of processing this argument is testing in tests/profile. """ project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) datasource_name = "wow_a_datasource" context = _add_datasource__with_two_generators_and_credentials_to_context( context, datasource_name, titanic_sqlite_db ) second_generator_name = "second_generator" runner = CliRunner() result = runner.invoke( cli, [ "datasource", "profile", datasource_name, "--generator-name", second_generator_name, "-d", project_root_dir, "--no-view", ], input="Y\n", ) stdout = result.stdout assert result.exit_code == 0 assert "Profiling '{}'".format(datasource_name) in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert ( suites[0].expectation_suite_name == "wow_a_datasource.second_generator.asset_one.BasicDatasetProfiler" ) assert "Preparing column 1 of 7" in caplog.messages[0] assert len(caplog.messages) == 7 assert_no_tracebacks(result)
def test_cli_datasource_delete_on_project_with_one_datasource_declining_prompt_does_not_delete( mock_emit, caplog, monkeypatch, titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled, ): context = titanic_pandas_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled assert "my_datasource" in [ds["name"] for ds in context.list_datasources()] assert len(context.list_datasources()) == 1 runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, "--v3-api datasource delete my_datasource", input="n\n", catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert "Using v3 (Batch Request) API" in stdout assert "Datasource `my_datasource` was not deleted." in stdout expected_call_args_list = [ mock.call({ "event_payload": {}, "event": "data_context.__init__", "success": True }), mock.call({ "event": "cli.datasource.delete.begin", "event_payload": { "api_version": "v3" }, "success": True, }), mock.call({ "event": "cli.datasource.delete.end", "event_payload": { "cancelled": True, "api_version": "v3" }, "success": True, }), ] assert mock_emit.call_args_list == expected_call_args_list assert mock_emit.call_count == len(expected_call_args_list) # reload context from disk to see if the datasource was in fact deleted root_directory = context.root_directory del context context = DataContext(root_directory) assert len(context.list_datasources()) == 1 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_list(empty_data_context, empty_sqlite_db, caplog, monkeypatch): """Test an empty project and after adding a single datasource.""" project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, [ "--v3-api", "datasource", "list", ], catch_exceptions=False, ) stdout = result.output.strip() assert "No Datasources found" in stdout assert context.list_datasources() == [] datasource_name = "wow_a_datasource" _add_datasource_and_credentials_to_context(context, datasource_name, empty_sqlite_db) runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, ["--v3-api", "datasource", "list"], catch_exceptions=False, ) url = str(empty_sqlite_db.engine.url) expected_output = """\ 1 Datasource found:[0m [0m - [36mname:[0m wow_a_datasource[0m [36mmodule_name:[0m great_expectations.datasource[0m [36mclass_name:[0m SqlAlchemyDatasource[0m [36mbatch_kwargs_generators:[0m[0m [36mdefault:[0m[0m [36mclass_name:[0m TableBatchKwargsGenerator[0m [36mcredentials:[0m[0m [36murl:[0m {}[0m [36mdata_asset_type:[0m[0m [36mclass_name:[0m SqlAlchemyDataset[0m [36mmodule_name:[0m None[0m """.format(url).strip() stdout = result.output.strip() assert stdout == expected_output assert_no_logging_messages_or_tracebacks(caplog, result)
def _remove_all_datasources(ge_dir): config_path = os.path.join(ge_dir, DataContext.GE_YML) config = _load_config_file(config_path) config["datasources"] = {} with open(config_path, "w") as f: yaml.dump(config, f) context = DataContext(ge_dir) assert context.list_datasources() == []
def _run_cli_datasource_new_path_test(context: DataContext, args: str, invocation_input: str, base_path: str) -> None: root_dir = context.root_directory runner = CliRunner(mix_stderr=True) result = runner.invoke( cli, args=args, input=invocation_input, catch_exceptions=False, ) assert result.exit_code == 0 _run_notebook(context) # Renew a context since we executed a notebook in a different process del context context = DataContext(root_dir) assert context.list_datasources() == [{ "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "default_inferred_data_connector_name": { "default_regex": { "group_names": ["data_asset_name"], "pattern": "(.*)", }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"../../{base_path}", "class_name": "InferredAssetFilesystemDataConnector", }, "default_runtime_data_connector_name": { "assets": { "my_runtime_asset_name": { "batch_identifiers": ["runtime_batch_identifier_name"], "class_name": "Asset", "module_name": "great_expectations.datasource.data_connector.asset", } }, "class_name": "RuntimeDataConnector", "module_name": "great_expectations.datasource.data_connector", }, }, }]
def test_cli_datasource_profile_with_invalid_data_asset_arg_answering_no( caplog, empty_data_context, filesystem_csv_2 ): empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) not_so_empty_data_context = empty_data_context project_root_dir = not_so_empty_data_context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "my_datasource", "--data-assets", "bad-bad-asset", "-d", project_root_dir, "--no-view", ], input="2\n", catch_exceptions=False, ) stdout = result.stdout assert ( "Some of the data assets you specified were not found: bad-bad-asset" in stdout ) assert "Choose how to proceed" in stdout assert "Skipping profiling for now." in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 0 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_list_on_project_with_no_datasources( caplog, monkeypatch, empty_data_context, filesystem_csv_2): project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, "--v3-api datasource list", catch_exceptions=False, ) stdout = result.stdout.strip() assert "No Datasources found" in stdout assert context.list_datasources() == []
def test_cli_datasource_profile_with_invalid_data_asset_arg_answering_no( empty_data_context, titanic_sqlite_db, caplog ): project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) datasource_name = "wow_a_datasource" context = _add_datasource_and_credentials_to_context( context, datasource_name, titanic_sqlite_db ) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", datasource_name, "--data-assets", "bad-bad-asset", "-d", project_root_dir, "--no-view", ], input="2\n", catch_exceptions=False, ) stdout = result.stdout assert ( "Some of the data assets you specified were not found: bad-bad-asset" in stdout ) assert "Choose how to proceed" in stdout assert "Skipping profiling for now." in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 0 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasorce_new_connection_string(empty_data_context, empty_sqlite_db, caplog, monkeypatch): project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) assert context.list_datasources() == [] runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, [ "--v3-api", "datasource", "new", ], input=f"2\n6\nmynewsource\n{empty_sqlite_db.url}\n\n", catch_exceptions=False, ) stdout = result.stdout assert "What data would you like Great Expectations to connect to?" in stdout assert "Give your new Datasource a short name." in stdout assert ( "Next, we will configure database credentials and store them in the `mynewsource` section" in stdout) assert "What is the url/connection string for the sqlalchemy connection?" in stdout assert "Attempting to connect to your database. This may take a moment" in stdout assert "Great Expectations connected to your database" in stdout assert "A new datasource 'mynewsource' was added to your project." in stdout assert result.exit_code == 0 config_path = os.path.join(project_root_dir, DataContext.GE_YML) config = yaml.load(open(config_path)) datasources = config["datasources"] assert "mynewsource" in datasources.keys() data_source_class = datasources["mynewsource"]["data_asset_type"][ "class_name"] assert data_source_class == "SqlAlchemyDataset" assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_list(empty_data_context, empty_sqlite_db, caplog, monkeypatch): """Test an empty project and after adding a single datasource.""" project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, "--v3-api datasource list", catch_exceptions=False, ) stdout = result.stdout.strip() assert "No Datasources found" in stdout assert context.list_datasources() == [] datasource_name = "wow_a_datasource" _add_datasource_and_credentials_to_context( context, datasource_name, empty_sqlite_db ) runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, ["--v3-api", "datasource", "list"], catch_exceptions=False, ) expected_output = """\ Using v3 (Batch Request) API\x1b[0m 1 Datasource found:[0m [0m - [36mname:[0m wow_a_datasource[0m [36mclass_name:[0m SqlAlchemyDatasource[0m """.strip() stdout = result.stdout.strip() assert stdout == expected_output assert_no_logging_messages_or_tracebacks(caplog, result)
def generate_datasources(kedro_context: KedroContext, ge_context: DataContext) -> List[str]: catalog = kedro_context.catalog new_datasources = [] existing_datasource_names = { ds["name"] for ds in ge_context.list_datasources() } for dataset_name in catalog.list(): datasource_name = generate_datasource_name(dataset_name) if datasource_name in existing_datasource_names: continue dataset = catalog._get_dataset(dataset_name) datasource_type = identify_dataset_type(dataset) if datasource_type == DatasourceTypes.PANDAS: name = _add_pandas_datasource(datasource_name, dataset, ge_context) new_datasources.append(name) elif datasource_type == DatasourceTypes.SPARK: name = _add_spark_datasource(datasource_name, dataset, ge_context) new_datasources.append(name) return new_datasources
def test_cli_init_on_new_project(mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file): project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff")) ge_dir = os.path.join(project_dir, "great_expectations") database_path = os.path.join(project_dir, "titanic.db") shutil.copy(titanic_sqlite_db_file, database_path) engine = create_engine("sqlite:///{}".format(database_path)) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="Y\n2\n5\ntitanic\n{}\n1\nwarning\n\n".format( engine.url, catch_exceptions=False), ) stdout = result.output assert len(stdout) < 3000, "CLI output is unreasonably long." assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert "Which database backend are you using" in stdout assert "Give your new data source a short name" in stdout assert "What is the url/connection string for the sqlalchemy connection" in stdout assert "Attempting to connect to your database." in stdout assert "Great Expectations connected to your database" in stdout assert "Which table would you like to use?" in stdout assert "Name the new expectation suite [main.titanic.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations about them" in stdout) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "Data Docs" in stdout assert "A new Expectation suite 'warning' was added to your project" in stdout assert "Great Expectations is now set up" in stdout context = DataContext(ge_dir) assert len(context.list_datasources()) == 1 assert context.list_datasources() == [{ "class_name": "SqlAlchemyDatasource", "name": "titanic" }] first_suite = context.list_expectation_suites()[0] suite = context.get_expectation_suite(first_suite.expectation_suite_name) assert len(suite.expectations) == 13 assert os.path.isdir(ge_dir) config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml") assert os.path.isfile(config_path) config = yaml.load(open(config_path, "r")) data_source_class = config["datasources"]["titanic"]["data_asset_type"][ "class_name"] assert data_source_class == "SqlAlchemyDataset" obs_tree = gen_directory_tree_str(ge_dir) # Instead of monkey patching datetime, just regex out the time directories date_safe_obs_tree = re.sub(r"\d*T\d*\.\d*Z", "9999.9999", obs_tree) # Instead of monkey patching guids, just regex out the guids guid_safe_obs_tree = re.sub(r"[a-z0-9]{32}(?=\.(json|html))", "foobarbazguid", date_safe_obs_tree) assert (guid_safe_obs_tree == """\ great_expectations/ .gitignore great_expectations.yml expectations/ warning.json notebooks/ pandas/ validation_playground.ipynb spark/ validation_playground.ipynb sql/ validation_playground.ipynb plugins/ custom_data_docs/ renderers/ styles/ data_docs_custom_styles.css views/ uncommitted/ config_variables.yml data_docs/ local_site/ index.html expectations/ warning.html static/ fonts/ HKGrotesk/ HKGrotesk-Bold.otf HKGrotesk-BoldItalic.otf HKGrotesk-Italic.otf HKGrotesk-Light.otf HKGrotesk-LightItalic.otf HKGrotesk-Medium.otf HKGrotesk-MediumItalic.otf HKGrotesk-Regular.otf HKGrotesk-SemiBold.otf HKGrotesk-SemiBoldItalic.otf images/ favicon.ico glossary_scroller.gif iterative-dev-loop.png logo-long-vector.svg logo-long.png short-logo-vector.svg short-logo.png validation_failed_unexpected_values.gif styles/ data_docs_custom_styles_template.css data_docs_default_styles.css validations/ warning/ 9999.9999/ foobarbazguid.html validations/ warning/ 9999.9999/ foobarbazguid.json """) assert_no_logging_messages_or_tracebacks(caplog, result) assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format( project_dir) in mock_webbrowser.call_args[0][0]