def test_cli_datasource_profile_with_valid_data_asset_arg( caplog, empty_data_context, filesystem_csv_2): empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) context = empty_data_context project_root_dir = context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "my_datasource", "--data-assets", "f1", "-d", project_root_dir, "--no-view", ], catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert "Profiling 'my_datasource'" in stdout assert "The following Data Docs sites were built:\n- local_site:" in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert (suites[0].expectation_suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler") validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) suite_name = validation.meta["expectation_suite_name"] assert suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler" assert validation.success is False assert len(validation.results) == 8 assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_cli_datasource_profile_with_skip_prompt_flag(caplog, empty_data_context, filesystem_csv_2): empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) not_so_empty_data_context = empty_data_context project_root_dir = not_so_empty_data_context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "profile", "-d", project_root_dir, "--no-view", '-y'], input="Y\n", catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert ( "Profiling 'my_datasource' will create expectations and documentation." in stdout) assert "Would you like to profile 'my_datasource'" not in stdout assert ( "Great Expectations is building Data Docs from the data you just profiled!" in stdout) context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert (suites[0].expectation_suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler") validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert (validation.meta["expectation_suite_name"] == "my_datasource.subdir_reader.f1.BasicDatasetProfiler") assert validation.success is False assert len(validation.results) == 8 assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_cli_datasource_profile_with_valid_data_asset_arg( empty_data_context, titanic_sqlite_db, caplog ): project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) datasource_name = "wow_a_datasource" context = _add_datasource_and_credentials_to_context( context, datasource_name, titanic_sqlite_db ) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", datasource_name, "--data-assets", "main.titanic", "-d", project_root_dir, "--no-view", ], catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Profiling '{}'".format(datasource_name) in stdout assert "The following Data Docs sites were built:\n" in stdout assert "local_site:" in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert ( suites[0].expectation_suite_name == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler" ) validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert ( validation.meta["expectation_suite_name"] == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler" ) assert validation.success is False assert len(validation.results) == 51 assert "Preparing column 1 of 7" in caplog.messages[0] assert len(caplog.messages) == 10 assert_no_tracebacks(result)
def test_cli_datasource_profile_with_datasource_arg_and_generator_name_arg( empty_data_context, titanic_sqlite_db, caplog ): """ Here we are verifying that when generator_name argument is passed to the methods down the stack. We use a datasource with two generators. This way we can check that the name of the expectation suite created by the profiler corresponds to the name of the data asset listed by the generator that we told the profiler to use. The logic of processing this argument is testing in tests/profile. """ project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) datasource_name = "wow_a_datasource" context = _add_datasource__with_two_generators_and_credentials_to_context( context, datasource_name, titanic_sqlite_db ) second_generator_name = "second_generator" runner = CliRunner() result = runner.invoke( cli, [ "datasource", "profile", datasource_name, "--generator-name", second_generator_name, "-d", project_root_dir, "--no-view", ], input="Y\n", ) stdout = result.stdout assert result.exit_code == 0 assert "Profiling '{}'".format(datasource_name) in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert ( suites[0].expectation_suite_name == "wow_a_datasource.second_generator.asset_one.BasicDatasetProfiler" ) assert "Preparing column 1 of 7" in caplog.messages[0] assert len(caplog.messages) == 7 assert_no_tracebacks(result)
def test_cli_datasource_profile_with_additional_batch_kwargs( caplog, empty_data_context, filesystem_csv_2 ): empty_data_context.add_datasource( "my_datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) not_so_empty_data_context = empty_data_context project_root_dir = not_so_empty_data_context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "-d", project_root_dir, "--additional-batch-kwargs", '{"reader_options": {"sep": ",", "parse_dates": [0]}}', "--no-view", ], input="Y\n", catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert ( "Profiling 'my_datasource' will create expectations and documentation." in stdout ) assert "Would you like to profile 'my_datasource'" in stdout assert ( "Great Expectations is building Data Docs from the data you just profiled!" in stdout ) context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 expected_suite_name = "my_datasource.subdir_reader.f1.BasicDatasetProfiler" assert suites[0].expectation_suite_name == expected_suite_name validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert validation.meta["expectation_suite_name"] == expected_suite_name assert validation.success is False assert len(validation.results) == 9 batch_id = validation_keys[0].batch_identifier evr = context.get_validation_result( expectation_suite_name=expected_suite_name, batch_identifier=batch_id ) reader_options = evr.meta["batch_kwargs"]["reader_options"] assert reader_options["parse_dates"] == [0] assert reader_options["sep"] == "," assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_cli_datasource_profile_with_data_asset_and_additional_batch_kwargs_with_limit( empty_data_context, titanic_sqlite_db, caplog ): """ User can pass additional batch kwargs (e.g., limit) to a sql backend. Here we are verifying that passing "limit" affects the query correctly - the row count in the batch that the profiler uses to profile the data asset must match the limit passed by the user. """ project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) datasource_name = "wow_a_datasource" context = _add_datasource_and_credentials_to_context( context, datasource_name, titanic_sqlite_db ) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "-d", project_root_dir, "--data-assets", "main.titanic", "--additional-batch-kwargs", '{"limit": 97}', "--no-view", ], input="Y\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Profiling '{}'".format(datasource_name) in stdout assert "The following Data Docs sites were built:\n- local_site:" in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert ( suites[0].expectation_suite_name == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler" ) validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert ( validation.meta["expectation_suite_name"] == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler" ) assert validation.success is False row_count_validation_results = [ validation_result for validation_result in validation.results if validation_result.expectation_config.expectation_type == "expect_table_row_count_to_be_between" ] assert len(row_count_validation_results) == 1 assert row_count_validation_results[0].result["observed_value"] == 97 assert "Preparing column 1 of 7" in caplog.messages[0] assert len(caplog.messages) == 7 assert_no_tracebacks(result)