saves validation results to your results store and then updates Data Docs. This makes viewing validation results easy for you and your team. Usage: - Run this file: `python {0}`. - This can be run manually or via a scheduler such as cron. - If your pipeline runner supports python snippets you can paste this into your pipeline. """ import sys from great_expectations import DataContext # tap configuration context = DataContext("{1}") suite = context.get_expectation_suite("{2}") # You can modify your BatchKwargs to select different data batch_kwargs = {3} # tap validation process batch = context.get_batch(batch_kwargs, suite) results = context.run_validation_operator("action_list_operator", [batch]) if not results["success"]: print("Validation Failed!") sys.exit(1) print("Validation Succeeded!") sys.exit(0)
def test_cli_init_on_new_project_extra_whitespace_in_url( mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file ): project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff")) ge_dir = os.path.join(project_dir, "great_expectations") database_path = os.path.join(project_dir, "titanic.db") shutil.copy(titanic_sqlite_db_file, database_path) engine = create_engine("sqlite:///{}".format(database_path)) engine_url_with_added_whitespace = " " + str(engine.url) + " " runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="\n\n2\n6\ntitanic\n{}\n\n\n1\nwarning\n\n\n\n".format( engine_url_with_added_whitespace ), catch_exceptions=False, ) stdout = result.output assert len(stdout) < 6000, "CLI output is unreasonably long." assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert "Which database backend are you using" in stdout assert "Give your new Datasource a short name" in stdout assert "What is the url/connection string for the sqlalchemy connection" in stdout assert "Attempting to connect to your database." in stdout assert "Great Expectations connected to your database" in stdout assert "Which table would you like to use?" in stdout assert "Name the new Expectation Suite [main.titanic.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations about them" in stdout ) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "Data Docs" in stdout assert "Great Expectations is now set up" in stdout context = DataContext(ge_dir) assert len(context.list_datasources()) == 1 assert context.list_datasources() == [ { "class_name": "SqlAlchemyDatasource", "name": "titanic", "module_name": "great_expectations.datasource", "credentials": {"url": str(engine.url)}, "data_asset_type": { "class_name": "SqlAlchemyDataset", "module_name": "great_expectations.dataset", }, } ] first_suite = context.list_expectation_suites()[0] suite = context.get_expectation_suite(first_suite.expectation_suite_name) assert len(suite.expectations) == 14 assert os.path.isdir(ge_dir) config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml") assert os.path.isfile(config_path) config = yaml.load(open(config_path, "r")) data_source_class = config["datasources"]["titanic"]["data_asset_type"][ "class_name" ] assert data_source_class == "SqlAlchemyDataset" assert_no_logging_messages_or_tracebacks(caplog, result) assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format( project_dir ) in mock_webbrowser.call_args[0][0] )
def test_cli_init_on_new_project( mock_webbrowser, caplog, tmp_path_factory, titanic_sqlite_db_file ): project_dir = str(tmp_path_factory.mktemp("test_cli_init_diff")) ge_dir = os.path.join(project_dir, "great_expectations") database_path = os.path.join(project_dir, "titanic.db") shutil.copy(titanic_sqlite_db_file, database_path) engine = create_engine("sqlite:///{}".format(database_path)) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["init", "-d", project_dir], input="\n\n2\n6\ntitanic\n{}\n\n\n1\nwarning\n\n\n\n".format(engine.url), catch_exceptions=False, ) stdout = result.output assert len(stdout) < 6000, "CLI output is unreasonably long." assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert "Which database backend are you using" in stdout assert "Give your new Datasource a short name" in stdout assert "What is the url/connection string for the sqlalchemy connection" in stdout assert "Attempting to connect to your database." in stdout assert "Great Expectations connected to your database" in stdout assert "Which table would you like to use?" in stdout assert "Name the new Expectation Suite [main.titanic.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations about them" in stdout ) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "Data Docs" in stdout assert "Great Expectations is now set up" in stdout context = DataContext(ge_dir) assert len(context.list_datasources()) == 1 assert context.list_datasources()[0]["class_name"] == "SqlAlchemyDatasource" assert context.list_datasources()[0]["name"] == "titanic" first_suite = context.list_expectation_suites()[0] suite = context.get_expectation_suite(first_suite.expectation_suite_name) assert len(suite.expectations) == 14 assert os.path.isdir(ge_dir) config_path = os.path.join(project_dir, "great_expectations/great_expectations.yml") assert os.path.isfile(config_path) config = yaml.load(open(config_path, "r")) data_source_class = config["datasources"]["titanic"]["data_asset_type"][ "class_name" ] assert data_source_class == "SqlAlchemyDataset" obs_tree = gen_directory_tree_str(ge_dir) # Instead of monkey patching guids, just regex out the guids guid_safe_obs_tree = re.sub( r"[a-z0-9]{32}(?=\.(json|html))", "foobarbazguid", obs_tree ) # print(guid_safe_obs_tree) assert ( guid_safe_obs_tree == """\ great_expectations/ .gitignore great_expectations.yml checkpoints/ expectations/ warning.json notebooks/ pandas/ validation_playground.ipynb spark/ validation_playground.ipynb sql/ validation_playground.ipynb plugins/ custom_data_docs/ renderers/ styles/ data_docs_custom_styles.css views/ uncommitted/ config_variables.yml data_docs/ local_site/ index.html expectations/ warning.html static/ fonts/ HKGrotesk/ HKGrotesk-Bold.otf HKGrotesk-BoldItalic.otf HKGrotesk-Italic.otf HKGrotesk-Light.otf HKGrotesk-LightItalic.otf HKGrotesk-Medium.otf HKGrotesk-MediumItalic.otf HKGrotesk-Regular.otf HKGrotesk-SemiBold.otf HKGrotesk-SemiBoldItalic.otf images/ favicon.ico glossary_scroller.gif iterative-dev-loop.png logo-long-vector.svg logo-long.png short-logo-vector.svg short-logo.png validation_failed_unexpected_values.gif styles/ data_docs_custom_styles_template.css data_docs_default_styles.css validations/ warning/ 20190926T134241.000000Z/ 20190926T134241.000000Z/ foobarbazguid.html validations/ warning/ 20190926T134241.000000Z/ 20190926T134241.000000Z/ foobarbazguid.json """ ) assert_no_logging_messages_or_tracebacks(caplog, result) assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/warning/".format( project_dir ) in mock_webbrowser.call_args[0][0] )
def test_checkpoint_new_happy_path_generates_checkpoint_yml_with_comments( mock_emit, caplog, titanic_data_context_stats_enabled, titanic_expectation_suite): context = titanic_data_context_stats_enabled root_dir = context.root_directory assert context.list_checkpoints() == [] context.save_expectation_suite(titanic_expectation_suite) assert context.list_expectation_suite_names() == ["Titanic.warning"] mock_emit.reset_mock() runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, f"checkpoint new passengers Titanic.warning -d {root_dir}", input="1\n1\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "A checkpoint named `passengers` was added to your project" in stdout assert mock_emit.call_count == 2 assert mock_emit.call_args_list == [ mock.call({ "event_payload": {}, "event": "data_context.__init__", "success": True }), mock.call({ "event": "cli.checkpoint.new", "event_payload": {}, "success": True }), ] expected_checkpoint = os.path.join(root_dir, context.CHECKPOINTS_DIR, "passengers.yml") assert os.path.isfile(expected_checkpoint) # Newup a context for additional assertions context = DataContext(root_dir) assert context.list_checkpoints() == ["passengers"] with open(expected_checkpoint, "r") as f: obs_file = f.read() # This is snapshot-ish to prove that comments remain in place assert ("""\ # This checkpoint was created by the command `great_expectations checkpoint new`. # # A checkpoint is a list of one or more batches paired with one or more # Expectation Suites and a configurable Validation Operator. # # It can be run with the `great_expectations checkpoint run` command. # You can edit this file to add batches of data and expectation suites. # # For more details please see # https://docs.greatexpectations.io/en/latest/how_to_guides/validation/how_to_add_validations_data_or_suites_to_a_checkpoint.html validation_operator_name: action_list_operator # Batches are a list of batch_kwargs paired with a list of one or more suite # names. A checkpoint can have one or more batches. This makes deploying # Great Expectations in your pipelines easy! batches: - batch_kwargs:""" in obs_file) assert "/data/Titanic.csv" in obs_file assert ("""datasource: mydatasource data_asset_name: Titanic expectation_suite_names: # one or more suites may validate against a single batch - Titanic.warning """ in obs_file) assert_no_logging_messages_or_tracebacks(caplog, result)
def init(ctx, view, usage_stats): """ Initialize a new Great Expectations project. This guided input walks the user through setting up a new project and also onboards a new developer in an existing project. It scaffolds directories, sets up notebooks, creates a project file, and appends to a `.gitignore` file. """ display_not_implemented_message_and_exit() directory = toolkit.parse_cli_config_file_location( config_file_location=ctx.obj.config_file_location).get("directory") target_directory = os.path.abspath(directory) ge_dir = _get_full_path_to_ge_dir(target_directory) cli_message(GREETING) if DataContext.does_config_exist_on_disk(ge_dir): try: if DataContext.is_project_initialized(ge_dir): # Ensure the context can be instantiated cli_message(PROJECT_IS_COMPLETE) except (DataContextError, DatasourceInitializationError) as e: cli_message("<red>{}</red>".format(e.message)) sys.exit(1) try: context = DataContext.create(target_directory, usage_statistics_enabled=usage_stats) cli_message(ONBOARDING_COMPLETE) # TODO if this is correct, ensure this is covered by a test # cli_message(SETUP_SUCCESS) # exit(0) except DataContextError as e: cli_message("<red>{}</red>".format(e.message)) # TODO ensure this is covered by a test exit(5) else: if not click.confirm(LETS_BEGIN_PROMPT, default=True): cli_message(RUN_INIT_AGAIN) # TODO ensure this is covered by a test exit(0) try: context = DataContext.create(target_directory, usage_statistics_enabled=usage_stats) toolkit.send_usage_message(data_context=context, event="cli.init.create", success=True) except DataContextError as e: # TODO ensure this is covered by a test cli_message("<red>{}</red>".format(e)) try: # if expectations exist, offer to build docs context = DataContext(ge_dir) if context.list_expectation_suites(): if click.confirm(BUILD_DOCS_PROMPT, default=True): build_docs(context, view=view) else: datasources = context.list_datasources() if len(datasources) == 0: cli_message(SECTION_SEPARATOR) if not click.confirm( "Would you like to configure a Datasource?", default=True): cli_message("Okay, bye!") sys.exit(1) datasource_name, data_source_type = add_datasource_impl( context, choose_one_data_asset=False) if not datasource_name: # no datasource was created sys.exit(1) datasources = context.list_datasources() if len(datasources) == 1: datasource_name = datasources[0]["name"] cli_message(SECTION_SEPARATOR) if not click.confirm( "Would you like to profile new Expectations for a single data asset within your new Datasource?", default=True, ): cli_message( "Okay, exiting now. To learn more about Profilers, run great_expectations profile --help or visit docs.greatexpectations.io!" ) sys.exit(1) ( success, suite_name, profiling_results, ) = toolkit.create_expectation_suite( context, datasource_name=datasource_name, additional_batch_kwargs={"limit": 1000}, flag_build_docs=False, open_docs=False, ) cli_message(SECTION_SEPARATOR) if not click.confirm("Would you like to build Data Docs?", default=True): cli_message( "Okay, exiting now. To learn more about Data Docs, run great_expectations docs --help or visit docs.greatexpectations.io!" ) sys.exit(1) build_docs(context, view=False) if not click.confirm( "\nWould you like to view your new Expectations in Data Docs? This will open a new browser window.", default=True, ): cli_message( "Okay, exiting now. You can view the site that has been created in a browser, or visit docs.greatexpectations.io for more information!" ) sys.exit(1) toolkit.attempt_to_open_validation_results_in_data_docs( context, profiling_results) cli_message(SECTION_SEPARATOR) cli_message(SETUP_SUCCESS) sys.exit(0) except ( DataContextError, ge_exceptions.ProfilerError, OSError, SQLAlchemyError, ) as e: cli_message("<red>{}</red>".format(e)) sys.exit(1)
def test_cli_datasource_profile_with_additional_batch_kwargs( caplog, empty_data_context, filesystem_csv_2 ): empty_data_context.add_datasource( "my_datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) not_so_empty_data_context = empty_data_context project_root_dir = not_so_empty_data_context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "-d", project_root_dir, "--additional-batch-kwargs", '{"reader_options": {"sep": ",", "parse_dates": [0]}}', "--no-view", ], input="Y\n", catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert ( "Profiling 'my_datasource' will create expectations and documentation." in stdout ) assert "Would you like to profile 'my_datasource'" in stdout assert ( "Great Expectations is building Data Docs from the data you just profiled!" in stdout ) context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 expected_suite_name = "my_datasource.subdir_reader.f1.BasicDatasetProfiler" assert suites[0].expectation_suite_name == expected_suite_name validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert validation.meta["expectation_suite_name"] == expected_suite_name assert validation.success is False assert len(validation.results) == 9 batch_id = validation_keys[0].batch_identifier evr = context.get_validation_result( expectation_suite_name=expected_suite_name, batch_identifier=batch_id ) reader_options = evr.meta["batch_kwargs"]["reader_options"] assert reader_options["parse_dates"] == [0] assert reader_options["sep"] == "," assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one( mock_webbrowser, capsys, caplog, initialized_project, ): project_dir = initialized_project ge_dir = os.path.join(project_dir, DataContext.GE_DIR) # mangle the project to remove all traces of a suite and validations _remove_all_datasources(ge_dir) os.remove(os.path.join(ge_dir, "expectations", "Titanic", "warning.json")) uncommitted_dir = os.path.join(ge_dir, "uncommitted") validations_dir = os.path.join(ge_dir, uncommitted_dir, "validations") shutil.rmtree(validations_dir) os.mkdir(validations_dir) shutil.rmtree(os.path.join(uncommitted_dir, "data_docs", "local_site")) context = DataContext(ge_dir) assert not context.list_expectation_suites() data_folder_path = os.path.join(project_dir, "data") csv_path = os.path.join(project_dir, "data", "Titanic.csv") runner = CliRunner(mix_stderr=False) with pytest.warns( UserWarning, match="Warning. An existing `great_expectations.yml` was found"): result = runner.invoke( cli, ["init", "-d", project_dir], input="\n1\n1\n{}\n\n\n\n2\n{}\nmy_suite\n\n\n\n\n".format( data_folder_path, csv_path), catch_exceptions=False, ) assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/" .format(project_dir) in mock_webbrowser.call_args[0][0]) stdout = result.stdout assert result.exit_code == 0 assert "Error: invalid input" not in stdout assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert ( "Enter the path of a data file (relative or absolute, s3a:// and gs:// paths are ok too)" in stdout) assert "Name the new Expectation Suite [Titanic.warning]:" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations" in stdout) assert "Great Expectations is now set up." in stdout config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML)) assert "data__dir" in config["datasources"].keys() context = DataContext(ge_dir) assert len(context.list_datasources()) == 1 assert context.list_datasources()[0]["name"] == "data__dir" assert context.list_datasources()[0]["class_name"] == "PandasDatasource" assert context.list_expectation_suites( )[0].expectation_suite_name == "my_suite" assert len(context.list_expectation_suites()) == 1 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_suite_demo_answer_suite_name_prompts_with_name_of_existing_suite( mock_webbrowser, mock_subprocess, caplog, data_context, filesystem_csv_2): """ We call the "suite demo" command without the suite name argument The command should: - prompt us to enter the name of the expectation suite that will be created. We answer the prompt with the name of an existing expectation suite. - display an error message and let us retry until we answer with a name that is not "taken". - create an example suite - NOT open jupyter - open DataDocs to the new example suite page """ not_so_empty_data_context = data_context root_dir = not_so_empty_data_context.root_directory os.mkdir(os.path.join(root_dir, "uncommitted")) runner = CliRunner(mix_stderr=False) csv_path = os.path.join(filesystem_csv_2, "f1.csv") existing_suite_name = "my_dag_node.default" context = DataContext(root_dir) assert context.list_expectation_suite_names() == [existing_suite_name] result = runner.invoke( cli, ["suite", "demo", "-d", root_dir], input=f"{csv_path}\n{existing_suite_name}\nmy_new_suite\n\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert ( f"An expectation suite named `{existing_suite_name}` already exists." in stdout) assert ( f"If you intend to edit the suite please use `great_expectations suite edit {existing_suite_name}`" in stdout) assert "Enter the path" in stdout assert "Name the new expectation suite [f1.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations" in stdout) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "The following Data Docs sites were built" in stdout assert "A new Expectation suite 'my_new_suite' was added to your project" in stdout assert "open a notebook for you now" not in stdout expected_suite_path = os.path.join(root_dir, "expectations", "my_new_suite.json") assert os.path.isfile(expected_suite_path) assert mock_subprocess.call_count == 0 assert mock_webbrowser.call_count == 1 foo = os.path.join( root_dir, "uncommitted/data_docs/local_site/validations/my_new_suite/") assert f"file://{foo}" in mock_webbrowser.call_args[0][0] assert_no_logging_messages_or_tracebacks(caplog, result)
def test_suite_new_empty_with_no_jupyter(mock_webbroser, mock_subprocess, caplog, data_context, filesystem_csv_2): """ Running "suite new --no-jupyter" should: - make an empty suite - NOT open jupyter - NOT open data docs """ os.mkdir(os.path.join(data_context.root_directory, "uncommitted")) root_dir = data_context.root_directory runner = CliRunner(mix_stderr=False) csv = os.path.join(filesystem_csv_2, "f1.csv") # TODO this test must be updated to remove the --empty flag in the next major release result = runner.invoke( cli, [ "suite", "new", "-d", root_dir, "--empty", "--suite", "foo", "--no-jupyter" ], input=f"{csv}\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Enter the path" in stdout assert "Name the new expectation suite" not in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations" not in stdout) assert "Generating example Expectation Suite..." not in stdout assert "The following Data Docs sites were built" not in stdout assert "A new Expectation suite 'foo' was added to your project" in stdout assert "open a notebook for you now" not in stdout expected_suite_path = os.path.join(root_dir, "expectations", "foo.json") assert os.path.isfile(expected_suite_path) expected_notebook = os.path.join(root_dir, "uncommitted", "edit_foo.ipynb") assert os.path.isfile(expected_notebook) context = DataContext(root_dir) assert "foo" in context.list_expectation_suite_names() suite = context.get_expectation_suite("foo") assert suite.expectations == [] citations = suite.get_citations() citations[0].pop("citation_date") assert citations[0] == { "batch_kwargs": { "datasource": "mydatasource", "path": csv, "reader_method": "read_csv", }, "batch_markers": None, "batch_parameters": None, "comment": "New suite added via CLI", } assert mock_subprocess.call_count == 0 assert mock_webbroser.call_count == 0 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_notebook_execution_with_pandas_backend( titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled, ): """ To set this test up we: - create a suite using profiling - verify that no validations have happened - create the suite edit notebook by hijacking the private cli method We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ context: DataContext = titanic_v013_multi_datasource_pandas_data_context_with_checkpoints_v1_with_empty_store_stats_enabled root_dir: str = context.root_directory uncommitted_dir: str = os.path.join(root_dir, "uncommitted") expectation_suite_name: str = "warning" context.create_expectation_suite( expectation_suite_name=expectation_suite_name) batch_request: dict = { "datasource_name": "my_datasource", "data_connector_name": "my_basic_data_connector", "data_asset_name": "Titanic_1912", } # Sanity check test setup original_suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) assert len(original_suite.expectations) == 0 assert context.list_expectation_suite_names() == [expectation_suite_name] assert context.list_datasources() == [ { "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "class_name": "PandasExecutionEngine", "module_name": "great_expectations.execution_engine", }, "data_connectors": { "my_basic_data_connector": { "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "class_name": "InferredAssetFilesystemDataConnector", }, "my_special_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "pattern": "(.+)_(\\d+)_(\\d+)\\.csv", "group_names": ["name", "timestamp", "size"], "class_name": "Asset", "base_directory": f"{root_dir}/../data/titanic", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_other_data_connector": { "glob_directive": "*.csv", "assets": { "users": { "class_name": "Asset", "module_name": "great_expectations.datasource.data_connector.asset", } }, "module_name": "great_expectations.datasource.data_connector", "base_directory": f"{root_dir}/../data/titanic", "default_regex": { "pattern": "(.+)\\.csv", "group_names": ["name"] }, "class_name": "ConfiguredAssetFilesystemDataConnector", }, "my_runtime_data_connector": { "module_name": "great_expectations.datasource.data_connector", "batch_identifiers": ["pipeline_stage_name", "airflow_run_id"], "class_name": "RuntimeDataConnector", }, }, }, { "name": "my_additional_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "my_additional_data_connector": { "module_name": "great_expectations.datasource.data_connector", "default_regex": { "pattern": "(.*)\\.csv", "group_names": ["data_asset_name"], }, "base_directory": f"{root_dir}/../data/titanic", "class_name": "InferredAssetFilesystemDataConnector", } }, }, ] assert context.get_validation_result( expectation_suite_name="warning") == {} # Create notebook # do not want to actually send usage_message, since the function call is not the result of actual usage _suite_edit_workflow( context=context, expectation_suite_name=expectation_suite_name, profile=True, profiler_name=None, usage_event="test_notebook_execution", interactive_mode=CLISuiteInteractiveFlagCombinations. UNPROMPTED_INTERACTIVE_FALSE_MANUAL_TRUE, no_jupyter=True, create_if_not_exist=False, datasource_name=None, batch_request=batch_request, additional_batch_request_args=None, suppress_usage_message=True, assume_yes=True, ) edit_notebook_path: str = os.path.join(uncommitted_dir, "edit_warning.ipynb") assert os.path.isfile(edit_notebook_path) run_notebook( notebook_path=edit_notebook_path, notebook_dir=uncommitted_dir, string_to_be_replaced= "context.open_data_docs(resource_identifier=validation_result_identifier)", replacement_string="", ) # Assertions about output context = DataContext(context_root_dir=root_dir) obs_validation_result: ExpectationSuiteValidationResult = ( context.get_validation_result(expectation_suite_name="warning")) assert obs_validation_result.statistics == { "evaluated_expectations": 2, "successful_expectations": 2, "unsuccessful_expectations": 0, "success_percent": 100.0, } suite: ExpectationSuite = context.get_expectation_suite( expectation_suite_name=expectation_suite_name) suite["meta"].pop("citations", None) assert suite.expectations == [ ExpectationConfiguration( **{ "expectation_type": "expect_table_columns_to_match_ordered_list", "kwargs": { "column_list": [ "Unnamed: 0", "Name", "PClass", "Age", "Sex", "Survived", "SexCode", ] }, "meta": {}, }), ExpectationConfiguration( **{ "expectation_type": "expect_table_row_count_to_be_between", "kwargs": { "max_value": 1313, "min_value": 1313 }, "meta": {}, }), ] columns_with_expectations: Set[str] expectations_from_suite: Set[str] ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite=suite) expected_expectations: Set[str] = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", } assert columns_with_expectations == set() assert expectations_from_suite == expected_expectations
def test_cli_init_on_existing_project_with_no_uncommitted_dirs_answering_yes_to_fixing_them( mock_webbrowser, caplog, monkeypatch, tmp_path_factory, ): """ This test walks through the onboarding experience. The user just checked an existing project out of source control and does not yet have an uncommitted directory. """ root_dir = tmp_path_factory.mktemp("hiya") root_dir = str(root_dir) os.makedirs(os.path.join(root_dir, "data")) data_folder_path = os.path.join(root_dir, "data") data_path = os.path.join(root_dir, "data", "Titanic.csv") fixture_path = file_relative_path( __file__, os.path.join("..", "test_sets", "Titanic.csv")) shutil.copy(fixture_path, data_path) # Create a new project from scratch that we will use for the test in the next step runner = CliRunner(mix_stderr=False) monkeypatch.chdir(root_dir) result = runner.invoke( cli, ["--v3-api", "init"], input=f"\n\n1\n1\n{data_folder_path}\n\n\n\n2\n{data_path}\n\n\n\n", catch_exceptions=False, ) stdout = result.output assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/Titanic/warning/" .format(root_dir) in mock_webbrowser.call_args[0][0]) assert "Great Expectations is now set up." in stdout context = DataContext(os.path.join(root_dir, DataContext.GE_DIR)) uncommitted_dir = os.path.join(context.root_directory, "uncommitted") shutil.rmtree(uncommitted_dir) assert not os.path.isdir(uncommitted_dir) # Test the second invocation of init runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) with pytest.warns( UserWarning, match="Warning. An existing `great_expectations.yml` was found"): result = runner.invoke( cli, ["--v3-api", "init"], input="Y\nn\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Great Expectations added some missing files required to run." in stdout assert "You may see new files in" in stdout assert "OK. You must run" not in stdout assert "great_expectations init" not in stdout assert "to fix the missing files!" not in stdout assert "Would you like to build & view this project's Data Docs!?" in stdout assert os.path.isdir(uncommitted_dir) config_var_path = os.path.join(uncommitted_dir, "config_variables.yml") assert os.path.isfile(config_var_path) with open(config_var_path) as f: assert f.read() == CONFIG_VARIABLES_TEMPLATE assert_no_logging_messages_or_tracebacks(caplog, result)
def upgrade_project(context_root_dir, ge_config_version, from_cli_upgrade_command=False): continuation_message = ( "\nOk, exiting now. To upgrade at a later time, use the following command: " "<cyan>great_expectations project upgrade</cyan>\n\nTo learn more about the upgrade " "process, visit " "<cyan>https://docs.greatexpectations.io/en/latest/how_to_guides/migrating_versions.html" "</cyan>.\n") if from_cli_upgrade_command: message = ( f"<red>\nYour project appears to have an out-of-date config version ({ge_config_version}) - " f"the version " f"number must be at least {MINIMUM_SUPPORTED_CONFIG_VERSION}.</red>" ) else: message = ( f"<red>\nYour project appears to have an out-of-date config version ({ge_config_version}) - " f"the version " f"number must be at least {MINIMUM_SUPPORTED_CONFIG_VERSION}.\nIn order to proceed, " f"your project must be upgraded.</red>") cli_message(message) upgrade_prompt = ( "\nWould you like to run the Upgrade Helper to bring your project up-to-date?" ) confirm_proceed_or_exit(confirm_prompt=upgrade_prompt, continuation_message=continuation_message) cli_message(SECTION_SEPARATOR) # use loop in case multiple upgrades need to take place while ge_config_version < MINIMUM_SUPPORTED_CONFIG_VERSION: upgrade_helper_class = GE_UPGRADE_HELPER_VERSION_MAP.get( int(ge_config_version)) if not upgrade_helper_class: break target_ge_config_version = int(ge_config_version) + 1 # set version temporarily to MINIMUM_SUPPORTED_CONFIG_VERSION to get functional DataContext DataContext.set_ge_config_version( config_version=MINIMUM_SUPPORTED_CONFIG_VERSION, context_root_dir=context_root_dir, ) upgrade_helper = upgrade_helper_class( context_root_dir=context_root_dir) upgrade_overview, confirmation_required = upgrade_helper.get_upgrade_overview( ) if confirmation_required: upgrade_confirmed = confirm_proceed_or_exit( confirm_prompt=upgrade_overview, continuation_message=continuation_message, exit_on_no=False, ) else: upgrade_confirmed = True if upgrade_confirmed: cli_message("\nUpgrading project...") cli_message(SECTION_SEPARATOR) # run upgrade and get report of what was done, if version number should be incremented upgrade_report, increment_version = upgrade_helper.upgrade_project( ) # display report to user cli_message(upgrade_report) # set config version to target version if increment_version: DataContext.set_ge_config_version( target_ge_config_version, context_root_dir, validate_config_version=False, ) ge_config_version += 1 else: # restore version number to current number DataContext.set_ge_config_version( ge_config_version, context_root_dir, validate_config_version=False) break else: # restore version number to current number DataContext.set_ge_config_version(ge_config_version, context_root_dir, validate_config_version=False) cli_message(continuation_message) sys.exit(0) cli_message(SECTION_SEPARATOR) upgrade_success_message = "<green>Upgrade complete. Exiting...</green>\n" upgrade_incomplete_message = f"""\ <red>The Upgrade Helper was unable to perform a complete project upgrade. Next steps:</red> - Please perform any manual steps outlined in the Upgrade Overview and/or Upgrade Report above - When complete, increment the config_version key in your <cyan>great_expectations.yml</cyan> to <cyan>{ ge_config_version + 1}</cyan>\n To learn more about the upgrade process, visit \ <cyan>https://docs.greatexpectations.io/en/latest/how_to_guides/migrating_versions.html</cyan> """ if ge_config_version < MINIMUM_SUPPORTED_CONFIG_VERSION: cli_message(upgrade_incomplete_message) else: cli_message(upgrade_success_message) sys.exit(0)
def titanic_spark_data_context_with_v013_datasource_with_checkpoints_v1_with_empty_store_stats_enabled( tmp_path_factory, monkeypatch, spark_session, ): # Re-enable GE_USAGE_STATS monkeypatch.delenv("GE_USAGE_STATS") project_path: str = str(tmp_path_factory.mktemp("titanic_data_context")) context_path: str = os.path.join(project_path, "great_expectations") os.makedirs(os.path.join(context_path, "expectations"), exist_ok=True) data_path: str = os.path.join(context_path, "..", "data", "titanic") os.makedirs(os.path.join(data_path), exist_ok=True) shutil.copy( file_relative_path( __file__, os.path.join( "..", "test_fixtures", "great_expectations_v013_no_datasource_stats_enabled.yml", ), ), str(os.path.join(context_path, "great_expectations.yml")), ) shutil.copy( file_relative_path(__file__, os.path.join("..", "test_sets", "Titanic.csv")), str( os.path.join(context_path, "..", "data", "titanic", "Titanic_19120414_1313.csv")), ) shutil.copy( file_relative_path(__file__, os.path.join("..", "test_sets", "Titanic.csv")), str( os.path.join(context_path, "..", "data", "titanic", "Titanic_19120414_1313")), ) shutil.copy( file_relative_path(__file__, os.path.join("..", "test_sets", "Titanic.csv")), str( os.path.join(context_path, "..", "data", "titanic", "Titanic_1911.csv")), ) shutil.copy( file_relative_path(__file__, os.path.join("..", "test_sets", "Titanic.csv")), str( os.path.join(context_path, "..", "data", "titanic", "Titanic_1912.csv")), ) context: DataContext = DataContext(context_root_dir=context_path) assert context.root_directory == context_path datasource_config: str = f""" class_name: Datasource execution_engine: class_name: SparkDFExecutionEngine data_connectors: my_basic_data_connector: class_name: InferredAssetFilesystemDataConnector base_directory: {data_path} default_regex: pattern: (.*)\\.csv group_names: - data_asset_name my_special_data_connector: class_name: ConfiguredAssetFilesystemDataConnector base_directory: {data_path} glob_directive: "*.csv" default_regex: pattern: (.+)\\.csv group_names: - name assets: users: base_directory: {data_path} pattern: (.+)_(\\d+)_(\\d+)\\.csv group_names: - name - timestamp - size my_other_data_connector: class_name: ConfiguredAssetFilesystemDataConnector base_directory: {data_path} glob_directive: "*.csv" default_regex: pattern: (.+)\\.csv group_names: - name assets: users: {{}} my_runtime_data_connector: module_name: great_expectations.datasource.data_connector class_name: RuntimeDataConnector batch_identifiers: - pipeline_stage_name - airflow_run_id """ # noinspection PyUnusedLocal context.test_yaml_config(name="my_datasource", yaml_config=datasource_config, pretty_print=False) # noinspection PyProtectedMember context._save_project_config() return context
def test_cli_datasource_profile_with_data_asset_and_additional_batch_kwargs_with_limit( empty_data_context, titanic_sqlite_db, caplog): """ User can pass additional batch kwargs (e.g., limit) to a sql backend. Here we are verifying that passing "limit" affects the query correctly - the row count in the batch that the profiler uses to profile the data asset must match the limit passed by the user. """ project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) datasource_name = "wow_a_datasource" context = _add_datasource_and_credentials_to_context( context, datasource_name, titanic_sqlite_db) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "-d", project_root_dir, "--data-assets", "main.titanic", "--additional-batch-kwargs", '{"limit": 97}', "--no-view", ], input="Y\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Profiling '{}'".format(datasource_name) in stdout assert "The following Data Docs sites will be built:\n" in stdout assert "local_site:" in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert (suites[0].expectation_suite_name == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler") validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert (validation.meta["expectation_suite_name"] == "wow_a_datasource.default.main.titanic.BasicDatasetProfiler") assert validation.success is False row_count_validation_results = [ validation_result for validation_result in validation.results if validation_result.expectation_config.expectation_type == "expect_table_row_count_to_be_between" ] assert len(row_count_validation_results) == 1 assert row_count_validation_results[0].result["observed_value"] == 97 assert "Preparing column 1 of 7" in caplog.messages[0] assert len(caplog.messages) == 10 assert_no_tracebacks(result)
def test_cli_datasource_list(caplog, empty_data_context, filesystem_csv_2): """Test an empty project and after adding a single datasource.""" project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False ) stdout = result.stdout.strip() assert "No Datasources found" in stdout assert context.list_datasources() == [] base_directory = str(filesystem_csv_2) context.add_datasource( "wow_a_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": base_directory, } }, ) datasources = context.list_datasources() assert datasources == [ { "name": "wow_a_datasource", "class_name": "PandasDatasource", "data_asset_type": { "class_name": "PandasDataset", "module_name": "great_expectations.dataset", }, "batch_kwargs_generators": { "subdir_reader": { "base_directory": base_directory, "class_name": "SubdirReaderBatchKwargsGenerator", } }, "module_name": "great_expectations.datasource", } ] runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False ) expected_output = """ 1 Datasource found:[0m [0m - [36mname:[0m wow_a_datasource[0m [36mmodule_name:[0m great_expectations.datasource[0m [36mclass_name:[0m PandasDatasource[0m [36mbatch_kwargs_generators:[0m[0m [36msubdir_reader:[0m[0m [36mclass_name:[0m SubdirReaderBatchKwargsGenerator[0m [36mbase_directory:[0m {}[0m [36mdata_asset_type:[0m[0m [36mmodule_name:[0m great_expectations.dataset[0m [36mclass_name:[0m PandasDataset[0m""".format( base_directory ).strip() stdout = result.stdout.strip() assert stdout == expected_output assert_no_logging_messages_or_tracebacks(caplog, result)
def test_suite_demo_one_datasource_without_generator_without_suite_name_argument( mock_webbrowser, mock_subprocess, caplog, empty_data_context, filesystem_csv_2): """ We call the "suite demo" command without the suite name argument The command should: - NOT prompt us to choose a datasource (because there is only one) - prompt us only to enter the path (The datasource has no generator configured and not to choose from the generator's list of available data assets). - We enter the path of the file we want the command to use as the batch to create the expectation suite. - prompt us to enter the name of the expectation suite that will be created - open Data Docs - NOT open jupyter """ empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", ) context = empty_data_context root_dir = context.root_directory context = DataContext(root_dir) runner = CliRunner(mix_stderr=False) csv_path = os.path.join(filesystem_csv_2, "f1.csv") result = runner.invoke( cli, ["suite", "demo", "-d", root_dir], input=f"{csv_path}\nmy_new_suite\n\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Enter the path" in stdout assert "Name the new expectation suite [f1.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations" in stdout) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "The following Data Docs sites were built" in stdout assert "A new Expectation suite 'my_new_suite' was added to your project" in stdout obs_urls = context.get_docs_sites_urls() assert len(obs_urls) == 1 assert ("great_expectations/uncommitted/data_docs/local_site/index.html" in obs_urls[0]["site_url"]) expected_index_path = os.path.join(root_dir, "uncommitted", "data_docs", "local_site", "index.html") assert os.path.isfile(expected_index_path) expected_suite_path = os.path.join(root_dir, "expectations", "my_new_suite.json") assert os.path.isfile(expected_suite_path) assert mock_webbrowser.call_count == 1 assert mock_subprocess.call_count == 0 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_profile_with_skip_prompt_flag( caplog, empty_data_context, filesystem_csv_2 ): empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) not_so_empty_data_context = empty_data_context project_root_dir = not_so_empty_data_context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "profile", "-d", project_root_dir, "--no-view", "-y"], input="Y\n", catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert ( "Profiling 'my_datasource' will create expectations and documentation." in stdout ) assert "Would you like to profile 'my_datasource'" not in stdout assert ( "Great Expectations is building Data Docs from the data you just profiled!" in stdout ) context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert ( suites[0].expectation_suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler" ) validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) assert ( validation.meta["expectation_suite_name"] == "my_datasource.subdir_reader.f1.BasicDatasetProfiler" ) assert validation.success is False assert len(validation.results) == 8 assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_suite_demo_multiple_datasources_with_generator_without_suite_name_argument( mock_webbrowser, mock_subprocess, caplog, site_builder_data_context_with_html_store_titanic_random, ): """ We call the "suite demo" command without the suite name argument - The data context has two datasources - we choose one of them. - It has a generator configured. We choose to use the generator and select a generator asset from the list. - The command should prompt us to enter the name of the expectation suite that will be created. - open Data Docs - NOT open jupyter """ root_dir = site_builder_data_context_with_html_store_titanic_random.root_directory os.chdir(root_dir) context = DataContext(root_dir) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["suite", "demo", "-d", root_dir], input="1\n1\n1\nmy_new_suite\n\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert ("""Select a datasource 1. random 2. titanic""" in stdout) assert ("""Which data would you like to use? 1. f1 (file) 2. f2 (file)""" in stdout) assert "Name the new expectation suite [f1.warning]" in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations" in stdout) assert "Generating example Expectation Suite..." in stdout assert "Building" in stdout assert "The following Data Docs sites were built" in stdout assert "A new Expectation suite 'my_new_suite' was added to your project" in stdout obs_urls = context.get_docs_sites_urls() assert len(obs_urls) == 1 assert ("great_expectations/uncommitted/data_docs/local_site/index.html" in obs_urls[0]["site_url"]) expected_index_path = os.path.join(root_dir, "uncommitted", "data_docs", "local_site", "index.html") assert os.path.isfile(expected_index_path) expected_suite_path = os.path.join(root_dir, "expectations", "my_new_suite.json") assert os.path.isfile(expected_suite_path) assert mock_webbrowser.call_count == 1 assert mock_subprocess.call_count == 0 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_profile_with_valid_data_asset_arg( caplog, empty_data_context, filesystem_csv_2 ): empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) context = empty_data_context project_root_dir = context.root_directory runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "datasource", "profile", "my_datasource", "--data-assets", "f1", "-d", project_root_dir, "--no-view", ], input="\n", catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert "Profiling 'my_datasource'" in stdout assert "The following Data Docs sites will be built:\n" in stdout assert "local_site:" in stdout context = DataContext(project_root_dir) assert len(context.list_datasources()) == 1 expectations_store = context.stores["expectations_store"] suites = expectations_store.list_keys() assert len(suites) == 1 assert ( suites[0].expectation_suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler" ) validations_store = context.stores["validations_store"] validation_keys = validations_store.list_keys() assert len(validation_keys) == 1 validation = validations_store.get(validation_keys[0]) suite_name = validation.meta["expectation_suite_name"] assert suite_name == "my_datasource.subdir_reader.f1.BasicDatasetProfiler" assert validation.success is False assert len(validation.results) == 8 assert "Preparing column 1 of 1" in caplog.messages[0] assert len(caplog.messages) == 1 assert_no_tracebacks(result)
def test_suite_edit_multiple_datasources_with_generator_with_no_additional_args_with_suite_without_citations( mock_webbrowser, mock_subprocess, caplog, site_builder_data_context_with_html_store_titanic_random, ): """ Here we verify that the "suite edit" command helps the user to specify the batch kwargs when it is called without the optional arguments that specify the batch. First, we call the "suite new" command to create the expectation suite our test will edit - this step is a just a setup. We call the "suite edit" command without any optional arguments. This means that the command will help us specify the batch kwargs interactively. The data context has two datasources - we choose one of them. It has a generator configured. We choose to use the generator and select a generator asset from the list. The command should: - NOT open Data Docs - open jupyter """ root_dir = site_builder_data_context_with_html_store_titanic_random.root_directory os.chdir(root_dir) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["suite", "demo", "-d", root_dir, "--suite", "foo_suite"], input="2\n1\n1\n\n", catch_exceptions=False, ) assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert mock_subprocess.call_count == 0 mock_webbrowser.reset_mock() mock_subprocess.reset_mock() # remove the citations from the suite context = DataContext(root_dir) suite = context.get_expectation_suite("foo_suite") assert isinstance(suite, ExpectationSuite) suite.meta.pop("citations") context.save_expectation_suite(suite) # Actual testing really starts here runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, [ "suite", "edit", "foo_suite", "-d", root_dir, ], input="2\n1\n1\n\n", catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert "A batch of data is required to edit the suite" in stdout assert "Select a datasource" in stdout assert "Which data would you like to use" in stdout expected_notebook_path = os.path.join(root_dir, "uncommitted", "edit_foo_suite.ipynb") assert os.path.isfile(expected_notebook_path) expected_suite_path = os.path.join(root_dir, "expectations", "foo_suite.json") assert os.path.isfile(expected_suite_path) assert mock_webbrowser.call_count == 0 assert mock_subprocess.call_count == 1 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_new(mock_subprocess, caplog, monkeypatch, empty_data_context, filesystem_csv_2): context = empty_data_context root_dir = context.root_directory assert context.list_datasources() == [] runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(root_dir)) result = runner.invoke( cli, "--v3-api datasource new", input=f"1\n1\n{filesystem_csv_2}\n", catch_exceptions=False, ) stdout = result.stdout assert context.list_datasources() == [] assert "What data would you like Great Expectations to connect to?" in stdout assert "What are you processing your files with?" in stdout assert result.exit_code == 0 uncommitted_dir = os.path.join(root_dir, context.GE_UNCOMMITTED_DIR) expected_notebook = os.path.join(uncommitted_dir, "datasource_new.ipynb") assert os.path.isfile(expected_notebook) mock_subprocess.assert_called_once_with( ["jupyter", "notebook", expected_notebook]) # Run notebook with open(expected_notebook) as f: nb = nbformat.read(f, as_version=4) ep = ExecutePreprocessor(timeout=600, kernel_name="python3") ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}}) del context context = DataContext(root_dir) assert len(context.list_datasources()) == 1 assert context.list_datasources() == [{ "name": "my_datasource", "class_name": "Datasource", "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "class_name": "PandasExecutionEngine", }, "data_connectors": { "my_datasource_example_data_connector": { "default_regex": { "group_names": "data_asset_name", "pattern": "(.*)", }, "module_name": "great_expectations.datasource.data_connector", "base_directory": "../../filesystem_csv_2", "class_name": "InferredAssetFilesystemDataConnector", } }, }] assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_new_connection_string( mock_subprocess, mock_emit, empty_data_context, empty_sqlite_db, caplog, monkeypatch ): monkeypatch.delenv( "GE_USAGE_STATS", raising=False ) # Undo the project-wide test default root_dir = empty_data_context.root_directory context: DataContext = empty_data_context assert context.list_datasources() == [] runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, "--v3-api datasource new", input="2\n6\n", catch_exceptions=False, ) stdout = result.stdout assert "What data would you like Great Expectations to connect to?" in stdout assert result.exit_code == 0 uncommitted_dir = os.path.join(root_dir, context.GE_UNCOMMITTED_DIR) expected_notebook = os.path.join(uncommitted_dir, "datasource_new.ipynb") assert os.path.isfile(expected_notebook) mock_subprocess.assert_called_once_with(["jupyter", "notebook", expected_notebook]) expected_call_args_list = [ mock.call( {"event_payload": {}, "event": "data_context.__init__", "success": True} ), mock.call( { "event": "cli.datasource.new.begin", "event_payload": {"api_version": "v3"}, "success": True, } ), mock.call( { "event": "cli.new_ds_choice", "event_payload": { "type": "sqlalchemy", "db": "other", "api_version": "v3", }, "success": True, } ), mock.call( { "event": "cli.datasource.new.end", "event_payload": {"api_version": "v3"}, "success": True, } ), ] assert mock_emit.call_args_list == expected_call_args_list assert mock_emit.call_count == len(expected_call_args_list) # Run notebook with open(expected_notebook) as f: nb = nbformat.read(f, as_version=4) # mock the user adding a connection string into the notebook by overwriting the right cell assert "connection_string" in nb["cells"][5]["source"] nb["cells"][5]["source"] = ' connection_string = "sqlite://"' ep = ExecutePreprocessor(timeout=60, kernel_name="python3") ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}}) del context context = DataContext(root_dir) assert context.list_datasources() == [ { "module_name": "great_expectations.datasource", "execution_engine": { "module_name": "great_expectations.execution_engine", "connection_string": "sqlite://", "class_name": "SqlAlchemyExecutionEngine", }, "class_name": "Datasource", "data_connectors": { "default_runtime_data_connector_name": { "batch_identifiers": ["default_identifier_name"], "class_name": "RuntimeDataConnector", "module_name": "great_expectations.datasource.data_connector", }, "default_inferred_data_connector_name": { "class_name": "InferredAssetSqlDataConnector", "module_name": "great_expectations.datasource.data_connector", "include_schema_name": True, }, }, "name": "my_datasource", } ] assert_no_logging_messages_or_tracebacks(caplog, result)
def __init__(self, data_context=None, context_root_dir=None): assert ( data_context or context_root_dir ), "Please provide a data_context object or a context_root_dir." self.data_context = data_context or DataContext( context_root_dir=context_root_dir ) self.upgrade_log = { "skipped_validations_stores": { "database_store_backends": [], "unsupported": [], }, "skipped_docs_validations_stores": {"unsupported": []}, "skipped_metrics_stores": { "database_store_backends": [], "unsupported": [], }, "exceptions": [ # { # "validation_store_name": store_name # "src": src_url, # "dest": dest_url, # "exception_message": exception_message, # }, # { # "site_name": site_name, # "src": src_url, # "dest": dest_url, # "exception_message": exception_message, # } ], "upgraded_validations_stores": { # STORE_NAME: { # "validations_updated": [{ # "src": src_url, # "dest": dest_url # }], # "exceptions": BOOL # } }, "upgraded_docs_site_validations_stores": { # SITE_NAME: { # "validation_result_pages_updated": [{ # src: src_url, # dest: dest_url # }], # "exceptions": BOOL # } }, } self.upgrade_checklist = { "validations_store_backends": {}, "docs_validations_store_backends": {}, } self.validation_run_times = {} self.run_time_setters_by_backend_type = { TupleFilesystemStoreBackend: self._get_tuple_filesystem_store_backend_run_time, TupleS3StoreBackend: self._get_tuple_s3_store_backend_run_time, TupleGCSStoreBackend: self._get_tuple_gcs_store_backend_run_time, } self._generate_upgrade_checklist()
def test_suite_edit_multiple_datasources_with_generator_with_no_additional_args_with_suite_containing_citations( mock_webbrowser, mock_subprocess, caplog, monkeypatch, site_builder_data_context_v013_with_html_store_titanic_random, ): """ Here we verify that the "suite edit" command uses the batch kwargs found in citations in the existing suite when it is called without the optional arguments that specify the batch. First, we call the "suite new" command to create the expectation suite our test will edit - this step is a just a setup. We call the "suite edit" command without any optional arguments. The command should: - NOT open Data Docs - NOT open jupyter """ context = site_builder_data_context_v013_with_html_store_titanic_random root_dir = context.root_directory runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, [ "--v3-api", "suite", "new", "--suite", "foo_suite", "--no-jupyter", ], input="2\n1\n1\n\n", catch_exceptions=False, ) assert mock_webbrowser.call_count == 0 assert mock_subprocess.call_count == 0 mock_subprocess.reset_mock() mock_webbrowser.reset_mock() assert result.exit_code == 0 context = DataContext(root_dir) suite = context.get_expectation_suite("foo_suite") assert isinstance(suite, ExpectationSuite) runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, ["--v3-api", "suite", "edit", "foo_suite"], input="2\n1\n1\n\n", catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert "Select a datasource" not in stdout assert "Which data would you like to use" not in stdout expected_notebook_path = os.path.join( root_dir, "uncommitted", "edit_foo_suite.ipynb" ) assert os.path.isfile(expected_notebook_path) expected_suite_path = os.path.join(root_dir, "expectations", "foo_suite.json") assert os.path.isfile(expected_suite_path) assert mock_webbrowser.call_count == 0 assert mock_subprocess.call_count == 1 assert_no_logging_messages_or_tracebacks( my_caplog=caplog, click_result=result, allowed_deprecation_message=VALIDATION_OPERATORS_DEPRECATION_MESSAGE, )
def validation_operator_run(name, run_name, validation_config_file, suite, directory) -> None: # Note though the long lines here aren't pythonic, they look best if Click does the line wraps. """ Run a validation operator against some data. There are two modes to run this command: 1. Interactive (good for development): Specify the name of the validation operator using the --name argument and the name of the expectation suite using the --suite argument. The cli will help you specify the batch of data that you want to validate interactively. 2. Non-interactive (good for production): Use the `--validation_config_file` argument to specify the path of the validation configuration JSON file. This file can be used to instruct a validation operator to validate multiple batches of data and use multiple expectation suites to validate each batch. Learn how to create a validation config file here: https://great-expectations.readthedocs.io/en/latest/command_line.html#great-expectations-validation-operator-run-validation-config-file-validation-config-file-path This command exits with 0 if the validation operator ran and the "success" attribute in its return object is True. Otherwise, the command exits with 1. To learn more about validation operators, go here: https://great-expectations.readthedocs.io/en/latest/features/validation.html#validation-operators """ try: context = DataContext(directory) except ge_exceptions.ConfigNotFoundError as err: cli_message(f"Failed to process <red>{err.message}</red>") sys.exit(1) try: if validation_config_file is not None: try: with open(validation_config_file) as f: validation_config = json.load(f) except (OSError, json_parse_exception) as e: cli_message( f"Failed to process the --validation_config_file argument: <red>{e}</red>" ) send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(1) validation_config_error_message = _validate_valdiation_config( validation_config) if validation_config_error_message is not None: cli_message( "<red>The validation config in {:s} is misconfigured: {:s}</red>" .format(validation_config_file, validation_config_error_message)) send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(1) else: if suite is None: cli_message(""" Please use --suite argument to specify the name of the expectation suite. Call `great_expectation suite list` command to list the expectation suites in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(0) suite = toolkit.load_expectation_suite( context, suite, "cli.validation_operator.run") if name is None: cli_message(""" Please use --name argument to specify the name of the validation operator. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(1) else: if name not in context.list_validation_operator_names(): cli_message(f""" Could not find a validation operator {name}. Call `great_expectation validation-operator list` command to list the operators in your project. """) send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(1) batch_kwargs = None cli_message(""" Let us help you specify the batch of data your want the validation operator to validate.""" ) try: data_source = toolkit.select_datasource(context) except ValueError as ve: cli_message(f"<red>{ve}</red>") send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(1) if not data_source: cli_message("<red>No datasources found in the context.</red>") send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(1) if batch_kwargs is None: ( datasource_name, batch_kwargs_generator, data_asset, batch_kwargs, ) = get_batch_kwargs( context, datasource_name=data_source.name, batch_kwargs_generator_name=None, data_asset_name=None, additional_batch_kwargs=None, ) validation_config = { "validation_operator_name": name, "batches": [{ "batch_kwargs": batch_kwargs, "expectation_suite_names": [suite.expectation_suite_name], }], } try: validation_operator_name = validation_config[ "validation_operator_name"] batches_to_validate = [] for entry in validation_config["batches"]: for expectation_suite_name in entry["expectation_suite_names"]: batch = context.get_batch(entry["batch_kwargs"], expectation_suite_name) batches_to_validate.append(batch) if run_name is None: run_name = datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") run_id = RunIdentifier(run_name=run_name) if suite is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: if suite.evaluation_parameters is None: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, ) else: results = context.run_validation_operator( validation_operator_name, assets_to_validate=batches_to_validate, run_id=run_id, evaluation_parameters=suite.evaluation_parameters, ) except (ge_exceptions.DataContextError, OSError, SQLAlchemyError) as e: cli_message(f"<red>{e}</red>") send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) sys.exit(1) if not results["success"]: cli_message("Validation failed!") send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=True, ) sys.exit(1) else: cli_message("Validation succeeded!") send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=True, ) sys.exit(0) except Exception as e: send_usage_message( data_context=context, event="cli.validation_operator.run", api_version="v2", success=False, ) raise e
def test_suite_new_creates_empty_suite( mock_webbroser, mock_subprocess, caplog, monkeypatch, data_context_parameterized_expectation_suite, filesystem_csv_2, ): """ Running "suite new" should: - make an empty suite - open jupyter - NOT open data docs """ context = data_context_parameterized_expectation_suite project_root_dir = context.root_directory os.mkdir(os.path.join(project_root_dir, "uncommitted")) root_dir = project_root_dir runner = CliRunner(mix_stderr=False) csv = os.path.join(filesystem_csv_2, "f1.csv") monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, ["--v3-api", "suite", "new", "--suite", "foo"], input=f"{csv}\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert "Enter the path" in stdout assert "Name the new expectation suite" not in stdout assert ( "Great Expectations will choose a couple of columns and generate expectations" not in stdout ) assert "Generating example Expectation Suite..." not in stdout assert "The following Data Docs sites were built" not in stdout assert ( "Great Expectations will create a new Expectation Suite 'foo' and store it here" in stdout ) assert ( "Because you requested an empty suite, we'll open a notebook for you now to edit it!" in stdout ) expected_suite_path = os.path.join(root_dir, "expectations", "foo.json") assert os.path.isfile(expected_suite_path) expected_notebook = os.path.join(root_dir, "uncommitted", "edit_foo.ipynb") assert os.path.isfile(expected_notebook) context = DataContext(root_dir) assert "foo" in context.list_expectation_suite_names() suite = context.get_expectation_suite("foo") assert suite.expectations == [] citations = suite.get_citations() citations[0].pop("citation_date") assert citations[0] == { "batch_kwargs": { "data_asset_name": "f1", "datasource": "mydatasource", "path": csv, "reader_method": "read_csv", }, "batch_markers": None, "batch_parameters": None, "comment": "New suite added via CLI", } assert mock_subprocess.call_count == 1 call_args = mock_subprocess.call_args[0][0] assert call_args[0] == "jupyter" assert call_args[1] == "notebook" assert expected_notebook in call_args[2] assert mock_webbroser.call_count == 0 assert_no_logging_messages_or_tracebacks( my_caplog=caplog, click_result=result, allowed_deprecation_message=VALIDATION_OPERATORS_DEPRECATION_MESSAGE, )
def test_init_on_existing_project_with_no_datasources_should_continue_init_flow_and_add_one( mock_webbrowser, caplog, initialized_sqlite_project, titanic_sqlite_db_file, ): project_dir = initialized_sqlite_project ge_dir = os.path.join(project_dir, DataContext.GE_DIR) _remove_all_datasources(ge_dir) os.remove(os.path.join(ge_dir, "expectations", "warning.json")) context = DataContext(ge_dir) assert not context.list_expectation_suites() runner = CliRunner(mix_stderr=False) url = "sqlite:///{}".format(titanic_sqlite_db_file) with pytest.warns( UserWarning, match="Warning. An existing `great_expectations.yml` was found" ): result = runner.invoke( cli, ["init", "-d", project_dir], input="\n\n2\n6\nsqlite\n{}\n\n\n1\nmy_suite\n\n\n\n".format(url), catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert mock_webbrowser.call_count == 1 assert ( "{}/great_expectations/uncommitted/data_docs/local_site/validations/my_suite/".format( project_dir ) in mock_webbrowser.call_args[0][0] ) assert "Error: invalid input" not in stdout assert "Always know what to expect from your data" in stdout assert "What data would you like Great Expectations to connect to" in stdout assert ( "Next, we will configure database credentials and store them in the `sqlite` section" in stdout ) assert "What is the url/connection string for the sqlalchemy connection?" in stdout assert "Which table would you like to use?" in stdout assert "Great Expectations connected to your database" in stdout assert "This looks like an existing project that" not in stdout config = _load_config_file(os.path.join(ge_dir, DataContext.GE_YML)) assert "sqlite" in config["datasources"].keys() context = DataContext(ge_dir) assert context.list_datasources() == [ { "class_name": "SqlAlchemyDatasource", "name": "sqlite", "module_name": "great_expectations.datasource", "credentials": {"url": url}, "data_asset_type": { "class_name": "SqlAlchemyDataset", "module_name": "great_expectations.dataset", }, } ] assert context.list_expectation_suites()[0].expectation_suite_name == "my_suite" assert len(context.list_expectation_suites()) == 1 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_suite_edit_one_datasources_no_generator_with_no_additional_args_and_no_citations( mock_webbrowser, mock_subprocess, caplog, monkeypatch, empty_data_context, filesystem_csv_2, ): """ Here we verify that the "suite edit" command helps the user to specify the batch kwargs when it is called without the optional arguments that specify the batch. First, we call the "suite new" command to create the expectation suite our test will edit - this step is a just a setup. We call the "suite edit" command without any optional arguments. This means that the command will help us specify the batch kwargs interactively. The data context has one datasource. The datasource has no generators configured. The command prompts us to enter the file path. """ empty_data_context.add_datasource( "my_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", ) context = empty_data_context project_root_dir = context.root_directory root_dir = project_root_dir runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, ["--v3-api", "suite", "new", "--no-jupyter"], input="{:s}\nmy_new_suite\n\n".format(os.path.join(filesystem_csv_2, "f1.csv")), catch_exceptions=False, ) stdout = result.stdout assert mock_webbrowser.call_count == 0 assert mock_subprocess.call_count == 0 mock_subprocess.reset_mock() mock_webbrowser.reset_mock() assert result.exit_code == 0 assert ( "Great Expectations will create a new Expectation Suite 'my_new_suite' and store it here:" in stdout ) # remove the citations from the suite context = DataContext(project_root_dir) suite = context.get_expectation_suite("my_new_suite") suite.meta.pop("citations") context.save_expectation_suite(suite) runner = CliRunner(mix_stderr=False) monkeypatch.chdir(os.path.dirname(context.root_directory)) result = runner.invoke( cli, [ "--v3-api", "suite", "edit", "my_new_suite", ], input="{:s}\n\n".format(os.path.join(filesystem_csv_2, "f1.csv")), catch_exceptions=False, ) assert result.exit_code == 0 stdout = result.stdout assert "Select a datasource" not in stdout assert "Which data would you like to use" not in stdout assert "Enter the path" in stdout expected_notebook_path = os.path.join( root_dir, "uncommitted", "edit_my_new_suite.ipynb" ) assert os.path.isfile(expected_notebook_path) expected_suite_path = os.path.join(root_dir, "expectations", "my_new_suite.json") assert os.path.isfile(expected_suite_path) assert mock_webbrowser.call_count == 0 assert mock_subprocess.call_count == 1 assert_no_logging_messages_or_tracebacks( my_caplog=caplog, click_result=result, )
def test_notebook_execution_with_pandas_backend( titanic_data_context_no_data_docs_no_checkpoint_store, ): """ This tests that the notebook is written to disk and executes without error. To set this test up we: - create a scaffold notebook - verify that no validations have happened We then: - execute that notebook (Note this will raise various errors like CellExecutionError if any cell in the notebook fails - create a new context from disk - verify that a validation has been run with our expectation suite """ # Since we'll run the notebook, we use a context with no data docs to avoid # the renderer's default behavior of building and opening docs, which is not # part of this test. context = titanic_data_context_no_data_docs_no_checkpoint_store root_dir = context.root_directory uncommitted_dir = os.path.join(root_dir, "uncommitted") suite_name = "my_suite" suite = context.create_expectation_suite(suite_name) csv_path = os.path.join(root_dir, "..", "data", "Titanic.csv") batch_kwargs = {"datasource": "mydatasource", "path": csv_path} # Sanity check test setup assert context.list_expectation_suite_names() == [suite_name] assert context.list_datasources() == [{ "module_name": "great_expectations.datasource", "class_name": "PandasDatasource", "data_asset_type": { "module_name": "great_expectations.dataset", "class_name": "PandasDataset", }, "batch_kwargs_generators": { "mygenerator": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": "../data", } }, "name": "mydatasource", }] assert context.get_validation_result(suite_name) == {} notebook_path = os.path.join(uncommitted_dir, f"{suite_name}.ipynb") assert not os.path.isfile(notebook_path) # Create notebook renderer = SuiteScaffoldNotebookRenderer( titanic_data_context_no_data_docs_no_checkpoint_store, suite, batch_kwargs) renderer.render_to_disk(notebook_path) assert os.path.isfile(notebook_path) with open(notebook_path) as f: nb = nbformat.read(f, as_version=4) # Run notebook ep = ExecutePreprocessor(timeout=600, kernel_name="python3") ep.preprocess(nb, {"metadata": {"path": uncommitted_dir}}) # Useful to inspect executed notebook output_notebook = os.path.join(uncommitted_dir, "output.ipynb") with open(output_notebook, "w") as f: nbformat.write(nb, f) # Assertions about output context = DataContext(root_dir) obs_validation_result = context.get_validation_result(suite_name) assert obs_validation_result.statistics == { "evaluated_expectations": 2, "successful_expectations": 2, "unsuccessful_expectations": 0, "success_percent": 100, } suite = context.get_expectation_suite(suite_name) assert suite.expectations ( columns_with_expectations, expectations_from_suite, ) = get_set_of_columns_and_expectations_from_suite(suite) expected_expectations = { "expect_table_columns_to_match_ordered_list", "expect_table_row_count_to_be_between", } assert columns_with_expectations == set() assert expectations_from_suite == expected_expectations
def test_build_data_docs_skipping_index_does_not_build_index( tmp_path_factory, ): # TODO What's the latest and greatest way to use configs rather than my hackery? empty_directory = str(tmp_path_factory.mktemp("empty")) DataContext.create(empty_directory) ge_dir = os.path.join(empty_directory, DataContext.GE_DIR) context = DataContext(ge_dir) config = context.get_config() config.data_docs_sites = { "local_site": { "class_name": "SiteBuilder", "store_backend": { "class_name": "TupleFilesystemStoreBackend", "base_directory": os.path.join("uncommitted", "data_docs"), }, }, } context._project_config = config # TODO Workaround project config programmatic config manipulation # statefulness issues by writing to disk and re-upping a new context context._save_project_config() del context context = DataContext(ge_dir) data_docs_dir = os.path.join(ge_dir, "uncommitted", "data_docs") index_path = os.path.join(data_docs_dir, "index.html") assert not os.path.isfile(index_path) context.build_data_docs(build_index=False) assert os.path.isdir(os.path.join(data_docs_dir, "static")) assert not os.path.isfile(index_path)