def sanitize_yaml_and_save_datasource( context: DataContext, datasource_yaml: str, overwrite_existing: bool = False) -> None: """A convenience function used in notebooks to help users save secrets.""" if not datasource_yaml: raise ValueError("Please verify the yaml and try again.") if not isinstance(datasource_yaml, str): raise TypeError("Please pass in a valid yaml string.") config = yaml.load(datasource_yaml) try: datasource_name = config.pop("name") except KeyError: raise ValueError("The datasource yaml is missing a `name` attribute.") if not overwrite_existing and check_if_datasource_name_exists( context=context, datasource_name=datasource_name): print( f'**WARNING** A Datasource named "{datasource_name}" already exists in this Data Context. The Datasource has *not* been saved. Please use a different name or set overwrite_existing=True if you want to overwrite!' ) return if "credentials" in config.keys(): credentials = config["credentials"] config["credentials"] = "${" + datasource_name + "}" context.save_config_variable(datasource_name, credentials) context.add_datasource(name=datasource_name, **config)
def test_suite_edit_with_batch_kwargs_unable_to_load_a_batch_raises_helpful_error( mock_webbrowser, mock_subprocess, caplog, empty_data_context ): """ The command should: - exit with a clear error message - NOT open Data Docs - NOT open jupyter """ project_dir = empty_data_context.root_directory context = DataContext(project_dir) context.create_expectation_suite("foo") context.add_datasource("source", class_name="PandasDatasource") runner = CliRunner(mix_stderr=False) batch_kwargs = '{"table": "fake", "datasource": "source"}' result = runner.invoke( cli, ["suite", "edit", "foo", "-d", project_dir, "--batch-kwargs", batch_kwargs], catch_exceptions=False, ) stdout = result.output assert result.exit_code == 1 assert "To continue editing this suite" not in stdout assert "Please check that your batch_kwargs are able to load a batch." in stdout assert mock_webbrowser.call_count == 0 assert mock_subprocess.call_count == 0 assert_no_logging_messages_or_tracebacks(caplog, result)
def test_init_on_existing_project_with_multiple_datasources_exist_do_nothing( mock_webbrowser, caplog, initialized_project, filesystem_csv_2 ): project_dir = initialized_project ge_dir = os.path.join(project_dir, DataContext.GE_DIR) context = DataContext(ge_dir) context.add_datasource( "another_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", ) assert len(context.list_datasources()) == 2 runner = CliRunner(mix_stderr=False) with pytest.warns( UserWarning, match="Warning. An existing `great_expectations.yml` was found" ): result = runner.invoke( cli, ["init", "-d", project_dir], input="n\n", catch_exceptions=False, ) stdout = result.stdout assert result.exit_code == 0 assert mock_webbrowser.call_count == 0 assert "Error: invalid input" not in stdout assert "Always know what to expect from your data" in stdout assert "This looks like an existing project that" in stdout assert "appears complete" in stdout assert "Would you like to build & view this project's Data Docs" in stdout assert_no_logging_messages_or_tracebacks(caplog, result)
def test_get_batch_kwargs_for_specific_dataasset(empty_data_context, filesystem_csv): project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) base_directory = str(filesystem_csv) context.add_datasource( "wow_a_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": base_directory, } }, ) batch = get_batch_kwargs( context, datasource_name=None, batch_kwargs_generator_name=None, data_asset_name="f1", additional_batch_kwargs={}, ) expected_batch = { "data_asset_name": "f1", "datasource": "wow_a_datasource", "path": os.path.join(filesystem_csv, "f1.csv"), } assert batch == expected_batch
def _add_spark_datasource(datasource_name: str, dataset: AbstractDataSet, ge_context: DataContext) -> str: from great_expectations.datasource import SparkDFDatasource path = str(dataset._filepath.parent) if path.startswith("./"): path = path[2:] configuration = SparkDFDatasource.build_configuration( batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": os.path.join("..", path), } }) configuration["class_name"] = "SparkDFDatasource" errors = DatasourceConfigSchema().validate(configuration) if len(errors) != 0: raise ge_exceptions.GreatExpectationsError( "Invalid Datasource configuration: {0:s}".format(errors)) ge_context.add_datasource(name=datasource_name, **configuration) return datasource_name
def test_cli_datasorce_list(caplog, empty_data_context, filesystem_csv_2): """Test an empty project and after adding a single datasource.""" project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False) stdout = result.output.strip() assert "[]" in stdout assert context.list_datasources() == [] context.add_datasource( "wow_a_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": str(filesystem_csv_2), } }, ) assert context.list_datasources() == [{ "name": "wow_a_datasource", "class_name": "PandasDatasource" }] runner = CliRunner(mix_stderr=False) result = runner.invoke(cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False) stdout = result.output.strip() if PY2: # deal with legacy python dictionary sorting assert ("'name': 'wow_a_datasource'" and "'class_name': 'PandasDatasource'" in stdout) assert len(stdout) >= 60 and len(stdout) <= 70 else: assert ( "[{'name': 'wow_a_datasource', 'class_name': 'PandasDatasource'}]" in stdout) assert_no_logging_messages_or_tracebacks(caplog, result)
def test_cli_datasource_list(caplog, empty_data_context, filesystem_csv_2): """Test an empty project and after adding a single datasource.""" project_root_dir = empty_data_context.root_directory context = DataContext(project_root_dir) runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False ) stdout = result.output.strip() assert "No Datasources found" in stdout assert context.list_datasources() == [] base_directory = str(filesystem_csv_2) context.add_datasource( "wow_a_datasource", module_name="great_expectations.datasource", class_name="PandasDatasource", batch_kwargs_generators={ "subdir_reader": { "class_name": "SubdirReaderBatchKwargsGenerator", "base_directory": base_directory, } }, ) datasources = context.list_datasources() assert datasources == [ { "name": "wow_a_datasource", "class_name": "PandasDatasource", "data_asset_type": {"class_name": "PandasDataset", "module_name": "great_expectations.dataset"}, "batch_kwargs_generators": {"subdir_reader": { "base_directory": base_directory, "class_name": "SubdirReaderBatchKwargsGenerator"}}, "module_name": "great_expectations.datasource", } ] runner = CliRunner(mix_stderr=False) result = runner.invoke( cli, ["datasource", "list", "-d", project_root_dir], catch_exceptions=False ) expected_output = """ 1 Datasource found:[0m [0m - [36mname:[0m wow_a_datasource[0m [36mmodule_name:[0m great_expectations.datasource[0m [36mclass_name:[0m PandasDatasource[0m [36mbatch_kwargs_generators:[0m[0m [36msubdir_reader:[0m[0m [36mclass_name:[0m SubdirReaderBatchKwargsGenerator[0m [36mbase_directory:[0m {}[0m [36mdata_asset_type:[0m[0m [36mmodule_name:[0m great_expectations.dataset[0m [36mclass_name:[0m PandasDataset[0m""".format(base_directory).strip() stdout = result.output.strip() assert stdout == expected_output assert_no_logging_messages_or_tracebacks(caplog, result)