Ejemplo n.º 1
0
def _add_spark_datasource(datasource_name: str, dataset: AbstractDataSet,
                          ge_context: DataContext) -> str:
    from great_expectations.datasource import SparkDFDatasource

    path = str(dataset._filepath.parent)

    if path.startswith("./"):
        path = path[2:]

    configuration = SparkDFDatasource.build_configuration(
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": os.path.join("..", path),
            }
        })

    configuration["class_name"] = "SparkDFDatasource"
    errors = DatasourceConfigSchema().validate(configuration)
    if len(errors) != 0:
        raise ge_exceptions.GreatExpectationsError(
            "Invalid Datasource configuration: {0:s}".format(errors))

    ge_context.add_datasource(name=datasource_name, **configuration)
    return datasource_name
Ejemplo n.º 2
0
def test_PandasDatasource_config(default_pandas_datasource_config):

    datasource_config = DatasourceConfig(
        class_name="PandasDatasource",
        batch_kwargs_generators={
            "subdir_reader": {
                "class_name": "SubdirReaderBatchKwargsGenerator",
                "base_directory": "../data/",
            }
        },
    )

    desired_config = default_pandas_datasource_config["my_pandas_datasource"]

    datasource_config_schema = DatasourceConfigSchema()
    assert datasource_config_schema.dump(datasource_config) == desired_config
Ejemplo n.º 3
0
def test_SqlAlchemyDatasource_config(default_sql_alchemy_datasource_config):

    datasource_config = DatasourceConfig(
        class_name="SqlAlchemyDatasource",
        credentials={
            "drivername": "custom_drivername",
            "host": "custom_host",
            "port": "custom_port",
            "username": "******",
            "password": "******",
            "database": "custom_database",
        },
    )

    desired_config = default_sql_alchemy_datasource_config[
        "my_sql_alchemy_datasource"]

    datasource_config_schema = DatasourceConfigSchema()
    assert datasource_config_schema.dump(datasource_config) == desired_config
    def __init__(
        self,
        store_name: Optional[str] = None,
        store_backend: Optional[dict] = None,
        runtime_environment: Optional[dict] = None,
    ) -> None:
        self._schema = DatasourceConfigSchema()
        super().__init__(
            store_backend=store_backend,
            runtime_environment=runtime_environment,
            store_name=store_name,
        )

        # Gather the call arguments of the present function (include the "module_name" and add the "class_name"), filter
        # out the Falsy values, and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "store_backend": store_backend,
            "runtime_environment": runtime_environment,
            "store_name": store_name,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        filter_properties_dict(properties=self._config, clean_falsy=True, inplace=True)
def test_create_data_context_and_config_vars_in_code(tmp_path_factory,
                                                     monkeypatch):
    """
    What does this test and why?
    Creating a DataContext via .create(), then using .save_config_variable() to save a variable that will eventually be substituted (e.g. ${SOME_VAR}) should result in the proper escaping of $.
    This is in response to issue #2196
    """

    project_path = str(tmp_path_factory.mktemp("data_context"))
    context = ge.DataContext.create(
        project_root_dir=project_path,
        usage_statistics_enabled=False,
    )

    CONFIG_VARS = {
        "DB_HOST": "${DB_HOST_FROM_ENV_VAR}",
        "DB_NAME": "DB_NAME",
        "DB_USER": "******",
        "DB_PWD": "pas$word",
    }
    for k, v in CONFIG_VARS.items():
        context.save_config_variable(k, v)

    config_vars_file_contents = context._load_config_variables_file()

    # Add escaping for DB_PWD since it is not of the form ${SOMEVAR} or $SOMEVAR
    CONFIG_VARS_WITH_ESCAPING = CONFIG_VARS.copy()
    CONFIG_VARS_WITH_ESCAPING["DB_PWD"] = r"pas\$word"

    # Ensure all config vars saved are in the config_variables.yml file
    # and that escaping was added for "pas$word" -> "pas\$word"
    assert all(item in config_vars_file_contents.items()
               for item in CONFIG_VARS_WITH_ESCAPING.items())
    assert not all(item in config_vars_file_contents.items()
                   for item in CONFIG_VARS.items())

    # Add env var for substitution
    monkeypatch.setenv("DB_HOST_FROM_ENV_VAR", "DB_HOST_FROM_ENV_VAR_VALUE")

    datasource_config = DatasourceConfig(
        class_name="SqlAlchemyDatasource",
        credentials={
            "drivername": "postgresql",
            "host": "$DB_HOST",
            "port": "65432",
            "database": "${DB_NAME}",
            "username": "******",
            "password": "******",
        },
    )
    datasource_config_schema = DatasourceConfigSchema()

    # use context.add_datasource to test this by adding a datasource with values to substitute.
    context.add_datasource(initialize=False,
                           name="test_datasource",
                           **datasource_config_schema.dump(datasource_config))

    assert context.list_datasources()[0]["credentials"] == {
        "drivername": "postgresql",
        "host": "DB_HOST_FROM_ENV_VAR_VALUE",
        "port": "65432",
        "database": "DB_NAME",
        "username": "******",
        # Note masking of "password" field
        "password": "******",
    }

    # Check context substitutes escaped variables appropriately
    data_context_config_schema = DataContextConfigSchema()
    context_with_variables_substituted_dict = data_context_config_schema.dump(
        context.get_config_with_variables_substituted())

    test_datasource_credentials = context_with_variables_substituted_dict[
        "datasources"]["test_datasource"]["credentials"]

    assert test_datasource_credentials["host"] == "DB_HOST_FROM_ENV_VAR_VALUE"
    assert test_datasource_credentials["username"] == "DB_USER"
    assert test_datasource_credentials["password"] == "pas$word"
    assert test_datasource_credentials["database"] == "DB_NAME"

    # Ensure skip_if_substitution_variable=False works as documented
    context.save_config_variable("escaped",
                                 "$SOME_VAR",
                                 skip_if_substitution_variable=False)
    context.save_config_variable("escaped_curly",
                                 "${SOME_VAR}",
                                 skip_if_substitution_variable=False)

    config_vars_file_contents = context._load_config_variables_file()

    assert config_vars_file_contents["escaped"] == r"\$SOME_VAR"
    assert config_vars_file_contents["escaped_curly"] == r"\${SOME_VAR}"
Ejemplo n.º 6
0
class DatasourceStore(Store):
    """
    A DatasourceStore manages Datasources for the DataContext.
    """

    _key_class = DataContextVariableKey

    def __init__(
        self,
        store_name: Optional[str] = None,
        store_backend: Optional[dict] = None,
        runtime_environment: Optional[dict] = None,
    ) -> None:
        self._schema = DatasourceConfigSchema()
        super().__init__(
            store_backend=store_backend,
            runtime_environment=runtime_environment,
            store_name=store_name,
        )

        # Gather the call arguments of the present function (include the "module_name" and add the "class_name"), filter
        # out the Falsy values, and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "store_backend": store_backend,
            "runtime_environment": runtime_environment,
            "store_name": store_name,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        filter_properties_dict(properties=self._config,
                               clean_falsy=True,
                               inplace=True)

    def list_keys(self) -> List[str]:
        """
        See parent 'Store.list_keys()' for more information
        """
        from great_expectations.data_context.data_context_variables import (
            DataContextVariableSchema, )

        datasource_key: Tuple[DataContextVariableSchema] = (
            DataContextVariableSchema.DATASOURCES, )

        keys_without_store_backend_id: List[str] = [
            key for key in self._store_backend.list_keys(prefix=datasource_key)
        ]
        return [key for key in keys_without_store_backend_id]

    def remove_key(self, key: DataContextVariableKey) -> None:
        """
        See parent `Store.remove_key()` for more information
        """
        return self._store_backend.remove_key(key.to_tuple())

    def serialize(self,
                  value: DatasourceConfig) -> Union[str, DatasourceConfig]:
        """
        See parent 'Store.serialize()' for more information
        """
        if self.ge_cloud_mode:
            # GeCloudStoreBackend expects a json str
            return self._schema.dump(value)
        return value

    def deserialize(self, value: Union[dict,
                                       DatasourceConfig]) -> DatasourceConfig:
        """
        See parent 'Store.deserialize()' for more information
        """
        # When using the InlineStoreBackend, objects are already converted to their respective config types.
        if isinstance(value, DatasourceConfig):
            return value
        elif isinstance(value, dict):
            return self._schema.load(value)
        else:
            return self._schema.loads(value)

    def retrieve_by_name(self, datasource_name: str) -> DatasourceConfig:
        """Retrieves a DatasourceConfig persisted in the store by it's given name.

        Args:
            datasource_name: The name of the Datasource to retrieve.

        Returns:
            The DatasourceConfig persisted in the store that is associated with the given
            input datasource_name.

        Raises:
            ValueError if a DatasourceConfig is not found.
        """
        datasource_key: DataContextVariableKey = self._determine_datasource_key(
            datasource_name=datasource_name)
        if not self.has_key(datasource_key):
            raise ValueError(
                f"Unable to load datasource `{datasource_name}` -- no configuration found or invalid configuration."
            )

        datasource_config: DatasourceConfig = copy.deepcopy(
            self.get(datasource_key))
        return datasource_config

    def delete_by_name(self, datasource_name: str) -> None:
        """Deletes a DatasourceConfig persisted in the store by it's given name.

        Args:
            datasource_name: The name of the Datasource to retrieve.
        """
        datasource_key: DataContextVariableKey = self._determine_datasource_key(
            datasource_name=datasource_name)
        self.remove_key(datasource_key)

    def set_by_name(self, datasource_name: str,
                    datasource_config: DatasourceConfig) -> None:
        """Persists a DatasourceConfig in the store by a given name.

        Args:
            datasource_name: The name of the Datasource to update.
            datasource_config: The config object to persist using the StoreBackend.
        """
        datasource_key: DataContextVariableKey = self._determine_datasource_key(
            datasource_name=datasource_name)
        self.set(datasource_key, datasource_config)

    def update_by_name(self, datasource_name: str,
                       datasource_config: DatasourceConfig) -> None:
        """Updates a DatasourceConfig that already exists in the store.

        Args:
            datasource_name: The name of the Datasource to retrieve.
            datasource_config: The config object to persist using the StoreBackend.

        Raises:
            ValueError if a DatasourceConfig is not found.
        """
        datasource_key: DataContextVariableKey = self._determine_datasource_key(
            datasource_name=datasource_name)
        if not self.has_key(datasource_key):
            raise ValueError(
                f"Unable to load datasource `{datasource_name}` -- no configuration found or invalid configuration."
            )

        self.set_by_name(datasource_name=datasource_name,
                         datasource_config=datasource_config)

    def _determine_datasource_key(
            self, datasource_name: str) -> DataContextVariableKey:
        datasource_key: DataContextVariableKey = DataContextVariableKey(
            resource_type=DataContextVariableSchema.DATASOURCES,
            resource_name=datasource_name,
        )
        return datasource_key
Ejemplo n.º 7
0
def _add_spark_datasource(
    context, passthrough_generator_only=True, prompt_for_datasource_name=True
):
    toolkit.send_usage_message(
        data_context=context,
        event="cli.new_ds_choice",
        event_payload={"type": "spark"},
        success=True,
    )

    if not _verify_pyspark_dependent_modules():
        return None

    if passthrough_generator_only:
        datasource_name = "files_spark_datasource"

        # configuration = SparkDFDatasource.build_configuration(batch_kwargs_generators={
        #     "default": {
        #         "class_name": "PassthroughGenerator",
        #     }
        # }
        # )
        configuration = SparkDFDatasource.build_configuration()

    else:
        path = click.prompt(
            msg_prompt_filesys_enter_base_path,
            type=click.Path(exists=True, file_okay=False),
        ).strip()
        if path.startswith("./"):
            path = path[2:]

        if path.endswith("/"):
            basenamepath = path[:-1]
        else:
            basenamepath = path

        datasource_name = os.path.basename(basenamepath) + "__dir"
        if prompt_for_datasource_name:
            datasource_name = click.prompt(
                msg_prompt_datasource_name, default=datasource_name
            )

        configuration = SparkDFDatasource.build_configuration(
            batch_kwargs_generators={
                "subdir_reader": {
                    "class_name": "SubdirReaderBatchKwargsGenerator",
                    "base_directory": os.path.join("..", path),
                }
            }
        )
        configuration["class_name"] = "SparkDFDatasource"
        configuration["module_name"] = "great_expectations.datasource"
        errors = DatasourceConfigSchema().validate(configuration)
        if len(errors) != 0:
            raise ge_exceptions.GreatExpectationsError(
                "Invalid Datasource configuration: {:s}".format(errors)
            )

    cli_message(
        """
Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml:

{:s}
""".format(
            datasource_name,
            textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), "  "),
        )
    )
    toolkit.confirm_proceed_or_exit()

    context.add_datasource(name=datasource_name, **configuration)
    return datasource_name
Ejemplo n.º 8
0
def _add_sqlalchemy_datasource(context, prompt_for_datasource_name=True):

    msg_success_database = (
        "\n<green>Great Expectations connected to your database!</green>"
    )

    if not _verify_sqlalchemy_dependent_modules():
        return None

    db_choices = [str(x) for x in list(range(1, 1 + len(SupportedDatabases)))]
    selected_database = (
        int(
            click.prompt(
                msg_prompt_choose_database,
                type=click.Choice(db_choices),
                show_choices=False,
            )
        )
        - 1
    )  # don't show user a zero index list :)

    selected_database = list(SupportedDatabases)[selected_database]

    toolkit.send_usage_message(
        data_context=context,
        event="cli.new_ds_choice",
        event_payload={"type": "sqlalchemy", "db": selected_database.name},
        success=True,
    )

    datasource_name = "my_{}_db".format(selected_database.value.lower())
    if selected_database == SupportedDatabases.OTHER:
        datasource_name = "my_database"
    if prompt_for_datasource_name:
        datasource_name = click.prompt(
            msg_prompt_datasource_name, default=datasource_name
        )

    credentials = {}
    # Since we don't want to save the database credentials in the config file that will be
    # committed in the repo, we will use our Variable Substitution feature to store the credentials
    # in the credentials file (that will not be committed, since it is in the uncommitted directory)
    # with the datasource's name as the variable name.
    # The value of the datasource's "credentials" key in the config file (great_expectations.yml) will
    # be ${datasource name}.
    # Great Expectations will replace the ${datasource name} with the value from the credentials file in runtime.

    while True:
        cli_message(msg_db_config.format(datasource_name))

        if selected_database == SupportedDatabases.MYSQL:
            if not _verify_mysql_dependent_modules():
                return None

            credentials = _collect_mysql_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.POSTGRES:
            if not _verify_postgresql_dependent_modules():
                return None

            credentials = _collect_postgres_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.REDSHIFT:
            if not _verify_redshift_dependent_modules():
                return None

            credentials = _collect_redshift_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.SNOWFLAKE:
            if not _verify_snowflake_dependent_modules():
                return None

            credentials = _collect_snowflake_credentials(
                default_credentials=credentials
            )
        elif selected_database == SupportedDatabases.BIGQUERY:
            if not _verify_bigquery_dependent_modules():
                return None

            credentials = _collect_bigquery_credentials(default_credentials=credentials)
        elif selected_database == SupportedDatabases.OTHER:
            sqlalchemy_url = click.prompt(
                """What is the url/connection string for the sqlalchemy connection?
(reference: https://docs.sqlalchemy.org/en/latest/core/engines.html#database-urls)
""",
                show_default=False,
            ).strip()
            credentials = {"url": sqlalchemy_url}

        context.save_config_variable(datasource_name, credentials)

        message = """
<red>Cannot connect to the database.</red>
  - Please check your environment and the configuration you provided.
  - Database Error: {0:s}"""
        try:
            cli_message(
                "<cyan>Attempting to connect to your database. This may take a moment...</cyan>"
            )

            configuration = SqlAlchemyDatasource.build_configuration(
                credentials="${" + datasource_name + "}"
            )

            configuration["class_name"] = "SqlAlchemyDatasource"
            configuration["module_name"] = "great_expectations.datasource"
            errors = DatasourceConfigSchema().validate(configuration)
            if len(errors) != 0:
                raise ge_exceptions.GreatExpectationsError(
                    "Invalid Datasource configuration: {:s}".format(errors)
                )

            cli_message(
                """
Great Expectations will now add a new Datasource '{0:s}' to your deployment, by adding this entry to your great_expectations.yml:

{1:s}
The credentials will be saved in uncommitted/config_variables.yml under the key '{0:s}'
""".format(
                    datasource_name,
                    textwrap.indent(
                        toolkit.yaml.dump({datasource_name: configuration}), "  "
                    ),
                )
            )

            toolkit.confirm_proceed_or_exit()
            context.add_datasource(name=datasource_name, **configuration)
            cli_message(msg_success_database)
            break
        except ModuleNotFoundError as de:
            cli_message(message.format(str(de)))
            return None

        except DatasourceInitializationError as de:
            cli_message(message.format(str(de)))
            if not click.confirm("Enter the credentials again?", default=True):
                context.add_datasource(
                    datasource_name,
                    initialize=False,
                    module_name="great_expectations.datasource",
                    class_name="SqlAlchemyDatasource",
                    data_asset_type={"class_name": "SqlAlchemyDataset"},
                    credentials="${" + datasource_name + "}",
                )
                # TODO this message about continuing may not be accurate
                cli_message(
                    """
We saved datasource {:s} in {:s} and the credentials you entered in {:s}.
Since we could not connect to the database, you can complete troubleshooting in the configuration files documented in the how-to guides here:
<blue>https://docs.greatexpectations.io/en/latest/guides/how_to_guides/configuring_datasources.html?utm_source=cli&utm_medium=init&utm_campaign={:s}#{:s}</blue> .

After you connect to the datasource, run great_expectations init to continue.

""".format(
                        datasource_name,
                        DataContext.GE_YML,
                        context.get_config()["config_variables_file_path"],
                        rtd_url_ge_version,
                        selected_database.value.lower(),
                    )
                )
                return None

    return datasource_name
Ejemplo n.º 9
0
def _add_pandas_datasource(
    context, passthrough_generator_only=True, prompt_for_datasource_name=True
):
    toolkit.send_usage_message(
        data_context=context,
        event="cli.new_ds_choice",
        event_payload={"type": "pandas"},
        success=True,
    )

    if passthrough_generator_only:
        datasource_name = "files_datasource"
        configuration = PandasDatasource.build_configuration()

    else:
        path = click.prompt(
            msg_prompt_filesys_enter_base_path,
            type=click.Path(exists=True, file_okay=False),
        )

        if path.startswith("./"):
            path = path[2:]

        if path.endswith("/"):
            basenamepath = path[:-1]
        else:
            basenamepath = path

        datasource_name = os.path.basename(basenamepath) + "__dir"
        if prompt_for_datasource_name:
            datasource_name = click.prompt(
                msg_prompt_datasource_name, default=datasource_name
            )

        configuration = PandasDatasource.build_configuration(
            batch_kwargs_generators={
                "subdir_reader": {
                    "class_name": "SubdirReaderBatchKwargsGenerator",
                    "base_directory": os.path.join("..", path),
                }
            }
        )

        configuration["class_name"] = "PandasDatasource"
        configuration["module_name"] = "great_expectations.datasource"
        errors = DatasourceConfigSchema().validate(configuration)
        if len(errors) != 0:
            raise ge_exceptions.GreatExpectationsError(
                "Invalid Datasource configuration: {:s}".format(errors)
            )

    cli_message(
        """
Great Expectations will now add a new Datasource '{:s}' to your deployment, by adding this entry to your great_expectations.yml:

{:s}
""".format(
            datasource_name,
            textwrap.indent(toolkit.yaml.dump({datasource_name: configuration}), "  "),
        )
    )

    toolkit.confirm_proceed_or_exit(
        continuation_message="Okay, exiting now. To learn more about adding datasources, run great_expectations "
        "datasource --help or visit https://docs.greatexpectations.io/"
    )

    context.add_datasource(name=datasource_name, **configuration)
    return datasource_name