コード例 #1
0
def _raise_profiling_errors(profiling_results):
    if (profiling_results["error"]["code"] ==
            DataContext.PROFILING_ERROR_CODE_SPECIFIED_DATA_ASSETS_NOT_FOUND):
        raise ge_exceptions.DataContextError(
            """Some of the data assets you specified were not found: {:s}
            """.format(",".join(
                profiling_results["error"]["not_found_data_assets"])))
    raise ge_exceptions.DataContextError("Unknown profiling error code: " +
                                         profiling_results["error"]["code"])
コード例 #2
0
    def save_expectation_suite(
        self,
        expectation_suite: ExpectationSuite,
        expectation_suite_name: Optional[str] = None,
        overwrite_existing: bool = True,
        **kwargs: Dict[str, Any],
    ):
        """Save the provided expectation suite into the DataContext.

        Args:
            expectation_suite: the suite to save
            expectation_suite_name: the name of this expectation suite. If no name is provided the name will \
                be read from the suite

            overwrite_existing: bool setting whether to overwrite existing ExpectationSuite

        Returns:
            None
        """
        if expectation_suite_name is None:
            key: ExpectationSuiteIdentifier = ExpectationSuiteIdentifier(
                expectation_suite_name=expectation_suite.expectation_suite_name
            )
        else:
            expectation_suite.expectation_suite_name = expectation_suite_name
            key: ExpectationSuiteIdentifier = ExpectationSuiteIdentifier(
                expectation_suite_name=expectation_suite_name)
        if self.expectations_store.has_key(key) and not overwrite_existing:
            raise ge_exceptions.DataContextError(
                "expectation_suite with name {} already exists. If you would like to overwrite this "
                "expectation_suite, set overwrite_existing=True.".format(
                    expectation_suite_name))
        self._evaluation_parameter_dependencies_compiled = False
        return self.expectations_store.set(key, expectation_suite, **kwargs)
コード例 #3
0
ファイル: util.py プロジェクト: yjlee215/great_expectations
def build_configuration_store(
    class_name: str,
    store_name: str,
    store_backend: Union[StoreBackend, dict],
    *,
    module_name: str = "great_expectations.data_context.store",
    overwrite_existing: bool = False,
    **kwargs,
) -> ConfigurationStore:
    logger.debug(
        f"Starting data_context/store/util.py#build_configuration_store for store_name {store_name}"
    )

    if store_backend is not None and issubclass(type(store_backend), StoreBackend):
        store_backend = store_backend.config
    elif not isinstance(store_backend, dict):
        raise ge_exceptions.DataContextError(
            "Invalid configuration: A store_backend needs to be a dictionary or inherit from the StoreBackend class."
        )

    store_backend.update(**kwargs)

    store_config: dict = {
        "store_name": store_name,
        "module_name": module_name,
        "class_name": class_name,
        "overwrite_existing": overwrite_existing,
        "store_backend": store_backend,
    }
    configuration_store: ConfigurationStore = build_store_from_config(
        store_config=store_config,
        module_name=module_name,
        runtime_environment=None,
    )
    return configuration_store
コード例 #4
0
    def __init__(self,
                 credentials,
                 table_name,
                 key_columns,
                 fixed_length_key=True):
        super().__init__(fixed_length_key=fixed_length_key)
        if not sqlalchemy:
            raise ge_exceptions.DataContextError(
                "ModuleNotFoundError: No module named 'sqlalchemy'")

        if not self.fixed_length_key:
            raise ValueError(
                "DatabaseStoreBackend requires use of a fixed-length-key")

        meta = MetaData()
        self.key_columns = key_columns
        # Dynamically construct a SQLAlchemy table with the name and column names we'll use
        cols = []
        for column in key_columns:
            if column == "value":
                raise ValueError(
                    "'value' cannot be used as a key_element name")
            cols.append(Column(column, String, primary_key=True))

        cols.append(Column("value", String))
        self._table = Table(table_name, meta, *cols)

        drivername = credentials.pop("drivername")
        options = URL(drivername, **credentials)
        self.engine = create_engine(options)
        meta.create_all(self.engine)
コード例 #5
0
def _get_batch_spec_passthrough(
    datasource: BaseDatasource, ) -> Dict[str, Union[str, Dict[str, Any]]]:
    batch_spec_passthrough: Dict[str, Union[str, Dict[str, Any]]] = {}

    if isinstance(datasource, Datasource):
        pass  # TODO: <Alex>Add parameters for Pandas, Spark, and other SQL as CLI functionality expands.</Alex>
    elif isinstance(datasource, SimpleSqlalchemyDatasource):
        # Some backends require named temporary table parameters.  We specifically elicit those and add them
        # where appropriate.
        execution_engine: SqlAlchemyExecutionEngine = cast(
            SqlAlchemyExecutionEngine, datasource.execution_engine)
        if execution_engine.engine.dialect.name.lower() == "bigquery":
            # bigquery also requires special handling
            bigquery_temp_table: str = click.prompt(
                "Great Expectations will create a table to use for "
                "validation." + os.linesep +
                "Please enter a name for this table: ",
                default="SOME_PROJECT.SOME_DATASET.ge_tmp_" +
                str(uuid.uuid4())[:8],
            )
            if bigquery_temp_table:
                batch_spec_passthrough.update({
                    "bigquery_temp_table":
                    bigquery_temp_table,
                })
    else:
        raise ge_exceptions.DataContextError(
            "Datasource {:s} of unsupported type {:s} was encountered.".format(
                datasource.name, str(type(datasource))))

    return batch_spec_passthrough
コード例 #6
0
    def __init__(
        self,
        credentials,
        queries=None,
        store_backend=None,
        runtime_environment=None,
        store_name=None,
    ) -> None:
        if not sqlalchemy:
            raise ge_exceptions.DataContextError(
                "sqlalchemy module not found, but is required for "
                "SqlAlchemyQueryStore"
            )
        super().__init__(
            store_backend=store_backend,
            runtime_environment=runtime_environment,
            store_name=store_name,
        )
        if queries:
            # If queries are defined in configuration, then we load them into an InMemoryStoreBackend
            try:
                assert isinstance(
                    queries, dict
                ), "SqlAlchemyQueryStore queries must be defined as a dictionary"
                assert (
                    store_backend is None
                    or store_backend["class_name"] == "InMemoryStoreBackend"
                ), (
                    "If queries are provided in configuration, then store_backend must be empty or an "
                    "InMemoryStoreBackend"
                )
                for k, v in queries.items():
                    self._store_backend.set(tuple([k]), v)

            except (AssertionError, KeyError) as e:
                raise ge_exceptions.InvalidConfigError(str(e))

        if "engine" in credentials:
            self.engine = credentials["engine"]
        elif "url" in credentials:
            self.engine = create_engine(credentials["url"])
        elif "connection_string" in credentials:
            self.engine = create_engine(credentials["connection_string"])
        else:
            drivername = credentials.pop("drivername")
            options = URL(drivername, **credentials)
            self.engine = create_engine(options)

        # Gather the call arguments of the present function (include the "module_name" and add the "class_name"), filter
        # out the Falsy values, and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "credentials": credentials,
            "queries": queries,
            "store_backend": store_backend,
            "runtime_environment": runtime_environment,
            "store_name": store_name,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        filter_properties_dict(properties=self._config, clean_falsy=True, inplace=True)
コード例 #7
0
    def create_expectation_suite(
        self,
        expectation_suite_name: str,
        overwrite_existing: bool = False,
        ge_cloud_id: Optional[str] = None,
        **kwargs: Optional[dict],
    ) -> ExpectationSuite:
        """Build a new expectation suite and save it into the data_context expectation store.

        Args:
            expectation_suite_name: The name of the expectation_suite to create
            overwrite_existing (boolean): Whether to overwrite expectation suite if expectation suite with given name
                already exists.

        Returns:
            A new (empty) expectation suite.
        """
        if not isinstance(overwrite_existing, bool):
            raise ValueError(
                "Parameter overwrite_existing must be of type BOOL")

        expectation_suite: ExpectationSuite = ExpectationSuite(
            expectation_suite_name=expectation_suite_name, data_context=self)
        key = GeCloudIdentifier(
            resource_type=GeCloudRESTResource.EXPECTATION_SUITE,
            ge_cloud_id=ge_cloud_id,
        )
        if self.expectations_store.has_key(key) and not overwrite_existing:
            raise ge_exceptions.DataContextError(
                "expectation_suite with GE Cloud ID {} already exists. If you would like to overwrite this "
                "expectation_suite, set overwrite_existing=True.".format(
                    ge_cloud_id))
        self.expectations_store.set(key, expectation_suite, **kwargs)
        return expectation_suite
コード例 #8
0
    def save_expectation_suite(
        self,
        expectation_suite: ExpectationSuite,
        expectation_suite_name: Optional[str] = None,
        overwrite_existing: bool = True,
        ge_cloud_id: Optional[str] = None,
        **kwargs: Optional[dict],
    ) -> None:
        """Save the provided expectation suite into the DataContext.

        Args:
            expectation_suite: the suite to save
            expectation_suite_name: the name of this expectation suite. If no name is provided the name will \
                be read from the suite
            ge_cloud_id: cloud id for saving expectation suite
            overwrite_existing: should I over-write the Suite if it already exists?
        Returns:
            None
        """
        key: GeCloudIdentifier = GeCloudIdentifier(
            resource_type=GeCloudRESTResource.EXPECTATION_SUITE,
            ge_cloud_id=ge_cloud_id
            if ge_cloud_id is not None else str(expectation_suite.ge_cloud_id),
        )
        if self.expectations_store.has_key(key) and not overwrite_existing:
            raise ge_exceptions.DataContextError(
                f"expectation_suite with GE Cloud ID {ge_cloud_id} already exists. "
                f"If you would like to overwrite this expectation_suite, set overwrite_existing=True."
            )
        self._evaluation_parameter_dependencies_compiled = False
        self.expectations_store.set(key, expectation_suite, **kwargs)
コード例 #9
0
    def get_expectation_suite(
        self,
        expectation_suite_name: Optional[str] = None,
        ge_cloud_id: Optional[str] = None,
    ) -> ExpectationSuite:
        """Get an Expectation Suite by name or GE Cloud ID
        Args:
            expectation_suite_name (str): the name for the Expectation Suite
            ge_cloud_id (str): the GE Cloud ID for the Expectation Suite

        Returns:
            expectation_suite
        """
        key = GeCloudIdentifier(
            resource_type=GeCloudRESTResource.EXPECTATION_SUITE,
            ge_cloud_id=ge_cloud_id,
        )
        if self.expectations_store.has_key(key):
            expectations_schema_dict: dict = cast(
                dict, self.expectations_store.get(key))
            # create the ExpectationSuite from constructor
            return ExpectationSuite(**expectations_schema_dict,
                                    data_context=self)

        else:
            raise ge_exceptions.DataContextError(
                f"expectation_suite {expectation_suite_name} not found")
コード例 #10
0
    def create(
        cls,
        project_root_dir: Optional[str] = None,
        usage_statistics_enabled: bool = True,
        runtime_environment: Optional[dict] = None,
    ) -> "DataContext":
        """
        Build a new great_expectations directory and DataContext object in the provided project_root_dir.

        `create` will create a new "great_expectations" directory in the provided folder, provided one does not
        already exist. Then, it will initialize a new DataContext in that folder and write the resulting config.

        --Public API--

        --Documentation--
            https://docs.greatexpectations.io/docs/terms/data_context

        Args:
            project_root_dir: path to the root directory in which to create a new great_expectations directory
            usage_statistics_enabled: boolean directive specifying whether or not to gather usage statistics
            runtime_environment: a dictionary of config variables that override both those set in
                config_variables.yml and the environment

        Returns:
            DataContext
        """

        if not os.path.isdir(project_root_dir):
            raise ge_exceptions.DataContextError(
                "The project_root_dir must be an existing directory in which "
                "to initialize a new DataContext"
            )

        ge_dir = os.path.join(project_root_dir, cls.GE_DIR)
        os.makedirs(ge_dir, exist_ok=True)
        cls.scaffold_directories(ge_dir)

        if os.path.isfile(os.path.join(ge_dir, cls.GE_YML)):
            message = f"""Warning. An existing `{cls.GE_YML}` was found here: {ge_dir}.
    - No action was taken."""
            warnings.warn(message)
        else:
            cls.write_project_template_to_disk(ge_dir, usage_statistics_enabled)

        uncommitted_dir = os.path.join(ge_dir, cls.GE_UNCOMMITTED_DIR)
        if os.path.isfile(os.path.join(uncommitted_dir, "config_variables.yml")):
            message = """Warning. An existing `config_variables.yml` was found here: {}.
    - No action was taken.""".format(
                uncommitted_dir
            )
            warnings.warn(message)
        else:
            cls.write_config_variables_template_to_disk(uncommitted_dir)

        return cls(context_root_dir=ge_dir, runtime_environment=runtime_environment)
コード例 #11
0
    def __init__(
        self,
        store_name: str,
        store_backend: Optional[dict] = None,
        overwrite_existing: bool = False,
        runtime_environment: Optional[dict] = None,
    ) -> None:
        if not issubclass(self._configuration_class, BaseYamlConfig):
            raise ge_exceptions.DataContextError(
                "Invalid configuration: A configuration_class needs to inherit from the BaseYamlConfig class."
            )

        if store_backend is not None:
            store_backend_module_name = store_backend.get(
                "module_name", "great_expectations.data_context.store")
            store_backend_class_name = store_backend.get(
                "class_name", "InMemoryStoreBackend")
            verify_dynamic_loading_support(
                module_name=store_backend_module_name)
            store_backend_class = load_class(store_backend_class_name,
                                             store_backend_module_name)

            # Store Backend Class was loaded successfully; verify that it is of a correct subclass.
            if issubclass(store_backend_class, TupleStoreBackend):
                # Provide defaults for this common case
                store_backend["filepath_suffix"] = store_backend.get(
                    "filepath_suffix", ".yml")

        super().__init__(
            store_backend=store_backend,
            runtime_environment=runtime_environment,
            store_name=store_name,
        )

        # Gather the call arguments of the present function (include the "module_name" and add the "class_name"), filter
        # out the Falsy values, and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "store_name": store_name,
            "store_backend": store_backend,
            "overwrite_existing": overwrite_existing,
            "runtime_environment": runtime_environment,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        filter_properties_dict(properties=self._config,
                               clean_falsy=True,
                               inplace=True)

        self._overwrite_existing = overwrite_existing
コード例 #12
0
    def __init__(self,
                 credentials,
                 table_name,
                 key_columns,
                 fixed_length_key=True):
        super().__init__(fixed_length_key=fixed_length_key)
        if not sqlalchemy:
            raise ge_exceptions.DataContextError(
                "ModuleNotFoundError: No module named 'sqlalchemy'")

        if not self.fixed_length_key:
            raise ge_exceptions.InvalidConfigError(
                "DatabaseStoreBackend requires use of a fixed-length-key")

        drivername = credentials.pop("drivername")
        options = URL(drivername, **credentials)
        self.engine = create_engine(options)

        meta = MetaData()
        self.key_columns = key_columns
        # Dynamically construct a SQLAlchemy table with the name and column names we'll use
        cols = []
        for column in key_columns:
            if column == "value":
                raise ge_exceptions.InvalidConfigError(
                    "'value' cannot be used as a key_element name")
            cols.append(Column(column, String, primary_key=True))
        cols.append(Column("value", String))
        try:
            table = Table(table_name,
                          meta,
                          autoload=True,
                          autoload_with=self.engine)
            # We do a "light" check: if the columns' names match, we will proceed, otherwise, create the table
            if {str(col.name).lower()
                    for col in table.columns
                } != (set(key_columns) | {"value"}):
                raise ge_exceptions.StoreBackendError(
                    f"Unable to use table {table_name}: it exists, but does not have the expected schema."
                )
        except NoSuchTableError:
            table = Table(table_name, meta, *cols)
            try:
                meta.create_all(self.engine)
            except SQLAlchemyError as e:
                raise ge_exceptions.StoreBackendError(
                    f"Unable to connect to table {table_name} because of an error. It is possible your table needs to be migrated to a new schema.  SqlAlchemyError: {str(e)}"
                )
        self._table = table
コード例 #13
0
    def __init__(
        self,
        credentials,
        queries=None,
        store_backend=None,
        runtime_environment=None,
        store_name=None,
    ):
        if not sqlalchemy:
            raise ge_exceptions.DataContextError(
                "sqlalchemy module not found, but is required for "
                "SqlAlchemyQueryStore")
        super().__init__(
            store_backend=store_backend,
            runtime_environment=runtime_environment,
            store_name=store_name,
        )
        if queries:
            # If queries are defined in configuration, then we load them into an InMemoryStoreBackend
            try:
                assert isinstance(
                    queries, dict
                ), "SqlAlchemyQueryStore queries must be defined as a dictionary"
                assert (
                    store_backend is None
                    or store_backend["class_name"] == "InMemoryStoreBackend"
                ), ("If queries are provided in configuration, then store_backend must be empty or an "
                    "InMemoryStoreBackend")
                for k, v in queries.items():
                    self._store_backend.set(tuple([k]), v)

            except (AssertionError, KeyError) as e:
                raise ge_exceptions.InvalidConfigError(str(e))

        if "engine" in credentials:
            self.engine = credentials["engine"]
        elif "url" in credentials:
            self.engine = create_engine(credentials["url"])
        else:
            drivername = credentials.pop("drivername")
            options = URL(drivername, **credentials)
            self.engine = create_engine(options)
コード例 #14
0
def _get_batch_spec_passthrough(
    datasource: BaseDatasource,
) -> Dict[str, Union[str, Dict[str, Any]]]:
    batch_spec_passthrough: Dict[str, Union[str, Dict[str, Any]]] = {}

    if isinstance(datasource, Datasource):
        pass  # TODO: <Alex>Add parameters for Pandas, Spark, and other SQL as CLI functionality expands.</Alex>
    elif isinstance(datasource, SimpleSqlalchemyDatasource):
        # Some backends require named temporary table parameters.  We specifically elicit those and add them
        # where appropriate.
        execution_engine: SqlAlchemyExecutionEngine = cast(
            SqlAlchemyExecutionEngine, datasource.execution_engine
        )
    else:
        raise ge_exceptions.DataContextError(
            "Datasource {:s} of unsupported type {:s} was encountered.".format(
                datasource.name, str(type(datasource))
            )
        )

    return batch_spec_passthrough
コード例 #15
0
    def delete_expectation_suite(
        self,
        expectation_suite_name: Optional[str] = None,
        ge_cloud_id: Optional[str] = None,
    ):
        """Delete specified expectation suite from data_context expectation store.

        Args:
            expectation_suite_name: The name of the expectation_suite to create

        Returns:
            True for Success and False for Failure.
        """
        key = GeCloudIdentifier(
            resource_type=GeCloudRESTResource.EXPECTATION_SUITE,
            ge_cloud_id=ge_cloud_id,
        )
        if not self.expectations_store.has_key(key):
            raise ge_exceptions.DataContextError(
                "expectation_suite with name {} does not exist.")
        else:
            self.expectations_store.remove_key(key)
            return True
コード例 #16
0
    def __init__(
        self,
        table_name,
        key_columns,
        fixed_length_key=True,
        credentials=None,
        url=None,
        connection_string=None,
        engine=None,
        store_name=None,
        suppress_store_backend_id=False,
        manually_initialize_store_backend_id: str = "",
        **kwargs,
    ):
        super().__init__(
            fixed_length_key=fixed_length_key,
            suppress_store_backend_id=suppress_store_backend_id,
            manually_initialize_store_backend_id=
            manually_initialize_store_backend_id,
            store_name=store_name,
        )
        if not sa:
            raise ge_exceptions.DataContextError(
                "ModuleNotFoundError: No module named 'sqlalchemy'")

        if not self.fixed_length_key:
            raise ge_exceptions.InvalidConfigError(
                "DatabaseStoreBackend requires use of a fixed-length-key")

        self._schema_name = None
        self._credentials = credentials
        self._connection_string = connection_string
        self._url = url

        if engine is not None:
            if credentials is not None:
                logger.warning(
                    "Both credentials and engine were provided during initialization of SqlAlchemyExecutionEngine. "
                    "Ignoring credentials.")
            self.engine = engine
        elif credentials is not None:
            self.engine = self._build_engine(credentials=credentials, **kwargs)
        elif connection_string is not None:
            self.engine = sa.create_engine(connection_string, **kwargs)
        elif url is not None:
            self.drivername = urlparse(url).scheme
            self.engine = sa.create_engine(url, **kwargs)
        else:
            raise ge_exceptions.InvalidConfigError(
                "Credentials, url, connection_string, or an engine are required for a DatabaseStoreBackend."
            )

        meta = MetaData(schema=self._schema_name)
        self.key_columns = key_columns
        # Dynamically construct a SQLAlchemy table with the name and column names we'll use
        cols = []
        for column in key_columns:
            if column == "value":
                raise ge_exceptions.InvalidConfigError(
                    "'value' cannot be used as a key_element name")
            cols.append(Column(column, String, primary_key=True))
        cols.append(Column("value", String))
        try:
            table = Table(table_name,
                          meta,
                          autoload=True,
                          autoload_with=self.engine)
            # We do a "light" check: if the columns' names match, we will proceed, otherwise, create the table
            if {str(col.name).lower()
                    for col in table.columns
                } != (set(key_columns) | {"value"}):
                raise ge_exceptions.StoreBackendError(
                    f"Unable to use table {table_name}: it exists, but does not have the expected schema."
                )
        except NoSuchTableError:
            table = Table(table_name, meta, *cols)
            try:
                if self._schema_name:
                    self.engine.execute(
                        f"CREATE SCHEMA IF NOT EXISTS {self._schema_name};")
                meta.create_all(self.engine)
            except SQLAlchemyError as e:
                raise ge_exceptions.StoreBackendError(
                    f"Unable to connect to table {table_name} because of an error. It is possible your table needs to be migrated to a new schema.  SqlAlchemyError: {str(e)}"
                )
        self._table = table
        # Initialize with store_backend_id
        self._store_backend_id = None
        self._store_backend_id = self.store_backend_id

        # Gather the call arguments of the present function (include the "module_name" and add the "class_name"), filter
        # out the Falsy values, and set the instance "_config" variable equal to the resulting dictionary.
        self._config = {
            "table_name": table_name,
            "key_columns": key_columns,
            "fixed_length_key": fixed_length_key,
            "credentials": credentials,
            "url": url,
            "connection_string": connection_string,
            "engine": engine,
            "store_name": store_name,
            "suppress_store_backend_id": suppress_store_backend_id,
            "manually_initialize_store_backend_id":
            manually_initialize_store_backend_id,
            "module_name": self.__class__.__module__,
            "class_name": self.__class__.__name__,
        }
        self._config.update(kwargs)
        filter_properties_dict(properties=self._config,
                               clean_falsy=True,
                               inplace=True)
コード例 #17
0
def get_batch_request(
    datasource: BaseDatasource,
    additional_batch_request_args: Optional[Dict[str, Any]] = None,
) -> Dict[str, Union[str, Dict[str, Any]]]:
    """
    This method manages the interaction with user necessary to obtain batch_request for a batch of a data asset.

    In order to get batch_request this method needs datasource_name, data_connector_name and data_asset_name
    to combine them into a batch_request dictionary.

    All three arguments are optional. If they are present, the method uses their values. Otherwise, the method
    prompts user to enter them interactively. Since it is possible for any of these three components to be
    passed to this method as empty values and to get their values after interacting with user, this method
    returns these components' values in case they changed.

    If the datasource has data connectors, the method lets user choose a name from that list (note: if there are
    multiple data connectors, user has to choose one first).

    # :param datasource:
    # :param additional_batch_request_args:
    # :return: batch_request
    """
    available_data_asset_names_by_data_connector_dict: Dict[
        str, List[str]] = datasource.get_available_data_asset_names()
    data_connector_name: Optional[str] = select_data_connector_name(
        available_data_asset_names_by_data_connector_dict=
        available_data_asset_names_by_data_connector_dict, )

    batch_request: Dict[str, Union[str, int, Dict[str, Any]]] = {
        "datasource_name": datasource.name,
        "data_connector_name": data_connector_name,
    }

    data_asset_name: str

    if isinstance(datasource, Datasource):
        msg_prompt_enter_data_asset_name: str = f'\nWhich data asset (accessible by data connector "{data_connector_name}") would you like to use?\n'
        data_asset_name = _get_data_asset_name_from_data_connector(
            datasource=datasource,
            data_connector_name=data_connector_name,
            msg_prompt_enter_data_asset_name=msg_prompt_enter_data_asset_name,
        )
    elif isinstance(datasource, SimpleSqlalchemyDatasource):
        msg_prompt_enter_data_asset_name: str = (
            "\nWhich table would you like to use? (Choose one)\n")
        data_asset_name = _get_data_asset_name_for_simple_sqlalchemy_datasource(
            datasource=datasource,
            data_connector_name=data_connector_name,
            msg_prompt_enter_data_asset_name=msg_prompt_enter_data_asset_name,
        )
    else:
        raise ge_exceptions.DataContextError(
            "Datasource {:s} of unsupported type {:s} was encountered.".format(
                datasource.name, str(type(datasource))))

    batch_request.update({
        "data_asset_name": data_asset_name,
    })

    if additional_batch_request_args and isinstance(
            additional_batch_request_args, dict):
        batch_request.update(additional_batch_request_args)

    batch_spec_passthrough: Dict[str, Union[str, Dict[
        str, Any]]] = batch_request.get("batch_spec_passthrough")
    if batch_spec_passthrough is None:
        batch_spec_passthrough = {}

    batch_spec_passthrough.update(
        _get_batch_spec_passthrough(datasource=datasource))
    batch_request["batch_spec_passthrough"] = batch_spec_passthrough

    filter_properties_dict(properties=batch_request,
                           clean_falsy=True,
                           inplace=True)

    return batch_request
コード例 #18
0
def get_batch_kwargs(
    context,
    datasource_name=None,
    batch_kwargs_generator_name=None,
    data_asset_name=None,
    additional_batch_kwargs=None,
):
    """
    This method manages the interaction with user necessary to obtain batch_kwargs for a batch of a data asset.

    In order to get batch_kwargs this method needs datasource_name, batch_kwargs_generator_name and data_asset_name
    to combine them into a fully qualified data asset identifier(datasource_name/batch_kwargs_generator_name/data_asset_name).
    All three arguments are optional. If they are present, the method uses their values. Otherwise, the method
    prompts user to enter them interactively. Since it is possible for any of these three components to be
    passed to this method as empty values and to get their values after interacting with user, this method
    returns these components' values in case they changed.

    If the datasource has batch_kwargs_generators that can list available data asset names, the method lets user choose a name
    from that list (note: if there are multiple batch_kwargs_generators, user has to choose one first). If a name known to
    the chosen batch_kwargs_generator is selected, the batch_kwargs_generators will be able to yield batch_kwargs. The method also gives user
    an alternative to selecting the data asset name from the batch_kwargs_generators's list - user can type in a name for their
    data asset. In this case a passthrough batch kwargs batch_kwargs_generators will be used to construct a fully qualified data asset
    identifier (note: if the datasource has no passthrough batch_kwargs_generators configured, the method will exist with a failure).
    Since no batch_kwargs_generators can yield batch_kwargs for this data asset name, the method prompts user to specify batch_kwargs
    by choosing a file (if the datasource is pandas or spark) or by writing a SQL query (if the datasource points
    to a database).

    :param context:
    :param datasource_name:
    :param batch_kwargs_generator_name:
    :param data_asset_name:
    :param additional_batch_kwargs:
    :return: a tuple: (datasource_name, batch_kwargs_generator_name, data_asset_name, batch_kwargs). The components
                of the tuple were passed into the methods as optional arguments, but their values might
                have changed after this method's execution. If the returned batch_kwargs is None, it means
                that the batch_kwargs_generator will know to yield batch_kwargs when called.
    """
    try:
        available_data_assets_dict = context.get_available_data_asset_names(
            datasource_names=datasource_name)
    except ValueError:
        # the datasource has no batch_kwargs_generators
        available_data_assets_dict = {datasource_name: {}}

    data_source = toolkit.select_datasource(context,
                                            datasource_name=datasource_name)
    datasource_name = data_source.name

    if batch_kwargs_generator_name is None:
        batch_kwargs_generator_name = select_batch_kwargs_generator(
            context,
            datasource_name,
            available_data_assets_dict=available_data_assets_dict,
        )

    # if the user provided us with the batch kwargs generator name and the data asset, we have everything we need -
    # let's ask the generator to build batch kwargs for this asset - we are done.
    if batch_kwargs_generator_name is not None and data_asset_name is not None:
        generator = data_source.get_batch_kwargs_generator(
            batch_kwargs_generator_name)
        batch_kwargs = generator.build_batch_kwargs(data_asset_name,
                                                    **additional_batch_kwargs)
        return batch_kwargs

    if isinstance(context.get_datasource(datasource_name),
                  (PandasDatasource, SparkDFDatasource)):
        (
            data_asset_name,
            batch_kwargs,
        ) = _get_batch_kwargs_from_generator_or_from_file_path(
            context,
            datasource_name,
            batch_kwargs_generator_name=batch_kwargs_generator_name,
        )

    elif isinstance(context.get_datasource(datasource_name),
                    SqlAlchemyDatasource):
        data_asset_name, batch_kwargs = _get_batch_kwargs_for_sqlalchemy_datasource(
            context,
            datasource_name,
            additional_batch_kwargs=additional_batch_kwargs)

    else:
        raise ge_exceptions.DataContextError(
            "Datasource {:s} is expected to be a PandasDatasource or SparkDFDatasource, but is {:s}"
            .format(datasource_name,
                    str(type(context.get_datasource(datasource_name)))))

    return (datasource_name, batch_kwargs_generator_name, data_asset_name,
            batch_kwargs)
コード例 #19
0
    def _run_default_validation_operator(
        self,
        assets_to_validate: List,
        run_id: Optional[Union[str, RunIdentifier]] = None,
        evaluation_parameters: Optional[dict] = None,
        run_name: Optional[str] = None,
        run_time: Optional[Union[str, datetime.datetime]] = None,
        result_format: Optional[Union[str,
                                      dict]] = {"result_format": "SUMMARY"},
    ):
        result_format = result_format or {"result_format": "SUMMARY"}

        if not assets_to_validate:
            raise ge_exceptions.DataContextError(
                "No batches of data were passed in. These are required")

        for batch in assets_to_validate:
            if not isinstance(batch, (tuple, DataAsset, Validator)):
                raise ge_exceptions.DataContextError(
                    "Batches are required to be of type DataAsset or Validator"
                )

        if run_id is None and run_name is None:
            run_name = datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
            logger.info("Setting run_name to: {}".format(run_name))

        default_validation_operator = ActionListValidationOperator(
            data_context=self.data_context,
            action_list=[
                {
                    "name": "store_validation_result",
                    "action": {
                        "class_name": "StoreValidationResultAction"
                    },
                },
                {
                    "name": "store_evaluation_params",
                    "action": {
                        "class_name": "StoreEvaluationParametersAction"
                    },
                },
                {
                    "name": "update_data_docs",
                    "action": {
                        "class_name": "UpdateDataDocsAction",
                        "site_names": []
                    },
                },
            ],
            result_format=result_format,
            name="default-action-list-validation-operator",
        )

        if evaluation_parameters is None:
            return default_validation_operator.run(
                assets_to_validate=assets_to_validate,
                run_id=run_id,
                run_name=run_name,
                run_time=run_time,
                result_format=result_format,
            )
        else:
            return default_validation_operator.run(
                assets_to_validate=assets_to_validate,
                run_id=run_id,
                evaluation_parameters=evaluation_parameters,
                run_name=run_name,
                run_time=run_time,
                result_format=result_format,
            )