Ejemplo n.º 1
0
    def __init__(
        self,
        name: str,
        datasource_name: str,
        execution_engine: ExecutionEngine,
        batch_spec_passthrough: Optional[dict] = None,
    ):
        """
        Base class for DataConnectors

        Args:
            name (str): required name for DataConnector
            datasource_name (str): required name for datasource
            execution_engine (ExecutionEngine): reference to ExecutionEngine
            batch_spec_passthrough (dict): dictionary with keys that will be added directly to batch_spec
        """
        if execution_engine is None:
            raise ge_exceptions.DataConnectorError(
                "A non-existent/unknown ExecutionEngine instance was referenced."
            )

        self._name = name
        self._datasource_name = datasource_name
        self._execution_engine = execution_engine

        # This is a dictionary which maps data_references onto batch_requests.
        self._data_references_cache = {}

        self._data_context_root_directory = None
        self._batch_spec_passthrough = batch_spec_passthrough or {}
Ejemplo n.º 2
0
    def _validate_runtime_keys_configuration(self, runtime_keys: List[str]):
        if runtime_keys and len(runtime_keys) > 0:
            if not (self._runtime_keys
                    and set(runtime_keys) <= set(self._runtime_keys)):
                raise ge_exceptions.DataConnectorError(
                    f"""RuntimeDataConnector "{self.name}" was invoked with one or more runtime keys that do not
appear among the configured runtime keys.
                    """)
Ejemplo n.º 3
0
def fetch_batch_data_as_pandas_df(batch_data):
    if isinstance(batch_data, pd.DataFrame):
        return batch_data
    if pyspark_sql and isinstance(batch_data, pyspark_sql.DataFrame):
        return batch_data.toPandas()
    if isinstance(batch_data, SqlAlchemyBatchData):
        return batch_data.head(fetch_all=True)
    raise ge_exceptions.DataConnectorError(
        "Unknown batch_data type encountered.")
Ejemplo n.º 4
0
    def _validate_batch_identifiers_configuration(
            self, batch_identifiers: List[str]):
        if batch_identifiers and len(batch_identifiers) > 0:
            if not (self._batch_identifiers and
                    set(batch_identifiers) <= set(self._batch_identifiers)):
                raise ge_exceptions.DataConnectorError(
                    f"""RuntimeDataConnector "{self.name}" was invoked with one or more batch identifiers that do not
appear among the configured batch identifiers.
                    """)
Ejemplo n.º 5
0
    def _validate_sorters_configuration(self, data_asset_name: Optional[str] = None):
        if self.sorters is not None and len(self.sorters) > 0:
            # data_asset_name: str = batch_request.data_asset_name
            regex_config: dict = self._get_regex_config(data_asset_name=data_asset_name)
            group_names: List[str] = regex_config["group_names"]
            if any(
                [sorter_name not in group_names for sorter_name in self.sorters.keys()]
            ):
                raise ge_exceptions.DataConnectorError(
                    f"""DataConnector "{self.name}" specifies one or more sort keys that do not appear among the
configured group_name.
                    """
                )
            if len(group_names) < len(self.sorters):
                raise ge_exceptions.DataConnectorError(
                    f"""DataConnector "{self.name}" is configured with {len(group_names)} group names;
this is fewer than number of sorters specified, which is {len(self.sorters)}.
                    """
                )
 def _validate_asset_level_batch_identifiers(
         self, data_asset_name: str, batch_identifiers: dict) -> None:
     """
     Check that batch_identifiers passed in are an exact match to the ones configured at the Asset-level
     """
     asset: Asset = self.assets[data_asset_name]
     batch_identifiers_keys: List[str] = list(batch_identifiers.keys())
     if not set(batch_identifiers_keys) == set(asset.batch_identifiers):
         raise ge_exceptions.DataConnectorError(f"""
             Data Asset {data_asset_name} was invoked with one or more batch_identifiers that were not configured for the asset.
             """)
 def _validate_data_connector_level_batch_identifiers(
         self, batch_identifiers: dict) -> None:
     """
     Check that batch_identifiers passed in are a subset of the ones configured at the DataConnector-level
     """
     batch_identifiers_keys: List[str] = list(batch_identifiers.keys())
     if not set(batch_identifiers_keys) <= set(
             self._batch_identifiers[self.name]):
         raise ge_exceptions.DataConnectorError(
             f"""RuntimeDataConnector "{self.name}" was invoked with one or more batch identifiers that do not
     appear among the configured batch identifiers.
             """)
    def _add_batch_identifiers(
        self,
        batch_identifiers: List[str],
        data_asset_name: Optional[str] = None,
    ) -> None:
        """
        Handles batch_identifiers that are configured at the DataConnector or Asset-level.
        batch_identifiers are added to the `self._batch_identifiers` cache.

            - Asset-level batch_identifiers are keyed by data_asset_name
            - DataConnector-level batch_identifiers are keyed by DataConnector-name

        Using DataConnector-level batch_identifiers also comes with a Deprecation warning.

        Args:
            batch_identifiers:  batch_identifiers from either DataConnector or Asset-level
            data_asset_name: if this value is not None, then we know the batch_identifiers are Asset-level
        """
        if data_asset_name:
            if not batch_identifiers:
                raise ge_exceptions.DataConnectorError(
                    f"""RuntimeDataConnector "{self.name}" requires batch_identifiers to be configured when specifying Assets."""
                )
            self._batch_identifiers[data_asset_name] = batch_identifiers
        else:
            if not batch_identifiers and len(self.assets) == 0:
                raise ge_exceptions.DataConnectorError(
                    f"""RuntimeDataConnector "{self.name}" requires batch_identifiers to be configured, either at the DataConnector or Asset-level."""
                )
            if batch_identifiers:
                # deprecated-v0.15.1
                warnings.warn(
                    "Specifying batch_identifiers as part of the RuntimeDataConnector config is deprecated as of v0.15.1 and will be removed by v0.18. Please configure batch_identifiers as part of Assets instead.",
                    DeprecationWarning,
                )
                self._batch_identifiers[self.name] = batch_identifiers
Ejemplo n.º 9
0
    def _validate_batch_request(self, batch_request: BatchRequestBase):
        super()._validate_batch_request(batch_request=batch_request)

        runtime_parameters = batch_request.runtime_parameters
        batch_identifiers = batch_request.batch_identifiers
        if not ((not runtime_parameters and not batch_identifiers) or
                (runtime_parameters and batch_identifiers)):
            raise ge_exceptions.DataConnectorError(
                f"""RuntimeDataConnector "{self.name}" requires runtime_parameters and batch_identifiers to be both
                present and non-empty or
                both absent in the batch_request parameter.
                """)
        if runtime_parameters:
            self._validate_runtime_parameters(
                runtime_parameters=runtime_parameters)
    def _validate_batch_request(self, batch_request: BatchRequestBase):
        super()._validate_batch_request(batch_request=batch_request)

        # Insure that batch_data and batch_request satisfy the "if and only if" condition.
        if not (
            (batch_request.batch_data is None and
             (batch_request.partition_request is None or
              not batch_request.partition_request.get("batch_identifiers"))) or
            (batch_request.batch_data is not None
             and batch_request.partition_request
             and batch_request.partition_request.get("batch_identifiers"))):
            raise ge_exceptions.DataConnectorError(
                f"""RuntimeDataConnector "{self.name}" requires batch_data and partition_request to be both present or
                both absent in the batch_request parameter.
                """)
    def _get_batch_definition_list_from_batch_request(
        self,
        batch_request: RuntimeBatchRequest,
    ) -> List[BatchDefinition]:
        """
        <Will> 202103. The following behavior of the _data_references_cache follows a pattern that we are using for
        other data_connectors, including variations of FilePathDataConnector. When BatchRequest contains batch_data
        that is passed in as a in-memory dataframe, the cache will contain the names of all data_assets
        (and data_references) that have been passed into the RuntimeDataConnector in this session, even though technically
        only the most recent batch_data is available. This can be misleading. However, allowing the RuntimeDataConnector
        to keep a record of all data_assets (and data_references) that have been passed in will allow for the proposed
        behavior of RuntimeBatchRequest which will allow for paths and queries to be passed in as part of the BatchRequest.
        Therefore this behavior will be revisited when the design of RuntimeBatchRequest and related classes are complete.
        """
        self._validate_batch_request(batch_request=batch_request)

        batch_identifiers: Optional[dict] = None
        if batch_request.batch_identifiers:
            self._validate_batch_identifiers(
                data_asset_name=batch_request.data_asset_name,
                batch_identifiers=batch_request.batch_identifiers,
            )
            batch_identifiers = batch_request.batch_identifiers

        if not batch_identifiers:
            ge_exceptions.DataConnectorError(
                "Passed in a RuntimeBatchRequest with no batch_identifiers")

        batch_definition_list: List[BatchDefinition]
        batch_definition: BatchDefinition = BatchDefinition(
            datasource_name=self.datasource_name,
            data_connector_name=self.name,
            data_asset_name=batch_request.data_asset_name,
            batch_identifiers=IDDict(batch_identifiers),
            batch_spec_passthrough=batch_request.batch_spec_passthrough,
        )
        batch_definition_list = [batch_definition]
        self._update_data_references_cache(
            batch_request.data_asset_name,
            batch_definition_list,
            IDDict(batch_identifiers),
        )
        return batch_definition_list
Ejemplo n.º 12
0
    def _update_data_asset_name_from_config(self, data_asset_name: str,
                                            data_asset_config: dict) -> str:

        data_asset_name_prefix: str = data_asset_config.get(
            "data_asset_name_prefix", "")
        data_asset_name_suffix: str = data_asset_config.get(
            "data_asset_name_suffix", "")
        schema_name: str = data_asset_config.get("schema_name", "")
        include_schema_name: bool = data_asset_config.get(
            "include_schema_name", True)
        if schema_name and include_schema_name is False:
            raise ge_exceptions.DataConnectorError(
                message=
                f"{self.__class__.__name__} ran into an error while initializing Asset names. Schema {schema_name} was specified, but 'include_schema_name' flag was set to False."
            )

        if schema_name:
            data_asset_name: str = f"{schema_name}.{data_asset_name}"

        data_asset_name: str = (
            f"{data_asset_name_prefix}{data_asset_name}{data_asset_name_suffix}"
        )

        return data_asset_name