Exemple #1
0
def batch_fixture() -> Batch:
    """
    Fixture for Batch object that contains data, BatchRequest, BatchDefinition
    as well as BatchSpec and BatchMarkers. To be used in unittesting.
    """
    df: pd.DataFrame = pd.DataFrame(
        {"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}
    )
    batch_request: BatchRequest = BatchRequest(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
    )
    batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
        batch_identifiers=IDDict({"id": "A"}),
    )
    batch_spec: BatchSpec = BatchSpec(path="/some/path/some.file")
    batch_markers: BatchMarkers = BatchMarkers(ge_load_time="FAKE_LOAD_TIME")
    batch: Batch = Batch(
        data=df,
        batch_request=batch_request,
        batch_definition=batch_definition,
        batch_spec=batch_spec,
        batch_markers=batch_markers,
    )
    return batch
def test_batch__str__method():
    batch = Batch(
        data=None,
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
        ),
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
            batch_identifiers=IDDict({}),
        ),
        batch_spec=BatchSpec(path="/some/path/some.file"),
        batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"),
    )
    print(batch.__str__())

    assert (batch.__str__() == """{
  "data": "None",
  "batch_request": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name"
  },
  "batch_definition": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name",
    "batch_identifiers": {}
  },
  "batch_spec": "{'path': '/some/path/some.file'}",
  "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}"
}""")
    def get_batch_data_and_markers(
            self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:
        selectable = self._build_selectable_from_batch_spec(
            batch_spec=batch_spec)

        if "bigquery_temp_table" in batch_spec:
            temp_table_name = batch_spec.get("bigquery_temp_table")
        else:
            temp_table_name = None

        source_table_name = batch_spec.get("table_name", None)
        source_schema_name = batch_spec.get("schema_name", None)

        batch_data = SqlAlchemyBatchData(
            execution_engine=self,
            selectable=selectable,
            temp_table_name=temp_table_name,
            create_temp_table=batch_spec.get("create_temp_table",
                                             self._create_temp_table),
            source_table_name=source_table_name,
            source_schema_name=source_schema_name,
        )
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        return batch_data, batch_markers
Exemple #4
0
    def get_batch_data_and_markers(
            self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:
        if not isinstance(
                batch_spec,
            (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)):
            raise InvalidBatchSpecError(
                f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or
        RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received).
                        """)

        batch_data: Optional[SqlAlchemyBatchData] = None
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        source_schema_name: str = batch_spec.get("schema_name", None)
        source_table_name: str = batch_spec.get("table_name", None)

        temp_table_schema_name: Optional[str] = batch_spec.get(
            "temp_table_schema_name")
        temp_table_name: Optional[str] = batch_spec.get("bigquery_temp_table")

        create_temp_table: bool = batch_spec.get("create_temp_table",
                                                 self._create_temp_table)

        if isinstance(batch_spec, RuntimeQueryBatchSpec):
            # query != None is already checked when RuntimeQueryBatchSpec is instantiated
            query: str = batch_spec.query

            batch_spec.query = "SQLQuery"
            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                query=query,
                temp_table_schema_name=temp_table_schema_name,
                temp_table_name=temp_table_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )
        elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec):
            if self.engine.dialect.name.lower() == "oracle":
                selectable: str = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec)
            else:
                selectable: Selectable = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec)

            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                selectable=selectable,
                temp_table_name=temp_table_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )

        return batch_data, batch_markers
Exemple #5
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            batch_spec.batch_data = "PandasDataFrame"
        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            s3_url = S3Url(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            s3_object = s3_engine.get_object(Bucket=s3_url.bucket,
                                             Key=s3_url.key)
            logger.debug("Fetching s3 object. Bucket: {} Key: {}".format(
                s3_url.bucket, s3_url.key))
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            batch_data = reader_fn(
                StringIO(s3_object["Body"].read().decode(
                    s3_object.get("ContentEncoding", "utf-8"))),
                **reader_options,
            )
        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            batch_data = reader_fn(path, **reader_options)
        else:
            raise BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}"
            )

        batch_data = self._apply_splitting_and_sampling_methods(
            batch_spec, batch_data)
        if batch_data.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                batch_data)

        typed_batch_data = self._get_typed_batch_data(batch_data)

        return typed_batch_data, batch_markers
Exemple #6
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            if isinstance(batch_data, str):
                raise ge_exceptions.ExecutionEngineError(
                    f"""SparkDFExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal.
Please check your config.""")
            batch_spec.batch_data = "SparkDataFrame"
        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            try:
                reader_options = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader_options,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError("""
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """)
        else:
            raise BatchSpecError("""
                Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate.
                """)

        batch_data = self._apply_splitting_and_sampling_methods(
            batch_spec, batch_data)
        typed_batch_data = SparkDFBatchData(execution_engine=self,
                                            dataframe=batch_data)

        return typed_batch_data, batch_markers
Exemple #7
0
    def _run_suite(
        self,
        dataset_name: str,
        dataset_path: Optional[str],
        df: Any,
        target_expectation_suite_name: str,
        run_id: str,
    ):
        target_suite = self.expectation_context.get_expectation_suite(
            target_expectation_suite_name)
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_kwargs = {"datasource": generate_datasource_name(dataset_name)}

        if dataset_path:
            dataasset_name, _ = os.path.splitext(
                os.path.basename(dataset_path))
            batch_kwargs["path"] = str(dataset_path)
            batch_kwargs["data_asset_name"] = dataasset_name

        batch = Batch(
            "kedro",
            batch_kwargs=BatchKwargs(batch_kwargs),
            data=df,
            batch_parameters=None,
            batch_markers=batch_markers,
            data_context=self.expectation_context,
        )

        try:
            v = Validator(
                batch=batch,
                expectation_suite=target_suite,
            )
        except ValueError:
            raise UnsupportedDataSet

        validator_dataset_batch = v.get_dataset()
        return self.expectation_context.run_validation_operator(
            "action_list_operator", [validator_dataset_batch], run_id=run_id)
Exemple #8
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        batch_data: DataFrame

        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            batch_spec.batch_data = "SparkDataFrame"
        elif isinstance(batch_spec, (PathBatchSpec, S3BatchSpec)):
            reader_method: str = batch_spec.get("reader_method")
            reader_options: dict = batch_spec.get("reader_options") or {}
            path: str = batch_spec.get("path") or batch_spec.get("s3")
            try:
                reader_options = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader_options,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError("""
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """)
        else:
            raise BatchSpecError("""
                Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate.
                """)

        batch_data = self._apply_splitting_and_sampling_methods(
            batch_spec, batch_data)
        typed_batch_data = SparkDFBatchData(batch_data)

        return typed_batch_data, batch_markers
    def get_batch_data_and_markers(
            self, batch_spec) -> Tuple[SqlAlchemyBatchData, BatchMarkers]:

        selectable = self._build_selectable_from_batch_spec(
            batch_spec=batch_spec)
        if "bigquery_temp_table" in batch_spec:
            temp_table_name = batch_spec.get("bigquery_temp_table")
        else:
            temp_table_name = None
        batch_data = SqlAlchemyBatchData(engine=self.engine,
                                         selectable=selectable,
                                         temp_table_name=temp_table_name)

        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        return batch_data, batch_markers
    def get_batch(self, batch_kwargs, batch_parameters=None):
        # We need to build a batch_id to be used in the dataframe
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "bigquery_temp_table" in batch_kwargs:
            query_support_table_name = batch_kwargs.get("bigquery_temp_table")
        elif "snowflake_transient_table" in batch_kwargs:
            # Snowflake can use either a transient or temp table, so we allow a table_name to be provided
            query_support_table_name = batch_kwargs.get(
                "snowflake_transient_table")
        else:
            query_support_table_name = None

        if "query" in batch_kwargs:
            if "limit" in batch_kwargs or "offset" in batch_kwargs:
                logger.warning(
                    "Limit and offset parameters are ignored when using query-based batch_kwargs; consider "
                    "adding limit and offset directly to the generated query.")
            if "query_parameters" in batch_kwargs:
                query = Template(batch_kwargs["query"]).safe_substitute(
                    batch_kwargs["query_parameters"])
            else:
                query = batch_kwargs["query"]
            batch_reference = SqlAlchemyBatchReference(
                engine=self.engine,
                query=query,
                table_name=query_support_table_name,
                schema=batch_kwargs.get("schema"),
            )
        elif "table" in batch_kwargs:
            table = batch_kwargs["table"]
            if batch_kwargs.get("use_quoted_name"):
                table = quoted_name(table, quote=True)

            limit = batch_kwargs.get("limit")
            offset = batch_kwargs.get("offset")
            if limit is not None or offset is not None:
                # AWS Athena does not support offset
                if (offset is not None
                        and self.engine.dialect.name.lower() == "awsathena"):
                    raise NotImplementedError(
                        "AWS Athena does not support OFFSET.")
                logger.info(
                    "Generating query from table batch_kwargs based on limit and offset"
                )

                # In BigQuery the table name is already qualified with its schema name
                if self.engine.dialect.name.lower() == "bigquery":
                    schema = None

                else:
                    schema = batch_kwargs.get("schema")
                raw_query = (sqlalchemy.select(
                    [sqlalchemy.text("*")]).select_from(
                        sqlalchemy.schema.Table(
                            table, sqlalchemy.MetaData(),
                            schema=schema)).offset(offset).limit(limit))
                query = str(
                    raw_query.compile(self.engine,
                                      compile_kwargs={"literal_binds": True}))
                batch_reference = SqlAlchemyBatchReference(
                    engine=self.engine,
                    query=query,
                    table_name=query_support_table_name,
                    schema=batch_kwargs.get("schema"),
                )
            else:
                batch_reference = SqlAlchemyBatchReference(
                    engine=self.engine,
                    table_name=table,
                    schema=batch_kwargs.get("schema"),
                )
        else:
            raise ValueError(
                "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified"
            )

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=batch_reference,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context,
        )
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        batch_data: PandasBatchData
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            if isinstance(batch_spec.batch_data, pd.DataFrame):
                df = batch_spec.batch_data
            elif isinstance(batch_spec.batch_data, PandasBatchData):
                df = batch_spec.batch_data.dataframe
            else:
                raise ValueError(
                    "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object."
                )
            batch_spec.batch_data = "PandasDataFrame"
        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            s3_url = S3Url(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            if "compression" not in reader_options.keys():
                reader_options["compression"] = sniff_s3_compression(s3_url)
            s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key)
            logger.debug(
                "Fetching s3 object. Bucket: {} Key: {}".format(
                    s3_url.bucket, s3_url.key
                )
            )
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            buf = BytesIO(s3_object["Body"].read())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)
        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)
        else:
            raise ge_exceptions.BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}"
            )

        df = self._apply_splitting_and_sampling_methods(batch_spec, df)
        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(df)

        typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df)

        return typed_batch_data, batch_markers
Exemple #12
0
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:
        if not isinstance(
            batch_spec, (SqlAlchemyDatasourceBatchSpec, RuntimeQueryBatchSpec)
        ):
            raise InvalidBatchSpecError(
                f"""SqlAlchemyExecutionEngine accepts batch_spec only of type SqlAlchemyDatasourceBatchSpec or
        RuntimeQueryBatchSpec (illegal type "{str(type(batch_spec))}" was received).
                        """
            )

        batch_data: Optional[SqlAlchemyBatchData] = None
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        source_schema_name: str = batch_spec.get("schema_name", None)
        source_table_name: str = batch_spec.get("table_name", None)

        temp_table_schema_name: Optional[str] = batch_spec.get("temp_table_schema_name")

        if batch_spec.get("bigquery_temp_table"):
            # deprecated-v0.15.3
            warnings.warn(
                "BigQuery tables that are created as the result of a query are no longer created as "
                "permanent tables. Thus, a named permanent table through the `bigquery_temp_table`"
                "parameter is not required. The `bigquery_temp_table` parameter is deprecated as of"
                "v0.15.3 and will be removed in v0.18.",
                DeprecationWarning,
            )

        create_temp_table: bool = batch_spec.get(
            "create_temp_table", self._create_temp_table
        )

        if isinstance(batch_spec, RuntimeQueryBatchSpec):
            # query != None is already checked when RuntimeQueryBatchSpec is instantiated
            query: str = batch_spec.query

            batch_spec.query = "SQLQuery"
            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                query=query,
                temp_table_schema_name=temp_table_schema_name,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )
        elif isinstance(batch_spec, SqlAlchemyDatasourceBatchSpec):
            if self.engine.dialect.name.lower() == "oracle":
                selectable: str = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec
                )
            else:
                selectable: Selectable = self._build_selectable_from_batch_spec(
                    batch_spec=batch_spec
                )

            batch_data = SqlAlchemyBatchData(
                execution_engine=self,
                selectable=selectable,
                create_temp_table=create_temp_table,
                source_table_name=source_table_name,
                source_schema_name=source_schema_name,
            )

        return batch_data, batch_markers
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            if isinstance(batch_data, str):
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal.
Please check your config.""")
            if isinstance(batch_spec.batch_data, pd.DataFrame):
                df = batch_spec.batch_data
            elif isinstance(batch_spec.batch_data, PandasBatchData):
                df = batch_spec.batch_data.dataframe
            else:
                raise ValueError(
                    "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object."
                )
            batch_spec.batch_data = "PandasDataFrame"

        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                self._instantiate_s3_client()
            # if we were not able to instantiate S3 client, then raise error
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            try:
                reader_method: str = batch_spec.reader_method
                reader_options: dict = batch_spec.reader_options or {}
                path: str = batch_spec.path
                s3_url = S3Url(path)
                if "compression" not in reader_options.keys():
                    inferred_compression_param = sniff_s3_compression(s3_url)
                    if inferred_compression_param is not None:
                        reader_options[
                            "compression"] = inferred_compression_param
                s3_object = s3_engine.get_object(Bucket=s3_url.bucket,
                                                 Key=s3_url.key)
            except (ParamValidationError, ClientError) as error:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine encountered the following error while trying to read data from S3 Bucket: {error}"""
                )
            logger.debug(
                f"Fetching s3 object. Bucket: {s3_url.bucket} Key: {s3_url.key}"
            )
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            buf = BytesIO(s3_object["Body"].read())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, AzureBatchSpec):
            if self._azure is None:
                self._instantiate_azure_client()
            # if we were not able to instantiate Azure client, then raise error
            if self._azure is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a AzureBatchSpec,
                        but the ExecutionEngine does not have an Azure client configured. Please check your config."""
                )
            azure_engine = self._azure
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            path: str = batch_spec.path
            azure_url = AzureUrl(path)
            blob_client = azure_engine.get_blob_client(
                container=azure_url.container, blob=azure_url.blob)
            azure_object = blob_client.download_blob()
            logger.debug(
                f"Fetching Azure blob. Container: {azure_url.container} Blob: {azure_url.blob}"
            )
            reader_fn = self._get_reader_fn(reader_method, azure_url.blob)
            buf = BytesIO(azure_object.readall())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, GCSBatchSpec):
            if self._gcs is None:
                self._instantiate_gcs_client()
            # if we were not able to instantiate GCS client, then raise error
            if self._gcs is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a GCSBatchSpec,
                        but the ExecutionEngine does not have an GCS client configured. Please check your config."""
                )
            gcs_engine = self._gcs
            gcs_url = GCSUrl(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            try:
                gcs_bucket = gcs_engine.get_bucket(gcs_url.bucket)
                gcs_blob = gcs_bucket.blob(gcs_url.blob)
                logger.debug(
                    f"Fetching GCS blob. Bucket: {gcs_url.bucket} Blob: {gcs_url.blob}"
                )
            except GoogleAPIError as error:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine encountered the following error while trying to read data from GCS Bucket: {error}"""
                )
            reader_fn = self._get_reader_fn(reader_method, gcs_url.blob)
            buf = BytesIO(gcs_blob.download_as_bytes())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)

        else:
            raise ge_exceptions.BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, S3BatchSpec, or AzureBatchSpec, not {batch_spec.__class__.__name__}"
            )

        df = self._apply_splitting_and_sampling_methods(batch_spec, df)
        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                df)

        typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df)

        return typed_batch_data, batch_markers
Exemple #14
0
    def get_batch(self, batch_kwargs, batch_parameters=None):
        """class-private implementation of get_data_asset"""
        if self.spark is None:
            logger.error("No spark session available")
            return None

        reader_options = batch_kwargs.get("reader_options", {})

        # We need to build batch_markers to be used with the DataFrame
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "path" in batch_kwargs or "s3" in batch_kwargs:
            if "s3" in batch_kwargs:
                warnings.warn(
                    "Direct GE Support for the s3 BatchKwarg will be removed in a future release. Please use a path "
                    "including the s3a:// protocol instead.",
                    DeprecationWarning,
                )

            # If both are present, let s3 override
            path = batch_kwargs.get("path")
            path = batch_kwargs.get("s3", path)
            reader_method = batch_kwargs.get("reader_method")
            reader = self.spark.read

            for option in reader_options.items():
                reader = reader.option(*option)
            reader_fn = self._get_reader_fn(reader, reader_method, path)
            df = reader_fn(path)

        elif "query" in batch_kwargs:
            df = self.spark.sql(batch_kwargs["query"])

        elif "dataset" in batch_kwargs and isinstance(
                batch_kwargs["dataset"], (DataFrame, SparkDFDataset)):
            df = batch_kwargs.get("dataset")
            # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
            batch_kwargs = {
                k: batch_kwargs[k]
                for k in batch_kwargs if k != "dataset"
            }
            if isinstance(df, SparkDFDataset):
                # Grab just the spark_df reference, since we want to override everything else
                df = df.spark_df
            # Record this in the kwargs *and* the id
            batch_kwargs["SparkDFRef"] = True
            batch_kwargs["ge_batch_id"] = str(uuid.uuid1())

        else:
            raise BatchKwargsError(
                "Unrecognized batch_kwargs for spark_source", batch_kwargs)

        if "limit" in batch_kwargs:
            df = df.limit(batch_kwargs["limit"])

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=df,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context,
        )
Exemple #15
0
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        """
        As documented in Azure DataConnector implementations, Pandas and Spark execution engines utilize separate path
        formats for accessing Azure Blob Storage service.  However, Pandas and Spark execution engines utilize identical
        path formats for accessing all other supported cloud storage services (AWS S3 and Google Cloud Storage).
        Moreover, these formats (encapsulated in S3BatchSpec and GCSBatchSpec) extend PathBatchSpec (common to them).
        Therefore, at the present time, all cases with the exception of Azure Blob Storage , are handled generically.
        """

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            if isinstance(batch_data, str):
                raise ge_exceptions.ExecutionEngineError(
                    f"""SparkDFExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal.
Please check your config."""
                )
            batch_spec.batch_data = "SparkDataFrame"

        elif isinstance(batch_spec, AzureBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            path: str = batch_spec.path
            azure_url = AzureUrl(path)
            try:
                credential = self._azure_options.get("credential")
                storage_account_url = azure_url.account_url
                if credential:
                    self.spark.conf.set(
                        "fs.wasb.impl",
                        "org.apache.hadoop.fs.azure.NativeAzureFileSystem",
                    )
                    self.spark.conf.set(
                        f"fs.azure.account.key.{storage_account_url}", credential
                    )
                reader: DataFrameReader = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError(
                    """
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """
                )

        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            path: str = batch_spec.path
            try:
                reader: DataFrameReader = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError(
                    """
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """
                )
            # pyspark will raise an AnalysisException error if path is incorrect
            except pyspark.sql.utils.AnalysisException:
                raise ExecutionEngineError(
                    f"""Unable to read in batch from the following path: {path}. Please check your configuration."""
                )

        else:
            raise BatchSpecError(
                """
                Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate.
                """
            )

        batch_data = self._apply_splitting_and_sampling_methods(batch_spec, batch_data)
        typed_batch_data = SparkDFBatchData(execution_engine=self, dataframe=batch_data)

        return typed_batch_data, batch_markers
Exemple #16
0
    def get_batch(self, batch_kwargs, batch_parameters=None):
        # We will use and manipulate reader_options along the way
        reader_options = batch_kwargs.get("reader_options", {})

        # We need to build a batch_markers to be used in the dataframe
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "path" in batch_kwargs:
            path = batch_kwargs["path"]
            reader_method = batch_kwargs.get("reader_method")
            reader_fn = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)

        elif "s3" in batch_kwargs:
            warnings.warn(
                "Direct GE Support for the s3 BatchKwarg will be removed in a future release. Please use a path including the s3a:// protocol instead.",
                DeprecationWarning,
            )
            try:
                import boto3

                s3 = boto3.client("s3", **self._boto3_options)
            except ImportError:
                raise BatchKwargsError(
                    "Unable to load boto3 client to read s3 asset.",
                    batch_kwargs)
            raw_url = batch_kwargs["s3"]
            reader_method = batch_kwargs.get("reader_method")
            url = S3Url(raw_url)
            logger.debug(
                f"Fetching s3 object. Bucket: {url.bucket} Key: {url.key}")
            s3_object = s3.get_object(Bucket=url.bucket, Key=url.key)
            reader_fn = self._get_reader_fn(reader_method, url.key)
            default_reader_options = self._infer_default_options(
                reader_fn, reader_options)
            if not reader_options.get(
                    "encoding") and default_reader_options.get("encoding"):
                reader_options["encoding"] = s3_object.get(
                    "ContentEncoding", default_reader_options.get("encoding"))
            df = reader_fn(BytesIO(s3_object["Body"].read()), **reader_options)

        elif "dataset" in batch_kwargs and isinstance(
                batch_kwargs["dataset"], (pd.DataFrame, pd.Series)):
            df = batch_kwargs.get("dataset")
            # We don't want to store the actual dataframe in kwargs; copy the remaining batch_kwargs
            batch_kwargs = {
                k: batch_kwargs[k]
                for k in batch_kwargs if k != "dataset"
            }
            batch_kwargs["PandasInMemoryDF"] = True
            batch_kwargs["ge_batch_id"] = str(uuid.uuid1())

        else:
            raise BatchKwargsError(
                "Invalid batch_kwargs: path, s3, or dataset is required for a PandasDatasource",
                batch_kwargs,
            )

        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                df)

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=df,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context,
        )
    def get_batch(self, batch_kwargs, batch_parameters=None):
        # We need to build a batch_id to be used in the dataframe
        batch_markers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if "bigquery_temp_table" in batch_kwargs:
            # deprecated-v0.15.3
            warnings.warn(
                "BigQuery tables that are created as the result of a query are no longer created as "
                "permanent tables. Thus, a named permanent table through the `bigquery_temp_table`"
                "parameter is not required. The `bigquery_temp_table` parameter is deprecated as of"
                "v0.15.3 and will be removed in v0.18.",
                DeprecationWarning,
            )

        if "snowflake_transient_table" in batch_kwargs:
            # Snowflake can use either a transient or temp table, so we allow a table_name to be provided
            query_support_table_name = batch_kwargs.get(
                "snowflake_transient_table")
        else:
            query_support_table_name = None

        if "query" in batch_kwargs:
            if "limit" in batch_kwargs or "offset" in batch_kwargs:
                logger.warning(
                    "Limit and offset parameters are ignored when using query-based batch_kwargs; consider "
                    "adding limit and offset directly to the generated query.")
            if "query_parameters" in batch_kwargs:
                query = Template(batch_kwargs["query"]).safe_substitute(
                    batch_kwargs["query_parameters"])
            else:
                query = batch_kwargs["query"]
            batch_reference = SqlAlchemyBatchReference(
                engine=self.engine,
                query=query,
                table_name=query_support_table_name,
                schema=batch_kwargs.get("schema"),
            )
        elif "table" in batch_kwargs:
            table = batch_kwargs["table"]
            if batch_kwargs.get("use_quoted_name"):
                table = quoted_name(table, quote=True)

            limit = batch_kwargs.get("limit")
            offset = batch_kwargs.get("offset")
            if limit is not None or offset is not None:
                logger.info(
                    "Generating query from table batch_kwargs based on limit and offset"
                )

                # In BigQuery the table name is already qualified with its schema name
                if self.engine.dialect.name.lower() == "bigquery":
                    schema = None

                else:
                    schema = batch_kwargs.get("schema")
                # limit doesn't compile properly for oracle so we will append rownum to query string later
                if self.engine.dialect.name.lower() == "oracle":
                    raw_query = sqlalchemy.select(
                        [sqlalchemy.text("*")]).select_from(
                            sqlalchemy.schema.Table(table,
                                                    sqlalchemy.MetaData(),
                                                    schema=schema))
                else:
                    raw_query = (sqlalchemy.select(
                        [sqlalchemy.text("*")]).select_from(
                            sqlalchemy.schema.Table(
                                table, sqlalchemy.MetaData(),
                                schema=schema)).offset(offset).limit(limit))
                query = str(
                    raw_query.compile(self.engine,
                                      compile_kwargs={"literal_binds": True}))
                # use rownum instead of limit in oracle
                if self.engine.dialect.name.lower() == "oracle":
                    query += "\nWHERE ROWNUM <= %d" % limit
                batch_reference = SqlAlchemyBatchReference(
                    engine=self.engine,
                    query=query,
                    table_name=query_support_table_name,
                    schema=batch_kwargs.get("schema"),
                )
            else:
                batch_reference = SqlAlchemyBatchReference(
                    engine=self.engine,
                    table_name=table,
                    schema=batch_kwargs.get("schema"),
                )
        else:
            raise ValueError(
                "Invalid batch_kwargs: exactly one of 'table' or 'query' must be specified"
            )

        return Batch(
            datasource_name=self.name,
            batch_kwargs=batch_kwargs,
            data=batch_reference,
            batch_parameters=batch_parameters,
            batch_markers=batch_markers,
            data_context=self._data_context,
        )