def test_azure_spark_url_with_nested_blob(): url = AzureUrl( "my_container@my_account.blob.core.windows.net/a/b/c/d/e/my_blob") assert url.account_name == "my_account" assert url.account_url == "my_account.blob.core.windows.net" assert url.container == "my_container" assert url.blob == "a/b/c/d/e/my_blob"
def test_azure_spark_url_with_wasbs(): url = AzureUrl( "wasbs://my_container@my_account.blob.core.windows.net/my_blob") assert url.account_name == "my_account" assert url.account_url == "my_account.blob.core.windows.net" assert url.container == "my_container" assert url.blob == "my_blob"
def test_azure_pandas_url_with_special_chars(): # Note that `url` conforms with the naming restrictions set by the Azure API # Azure naming restrictions: https://docs.microsoft.com/en-us/rest/api/storageservices/naming-and-referencing-containers--blobs--and-metadata url = AzureUrl( "my_account.blob.core.windows.net/my-container_1.0/my-blob_`~!@#$%^&*()=+" ) assert url.account_name == "my_account" assert url.account_url == "my_account.blob.core.windows.net" assert url.container == "my-container_1.0" assert url.blob == "my-blob_`~!@#$%^&*()=+"
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_data: Any if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data if isinstance(batch_data, str): raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal. Please check your config.""") if isinstance(batch_spec.batch_data, pd.DataFrame): df = batch_spec.batch_data elif isinstance(batch_spec.batch_data, PandasBatchData): df = batch_spec.batch_data.dataframe else: raise ValueError( "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object." ) batch_spec.batch_data = "PandasDataFrame" elif isinstance(batch_spec, S3BatchSpec): if self._s3 is None: self._instantiate_s3_client() # if we were not able to instantiate S3 client, then raise error if self._s3 is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a S3BatchSpec, but the ExecutionEngine does not have a boto3 client configured. Please check your config.""" ) s3_engine = self._s3 try: reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path s3_url = S3Url(path) if "compression" not in reader_options.keys(): inferred_compression_param = sniff_s3_compression(s3_url) if inferred_compression_param is not None: reader_options[ "compression"] = inferred_compression_param s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key) except (ParamValidationError, ClientError) as error: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine encountered the following error while trying to read data from S3 Bucket: {error}""" ) logger.debug( f"Fetching s3 object. Bucket: {s3_url.bucket} Key: {s3_url.key}" ) reader_fn = self._get_reader_fn(reader_method, s3_url.key) buf = BytesIO(s3_object["Body"].read()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, AzureBatchSpec): if self._azure is None: self._instantiate_azure_client() # if we were not able to instantiate Azure client, then raise error if self._azure is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a AzureBatchSpec, but the ExecutionEngine does not have an Azure client configured. Please check your config.""" ) azure_engine = self._azure reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path azure_url = AzureUrl(path) blob_client = azure_engine.get_blob_client( container=azure_url.container, blob=azure_url.blob) azure_object = blob_client.download_blob() logger.debug( f"Fetching Azure blob. Container: {azure_url.container} Blob: {azure_url.blob}" ) reader_fn = self._get_reader_fn(reader_method, azure_url.blob) buf = BytesIO(azure_object.readall()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, GCSBatchSpec): if self._gcs is None: self._instantiate_gcs_client() # if we were not able to instantiate GCS client, then raise error if self._gcs is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a GCSBatchSpec, but the ExecutionEngine does not have an GCS client configured. Please check your config.""" ) gcs_engine = self._gcs gcs_url = GCSUrl(batch_spec.path) reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} try: gcs_bucket = gcs_engine.get_bucket(gcs_url.bucket) gcs_blob = gcs_bucket.blob(gcs_url.blob) logger.debug( f"Fetching GCS blob. Bucket: {gcs_url.bucket} Blob: {gcs_url.blob}" ) except GoogleAPIError as error: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine encountered the following error while trying to read data from GCS Bucket: {error}""" ) reader_fn = self._get_reader_fn(reader_method, gcs_url.blob) buf = BytesIO(gcs_blob.download_as_bytes()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options path: str = batch_spec.path reader_fn: Callable = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) else: raise ge_exceptions.BatchSpecError( f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, S3BatchSpec, or AzureBatchSpec, not {batch_spec.__class__.__name__}" ) df = self._apply_splitting_and_sampling_methods(batch_spec, df) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe( df) typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df) return typed_batch_data, batch_markers
def test_azure_pandas_url_with_https(): url = AzureUrl( "https://my_account.blob.core.windows.net/my_container/my_blob") assert url.account_name == "my_account" assert url.container == "my_container" assert url.blob == "my_blob"
def test_azure_url_with_invalid_url(): with pytest.raises(AssertionError): AzureUrl("my_bucket/my_blob")
def get_batch_data_and_markers( self, batch_spec: BatchSpec ) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers( { "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime( "%Y%m%dT%H%M%S.%fZ" ) } ) """ As documented in Azure DataConnector implementations, Pandas and Spark execution engines utilize separate path formats for accessing Azure Blob Storage service. However, Pandas and Spark execution engines utilize identical path formats for accessing all other supported cloud storage services (AWS S3 and Google Cloud Storage). Moreover, these formats (encapsulated in S3BatchSpec and GCSBatchSpec) extend PathBatchSpec (common to them). Therefore, at the present time, all cases with the exception of Azure Blob Storage , are handled generically. """ batch_data: Any if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data if isinstance(batch_data, str): raise ge_exceptions.ExecutionEngineError( f"""SparkDFExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal. Please check your config.""" ) batch_spec.batch_data = "SparkDataFrame" elif isinstance(batch_spec, AzureBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path azure_url = AzureUrl(path) try: credential = self._azure_options.get("credential") storage_account_url = azure_url.account_url if credential: self.spark.conf.set( "fs.wasb.impl", "org.apache.hadoop.fs.azure.NativeAzureFileSystem", ) self.spark.conf.set( f"fs.azure.account.key.{storage_account_url}", credential ) reader: DataFrameReader = self.spark.read.options(**reader_options) reader_fn: Callable = self._get_reader_fn( reader=reader, reader_method=reader_method, path=path, ) batch_data = reader_fn(path) except AttributeError: raise ExecutionEngineError( """ Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine. """ ) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path try: reader: DataFrameReader = self.spark.read.options(**reader_options) reader_fn: Callable = self._get_reader_fn( reader=reader, reader_method=reader_method, path=path, ) batch_data = reader_fn(path) except AttributeError: raise ExecutionEngineError( """ Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine. """ ) # pyspark will raise an AnalysisException error if path is incorrect except pyspark.sql.utils.AnalysisException: raise ExecutionEngineError( f"""Unable to read in batch from the following path: {path}. Please check your configuration.""" ) else: raise BatchSpecError( """ Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate. """ ) batch_data = self._apply_splitting_and_sampling_methods(batch_spec, batch_data) typed_batch_data = SparkDFBatchData(execution_engine=self, dataframe=batch_data) return typed_batch_data, batch_markers