コード例 #1
0
    def _sample_using_hash(
        df,
        column_name: str,
        hash_digits: int = 1,
        hash_value: str = "f",
        hash_function_name: str = "md5",
    ):
        try:
            getattr(hashlib, str(hash_function_name))
        except (TypeError, AttributeError) as e:
            raise (ge_exceptions.ExecutionEngineError(
                f"""The sampling method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name.
                    Reference to {hash_function_name} cannot be found."""))

        def _encrypt_value(to_encode):
            to_encode_str = str(to_encode)
            hash_func = getattr(hashlib, hash_function_name)
            hashed_value = hash_func(
                to_encode_str.encode()).hexdigest()[-1 * hash_digits:]
            return hashed_value

        encrypt_udf = F.udf(_encrypt_value, StringType())
        res = (df.withColumn(
            "encrypted_value", encrypt_udf(column_name)).filter(
                F.col("encrypted_value") == hash_value).drop("encrypted_value")
               )
        return res
コード例 #2
0
    def _split_on_hashed_column(
        df,
        column_name: str,
        hash_digits: int,
        partition_definition: dict,
        hash_function_name: str = "sha256",
    ):
        """Split on the hashed value of the named column"""
        try:
            getattr(hashlib, hash_function_name)
        except (TypeError, AttributeError) as e:
            raise (ge_exceptions.ExecutionEngineError(
                f"""The splitting method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name.
                    Reference to {hash_function_name} cannot be found."""))

        def _encrypt_value(to_encode):
            hash_func = getattr(hashlib, hash_function_name)
            hashed_value = hash_func(
                to_encode.encode()).hexdigest()[-1 * hash_digits:]
            return hashed_value

        encrypt_udf = F.udf(_encrypt_value, StringType())
        res = (df.withColumn(
            "encrypted_value", encrypt_udf(column_name)).filter(
                F.col("encrypted_value") ==
                partition_definition["hash_value"]).drop("encrypted_value"))
        return res
コード例 #3
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data

        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.get("reader_method")
            reader_options: dict = batch_spec.get("reader_options") or {}

            path: str = batch_spec["path"]
            reader_fn: Callable = self._get_reader_fn(reader_method, path)

            batch_data = reader_fn(path, **reader_options)

        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            s3_url = S3Url(batch_spec.get("s3"))
            reader_method: str = batch_spec.get("reader_method")
            reader_options: dict = batch_spec.get("reader_options") or {}

            s3_object = s3_engine.get_object(Bucket=s3_url.bucket,
                                             Key=s3_url.key)

            logger.debug("Fetching s3 object. Bucket: {} Key: {}".format(
                s3_url.bucket, s3_url.key))
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            batch_data = reader_fn(
                StringIO(s3_object["Body"].read().decode(
                    s3_object.get("ContentEncoding", "utf-8"))),
                **reader_options,
            )
        else:
            raise BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}"
            )

        batch_data = self._apply_splitting_and_sampling_methods(
            batch_spec, batch_data)
        if batch_data.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                batch_data)

        typed_batch_data = self._get_typed_batch_data(batch_data)

        return typed_batch_data, batch_markers
コード例 #4
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            if isinstance(batch_data, str):
                raise ge_exceptions.ExecutionEngineError(
                    f"""SparkDFExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal.
Please check your config.""")
            batch_spec.batch_data = "SparkDataFrame"
        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            try:
                reader_options = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader_options,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError("""
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """)
        else:
            raise BatchSpecError("""
                Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate.
                """)

        batch_data = self._apply_splitting_and_sampling_methods(
            batch_spec, batch_data)
        typed_batch_data = SparkDFBatchData(execution_engine=self,
                                            dataframe=batch_data)

        return typed_batch_data, batch_markers
コード例 #5
0
    def _sample_using_hash(
        df,
        column_name: str,
        hash_digits: int = 1,
        hash_value: str = "f",
        hash_function_name: str = "md5",
    ):
        """Hash the values in the named column, and split on that"""
        try:
            hash_func = getattr(hashlib, hash_function_name)
        except (TypeError, AttributeError) as e:
            raise (ge_exceptions.ExecutionEngineError(
                f"""The sampling method used with PandasExecutionEngine has a reference to an invalid hash_function_name.
                    Reference to {hash_function_name} cannot be found."""))

        matches = df[column_name].map(lambda x: hash_func(str(x).encode(
        )).hexdigest()[-1 * hash_digits:] == hash_value)
        return df[matches]
コード例 #6
0
 def _split_on_hashed_column(
     df,
     column_name: str,
     hash_digits: int,
     partition_definition: dict,
     hash_function_name: str = "md5",
 ):
     """Split on the hashed value of the named column"""
     try:
         hash_method = getattr(hashlib, hash_function_name)
     except (TypeError, AttributeError) as e:
         raise (ge_exceptions.ExecutionEngineError(
             f"""The splitting method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name.
                 Reference to {hash_function_name} cannot be found."""))
     matching_rows = df[column_name].map(
         lambda x: hash_method(str(x).encode()).hexdigest()[
             -1 * hash_digits:] == partition_definition["hash_value"])
     return df[matching_rows]
コード例 #7
0
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        """
        As documented in Azure DataConnector implementations, Pandas and Spark execution engines utilize separate path
        formats for accessing Azure Blob Storage service.  However, Pandas and Spark execution engines utilize identical
        path formats for accessing all other supported cloud storage services (AWS S3 and Google Cloud Storage).
        Moreover, these formats (encapsulated in S3BatchSpec and GCSBatchSpec) extend PathBatchSpec (common to them).
        Therefore, at the present time, all cases with the exception of Azure Blob Storage , are handled generically.
        """

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            if isinstance(batch_data, str):
                raise ge_exceptions.ExecutionEngineError(
                    f"""SparkDFExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal.
Please check your config."""
                )
            batch_spec.batch_data = "SparkDataFrame"

        elif isinstance(batch_spec, AzureBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            path: str = batch_spec.path
            azure_url = AzureUrl(path)
            try:
                credential = self._azure_options.get("credential")
                storage_account_url = azure_url.account_url
                if credential:
                    self.spark.conf.set(
                        "fs.wasb.impl",
                        "org.apache.hadoop.fs.azure.NativeAzureFileSystem",
                    )
                    self.spark.conf.set(
                        f"fs.azure.account.key.{storage_account_url}", credential
                    )
                reader: DataFrameReader = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError(
                    """
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """
                )

        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            path: str = batch_spec.path
            try:
                reader: DataFrameReader = self.spark.read.options(**reader_options)
                reader_fn: Callable = self._get_reader_fn(
                    reader=reader,
                    reader_method=reader_method,
                    path=path,
                )
                batch_data = reader_fn(path)
            except AttributeError:
                raise ExecutionEngineError(
                    """
                    Unable to load pyspark. Pyspark is required for SparkDFExecutionEngine.
                    """
                )
            # pyspark will raise an AnalysisException error if path is incorrect
            except pyspark.sql.utils.AnalysisException:
                raise ExecutionEngineError(
                    f"""Unable to read in batch from the following path: {path}. Please check your configuration."""
                )

        else:
            raise BatchSpecError(
                """
                Invalid batch_spec: batch_data is required for a SparkDFExecutionEngine to operate.
                """
            )

        batch_data = self._apply_splitting_and_sampling_methods(batch_spec, batch_data)
        typed_batch_data = SparkDFBatchData(execution_engine=self, dataframe=batch_data)

        return typed_batch_data, batch_markers
コード例 #8
0
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_data: PandasBatchData
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            if isinstance(batch_spec.batch_data, pd.DataFrame):
                df = batch_spec.batch_data
            elif isinstance(batch_spec.batch_data, PandasBatchData):
                df = batch_spec.batch_data.dataframe
            else:
                raise ValueError(
                    "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object."
                )
            batch_spec.batch_data = "PandasDataFrame"
        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            s3_url = S3Url(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            if "compression" not in reader_options.keys():
                reader_options["compression"] = sniff_s3_compression(s3_url)
            s3_object = s3_engine.get_object(Bucket=s3_url.bucket,
                                             Key=s3_url.key)
            logger.debug("Fetching s3 object. Bucket: {} Key: {}".format(
                s3_url.bucket, s3_url.key))
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            buf = BytesIO(s3_object["Body"].read())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)
        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)
        else:
            raise BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}"
            )

        df = self._apply_splitting_and_sampling_methods(batch_spec, df)
        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                df)

        typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df)

        return typed_batch_data, batch_markers