def _get_reader_fn(self, reader_method=None, path=None): """Static helper for parsing reader types. If reader_method is not provided, path will be used to guess the correct reader_method. Args: reader_method (str): the name of the reader method to use, if available. path (str): the path used to guess Returns: ReaderMethod to use for the filepath """ if reader_method is None and path is None: raise ge_exceptions.ExecutionEngineError( "Unable to determine pandas reader function without reader_method or path." ) reader_options = {} if reader_method is None: path_guess = self.guess_reader_method_from_path(path) reader_method = path_guess["reader_method"] reader_options = path_guess.get( "reader_options" ) # This may not be there; use None in that case try: reader_fn = getattr(pd, reader_method) if reader_options: reader_fn = partial(reader_fn, **reader_options) return reader_fn except AttributeError: raise ge_exceptions.ExecutionEngineError( f'Unable to find reader_method "{reader_method}" in pandas.')
def __init__( self, name: str, execution_engine=None, data_context_root_directory: Optional[str] = None, ): """ Build a new Datasource. Args: name: the name for the datasource execution_engine (ClassConfig): the type of compute engine to produce data_context_root_directory: Installation directory path (if installed on a filesystem). """ self._name = name self._data_context_root_directory = data_context_root_directory try: self._execution_engine = instantiate_class_from_config( config=execution_engine, runtime_environment={}, config_defaults={ "module_name": "great_expectations.execution_engine" }, ) self._datasource_config = { "execution_engine": execution_engine, } except Exception as e: raise ge_exceptions.ExecutionEngineError(message=str(e)) self._data_connectors = {}
def guess_reader_method_from_path(path): """Helper method for deciding which reader to use to read in a certain path. Args: path (str): the to use to guess Returns: ReaderMethod to use for the filepath """ if path.endswith(".csv") or path.endswith(".tsv"): return {"reader_method": "read_csv"} elif path.endswith(".parquet"): return {"reader_method": "read_parquet"} elif path.endswith(".xlsx") or path.endswith(".xls"): return {"reader_method": "read_excel"} elif path.endswith(".json"): return {"reader_method": "read_json"} elif path.endswith(".pkl"): return {"reader_method": "read_pickle"} elif path.endswith(".feather"): return {"reader_method": "read_feather"} elif path.endswith(".csv.gz") or path.endswith(".tsv.gz"): return { "reader_method": "read_csv", "reader_options": { "compression": "gzip" }, } elif path.endswith(".sas7bdat") or path.endswith(".xpt"): return {"reader_method": "read_sas"} else: raise ge_exceptions.ExecutionEngineError( f'Unable to determine reader method from path: "{path}".')
def inner_func( cls, execution_engine: PandasExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): filter_column_isnull = kwargs.get( "filter_column_isnull", getattr(cls, "filter_column_isnull", False)) df, _, accessor_domain_kwargs = execution_engine.get_compute_domain( domain_kwargs=metric_domain_kwargs, domain_type=domain_type) column_name = accessor_domain_kwargs["column"] if column_name not in metrics["table.columns"]: raise ge_exceptions.ExecutionEngineError( message= f'Error: The column "{column_name}" in BatchData does not exist.' ) if filter_column_isnull: df = df[df[column_name].notnull()] return metric_fn( cls, column=df[column_name], **metric_value_kwargs, _metrics=metrics, )
def active_batch_data_id(self, batch_id) -> None: if batch_id in self.loaded_batch_data_dict.keys(): self._active_batch_data_id = batch_id else: raise ge_exceptions.ExecutionEngineError( f"Unable to set active_batch_data_id to {batch_id}. The may data may not be loaded." )
def sample_using_hash( self, df: DataFrame, batch_spec: BatchSpec, ) -> DataFrame: """Hash the values in the named column, and only keep rows that match the given hash_value. Args: df: dataframe to sample batch_spec: should contain keys `column_name` and optionally `hash_digits` (default is 1 if not provided), `hash_value` (default is "f" if not provided), and `hash_function_name` (default is "md5" if not provided) Returns: Sampled dataframe Raises: SamplerError """ self.verify_batch_spec_sampling_kwargs_exists(batch_spec) self.verify_batch_spec_sampling_kwargs_key_exists( "column_name", batch_spec) column_name: str = self.get_sampling_kwargs_value_or_default( batch_spec, "column_name") hash_digits: int = self.get_sampling_kwargs_value_or_default( batch_spec=batch_spec, sampling_kwargs_key="hash_digits", default_value=1) hash_value: str = self.get_sampling_kwargs_value_or_default( batch_spec=batch_spec, sampling_kwargs_key="hash_value", default_value="f") hash_function_name: str = self.get_sampling_kwargs_value_or_default( batch_spec=batch_spec, sampling_kwargs_key="hash_function_name", default_value="md5", ) try: getattr(hashlib, str(hash_function_name)) except (TypeError, AttributeError): raise (ge_exceptions.ExecutionEngineError( f"""The sampling method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) def _encrypt_value(to_encode): to_encode_str = str(to_encode) hash_func = getattr(hashlib, hash_function_name) hashed_value = hash_func( to_encode_str.encode()).hexdigest()[-1 * hash_digits:] return hashed_value encrypt_udf = F.udf(_encrypt_value, sparktypes.StringType()) res = (df.withColumn( "encrypted_value", encrypt_udf(column_name)).filter( F.col("encrypted_value") == hash_value).drop("encrypted_value") ) return res
def sample_using_hash( self, df: pd.DataFrame, batch_spec: BatchSpec, ) -> pd.DataFrame: """Hash the values in the named column, and only keep rows that match the given hash_value. Args: df: dataframe to sample batch_spec: should contain keys `column_name` and optionally `hash_digits` (default is 1 if not provided), `hash_value` (default is "f" if not provided), and `hash_function_name` (default is "md5" if not provided) Returns: Sampled dataframe Raises: SamplerError """ self.verify_batch_spec_sampling_kwargs_exists(batch_spec) self.verify_batch_spec_sampling_kwargs_key_exists( "column_name", batch_spec) column_name: str = self.get_sampling_kwargs_value_or_default( batch_spec, "column_name") hash_digits: int = self.get_sampling_kwargs_value_or_default( batch_spec=batch_spec, sampling_kwargs_key="hash_digits", default_value=1) hash_value: str = self.get_sampling_kwargs_value_or_default( batch_spec=batch_spec, sampling_kwargs_key="hash_value", default_value="f") hash_function_name: str = self.get_sampling_kwargs_value_or_default( batch_spec=batch_spec, sampling_kwargs_key="hash_function_name", default_value="md5", ) try: hash_func = getattr(hashlib, hash_function_name) except (TypeError, AttributeError): raise (ge_exceptions.ExecutionEngineError( f"""The sampling method used with PandasExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) matches = df[column_name].map(lambda x: hash_func(str(x).encode( )).hexdigest()[-1 * hash_digits:] == hash_value) return df[matches]
def inner_func( cls, execution_engine: SqlAlchemyExecutionEngine, metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[str, Any], runtime_configuration: Dict, ): filter_column_isnull = kwargs.get( "filter_column_isnull", getattr(cls, "filter_column_isnull", False)) if filter_column_isnull: compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs) else: # We do not copy here because if compute domain is different, it will be copied by get_compute_domain compute_domain_kwargs = metric_domain_kwargs ( selectable, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( compute_domain_kwargs, domain_type=domain_type) column_name: str = accessor_domain_kwargs["column"] sqlalchemy_engine: sa.engine.Engine = execution_engine.engine if column_name not in metrics["table.columns"]: raise ge_exceptions.ExecutionEngineError( message= f'Error: The column "{column_name}" in BatchData does not exist.' ) dialect = sqlalchemy_engine.dialect metric_aggregate = metric_fn( cls, column=sa.column(column_name), **metric_value_kwargs, _dialect=dialect, _table=selectable, _column_name=column_name, _sqlalchemy_engine=sqlalchemy_engine, _metrics=metrics, ) return metric_aggregate, compute_domain_kwargs, accessor_domain_kwargs
def _sample_using_hash( df, column_name: str, hash_digits: int = 1, hash_value: str = "f", hash_function_name: str = "md5", ): """Hash the values in the named column, and split on that""" try: hash_func = getattr(hashlib, hash_function_name) except (TypeError, AttributeError) as e: raise (ge_exceptions.ExecutionEngineError( f"""The sampling method used with PandasExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) matches = df[column_name].map(lambda x: hash_func(str(x).encode( )).hexdigest()[-1 * hash_digits:] == hash_value) return df[matches]
def _split_on_hashed_column( df, column_name: str, hash_digits: int, batch_identifiers: dict, hash_function_name: str = "md5", ): """Split on the hashed value of the named column""" try: hash_method = getattr(hashlib, hash_function_name) except (TypeError, AttributeError) as e: raise (ge_exceptions.ExecutionEngineError( f"""The splitting method used with SparkDFExecutionEngine has a reference to an invalid hash_function_name. Reference to {hash_function_name} cannot be found.""")) matching_rows = df[column_name].map( lambda x: hash_method(str(x).encode()).hexdigest()[ -1 * hash_digits:] == batch_identifiers["hash_value"]) return df[matching_rows]
def inner_func( cls, execution_engine: "SparkDFExecutionEngine", metric_domain_kwargs: Dict, metric_value_kwargs: Dict, metrics: Dict[Tuple, Any], runtime_configuration: Dict, ): filter_column_isnull = kwargs.get( "filter_column_isnull", getattr(cls, "filter_column_isnull", False) ) if filter_column_isnull: compute_domain_kwargs = execution_engine.add_column_row_condition( metric_domain_kwargs ) else: # We do not copy here because if compute domain is different, it will be copied by get_compute_domain compute_domain_kwargs = metric_domain_kwargs ( data, compute_domain_kwargs, accessor_domain_kwargs, ) = execution_engine.get_compute_domain( domain_kwargs=compute_domain_kwargs, domain_type=domain_type ) column_name = accessor_domain_kwargs["column"] if column_name not in data.columns: raise ge_exceptions.ExecutionEngineError( message=f'Error: The column "{column_name}" in BatchData does not exist.' ) column = data[column_name] metric_aggregate = metric_fn( cls, column=column, **metric_value_kwargs, _table=data, _column_name=column_name, _metrics=metrics, ) return metric_aggregate, compute_domain_kwargs, accessor_domain_kwargs
def get_batch_data_and_markers( self, batch_spec: BatchSpec ) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers( { "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime( "%Y%m%dT%H%M%S.%fZ" ) } ) batch_data: PandasBatchData if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated if isinstance(batch_spec.batch_data, pd.DataFrame): df = batch_spec.batch_data elif isinstance(batch_spec.batch_data, PandasBatchData): df = batch_spec.batch_data.dataframe else: raise ValueError( "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object." ) batch_spec.batch_data = "PandasDataFrame" elif isinstance(batch_spec, S3BatchSpec): if self._s3 is None: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine has been passed a S3BatchSpec, but the ExecutionEngine does not have a boto3 client configured. Please check your config.""" ) s3_engine = self._s3 s3_url = S3Url(batch_spec.path) reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} if "compression" not in reader_options.keys(): reader_options["compression"] = sniff_s3_compression(s3_url) s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key) logger.debug( "Fetching s3 object. Bucket: {} Key: {}".format( s3_url.bucket, s3_url.key ) ) reader_fn = self._get_reader_fn(reader_method, s3_url.key) buf = BytesIO(s3_object["Body"].read()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options path: str = batch_spec.path reader_fn: Callable = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) else: raise ge_exceptions.BatchSpecError( f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}" ) df = self._apply_splitting_and_sampling_methods(batch_spec, df) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(df) typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df) return typed_batch_data, batch_markers
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_data: Any if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data if isinstance(batch_data, str): raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal. Please check your config.""") if isinstance(batch_spec.batch_data, pd.DataFrame): df = batch_spec.batch_data elif isinstance(batch_spec.batch_data, PandasBatchData): df = batch_spec.batch_data.dataframe else: raise ValueError( "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object." ) batch_spec.batch_data = "PandasDataFrame" elif isinstance(batch_spec, S3BatchSpec): if self._s3 is None: self._instantiate_s3_client() # if we were not able to instantiate S3 client, then raise error if self._s3 is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a S3BatchSpec, but the ExecutionEngine does not have a boto3 client configured. Please check your config.""" ) s3_engine = self._s3 try: reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path s3_url = S3Url(path) if "compression" not in reader_options.keys(): inferred_compression_param = sniff_s3_compression(s3_url) if inferred_compression_param is not None: reader_options[ "compression"] = inferred_compression_param s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key) except (ParamValidationError, ClientError) as error: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine encountered the following error while trying to read data from S3 Bucket: {error}""" ) logger.debug( f"Fetching s3 object. Bucket: {s3_url.bucket} Key: {s3_url.key}" ) reader_fn = self._get_reader_fn(reader_method, s3_url.key) buf = BytesIO(s3_object["Body"].read()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, AzureBatchSpec): if self._azure is None: self._instantiate_azure_client() # if we were not able to instantiate Azure client, then raise error if self._azure is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a AzureBatchSpec, but the ExecutionEngine does not have an Azure client configured. Please check your config.""" ) azure_engine = self._azure reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path azure_url = AzureUrl(path) blob_client = azure_engine.get_blob_client( container=azure_url.container, blob=azure_url.blob) azure_object = blob_client.download_blob() logger.debug( f"Fetching Azure blob. Container: {azure_url.container} Blob: {azure_url.blob}" ) reader_fn = self._get_reader_fn(reader_method, azure_url.blob) buf = BytesIO(azure_object.readall()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, GCSBatchSpec): if self._gcs is None: self._instantiate_gcs_client() # if we were not able to instantiate GCS client, then raise error if self._gcs is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a GCSBatchSpec, but the ExecutionEngine does not have an GCS client configured. Please check your config.""" ) gcs_engine = self._gcs gcs_url = GCSUrl(batch_spec.path) reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} try: gcs_bucket = gcs_engine.get_bucket(gcs_url.bucket) gcs_blob = gcs_bucket.blob(gcs_url.blob) logger.debug( f"Fetching GCS blob. Bucket: {gcs_url.bucket} Blob: {gcs_url.blob}" ) except GoogleAPIError as error: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine encountered the following error while trying to read data from GCS Bucket: {error}""" ) reader_fn = self._get_reader_fn(reader_method, gcs_url.blob) buf = BytesIO(gcs_blob.download_as_bytes()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options path: str = batch_spec.path reader_fn: Callable = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) else: raise ge_exceptions.BatchSpecError( f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, S3BatchSpec, or AzureBatchSpec, not {batch_spec.__class__.__name__}" ) df = self._apply_splitting_and_sampling_methods(batch_spec, df) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe( df) typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df) return typed_batch_data, batch_markers