def _get_reader_fn(self, reader_method=None, path=None): """Static helper for parsing reader types. If reader_method is not provided, path will be used to guess the correct reader_method. Args: reader_method (str): the name of the reader method to use, if available. path (str): the path used to guess Returns: ReaderMethod to use for the filepath """ if reader_method is None and path is None: raise ge_exceptions.BatchSpecError( "Unable to determine pandas reader function without reader_method or path." ) reader_options = dict() if reader_method is None: path_guess = self.guess_reader_method_from_path(path) reader_method = path_guess["reader_method"] reader_options = path_guess.get( "reader_options" ) # This may not be there; use None in that case try: reader_fn = getattr(pd, reader_method) if reader_options: reader_fn = partial(reader_fn, **reader_options) return reader_fn except AttributeError: raise ge_exceptions.BatchSpecError( f'Unable to find reader_method "{reader_method}" in pandas.' )
def guess_reader_method_from_path(path): """Helper method for deciding which reader to use to read in a certain path. Args: path (str): the to use to guess Returns: ReaderMethod to use for the filepath """ if path.endswith(".csv") or path.endswith(".tsv"): return {"reader_method": "read_csv"} elif path.endswith(".parquet"): return {"reader_method": "read_parquet"} elif path.endswith(".xlsx") or path.endswith(".xls"): return {"reader_method": "read_excel"} elif path.endswith(".json"): return {"reader_method": "read_json"} elif path.endswith(".pkl"): return {"reader_method": "read_pickle"} elif path.endswith(".feather"): return {"reader_method": "read_feather"} elif path.endswith(".csv.gz") or path.endswith(".tsv.gz"): return { "reader_method": "read_csv", "reader_options": {"compression": "gzip"}, } raise ge_exceptions.BatchSpecError( f'Unable to determine reader method from path: "{path}".' )
def get_batch_data_and_markers( self, batch_spec: BatchSpec ) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers( { "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime( "%Y%m%dT%H%M%S.%fZ" ) } ) batch_data: PandasBatchData if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated if isinstance(batch_spec.batch_data, pd.DataFrame): df = batch_spec.batch_data elif isinstance(batch_spec.batch_data, PandasBatchData): df = batch_spec.batch_data.dataframe else: raise ValueError( "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object." ) batch_spec.batch_data = "PandasDataFrame" elif isinstance(batch_spec, S3BatchSpec): if self._s3 is None: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine has been passed a S3BatchSpec, but the ExecutionEngine does not have a boto3 client configured. Please check your config.""" ) s3_engine = self._s3 s3_url = S3Url(batch_spec.path) reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} if "compression" not in reader_options.keys(): reader_options["compression"] = sniff_s3_compression(s3_url) s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key) logger.debug( "Fetching s3 object. Bucket: {} Key: {}".format( s3_url.bucket, s3_url.key ) ) reader_fn = self._get_reader_fn(reader_method, s3_url.key) buf = BytesIO(s3_object["Body"].read()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options path: str = batch_spec.path reader_fn: Callable = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) else: raise ge_exceptions.BatchSpecError( f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}" ) df = self._apply_splitting_and_sampling_methods(batch_spec, df) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(df) typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df) return typed_batch_data, batch_markers
def get_batch_data_and_markers( self, batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]: # batch_data # We need to build a batch_markers to be used in the dataframe batch_markers: BatchMarkers = BatchMarkers({ "ge_load_time": datetime.datetime.now( datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ") }) batch_data: Any if isinstance(batch_spec, RuntimeDataBatchSpec): # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated batch_data = batch_spec.batch_data if isinstance(batch_data, str): raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal. Please check your config.""") if isinstance(batch_spec.batch_data, pd.DataFrame): df = batch_spec.batch_data elif isinstance(batch_spec.batch_data, PandasBatchData): df = batch_spec.batch_data.dataframe else: raise ValueError( "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object." ) batch_spec.batch_data = "PandasDataFrame" elif isinstance(batch_spec, S3BatchSpec): if self._s3 is None: self._instantiate_s3_client() # if we were not able to instantiate S3 client, then raise error if self._s3 is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a S3BatchSpec, but the ExecutionEngine does not have a boto3 client configured. Please check your config.""" ) s3_engine = self._s3 try: reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path s3_url = S3Url(path) if "compression" not in reader_options.keys(): inferred_compression_param = sniff_s3_compression(s3_url) if inferred_compression_param is not None: reader_options[ "compression"] = inferred_compression_param s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key) except (ParamValidationError, ClientError) as error: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine encountered the following error while trying to read data from S3 Bucket: {error}""" ) logger.debug( f"Fetching s3 object. Bucket: {s3_url.bucket} Key: {s3_url.key}" ) reader_fn = self._get_reader_fn(reader_method, s3_url.key) buf = BytesIO(s3_object["Body"].read()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, AzureBatchSpec): if self._azure is None: self._instantiate_azure_client() # if we were not able to instantiate Azure client, then raise error if self._azure is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a AzureBatchSpec, but the ExecutionEngine does not have an Azure client configured. Please check your config.""" ) azure_engine = self._azure reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} path: str = batch_spec.path azure_url = AzureUrl(path) blob_client = azure_engine.get_blob_client( container=azure_url.container, blob=azure_url.blob) azure_object = blob_client.download_blob() logger.debug( f"Fetching Azure blob. Container: {azure_url.container} Blob: {azure_url.blob}" ) reader_fn = self._get_reader_fn(reader_method, azure_url.blob) buf = BytesIO(azure_object.readall()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, GCSBatchSpec): if self._gcs is None: self._instantiate_gcs_client() # if we were not able to instantiate GCS client, then raise error if self._gcs is None: raise ge_exceptions.ExecutionEngineError( """PandasExecutionEngine has been passed a GCSBatchSpec, but the ExecutionEngine does not have an GCS client configured. Please check your config.""" ) gcs_engine = self._gcs gcs_url = GCSUrl(batch_spec.path) reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options or {} try: gcs_bucket = gcs_engine.get_bucket(gcs_url.bucket) gcs_blob = gcs_bucket.blob(gcs_url.blob) logger.debug( f"Fetching GCS blob. Bucket: {gcs_url.bucket} Blob: {gcs_url.blob}" ) except GoogleAPIError as error: raise ge_exceptions.ExecutionEngineError( f"""PandasExecutionEngine encountered the following error while trying to read data from GCS Bucket: {error}""" ) reader_fn = self._get_reader_fn(reader_method, gcs_url.blob) buf = BytesIO(gcs_blob.download_as_bytes()) buf.seek(0) df = reader_fn(buf, **reader_options) elif isinstance(batch_spec, PathBatchSpec): reader_method: str = batch_spec.reader_method reader_options: dict = batch_spec.reader_options path: str = batch_spec.path reader_fn: Callable = self._get_reader_fn(reader_method, path) df = reader_fn(path, **reader_options) else: raise ge_exceptions.BatchSpecError( f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, S3BatchSpec, or AzureBatchSpec, not {batch_spec.__class__.__name__}" ) df = self._apply_splitting_and_sampling_methods(batch_spec, df) if df.memory_usage().sum() < HASH_THRESHOLD: batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe( df) typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df) return typed_batch_data, batch_markers