def _get_reader_fn(self, reader_method=None, path=None):
        """Static helper for parsing reader types. If reader_method is not provided, path will be used to guess the
        correct reader_method.

        Args:
            reader_method (str): the name of the reader method to use, if available.
            path (str): the path used to guess

        Returns:
            ReaderMethod to use for the filepath

        """
        if reader_method is None and path is None:
            raise ge_exceptions.BatchSpecError(
                "Unable to determine pandas reader function without reader_method or path."
            )

        reader_options = dict()
        if reader_method is None:
            path_guess = self.guess_reader_method_from_path(path)
            reader_method = path_guess["reader_method"]
            reader_options = path_guess.get(
                "reader_options"
            )  # This may not be there; use None in that case

        try:
            reader_fn = getattr(pd, reader_method)
            if reader_options:
                reader_fn = partial(reader_fn, **reader_options)
            return reader_fn
        except AttributeError:
            raise ge_exceptions.BatchSpecError(
                f'Unable to find reader_method "{reader_method}" in pandas.'
            )
    def guess_reader_method_from_path(path):
        """Helper method for deciding which reader to use to read in a certain path.

        Args:
            path (str): the to use to guess

        Returns:
            ReaderMethod to use for the filepath

        """
        if path.endswith(".csv") or path.endswith(".tsv"):
            return {"reader_method": "read_csv"}
        elif path.endswith(".parquet"):
            return {"reader_method": "read_parquet"}
        elif path.endswith(".xlsx") or path.endswith(".xls"):
            return {"reader_method": "read_excel"}
        elif path.endswith(".json"):
            return {"reader_method": "read_json"}
        elif path.endswith(".pkl"):
            return {"reader_method": "read_pickle"}
        elif path.endswith(".feather"):
            return {"reader_method": "read_feather"}
        elif path.endswith(".csv.gz") or path.endswith(".tsv.gz"):
            return {
                "reader_method": "read_csv",
                "reader_options": {"compression": "gzip"},
            }

        raise ge_exceptions.BatchSpecError(
            f'Unable to determine reader method from path: "{path}".'
        )
    def get_batch_data_and_markers(
        self, batch_spec: BatchSpec
    ) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers(
            {
                "ge_load_time": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y%m%dT%H%M%S.%fZ"
                )
            }
        )

        batch_data: PandasBatchData
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            if isinstance(batch_spec.batch_data, pd.DataFrame):
                df = batch_spec.batch_data
            elif isinstance(batch_spec.batch_data, PandasBatchData):
                df = batch_spec.batch_data.dataframe
            else:
                raise ValueError(
                    "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object."
                )
            batch_spec.batch_data = "PandasDataFrame"
        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            s3_url = S3Url(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            if "compression" not in reader_options.keys():
                reader_options["compression"] = sniff_s3_compression(s3_url)
            s3_object = s3_engine.get_object(Bucket=s3_url.bucket, Key=s3_url.key)
            logger.debug(
                "Fetching s3 object. Bucket: {} Key: {}".format(
                    s3_url.bucket, s3_url.key
                )
            )
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            buf = BytesIO(s3_object["Body"].read())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)
        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)
        else:
            raise ge_exceptions.BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, or S3BatchSpec, not {batch_spec.__class__.__name__}"
            )

        df = self._apply_splitting_and_sampling_methods(batch_spec, df)
        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(df)

        typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df)

        return typed_batch_data, batch_markers
    def get_batch_data_and_markers(
            self,
            batch_spec: BatchSpec) -> Tuple[Any, BatchMarkers]:  # batch_data
        # We need to build a batch_markers to be used in the dataframe
        batch_markers: BatchMarkers = BatchMarkers({
            "ge_load_time":
            datetime.datetime.now(
                datetime.timezone.utc).strftime("%Y%m%dT%H%M%S.%fZ")
        })

        batch_data: Any
        if isinstance(batch_spec, RuntimeDataBatchSpec):
            # batch_data != None is already checked when RuntimeDataBatchSpec is instantiated
            batch_data = batch_spec.batch_data
            if isinstance(batch_data, str):
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine has been passed a string type batch_data, "{batch_data}", which is illegal.
Please check your config.""")
            if isinstance(batch_spec.batch_data, pd.DataFrame):
                df = batch_spec.batch_data
            elif isinstance(batch_spec.batch_data, PandasBatchData):
                df = batch_spec.batch_data.dataframe
            else:
                raise ValueError(
                    "RuntimeDataBatchSpec must provide a Pandas DataFrame or PandasBatchData object."
                )
            batch_spec.batch_data = "PandasDataFrame"

        elif isinstance(batch_spec, S3BatchSpec):
            if self._s3 is None:
                self._instantiate_s3_client()
            # if we were not able to instantiate S3 client, then raise error
            if self._s3 is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a S3BatchSpec,
                        but the ExecutionEngine does not have a boto3 client configured. Please check your config."""
                )
            s3_engine = self._s3
            try:
                reader_method: str = batch_spec.reader_method
                reader_options: dict = batch_spec.reader_options or {}
                path: str = batch_spec.path
                s3_url = S3Url(path)
                if "compression" not in reader_options.keys():
                    inferred_compression_param = sniff_s3_compression(s3_url)
                    if inferred_compression_param is not None:
                        reader_options[
                            "compression"] = inferred_compression_param
                s3_object = s3_engine.get_object(Bucket=s3_url.bucket,
                                                 Key=s3_url.key)
            except (ParamValidationError, ClientError) as error:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine encountered the following error while trying to read data from S3 Bucket: {error}"""
                )
            logger.debug(
                f"Fetching s3 object. Bucket: {s3_url.bucket} Key: {s3_url.key}"
            )
            reader_fn = self._get_reader_fn(reader_method, s3_url.key)
            buf = BytesIO(s3_object["Body"].read())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, AzureBatchSpec):
            if self._azure is None:
                self._instantiate_azure_client()
            # if we were not able to instantiate Azure client, then raise error
            if self._azure is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a AzureBatchSpec,
                        but the ExecutionEngine does not have an Azure client configured. Please check your config."""
                )
            azure_engine = self._azure
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            path: str = batch_spec.path
            azure_url = AzureUrl(path)
            blob_client = azure_engine.get_blob_client(
                container=azure_url.container, blob=azure_url.blob)
            azure_object = blob_client.download_blob()
            logger.debug(
                f"Fetching Azure blob. Container: {azure_url.container} Blob: {azure_url.blob}"
            )
            reader_fn = self._get_reader_fn(reader_method, azure_url.blob)
            buf = BytesIO(azure_object.readall())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, GCSBatchSpec):
            if self._gcs is None:
                self._instantiate_gcs_client()
            # if we were not able to instantiate GCS client, then raise error
            if self._gcs is None:
                raise ge_exceptions.ExecutionEngineError(
                    """PandasExecutionEngine has been passed a GCSBatchSpec,
                        but the ExecutionEngine does not have an GCS client configured. Please check your config."""
                )
            gcs_engine = self._gcs
            gcs_url = GCSUrl(batch_spec.path)
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options or {}
            try:
                gcs_bucket = gcs_engine.get_bucket(gcs_url.bucket)
                gcs_blob = gcs_bucket.blob(gcs_url.blob)
                logger.debug(
                    f"Fetching GCS blob. Bucket: {gcs_url.bucket} Blob: {gcs_url.blob}"
                )
            except GoogleAPIError as error:
                raise ge_exceptions.ExecutionEngineError(
                    f"""PandasExecutionEngine encountered the following error while trying to read data from GCS Bucket: {error}"""
                )
            reader_fn = self._get_reader_fn(reader_method, gcs_url.blob)
            buf = BytesIO(gcs_blob.download_as_bytes())
            buf.seek(0)
            df = reader_fn(buf, **reader_options)

        elif isinstance(batch_spec, PathBatchSpec):
            reader_method: str = batch_spec.reader_method
            reader_options: dict = batch_spec.reader_options
            path: str = batch_spec.path
            reader_fn: Callable = self._get_reader_fn(reader_method, path)
            df = reader_fn(path, **reader_options)

        else:
            raise ge_exceptions.BatchSpecError(
                f"batch_spec must be of type RuntimeDataBatchSpec, PathBatchSpec, S3BatchSpec, or AzureBatchSpec, not {batch_spec.__class__.__name__}"
            )

        df = self._apply_splitting_and_sampling_methods(batch_spec, df)
        if df.memory_usage().sum() < HASH_THRESHOLD:
            batch_markers["pandas_data_fingerprint"] = hash_pandas_dataframe(
                df)

        typed_batch_data = PandasBatchData(execution_engine=self, dataframe=df)

        return typed_batch_data, batch_markers