def test_get_batch_with_split_on_whole_table_s3( spark_session, basic_spark_df_execution_engine ): # noinspection PyUnusedLocal def mocked_get_reader_function(*args, **kwargs): # noinspection PyUnusedLocal,PyShadowingNames def mocked_reader_function(*args, **kwargs): pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]}) df = spark_session.createDataFrame( [ tuple( None if isinstance(x, (float, int)) and np.isnan(x) else x for x in record.tolist() ) for record in pd_df.to_records(index=False) ], pd_df.columns.tolist(), ) return df return mocked_reader_function spark_engine = basic_spark_df_execution_engine spark_engine._get_reader_fn = mocked_get_reader_function test_sparkdf = spark_engine.get_batch_data( S3BatchSpec( path="s3://bucket/test/test.csv", reader_method="csv", reader_options={"header": True}, splitter_method="_split_on_whole_table", ) ).dataframe assert test_sparkdf.count() == 4 assert len(test_sparkdf.columns) == 2
def test_get_batch_s3_parquet(test_s3_files_parquet, test_df_small): bucket, keys = test_s3_files_parquet path = [key for key in keys if key.endswith(".parquet")][0] full_path = f"s3a://{os.path.join(bucket, path)}" batch_spec = S3BatchSpec(path=full_path, reader_method="read_parquet") df = PandasExecutionEngine().get_batch_data(batch_spec=batch_spec) assert df.dataframe.shape == test_df_small.shape
def test_get_batch_s3_compressed_files(test_s3_files_compressed, test_df_small): bucket, keys = test_s3_files_compressed path = keys[0] full_path = f"s3a://{os.path.join(bucket, path)}" batch_spec = S3BatchSpec(path=full_path, reader_method="read_csv") df = PandasExecutionEngine().get_batch_data(batch_spec=batch_spec) assert df.dataframe.shape == test_df_small.shape
def batch_with_split_on_whole_table_s3(test_s3_files) -> S3BatchSpec: bucket, keys = test_s3_files path = keys[0] full_path = f"s3a://{os.path.join(bucket, path)}" batch_spec = S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", ) return batch_spec
def test_get_batch_with_no_s3_configured(): batch_spec = S3BatchSpec( path="s3a://i_dont_exist", reader_method="read_csv", splitter_method="_split_on_whole_table", ) # if S3 was not configured execution_engine_no_s3 = PandasExecutionEngine() with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_s3.get_batch_data(batch_spec=batch_spec)
def build_batch_spec(self, batch_definition: BatchDefinition) -> S3BatchSpec: """ Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function. Args: batch_definition (BatchDefinition): to be used to build batch_spec Returns: BatchSpec built from batch_definition """ batch_spec: PathBatchSpec = super().build_batch_spec( batch_definition=batch_definition) return S3BatchSpec(batch_spec)
def test_get_batch_with_split_on_whole_table_s3(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) path = "path/A-100.csv" full_path = f"s3a://{os.path.join(bucket, path)}" test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", )) assert test_df.dataframe.shape == (2, 2) # if S3 was not configured execution_engine_no_s3 = PandasExecutionEngine() execution_engine_no_s3._s3 = None with pytest.raises(ge_exceptions.ExecutionEngineError): execution_engine_no_s3.get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", ))
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector( ): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "path/A-100.csv", "path/A-101.csv", "directory/B-1.csv", "directory/B-2.csv", ] for key in keys: client.put_object(Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key) path = "path/A-100.csv" full_path = f"s3a://{os.path.join(bucket, path)}" my_data_connector = ConfiguredAssetS3DataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", default_regex={ "pattern": "alpha-(.*)\\.csv", "group_names": ["index"], }, bucket=bucket, prefix="", assets={"alpha": {}}, ) test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec( path=full_path, reader_method="read_csv", splitter_method="_split_on_whole_table", )) assert test_df.dataframe.shape == (2, 2)
def build_batch_spec( self, batch_definition: BatchDefinition, runtime_parameters: dict, ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]: self._validate_runtime_parameters( runtime_parameters=runtime_parameters) batch_spec: BatchSpec = super().build_batch_spec( batch_definition=batch_definition) if runtime_parameters.get("batch_data") is not None: batch_spec["batch_data"] = runtime_parameters.get("batch_data") return RuntimeDataBatchSpec(batch_spec) elif runtime_parameters.get("query"): batch_spec["query"] = runtime_parameters.get("query") return RuntimeQueryBatchSpec(batch_spec) elif runtime_parameters.get("path"): path = runtime_parameters.get("path") batch_spec["path"] = path parsed_url = urlparse(path) if "s3" in parsed_url.scheme: return S3BatchSpec(batch_spec) else: return PathBatchSpec(batch_spec)
def build_batch_spec( self, batch_definition: BatchDefinition, runtime_parameters: dict, ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]: self._validate_runtime_parameters( runtime_parameters=runtime_parameters) batch_spec: BatchSpec = super().build_batch_spec( batch_definition=batch_definition) if "batch_data" in runtime_parameters: batch_spec["batch_data"] = runtime_parameters.get("batch_data") return RuntimeDataBatchSpec(batch_spec) elif "query" in runtime_parameters: batch_spec["query"] = runtime_parameters.get("query") return RuntimeQueryBatchSpec(batch_spec) elif "path" in runtime_parameters: path: str = runtime_parameters["path"] batch_spec["path"] = path if "s3" in path: return S3BatchSpec(batch_spec) elif "blob.core.windows.net" in path: return AzureBatchSpec(batch_spec) else: return PathBatchSpec(batch_spec)