Esempio n. 1
0
def test_get_batch_with_split_on_whole_table_s3(
    spark_session, basic_spark_df_execution_engine
):
    # noinspection PyUnusedLocal
    def mocked_get_reader_function(*args, **kwargs):
        # noinspection PyUnusedLocal,PyShadowingNames
        def mocked_reader_function(*args, **kwargs):
            pd_df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2, 3, 4, None]})
            df = spark_session.createDataFrame(
                [
                    tuple(
                        None if isinstance(x, (float, int)) and np.isnan(x) else x
                        for x in record.tolist()
                    )
                    for record in pd_df.to_records(index=False)
                ],
                pd_df.columns.tolist(),
            )
            return df

        return mocked_reader_function

    spark_engine = basic_spark_df_execution_engine
    spark_engine._get_reader_fn = mocked_get_reader_function

    test_sparkdf = spark_engine.get_batch_data(
        S3BatchSpec(
            path="s3://bucket/test/test.csv",
            reader_method="csv",
            reader_options={"header": True},
            splitter_method="_split_on_whole_table",
        )
    ).dataframe
    assert test_sparkdf.count() == 4
    assert len(test_sparkdf.columns) == 2
Esempio n. 2
0
def test_get_batch_s3_parquet(test_s3_files_parquet, test_df_small):
    bucket, keys = test_s3_files_parquet
    path = [key for key in keys if key.endswith(".parquet")][0]
    full_path = f"s3a://{os.path.join(bucket, path)}"

    batch_spec = S3BatchSpec(path=full_path, reader_method="read_parquet")
    df = PandasExecutionEngine().get_batch_data(batch_spec=batch_spec)
    assert df.dataframe.shape == test_df_small.shape
Esempio n. 3
0
def test_get_batch_s3_compressed_files(test_s3_files_compressed,
                                       test_df_small):
    bucket, keys = test_s3_files_compressed
    path = keys[0]
    full_path = f"s3a://{os.path.join(bucket, path)}"

    batch_spec = S3BatchSpec(path=full_path, reader_method="read_csv")
    df = PandasExecutionEngine().get_batch_data(batch_spec=batch_spec)
    assert df.dataframe.shape == test_df_small.shape
Esempio n. 4
0
def batch_with_split_on_whole_table_s3(test_s3_files) -> S3BatchSpec:
    bucket, keys = test_s3_files
    path = keys[0]
    full_path = f"s3a://{os.path.join(bucket, path)}"

    batch_spec = S3BatchSpec(
        path=full_path,
        reader_method="read_csv",
        splitter_method="_split_on_whole_table",
    )
    return batch_spec
Esempio n. 5
0
def test_get_batch_with_no_s3_configured():
    batch_spec = S3BatchSpec(
        path="s3a://i_dont_exist",
        reader_method="read_csv",
        splitter_method="_split_on_whole_table",
    )
    # if S3 was not configured
    execution_engine_no_s3 = PandasExecutionEngine()

    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine_no_s3.get_batch_data(batch_spec=batch_spec)
Esempio n. 6
0
    def build_batch_spec(self,
                         batch_definition: BatchDefinition) -> S3BatchSpec:
        """
        Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function.

        Args:
            batch_definition (BatchDefinition): to be used to build batch_spec

        Returns:
            BatchSpec built from batch_definition
        """
        batch_spec: PathBatchSpec = super().build_batch_spec(
            batch_definition=batch_definition)
        return S3BatchSpec(batch_spec)
def test_get_batch_with_split_on_whole_table_s3():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
    keys: List[str] = [
        "path/A-100.csv",
        "path/A-101.csv",
        "directory/B-1.csv",
        "directory/B-2.csv",
    ]
    for key in keys:
        client.put_object(Bucket=bucket,
                          Body=test_df.to_csv(index=False).encode("utf-8"),
                          Key=key)

    path = "path/A-100.csv"
    full_path = f"s3a://{os.path.join(bucket, path)}"
    test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec(
        path=full_path,
        reader_method="read_csv",
        splitter_method="_split_on_whole_table",
    ))
    assert test_df.dataframe.shape == (2, 2)

    # if S3 was not configured
    execution_engine_no_s3 = PandasExecutionEngine()
    execution_engine_no_s3._s3 = None
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine_no_s3.get_batch_data(batch_spec=S3BatchSpec(
            path=full_path,
            reader_method="read_csv",
            splitter_method="_split_on_whole_table",
        ))
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector(
):
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
    keys: List[str] = [
        "path/A-100.csv",
        "path/A-101.csv",
        "directory/B-1.csv",
        "directory/B-2.csv",
    ]
    for key in keys:
        client.put_object(Bucket=bucket,
                          Body=test_df.to_csv(index=False).encode("utf-8"),
                          Key=key)
    path = "path/A-100.csv"
    full_path = f"s3a://{os.path.join(bucket, path)}"

    my_data_connector = ConfiguredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
        bucket=bucket,
        prefix="",
        assets={"alpha": {}},
    )

    test_df = PandasExecutionEngine().get_batch_data(batch_spec=S3BatchSpec(
        path=full_path,
        reader_method="read_csv",
        splitter_method="_split_on_whole_table",
    ))
    assert test_df.dataframe.shape == (2, 2)
Esempio n. 9
0
 def build_batch_spec(
     self,
     batch_definition: BatchDefinition,
     runtime_parameters: dict,
 ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]:
     self._validate_runtime_parameters(
         runtime_parameters=runtime_parameters)
     batch_spec: BatchSpec = super().build_batch_spec(
         batch_definition=batch_definition)
     if runtime_parameters.get("batch_data") is not None:
         batch_spec["batch_data"] = runtime_parameters.get("batch_data")
         return RuntimeDataBatchSpec(batch_spec)
     elif runtime_parameters.get("query"):
         batch_spec["query"] = runtime_parameters.get("query")
         return RuntimeQueryBatchSpec(batch_spec)
     elif runtime_parameters.get("path"):
         path = runtime_parameters.get("path")
         batch_spec["path"] = path
         parsed_url = urlparse(path)
         if "s3" in parsed_url.scheme:
             return S3BatchSpec(batch_spec)
         else:
             return PathBatchSpec(batch_spec)
 def build_batch_spec(
     self,
     batch_definition: BatchDefinition,
     runtime_parameters: dict,
 ) -> Union[RuntimeDataBatchSpec, RuntimeQueryBatchSpec, PathBatchSpec]:
     self._validate_runtime_parameters(
         runtime_parameters=runtime_parameters)
     batch_spec: BatchSpec = super().build_batch_spec(
         batch_definition=batch_definition)
     if "batch_data" in runtime_parameters:
         batch_spec["batch_data"] = runtime_parameters.get("batch_data")
         return RuntimeDataBatchSpec(batch_spec)
     elif "query" in runtime_parameters:
         batch_spec["query"] = runtime_parameters.get("query")
         return RuntimeQueryBatchSpec(batch_spec)
     elif "path" in runtime_parameters:
         path: str = runtime_parameters["path"]
         batch_spec["path"] = path
         if "s3" in path:
             return S3BatchSpec(batch_spec)
         elif "blob.core.windows.net" in path:
             return AzureBatchSpec(batch_spec)
         else:
             return PathBatchSpec(batch_spec)