def test_batch_data_get_validator_failed_specification_no_batch_identifier_pandas_engine(
    data_context_with_datasource_pandas_engine, test_df_pandas
):
    context: DataContext = data_context_with_datasource_pandas_engine
    test_df: pd.DataFrame = test_df_pandas

    context.create_expectation_suite("my_expectations")

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    # batch_identifiers should not be None
    with pytest.raises(TypeError):
        validator: Validator = context.get_validator(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={"batch_data": test_df},
                batch_identifiers=None,
            ),
            expectation_suite_name="my_expectations",
        )

    # batch_identifiers should not be omitted
    with pytest.raises(TypeError):
        validator: Validator = context.get_validator(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={"batch_data": test_df},
            ),
            expectation_suite_name="my_expectations",
        )
def test_file_path_get_batch_successful_specification_pandas_engine_named_asset_two_batch_requests(
    data_context_with_datasource_pandas_engine, taxi_test_file
):
    context: DataContext = data_context_with_datasource_pandas_engine
    batch_identifiers: Dict[str, int] = {"day": 1, "month": 12}
    batch_list: List[Batch] = context.get_batch_list(
        batch_request=RuntimeBatchRequest(
            datasource_name="my_datasource",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name="asset_a",
            runtime_parameters={"path": taxi_test_file},
            batch_identifiers=batch_identifiers,
        )
    )
    assert len(batch_list) == 1
    assert isinstance(batch_list[0], Batch)

    batch_1: Batch = batch_list[0]
    assert batch_1.batch_definition.batch_identifiers == batch_identifiers

    batch_identifiers: Dict[str, int] = {"day": 2, "month": 12}
    batch_list: List[Batch] = context.get_batch_list(
        batch_request=RuntimeBatchRequest(
            datasource_name="my_datasource",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name="asset_a",
            runtime_parameters={"path": taxi_test_file},
            batch_identifiers=batch_identifiers,
        )
    )
    assert len(batch_list) == 1
    assert isinstance(batch_list[0], Batch)
    batch_2: Batch = batch_list[0]
    assert batch_2.batch_definition.batch_identifiers == batch_identifiers
def test_get_batch_failed_specification_no_runtime_parameters_pandas_engine(
    data_context_with_datasource_pandas_engine, test_df_pandas
):
    context: DataContext = data_context_with_datasource_pandas_engine
    test_df: pd.DataFrame = test_df_pandas

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # runtime_parameters missing (None)
        batch: list = context.get_batch_list(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters=None,
                batch_identifiers={"default_identifier_name": "identifier_name"},
            )
        )

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # runtime_parameters missing (omitted)
        batch: list = context.get_batch_list(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                batch_identifiers={"default_identifier_name": "identifier_name"},
            )
        )
def test_batch_data_get_batch_successful_specification_sparkdf_engine_named_asset_two_batch_requests(
        data_context_with_datasource_spark_engine, test_df_spark):
    context: DataContext = data_context_with_datasource_spark_engine
    test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark  # noqa: F821

    batch_identifiers: Dict[str, int] = {"day": 1, "month": 12}
    batch_list: List[Batch] = context.get_batch_list(
        batch_request=RuntimeBatchRequest(
            datasource_name="my_datasource",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name="asset_a",
            runtime_parameters={"batch_data": test_df},
            batch_identifiers=batch_identifiers,
        ))
    assert len(batch_list) == 1
    assert isinstance(batch_list[0], Batch)
    batch_1: Batch = batch_list[0]
    assert batch_1.batch_definition.batch_identifiers == batch_identifiers

    batch_identifiers: Dict[str, int] = {"day": 2, "month": 12}
    batch_list: List[Batch] = context.get_batch_list(
        batch_request=RuntimeBatchRequest(
            datasource_name="my_datasource",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name="asset_a",
            runtime_parameters={"batch_data": test_df},
            batch_identifiers=batch_identifiers,
        ))
    assert len(batch_list) == 1
    assert isinstance(batch_list[0], Batch)
    batch_2: Batch = batch_list[0]
    assert batch_2.batch_definition.batch_identifiers == batch_identifiers
def test_batch_identifiers_and_batch_identifiers_error_illegal_keys(
    basic_datasource, ):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    batch_identifiers: dict

    batch_identifiers = {
        "pipeline_stage_name": "core_processing",
        "airflow_run_id": 1234567890,
        "custom_key_0": "custom_value_0",
        "custom_key_1": "custom_value_1",
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])

    # Insure that keys in batch_identifiers["batch_identifiers"] that are not among batch_identifiers declared in
    # configuration
    # are not accepted.  In this test, all legal keys plus a single illegal key are present.
    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_name",
        "runtime_parameters": {
            "batch_data": test_df
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition] = test_runtime_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=batch_request)

    batch_identifiers = {"batch_identifiers": {"unknown_key": "some_value"}}

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"])

    # Insure that keys in batch_identifiers["batch_identifiers"] that are not among batch_identifiers declared in
    # configuration
    # are not accepted.  In this test, a single illegal key is present.
    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "IN_MEMORY_DATA_ASSET",
        "runtime_parameters": {
            "batch_data": test_df
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition] = test_runtime_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=batch_request)
def test_get_batch_failed_specification_no_batch_identifier_sqlalchemy_engine(
    data_context_with_datasource_sqlalchemy_engine, sa
):
    context = data_context_with_datasource_sqlalchemy_engine

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # batch_identifiers missing (set to None)
        batch_list: List[Batch] = context.get_batch_list(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={
                    "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10"
                },
                batch_identifiers=None,
            )
        )

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # batch_identifiers missing (omitted)
        batch_list: List[Batch] = context.get_batch_list(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={
                    "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10"
                },
            )
        )
def test_get_validator_failed_specification_no_runtime_parameters_sqlalchemy_engine(
    data_context_with_datasource_sqlalchemy_engine, sa
):
    context: DataContext = data_context_with_datasource_sqlalchemy_engine
    context.create_expectation_suite("my_expectations")
    with pytest.raises(TypeError):
        # runtime_parameters should not be None
        batch_list: List[Batch] = context.get_validator(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters=None,
                batch_identifiers={"default_identifier_name": "identifier_name"},
            ),
            expectation_suite_name="my_expectations",
        )

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # runtime_parameters missing (omitted)
        batch_list: List[Batch] = context.get_validator(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                batch_identifiers={"default_identifier_name": "identifier_name"},
            )
        )
def test_get_validator_failed_specification_no_batch_identifier_sqlalchemy_engine(
    data_context_with_datasource_sqlalchemy_engine, sa
):
    context: DataContext = data_context_with_datasource_sqlalchemy_engine
    context.create_expectation_suite("my_expectations")

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    # batch_identifiers should not be None
    with pytest.raises(TypeError):
        validator: Validator = context.get_validator(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={
                    "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10"
                },
                batch_identifiers=None,
            ),
            expectation_suite_name="my_expectations",
        )

    # batch_identifiers should not be omitted
    with pytest.raises(TypeError):
        validator: Validator = context.get_validator(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={
                    "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10"
                },
            ),
            expectation_suite_name="my_expectations",
        )
def test_batch_data_get_batch_failed_specification_no_batch_identifier_sparkdf_engine(
        data_context_with_datasource_spark_engine, spark_session,
        test_df_spark):
    context: DataContext = data_context_with_datasource_spark_engine
    test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark  # noqa: F821

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # batch_identifiers missing (set to None)
        batch: list = context.get_batch_list(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={"batch_data": test_df},
                batch_identifiers=None,
            ))

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # batch_identifiers missing (omitted)
        batch: list = context.get_batch_list(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={"batch_data": test_df},
            ))
Ejemplo n.º 10
0
def test_batch_data_pandas_execution_engine_no_batch_identifiers(
    datasource_with_runtime_data_connector_and_pandas_execution_engine, ):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # batch_identifiers missing (set to None)
        batch_list: List[
            Batch] = datasource_with_runtime_data_connector_and_pandas_execution_engine.get_batch_list_from_batch_request(
                batch_request=RuntimeBatchRequest(
                    datasource_name=
                    datasource_with_runtime_data_connector_and_pandas_execution_engine
                    .name,
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                    runtime_parameters={"batch_data": test_df},
                    batch_identifiers=None,
                ))

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # batch_identifiers missing
        batch_list: List[
            Batch] = datasource_with_runtime_data_connector_and_pandas_execution_engine.get_batch_list_from_batch_request(
                batch_request=RuntimeBatchRequest(
                    datasource_name=
                    datasource_with_runtime_data_connector_and_pandas_execution_engine
                    .name,
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                    runtime_parameters={"batch_data": test_df},
                ))
def test_batch_data_get_validator_failed_specification_no_runtime_parameters_sparkdf_engine(
        data_context_with_datasource_spark_engine, spark_session,
        test_df_spark):
    context: DataContext = data_context_with_datasource_spark_engine
    test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark  # noqa: F821

    context.create_expectation_suite("my_expectations")
    with pytest.raises(TypeError):
        # runtime_parameters should not be None
        batch: list = context.get_validator(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters=None,
                batch_identifiers={
                    "default_identifier_name": "identifier_name"
                },
            ),
            expectation_suite_name="my_expectations",
        )

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    with pytest.raises(TypeError):
        # runtime_parameters missing (omitted)
        batch: list = context.get_validator(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                batch_identifiers={
                    "default_identifier_name": "identifier_name"
                },
            ))
Ejemplo n.º 12
0
def test_get_available_data_asset_names_updating_after_batch_request(basic_datasource):
    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    # empty if data_connector has not been used
    assert test_runtime_data_connector.get_available_data_asset_names() == []

    batch_identifiers = {
        "airflow_run_id": 1234567890,
    }

    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_1",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # run with my_data_asset_1
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request
    )

    # updated to my_data_asset_1
    assert test_runtime_data_connector.get_available_data_asset_names() == [
        "my_data_asset_1"
    ]

    batch_identifiers = {
        "airflow_run_id": 1234567890,
    }
    batch_request: dict = {
        "datasource_name": basic_datasource.name,
        "data_connector_name": test_runtime_data_connector.name,
        "data_asset_name": "my_data_asset_2",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # run with my_data_asset_2
    test_runtime_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=batch_request
    )

    # updated to my_data_asset_1 and my_data_asset_2
    assert test_runtime_data_connector.get_available_data_asset_names() == [
        "my_data_asset_1",
        "my_data_asset_2",
    ]
def test_get_batch_list_from_batch_request_length_one_from_query_named_asset_two_batch_requests(
    datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine, sa
):
    # interacting with the database using query
    test_query: str = "SELECT * FROM table_full__I;"

    batch_identifiers: Dict[str, Union[str, int]] = {"day": 1, "month": 12}

    batch_request: Dict[str, Any] = {
        "datasource_name": datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.name,
        "data_connector_name": "test_runtime_data_connector",
        "data_asset_name": "asset_a",
        "runtime_parameters": {
            "query": test_query,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)
    batch_list: List[
        Batch
    ] = datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.get_batch_list_from_batch_request(
        batch_request=batch_request
    )
    # batches are a little bit more difficult to test because of batch_markers
    # they are ones that uniquely identify the data
    assert len(batch_list) == 1
    my_batch_1 = batch_list[0]
    assert my_batch_1.batch_spec is not None
    assert my_batch_1.batch_definition["data_asset_name"] == "asset_a"
    assert isinstance(my_batch_1.data.selectable, sqlalchemy.Table)

    # interacting with the database using query
    test_query: str = "SELECT * FROM table_full__I;"

    batch_identifiers: Dict[str, Union[str, int]] = {"day": 2, "month": 12}

    batch_request: Dict[str, Any] = {
        "datasource_name": datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.name,
        "data_connector_name": "test_runtime_data_connector",
        "data_asset_name": "asset_a",
        "runtime_parameters": {
            "query": test_query,
        },
        "batch_identifiers": batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)
    batch_list: List[
        Batch
    ] = datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.get_batch_list_from_batch_request(
        batch_request=batch_request
    )
    # batches are a little bit more difficult to test because of batch_markers
    # they are ones that uniquely identify the data
    assert len(batch_list) == 1
    my_batch_2 = batch_list[0]
    assert my_batch_2.batch_spec is not None
    assert my_batch_2.batch_definition["data_asset_name"] == "asset_a"
    assert isinstance(my_batch_2.data.selectable, sqlalchemy.Table)
def test_get_batch_with_query_in_runtime_parameters_using_runtime_data_connector(
    sa,
    data_context_with_runtime_sql_datasource_for_testing_get_batch,
    sqlite_view_engine,
):
    context: DataContext = (
        data_context_with_runtime_sql_datasource_for_testing_get_batch
    )

    batch: Batch

    batch = context.get_batch(
        batch_request=RuntimeBatchRequest(
            datasource_name="my_runtime_sql_datasource",
            data_connector_name="my_runtime_data_connector",
            data_asset_name="IN_MEMORY_DATA_ASSET",
            runtime_parameters={
                "query": "SELECT * FROM table_partitioned_by_date_column__A"
            },
            batch_identifiers={
                "pipeline_stage_name": "core_processing",
                "airflow_run_id": 1234567890,
            },
        ),
    )

    assert batch.batch_spec is not None
    assert batch.batch_definition["data_asset_name"] == "IN_MEMORY_DATA_ASSET"
    assert isinstance(batch.data, SqlAlchemyBatchData)

    selectable_table_name = batch.data.selectable.name
    selectable_count_sql_str = f"select count(*) from {selectable_table_name}"
    sa_engine = batch.data.execution_engine.engine

    assert sa_engine.execute(selectable_count_sql_str).scalar() == 120
    assert batch.batch_markers.get("ge_load_time") is not None
    # since create_temp_table defaults to True, there should be 1 temp table
    assert len(get_sqlite_temp_table_names(batch.data.execution_engine.engine)) == 1

    # if create_temp_table in batch_spec_passthrough is set to False, no new temp tables should be created
    batch = context.get_batch(
        batch_request=RuntimeBatchRequest(
            datasource_name="my_runtime_sql_datasource",
            data_connector_name="my_runtime_data_connector",
            data_asset_name="IN_MEMORY_DATA_ASSET",
            runtime_parameters={
                "query": "SELECT * FROM table_partitioned_by_date_column__A"
            },
            batch_identifiers={
                "pipeline_stage_name": "core_processing",
                "airflow_run_id": 1234567890,
            },
            batch_spec_passthrough={"create_temp_table": False},
        ),
    )
    assert len(get_sqlite_temp_table_names(batch.data.execution_engine.engine)) == 1
def test_basic_datasource_runtime_data_connector_error_checking(
    basic_datasource_with_runtime_data_connector, ):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    # Test for an unknown datasource
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=RuntimeBatchRequest(
                    datasource_name="non_existent_datasource",
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                ))

    # Test for an unknown data_connector
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=RuntimeBatchRequest(
                    datasource_name=basic_datasource_with_runtime_data_connector
                    .name,
                    data_connector_name="non_existent_data_connector",
                    data_asset_name="my_data_asset",
                ))

    # Test for illegal absence of batch_identifiers when batch_data is specified
    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=RuntimeBatchRequest(
                    datasource_name=basic_datasource_with_runtime_data_connector
                    .name,
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                    runtime_parameters={"batch_data": test_df},
                    batch_identifiers=None,
                ))

    # Test for illegal falsiness of batch_identifiers when batch_data is specified
    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request(
                batch_request=RuntimeBatchRequest(
                    datasource_name=basic_datasource_with_runtime_data_connector
                    .name,
                    data_connector_name="test_runtime_data_connector",
                    data_asset_name="my_data_asset",
                    runtime_parameters={"batch_data": test_df},
                    batch_identifiers=dict(),
                ))
Ejemplo n.º 16
0
def test_file_path_sparkedf_execution_engine_get_batch_list_with_named_asset(
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
        taxi_test_file):
    batch_identifiers: Dict[str, int] = {"day": 1, "month": 12}
    # Verify that all keys in batch_identifiers are acceptable as batch_identifiers (using batch count).
    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        "test_runtime_data_connector",
        "data_asset_name":
        "asset_a",
        "runtime_parameters": {
            "path": taxi_test_file,
        },
        "batch_identifiers":
        batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)
    batch_list: List[
        Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
            batch_request=batch_request)
    assert len(batch_list) == 1
    my_batch_1 = batch_list[0]

    assert my_batch_1.batch_spec is not None
    assert my_batch_1.batch_definition["data_asset_name"] == "asset_a"
    assert isinstance(my_batch_1.data, SparkDFBatchData)
    assert my_batch_1.data.dataframe.count() == 10001
    assert len(my_batch_1.data.dataframe.columns) == 18
    assert my_batch_1.batch_definition.batch_identifiers == batch_identifiers
def test_get_validator_wrong_type_sparkdf_engine(
        data_context_with_datasource_spark_engine, spark_session,
        test_df_spark):
    context: DataContext = data_context_with_datasource_spark_engine
    test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark  # noqa: F821

    context.create_expectation_suite("my_expectations")

    # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__()
    # data_connector_name should be a dict not an int
    with pytest.raises(TypeError):
        context.get_validator(
            batch_request=RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name=1,
                data_asset_name="default_data_asset_name",
                runtime_parameters={
                    "query":
                    "SELECT * from table_partitioned_by_date_column__A LIMIT 10"
                },
                batch_identifiers={
                    "default_identifier_name": "identifier_name"
                },
            ),
            expectation_suite_name="my_expectations",
        )
def test_batch_data_get_validator_ambiguous_parameter_sparkdf_engine(
        data_context_with_datasource_spark_engine, spark_session,
        test_df_spark):
    """
    What does this test and why?

    get_batch_list() requires batch_request to be passed in a named parameter. This test passes in a batch_request
    as an unnamed parameter, which will raise a GreatExpectationsTypeError
    """
    context: DataContext = data_context_with_datasource_spark_engine
    test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark  # noqa: F821

    context.create_expectation_suite("my_expectations")
    # raised by get_batch_list() in DataContext
    with pytest.raises(ge_exceptions.GreatExpectationsTypeError):
        batch_list: List[Batch] = context.get_validator(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={"batch_data": test_df},
                batch_identifiers={
                    "default_identifier_name": "identifier_name"
                },
            ),
            expectation_suite_name="my_expectations",
        )
Ejemplo n.º 19
0
def test_batch_data_sparkdf_execution_engine_unknown_data_connector(
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
        spark_session):
    test_df: "pyspark.sql.dataframe.DataFrame" = (  # noqa: F821
        spark_session.createDataFrame(data=pd.DataFrame(data={
            "col1": [1, 2],
            "col2": [3, 4]
        })))
    # raised by _validate_batch_request() in Datasource
    with pytest.raises(ValueError):
        # Test for an unknown data_connector
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
                batch_request=RuntimeBatchRequest(
                    datasource_name=
                    datasource_with_runtime_data_connector_and_sparkdf_execution_engine
                    .name,
                    data_connector_name="non_existent_data_connector",
                    data_asset_name="my_data_asset",
                    runtime_parameters={"batch_data": test_df},
                    batch_identifiers={
                        "default_identifier_name": "identifier_name"
                    },
                ))
Ejemplo n.º 20
0
def test_batch_data_sparkdf_execution_engine_all_keys_present_for_batch_identifiers(
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
        spark_session):
    test_df: "pyspark.sql.dataframe.DataFrame" = (  # noqa: F821
        spark_session.createDataFrame(data=pd.DataFrame(data={
            "col1": [1, 2],
            "col2": [3, 4]
        })))
    batch_identifiers: Dict[str, int] = {
        "pipeline_stage_name": "core_processing",
        "airflow_run_id": 1234567890,
        "custom_key_0": "custom_value_0",
    }

    # Verify that all keys in batch_identifiers are acceptable as batch_identifiers (using batch count).
    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        "test_runtime_data_connector",
        "data_asset_name":
        "IN_MEMORY_DATA_ASSET",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers":
        batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)
    batch_list: List[
        Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
            batch_request=batch_request)
    assert len(batch_list) == 1
Ejemplo n.º 21
0
def test_batch_data_sparkdf_execution_engine_batch_identifiers_error_one_illegal_key(
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
        spark_session):
    test_df: "pyspark.sql.dataframe.DataFrame" = (  # noqa: F821
        spark_session.createDataFrame(data=pd.DataFrame(data={
            "col1": [1, 2],
            "col2": [3, 4]
        })))

    batch_identifiers: Dict[str, str] = {"unknown_key": "some_value"}
    # Ensure that keys in batch_identifiers that are not among batch_identifiers declared in
    # configuration are not accepted.  In this test, a single illegal key is present.
    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        "test_runtime_data_connector",
        "data_asset_name":
        "IN_MEMORY_DATA_ASSET",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers":
        batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    with pytest.raises(ge_exceptions.DataConnectorError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
                batch_request=batch_request)
def test_get_validator_ambiguous_parameter_sqlalchemy_engine(
    data_context_with_datasource_sqlalchemy_engine, sa
):
    """
    What does this test and why?

    get_batch_list() requires batch_request to be passed in a named parameter. This test passes in a batch_request
    as an unnamed parameter, which will raise a GreatExpectationsTypeError
    """
    context: DataContext = data_context_with_datasource_sqlalchemy_engine
    context.create_expectation_suite("my_expectations")
    # raised by get_batch_list() in DataContext
    with pytest.raises(ge_exceptions.GreatExpectationsTypeError):
        batch_list: List[Batch] = context.get_validator(
            RuntimeBatchRequest(
                datasource_name="my_datasource",
                data_connector_name="default_runtime_data_connector_name",
                data_asset_name="default_data_asset_name",
                runtime_parameters={
                    "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10"
                },
                batch_identifiers={"default_identifier_name": "identifier_name"},
            ),
            expectation_suite_name="my_expectations",
        )
Ejemplo n.º 23
0
def test_batch_data_sparkdf_execution_engine_set_data_asset_name_for_runtime_data(
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
        spark_session):
    test_df: "pyspark.sql.dataframe.DataFrame" = (  # noqa: F821
        spark_session.createDataFrame(data=pd.DataFrame(data={
            "col1": [1, 2],
            "col2": [3, 4]
        })))
    batch_identifiers: Dict[str, int] = {
        "pipeline_stage_name": "core_processing",
        "airflow_run_id": 1234567890,
        "custom_key_0": "custom_value_0",
    }

    # set : my_runtime_data_asset
    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        "test_runtime_data_connector",
        "data_asset_name":
        "my_runtime_data_asset",
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers":
        batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)
    batch_list: List[
        Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
            batch_request=batch_request)
    assert batch_list[
        0].batch_definition.data_asset_name == "my_runtime_data_asset"
Ejemplo n.º 24
0
def test_get_batch_with_pipeline_style_batch_request_incompatible_batch_data_and_spark_df_execution_engine_error(
    basic_spark_datasource, ):
    data_connector_name: str = "test_runtime_data_connector"
    data_asset_name: str = "IN_MEMORY_DATA_ASSET"

    batch_request: Union[dict, BatchRequest]

    batch_request = {
        "datasource_name": basic_spark_datasource.name,
        "data_connector_name": data_connector_name,
        "data_asset_name": data_asset_name,
        "runtime_parameters": {
            "batch_data": "SELECT * FROM my_table",
        },
        "batch_identifiers": {
            "pipeline_stage_name": "core_processing",
            "airflow_run_id": 1234567890,
        },
    }
    batch_request = RuntimeBatchRequest(**batch_request)
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        # noinspection PyUnusedLocal
        batch_list: List[
            Batch] = basic_spark_datasource.get_batch_list_from_batch_request(
                batch_request=batch_request)
Ejemplo n.º 25
0
def test_get_batch_with_pipeline_style_batch_request(
        basic_pandas_datasource_v013):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    data_connector_name: str = "test_runtime_data_connector"
    data_asset_name: str = "IN_MEMORY_DATA_ASSET"

    batch_request: dict = {
        "datasource_name": basic_pandas_datasource_v013.name,
        "data_connector_name": data_connector_name,
        "data_asset_name": data_asset_name,
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": {
            "airflow_run_id": 1234567890,
        },
    }
    batch_request: BatchRequest = RuntimeBatchRequest(**batch_request)
    batch_list: List[
        Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
            batch_request=batch_request)

    assert len(batch_list) == 1

    batch: Batch = batch_list[0]

    assert batch.batch_spec is not None
    assert batch.batch_definition["data_asset_name"] == data_asset_name
    assert isinstance(batch.data.dataframe, pd.DataFrame)
    assert batch.data.dataframe.shape == (2, 2)
    assert batch.data.dataframe["col2"].values[1] == 4
    assert (batch.batch_markers["pandas_data_fingerprint"] ==
            "1e461a0df5fe0a6db2c3bc4ef88ef1f0")
def test_file_path_get_batch_successful_specification_spark_directory_no_header(
        data_context_with_datasource_spark_engine, taxi_test_file_directory,
        spark_session):
    context: DataContext = data_context_with_datasource_spark_engine
    batch_list: List[Batch] = context.get_batch_list(
        batch_request=RuntimeBatchRequest(
            datasource_name="my_datasource",
            data_connector_name="default_runtime_data_connector_name",
            data_asset_name="my_data_asset",
            runtime_parameters={
                "path": taxi_test_file_directory,
            },
            batch_identifiers={"default_identifier_name": 1234567890},
            batch_spec_passthrough={
                "reader_method": "csv",
            },
        ))
    assert len(batch_list) == 1
    assert isinstance(batch_list[0], Batch)
    batch = batch_list[0]
    assert isinstance(batch.batch_spec, BatchSpec)
    assert batch.batch_definition["data_asset_name"] == "my_data_asset"
    assert isinstance(batch, Batch)
    assert isinstance(batch.data, SparkDFBatchData)
    assert batch.data.dataframe.count() == 30003  # 3 files read in as 1
    assert len(batch.data.dataframe.columns) == 18
Ejemplo n.º 27
0
def test_file_path_sparkdf_execution_engine_batch_list_from_batch_request_fail_directory_but_no_reader_method(
    datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
    taxi_test_file_directory,
    spark_session,
):
    # The SparkDFExecutionEngine can only read in multiple files from a directory if the reader_method is specified
    batch_identifiers: Dict[str, int] = {
        "airflow_run_id": 1234567890,
    }
    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        "test_runtime_data_connector",
        "data_asset_name":
        "my_data_asset",
        "runtime_parameters": {
            "path": taxi_test_file_directory,
        },
        "batch_identifiers":
        batch_identifiers,
        "batch_spec_passthrough": {
            "reader_options": {
                "header": True
            }
        },
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # raised by guess_reader_method_from_path() in SparkDFExecutionEngine
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        batch_list: List[
            Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
                batch_request=batch_request)
Ejemplo n.º 28
0
def test_file_path_sparkdf_execution_engine_batch_list_from_batch_request_success_file_path_no_headers(
    datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
    taxi_test_file,
    spark_session,
):
    batch_identifiers: Dict[str, int] = {
        "airflow_run_id": 1234567890,
    }
    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        "test_runtime_data_connector",
        "data_asset_name":
        "my_data_asset",
        "runtime_parameters": {
            "path": taxi_test_file,
        },
        "batch_identifiers":
        batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)
    batch_list: List[
        Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
            batch_request=batch_request)
    assert len(batch_list) == 1
    my_batch_1 = batch_list[0]
    assert my_batch_1.batch_spec is not None
    assert my_batch_1.batch_definition["data_asset_name"] == "my_data_asset"
    assert isinstance(my_batch_1.data, SparkDFBatchData)
    assert (my_batch_1.data.dataframe.count() == 10001
            )  # headers are not read-in by default
    assert len(my_batch_1.data.dataframe.columns) == 18
Ejemplo n.º 29
0
def test_file_path_sparkdf_execution_engine_batch_list_from_batch_request_failed_wrong_file_path(
    datasource_with_runtime_data_connector_and_sparkdf_execution_engine, ):
    batch_identifiers: Dict[str, int] = {
        "airflow_run_id": 1234567890,
    }
    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        "test_runtime_data_connector",
        "data_asset_name":
        "my_data_asset",
        "runtime_parameters": {
            "path": "I_dont_exist",
        },
        "batch_identifiers":
        batch_identifiers,
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    # raised by guess_reader_method_from_path() in ExecutionEngine
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        batch_list: List[
            Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request(
                batch_request=batch_request)
Ejemplo n.º 30
0
def get_spark_runtime_validator(context, df):
    spark = get_or_create_spark_application(
        spark_config={
            "spark.sql.catalogImplementation": "hive",
            "spark.executor.memory": "450m",
            # "spark.driver.allowMultipleContexts": "true",  # This directive does not appear to have any effect.
        })
    df = spark.createDataFrame(df)
    batch_request = RuntimeBatchRequest(
        datasource_name="my_spark_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="IN_MEMORY_DATA_ASSET",
        runtime_parameters={"batch_data": df},
        batch_identifiers={
            "an_example_key": "a",
            "another_example_key": "b",
        },
    )

    expectation_suite = context.create_expectation_suite(
        "my_suite", overwrite_existing=True)

    validator = context.get_validator(batch_request=batch_request,
                                      expectation_suite=expectation_suite)

    return validator