def test_batch_data_get_validator_failed_specification_no_batch_identifier_pandas_engine( data_context_with_datasource_pandas_engine, test_df_pandas ): context: DataContext = data_context_with_datasource_pandas_engine test_df: pd.DataFrame = test_df_pandas context.create_expectation_suite("my_expectations") # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() # batch_identifiers should not be None with pytest.raises(TypeError): validator: Validator = context.get_validator( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={"batch_data": test_df}, batch_identifiers=None, ), expectation_suite_name="my_expectations", ) # batch_identifiers should not be omitted with pytest.raises(TypeError): validator: Validator = context.get_validator( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={"batch_data": test_df}, ), expectation_suite_name="my_expectations", )
def test_file_path_get_batch_successful_specification_pandas_engine_named_asset_two_batch_requests( data_context_with_datasource_pandas_engine, taxi_test_file ): context: DataContext = data_context_with_datasource_pandas_engine batch_identifiers: Dict[str, int] = {"day": 1, "month": 12} batch_list: List[Batch] = context.get_batch_list( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="asset_a", runtime_parameters={"path": taxi_test_file}, batch_identifiers=batch_identifiers, ) ) assert len(batch_list) == 1 assert isinstance(batch_list[0], Batch) batch_1: Batch = batch_list[0] assert batch_1.batch_definition.batch_identifiers == batch_identifiers batch_identifiers: Dict[str, int] = {"day": 2, "month": 12} batch_list: List[Batch] = context.get_batch_list( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="asset_a", runtime_parameters={"path": taxi_test_file}, batch_identifiers=batch_identifiers, ) ) assert len(batch_list) == 1 assert isinstance(batch_list[0], Batch) batch_2: Batch = batch_list[0] assert batch_2.batch_definition.batch_identifiers == batch_identifiers
def test_get_batch_failed_specification_no_runtime_parameters_pandas_engine( data_context_with_datasource_pandas_engine, test_df_pandas ): context: DataContext = data_context_with_datasource_pandas_engine test_df: pd.DataFrame = test_df_pandas # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # runtime_parameters missing (None) batch: list = context.get_batch_list( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters=None, batch_identifiers={"default_identifier_name": "identifier_name"}, ) ) # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # runtime_parameters missing (omitted) batch: list = context.get_batch_list( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", batch_identifiers={"default_identifier_name": "identifier_name"}, ) )
def test_batch_data_get_batch_successful_specification_sparkdf_engine_named_asset_two_batch_requests( data_context_with_datasource_spark_engine, test_df_spark): context: DataContext = data_context_with_datasource_spark_engine test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark # noqa: F821 batch_identifiers: Dict[str, int] = {"day": 1, "month": 12} batch_list: List[Batch] = context.get_batch_list( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="asset_a", runtime_parameters={"batch_data": test_df}, batch_identifiers=batch_identifiers, )) assert len(batch_list) == 1 assert isinstance(batch_list[0], Batch) batch_1: Batch = batch_list[0] assert batch_1.batch_definition.batch_identifiers == batch_identifiers batch_identifiers: Dict[str, int] = {"day": 2, "month": 12} batch_list: List[Batch] = context.get_batch_list( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="asset_a", runtime_parameters={"batch_data": test_df}, batch_identifiers=batch_identifiers, )) assert len(batch_list) == 1 assert isinstance(batch_list[0], Batch) batch_2: Batch = batch_list[0] assert batch_2.batch_definition.batch_identifiers == batch_identifiers
def test_batch_identifiers_and_batch_identifiers_error_illegal_keys( basic_datasource, ): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) batch_identifiers: dict batch_identifiers = { "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, "custom_key_0": "custom_value_0", "custom_key_1": "custom_value_1", } test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"]) # Insure that keys in batch_identifiers["batch_identifiers"] that are not among batch_identifiers declared in # configuration # are not accepted. In this test, all legal keys plus a single illegal key are present. batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "my_data_asset_name", "runtime_parameters": { "batch_data": test_df }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition] = test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request) batch_identifiers = {"batch_identifiers": {"unknown_key": "some_value"}} test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"]) # Insure that keys in batch_identifiers["batch_identifiers"] that are not among batch_identifiers declared in # configuration # are not accepted. In this test, a single illegal key is present. batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "IN_MEMORY_DATA_ASSET", "runtime_parameters": { "batch_data": test_df }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_definition_list: List[ BatchDefinition] = test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request)
def test_get_batch_failed_specification_no_batch_identifier_sqlalchemy_engine( data_context_with_datasource_sqlalchemy_engine, sa ): context = data_context_with_datasource_sqlalchemy_engine # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # batch_identifiers missing (set to None) batch_list: List[Batch] = context.get_batch_list( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={ "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10" }, batch_identifiers=None, ) ) # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # batch_identifiers missing (omitted) batch_list: List[Batch] = context.get_batch_list( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={ "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10" }, ) )
def test_get_validator_failed_specification_no_runtime_parameters_sqlalchemy_engine( data_context_with_datasource_sqlalchemy_engine, sa ): context: DataContext = data_context_with_datasource_sqlalchemy_engine context.create_expectation_suite("my_expectations") with pytest.raises(TypeError): # runtime_parameters should not be None batch_list: List[Batch] = context.get_validator( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters=None, batch_identifiers={"default_identifier_name": "identifier_name"}, ), expectation_suite_name="my_expectations", ) # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # runtime_parameters missing (omitted) batch_list: List[Batch] = context.get_validator( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", batch_identifiers={"default_identifier_name": "identifier_name"}, ) )
def test_get_validator_failed_specification_no_batch_identifier_sqlalchemy_engine( data_context_with_datasource_sqlalchemy_engine, sa ): context: DataContext = data_context_with_datasource_sqlalchemy_engine context.create_expectation_suite("my_expectations") # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() # batch_identifiers should not be None with pytest.raises(TypeError): validator: Validator = context.get_validator( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={ "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10" }, batch_identifiers=None, ), expectation_suite_name="my_expectations", ) # batch_identifiers should not be omitted with pytest.raises(TypeError): validator: Validator = context.get_validator( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={ "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10" }, ), expectation_suite_name="my_expectations", )
def test_batch_data_get_batch_failed_specification_no_batch_identifier_sparkdf_engine( data_context_with_datasource_spark_engine, spark_session, test_df_spark): context: DataContext = data_context_with_datasource_spark_engine test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark # noqa: F821 # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # batch_identifiers missing (set to None) batch: list = context.get_batch_list( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={"batch_data": test_df}, batch_identifiers=None, )) # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # batch_identifiers missing (omitted) batch: list = context.get_batch_list( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={"batch_data": test_df}, ))
def test_batch_data_pandas_execution_engine_no_batch_identifiers( datasource_with_runtime_data_connector_and_pandas_execution_engine, ): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # batch_identifiers missing (set to None) batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_pandas_execution_engine.get_batch_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name= datasource_with_runtime_data_connector_and_pandas_execution_engine .name, data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", runtime_parameters={"batch_data": test_df}, batch_identifiers=None, )) # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # batch_identifiers missing batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_pandas_execution_engine.get_batch_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name= datasource_with_runtime_data_connector_and_pandas_execution_engine .name, data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", runtime_parameters={"batch_data": test_df}, ))
def test_batch_data_get_validator_failed_specification_no_runtime_parameters_sparkdf_engine( data_context_with_datasource_spark_engine, spark_session, test_df_spark): context: DataContext = data_context_with_datasource_spark_engine test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark # noqa: F821 context.create_expectation_suite("my_expectations") with pytest.raises(TypeError): # runtime_parameters should not be None batch: list = context.get_validator( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters=None, batch_identifiers={ "default_identifier_name": "identifier_name" }, ), expectation_suite_name="my_expectations", ) # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() with pytest.raises(TypeError): # runtime_parameters missing (omitted) batch: list = context.get_validator( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", batch_identifiers={ "default_identifier_name": "identifier_name" }, ))
def test_get_available_data_asset_names_updating_after_batch_request(basic_datasource): test_runtime_data_connector: RuntimeDataConnector = ( basic_datasource.data_connectors["test_runtime_data_connector"] ) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # empty if data_connector has not been used assert test_runtime_data_connector.get_available_data_asset_names() == [] batch_identifiers = { "airflow_run_id": 1234567890, } batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "my_data_asset_1", "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) # run with my_data_asset_1 test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request ) # updated to my_data_asset_1 assert test_runtime_data_connector.get_available_data_asset_names() == [ "my_data_asset_1" ] batch_identifiers = { "airflow_run_id": 1234567890, } batch_request: dict = { "datasource_name": basic_datasource.name, "data_connector_name": test_runtime_data_connector.name, "data_asset_name": "my_data_asset_2", "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) # run with my_data_asset_2 test_runtime_data_connector.get_batch_definition_list_from_batch_request( batch_request=batch_request ) # updated to my_data_asset_1 and my_data_asset_2 assert test_runtime_data_connector.get_available_data_asset_names() == [ "my_data_asset_1", "my_data_asset_2", ]
def test_get_batch_list_from_batch_request_length_one_from_query_named_asset_two_batch_requests( datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine, sa ): # interacting with the database using query test_query: str = "SELECT * FROM table_full__I;" batch_identifiers: Dict[str, Union[str, int]] = {"day": 1, "month": 12} batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "asset_a", "runtime_parameters": { "query": test_query, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) batch_list: List[ Batch ] = datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request ) # batches are a little bit more difficult to test because of batch_markers # they are ones that uniquely identify the data assert len(batch_list) == 1 my_batch_1 = batch_list[0] assert my_batch_1.batch_spec is not None assert my_batch_1.batch_definition["data_asset_name"] == "asset_a" assert isinstance(my_batch_1.data.selectable, sqlalchemy.Table) # interacting with the database using query test_query: str = "SELECT * FROM table_full__I;" batch_identifiers: Dict[str, Union[str, int]] = {"day": 2, "month": 12} batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "asset_a", "runtime_parameters": { "query": test_query, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) batch_list: List[ Batch ] = datasource_with_runtime_data_connector_and_sqlalchemy_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request ) # batches are a little bit more difficult to test because of batch_markers # they are ones that uniquely identify the data assert len(batch_list) == 1 my_batch_2 = batch_list[0] assert my_batch_2.batch_spec is not None assert my_batch_2.batch_definition["data_asset_name"] == "asset_a" assert isinstance(my_batch_2.data.selectable, sqlalchemy.Table)
def test_get_batch_with_query_in_runtime_parameters_using_runtime_data_connector( sa, data_context_with_runtime_sql_datasource_for_testing_get_batch, sqlite_view_engine, ): context: DataContext = ( data_context_with_runtime_sql_datasource_for_testing_get_batch ) batch: Batch batch = context.get_batch( batch_request=RuntimeBatchRequest( datasource_name="my_runtime_sql_datasource", data_connector_name="my_runtime_data_connector", data_asset_name="IN_MEMORY_DATA_ASSET", runtime_parameters={ "query": "SELECT * FROM table_partitioned_by_date_column__A" }, batch_identifiers={ "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, }, ), ) assert batch.batch_spec is not None assert batch.batch_definition["data_asset_name"] == "IN_MEMORY_DATA_ASSET" assert isinstance(batch.data, SqlAlchemyBatchData) selectable_table_name = batch.data.selectable.name selectable_count_sql_str = f"select count(*) from {selectable_table_name}" sa_engine = batch.data.execution_engine.engine assert sa_engine.execute(selectable_count_sql_str).scalar() == 120 assert batch.batch_markers.get("ge_load_time") is not None # since create_temp_table defaults to True, there should be 1 temp table assert len(get_sqlite_temp_table_names(batch.data.execution_engine.engine)) == 1 # if create_temp_table in batch_spec_passthrough is set to False, no new temp tables should be created batch = context.get_batch( batch_request=RuntimeBatchRequest( datasource_name="my_runtime_sql_datasource", data_connector_name="my_runtime_data_connector", data_asset_name="IN_MEMORY_DATA_ASSET", runtime_parameters={ "query": "SELECT * FROM table_partitioned_by_date_column__A" }, batch_identifiers={ "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, }, batch_spec_passthrough={"create_temp_table": False}, ), ) assert len(get_sqlite_temp_table_names(batch.data.execution_engine.engine)) == 1
def test_basic_datasource_runtime_data_connector_error_checking( basic_datasource_with_runtime_data_connector, ): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) # Test for an unknown datasource with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name="non_existent_datasource", data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", )) # Test for an unknown data_connector with pytest.raises(ValueError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name=basic_datasource_with_runtime_data_connector .name, data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", )) # Test for illegal absence of batch_identifiers when batch_data is specified with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name=basic_datasource_with_runtime_data_connector .name, data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", runtime_parameters={"batch_data": test_df}, batch_identifiers=None, )) # Test for illegal falsiness of batch_identifiers when batch_data is specified with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_datasource_with_runtime_data_connector.get_batch_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name=basic_datasource_with_runtime_data_connector .name, data_connector_name="test_runtime_data_connector", data_asset_name="my_data_asset", runtime_parameters={"batch_data": test_df}, batch_identifiers=dict(), ))
def test_file_path_sparkedf_execution_engine_get_batch_list_with_named_asset( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, taxi_test_file): batch_identifiers: Dict[str, int] = {"day": 1, "month": 12} # Verify that all keys in batch_identifiers are acceptable as batch_identifiers (using batch count). batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "asset_a", "runtime_parameters": { "path": taxi_test_file, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request) assert len(batch_list) == 1 my_batch_1 = batch_list[0] assert my_batch_1.batch_spec is not None assert my_batch_1.batch_definition["data_asset_name"] == "asset_a" assert isinstance(my_batch_1.data, SparkDFBatchData) assert my_batch_1.data.dataframe.count() == 10001 assert len(my_batch_1.data.dataframe.columns) == 18 assert my_batch_1.batch_definition.batch_identifiers == batch_identifiers
def test_get_validator_wrong_type_sparkdf_engine( data_context_with_datasource_spark_engine, spark_session, test_df_spark): context: DataContext = data_context_with_datasource_spark_engine test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark # noqa: F821 context.create_expectation_suite("my_expectations") # raised by _validate_runtime_batch_request_specific_init_parameters() in RuntimeBatchRequest.__init__() # data_connector_name should be a dict not an int with pytest.raises(TypeError): context.get_validator( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name=1, data_asset_name="default_data_asset_name", runtime_parameters={ "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10" }, batch_identifiers={ "default_identifier_name": "identifier_name" }, ), expectation_suite_name="my_expectations", )
def test_batch_data_get_validator_ambiguous_parameter_sparkdf_engine( data_context_with_datasource_spark_engine, spark_session, test_df_spark): """ What does this test and why? get_batch_list() requires batch_request to be passed in a named parameter. This test passes in a batch_request as an unnamed parameter, which will raise a GreatExpectationsTypeError """ context: DataContext = data_context_with_datasource_spark_engine test_df: "pyspark.sql.dataframe.DataFrame" = test_df_spark # noqa: F821 context.create_expectation_suite("my_expectations") # raised by get_batch_list() in DataContext with pytest.raises(ge_exceptions.GreatExpectationsTypeError): batch_list: List[Batch] = context.get_validator( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={"batch_data": test_df}, batch_identifiers={ "default_identifier_name": "identifier_name" }, ), expectation_suite_name="my_expectations", )
def test_batch_data_sparkdf_execution_engine_unknown_data_connector( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, spark_session): test_df: "pyspark.sql.dataframe.DataFrame" = ( # noqa: F821 spark_session.createDataFrame(data=pd.DataFrame(data={ "col1": [1, 2], "col2": [3, 4] }))) # raised by _validate_batch_request() in Datasource with pytest.raises(ValueError): # Test for an unknown data_connector # noinspection PyUnusedLocal batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=RuntimeBatchRequest( datasource_name= datasource_with_runtime_data_connector_and_sparkdf_execution_engine .name, data_connector_name="non_existent_data_connector", data_asset_name="my_data_asset", runtime_parameters={"batch_data": test_df}, batch_identifiers={ "default_identifier_name": "identifier_name" }, ))
def test_batch_data_sparkdf_execution_engine_all_keys_present_for_batch_identifiers( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, spark_session): test_df: "pyspark.sql.dataframe.DataFrame" = ( # noqa: F821 spark_session.createDataFrame(data=pd.DataFrame(data={ "col1": [1, 2], "col2": [3, 4] }))) batch_identifiers: Dict[str, int] = { "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, "custom_key_0": "custom_value_0", } # Verify that all keys in batch_identifiers are acceptable as batch_identifiers (using batch count). batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request) assert len(batch_list) == 1
def test_batch_data_sparkdf_execution_engine_batch_identifiers_error_one_illegal_key( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, spark_session): test_df: "pyspark.sql.dataframe.DataFrame" = ( # noqa: F821 spark_session.createDataFrame(data=pd.DataFrame(data={ "col1": [1, 2], "col2": [3, 4] }))) batch_identifiers: Dict[str, str] = {"unknown_key": "some_value"} # Ensure that keys in batch_identifiers that are not among batch_identifiers declared in # configuration are not accepted. In this test, a single illegal key is present. batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "IN_MEMORY_DATA_ASSET", "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) with pytest.raises(ge_exceptions.DataConnectorError): # noinspection PyUnusedLocal batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request)
def test_get_validator_ambiguous_parameter_sqlalchemy_engine( data_context_with_datasource_sqlalchemy_engine, sa ): """ What does this test and why? get_batch_list() requires batch_request to be passed in a named parameter. This test passes in a batch_request as an unnamed parameter, which will raise a GreatExpectationsTypeError """ context: DataContext = data_context_with_datasource_sqlalchemy_engine context.create_expectation_suite("my_expectations") # raised by get_batch_list() in DataContext with pytest.raises(ge_exceptions.GreatExpectationsTypeError): batch_list: List[Batch] = context.get_validator( RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="default_data_asset_name", runtime_parameters={ "query": "SELECT * from table_partitioned_by_date_column__A LIMIT 10" }, batch_identifiers={"default_identifier_name": "identifier_name"}, ), expectation_suite_name="my_expectations", )
def test_batch_data_sparkdf_execution_engine_set_data_asset_name_for_runtime_data( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, spark_session): test_df: "pyspark.sql.dataframe.DataFrame" = ( # noqa: F821 spark_session.createDataFrame(data=pd.DataFrame(data={ "col1": [1, 2], "col2": [3, 4] }))) batch_identifiers: Dict[str, int] = { "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, "custom_key_0": "custom_value_0", } # set : my_runtime_data_asset batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "my_runtime_data_asset", "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request) assert batch_list[ 0].batch_definition.data_asset_name == "my_runtime_data_asset"
def test_get_batch_with_pipeline_style_batch_request_incompatible_batch_data_and_spark_df_execution_engine_error( basic_spark_datasource, ): data_connector_name: str = "test_runtime_data_connector" data_asset_name: str = "IN_MEMORY_DATA_ASSET" batch_request: Union[dict, BatchRequest] batch_request = { "datasource_name": basic_spark_datasource.name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "runtime_parameters": { "batch_data": "SELECT * FROM my_table", }, "batch_identifiers": { "pipeline_stage_name": "core_processing", "airflow_run_id": 1234567890, }, } batch_request = RuntimeBatchRequest(**batch_request) with pytest.raises(ge_exceptions.ExecutionEngineError): # noinspection PyUnusedLocal batch_list: List[ Batch] = basic_spark_datasource.get_batch_list_from_batch_request( batch_request=batch_request)
def test_get_batch_with_pipeline_style_batch_request( basic_pandas_datasource_v013): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) data_connector_name: str = "test_runtime_data_connector" data_asset_name: str = "IN_MEMORY_DATA_ASSET" batch_request: dict = { "datasource_name": basic_pandas_datasource_v013.name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": { "airflow_run_id": 1234567890, }, } batch_request: BatchRequest = RuntimeBatchRequest(**batch_request) batch_list: List[ Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=batch_request) assert len(batch_list) == 1 batch: Batch = batch_list[0] assert batch.batch_spec is not None assert batch.batch_definition["data_asset_name"] == data_asset_name assert isinstance(batch.data.dataframe, pd.DataFrame) assert batch.data.dataframe.shape == (2, 2) assert batch.data.dataframe["col2"].values[1] == 4 assert (batch.batch_markers["pandas_data_fingerprint"] == "1e461a0df5fe0a6db2c3bc4ef88ef1f0")
def test_file_path_get_batch_successful_specification_spark_directory_no_header( data_context_with_datasource_spark_engine, taxi_test_file_directory, spark_session): context: DataContext = data_context_with_datasource_spark_engine batch_list: List[Batch] = context.get_batch_list( batch_request=RuntimeBatchRequest( datasource_name="my_datasource", data_connector_name="default_runtime_data_connector_name", data_asset_name="my_data_asset", runtime_parameters={ "path": taxi_test_file_directory, }, batch_identifiers={"default_identifier_name": 1234567890}, batch_spec_passthrough={ "reader_method": "csv", }, )) assert len(batch_list) == 1 assert isinstance(batch_list[0], Batch) batch = batch_list[0] assert isinstance(batch.batch_spec, BatchSpec) assert batch.batch_definition["data_asset_name"] == "my_data_asset" assert isinstance(batch, Batch) assert isinstance(batch.data, SparkDFBatchData) assert batch.data.dataframe.count() == 30003 # 3 files read in as 1 assert len(batch.data.dataframe.columns) == 18
def test_file_path_sparkdf_execution_engine_batch_list_from_batch_request_fail_directory_but_no_reader_method( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, taxi_test_file_directory, spark_session, ): # The SparkDFExecutionEngine can only read in multiple files from a directory if the reader_method is specified batch_identifiers: Dict[str, int] = { "airflow_run_id": 1234567890, } batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "my_data_asset", "runtime_parameters": { "path": taxi_test_file_directory, }, "batch_identifiers": batch_identifiers, "batch_spec_passthrough": { "reader_options": { "header": True } }, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) # raised by guess_reader_method_from_path() in SparkDFExecutionEngine with pytest.raises(ge_exceptions.ExecutionEngineError): batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request)
def test_file_path_sparkdf_execution_engine_batch_list_from_batch_request_success_file_path_no_headers( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, taxi_test_file, spark_session, ): batch_identifiers: Dict[str, int] = { "airflow_run_id": 1234567890, } batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "my_data_asset", "runtime_parameters": { "path": taxi_test_file, }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request) assert len(batch_list) == 1 my_batch_1 = batch_list[0] assert my_batch_1.batch_spec is not None assert my_batch_1.batch_definition["data_asset_name"] == "my_data_asset" assert isinstance(my_batch_1.data, SparkDFBatchData) assert (my_batch_1.data.dataframe.count() == 10001 ) # headers are not read-in by default assert len(my_batch_1.data.dataframe.columns) == 18
def test_file_path_sparkdf_execution_engine_batch_list_from_batch_request_failed_wrong_file_path( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, ): batch_identifiers: Dict[str, int] = { "airflow_run_id": 1234567890, } batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": "test_runtime_data_connector", "data_asset_name": "my_data_asset", "runtime_parameters": { "path": "I_dont_exist", }, "batch_identifiers": batch_identifiers, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) # raised by guess_reader_method_from_path() in ExecutionEngine with pytest.raises(ge_exceptions.ExecutionEngineError): batch_list: List[ Batch] = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_list_from_batch_request( batch_request=batch_request)
def get_spark_runtime_validator(context, df): spark = get_or_create_spark_application( spark_config={ "spark.sql.catalogImplementation": "hive", "spark.executor.memory": "450m", # "spark.driver.allowMultipleContexts": "true", # This directive does not appear to have any effect. }) df = spark.createDataFrame(df) batch_request = RuntimeBatchRequest( datasource_name="my_spark_datasource", data_connector_name="my_data_connector", data_asset_name="IN_MEMORY_DATA_ASSET", runtime_parameters={"batch_data": df}, batch_identifiers={ "an_example_key": "a", "another_example_key": "b", }, ) expectation_suite = context.create_expectation_suite( "my_suite", overwrite_existing=True) validator = context.get_validator(batch_request=batch_request, expectation_suite=expectation_suite) return validator