def test_custom_list(periodic_table_of_elements): Hydrogen = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({"element": "Hydrogen"}), ) Helium = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", batch_identifiers=IDDict({"element": "Helium"}), ) Lithium = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"element": "Lithium"}), ) batch_list = [Hydrogen, Helium, Lithium] my_sorter = CustomListSorter(name="element", orderby="desc", reference_list=periodic_table_of_elements) sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [Lithium, Helium, Hydrogen] my_sorter = CustomListSorter(name="element", orderby="asc", reference_list=periodic_table_of_elements) sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [Hydrogen, Helium, Lithium]
def test_convert_data_reference_string_to_batch_identifiers_using_regex_with_named_groups( caplog, ): data_reference = "alex_20200809_1000.csv" pattern = r"^(?P<name>.+)_(?P<timestamp>\d+)_(?P<price>\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert convert_data_reference_string_to_batch_identifiers_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names) == ( "DEFAULT_ASSET_NAME", IDDict({ "name": "alex", "timestamp": "20200809", "price": "1000", }), ) group_names = ["name", "timestamp", "cost"] # Mismatch between "price" and "cost"! assert convert_data_reference_string_to_batch_identifiers_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names) == ( "DEFAULT_ASSET_NAME", IDDict({ "name": "alex", "timestamp": "20200809", }), ) assert "The named group 'price' must explicitly be stated" in caplog.text
def test_create_three_batch_definitions_sort_lexicographically(): a = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({"id": "A"}), ) b = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", batch_identifiers=IDDict({"id": "B"}), ) c = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"id": "C"}), ) batch_list = [a, b, c] # sorting by "id" reverse alphabetically (descending) my_sorter = LexicographicSorter(name="id", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, ) assert sorted_batch_list == [c, b, a] # sorting by "id" reverse alphabetically (ascending) my_sorter = LexicographicSorter(name="id", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, ) assert sorted_batch_list == [a, b, c]
def test_convert_batch_identifiers_to_data_reference_string_using_regex(): pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] batch_identifiers = IDDict( **{ "name": "alex", "timestamp": "20200809", "price": "1000", } ) assert ( convert_batch_identifiers_to_data_reference_string_using_regex( batch_identifiers=batch_identifiers, regex_pattern=pattern, group_names=group_names, ) == "alex_20200809_1000.csv" ) # Test an example with an uncaptured regex group (should return a WildcardDataReference) pattern = r"^(.+)_(\d+)_\d+\.csv$" group_names = ["name", "timestamp"] batch_identifiers = IDDict( **{ "name": "alex", "timestamp": "20200809", "price": "1000", } ) assert ( convert_batch_identifiers_to_data_reference_string_using_regex( batch_identifiers=batch_identifiers, regex_pattern=pattern, group_names=group_names, ) == "alex_20200809_*.csv" ) # Test an example with an uncaptured regex group (should return a WildcardDataReference) pattern = r"^.+_(\d+)_(\d+)\.csv$" group_names = ["timestamp", "price"] batch_identifiers = IDDict( **{ "name": "alex", "timestamp": "20200809", "price": "1000", } ) assert ( convert_batch_identifiers_to_data_reference_string_using_regex( batch_identifiers=batch_identifiers, regex_pattern=pattern, group_names=group_names, ) == "*_20200809_1000.csv" )
def test_data_connector_query_sorted_filtered_by_custom_filter_with_index_as_slice_via_string_no_left_right_no_step( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter. def my_custom_batch_selector(batch_identifiers: dict) -> bool: return (batch_identifiers["name"] in ["abe", "james", "eugene"] and datetime.datetime.strptime(batch_identifiers["timestamp"], "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date()) returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={ "custom_filter_function": my_custom_batch_selector, "index": ":3", }, ))) assert len(returned_batch_definition_list) == 3 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), ] assert returned_batch_definition_list == expected
def test_batch_definition_id(): # noinspection PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"})) print(A.id) # noinspection PyUnusedLocal,PyPep8Naming B = BatchDefinition("B", "b", "bbb", batch_identifiers=IDDict({"id": "B"})) print(B.id) assert A.id != B.id
def test_convert_data_reference_string_to_batch_identifiers_using_regex(): data_reference = "alex_20200809_1000.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert convert_data_reference_string_to_batch_identifiers_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names ) == ( "DEFAULT_ASSET_NAME", IDDict( { "name": "alex", "timestamp": "20200809", "price": "1000", } ), ) data_reference = "eugene_20200810_1500.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert convert_data_reference_string_to_batch_identifiers_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names ) == ( "DEFAULT_ASSET_NAME", IDDict( { "name": "eugene", "timestamp": "20200810", "price": "1500", } ), ) data_reference = "DOESNT_MATCH_CAPTURING_GROUPS.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert ( convert_data_reference_string_to_batch_identifiers_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names, ) is None ) data_reference = "eugene_DOESNT_MATCH_ALL_CAPTURING_GROUPS_1500.csv" pattern = r"^(.+)_(\d+)_(\d+)\.csv$" group_names = ["name", "timestamp", "price"] assert ( convert_data_reference_string_to_batch_identifiers_using_regex( data_reference=data_reference, regex_pattern=pattern, group_names=group_names, ) is None )
def test_map_batch_definition_to_data_reference_string_using_regex(): # not BatchDefinition my_batch_definition = "I_am_a_string" group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(TypeError): # noinspection PyUnusedLocal,PyTypeChecker my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # group names do not match my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["i", "wont", "match"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" with pytest.raises(KeyError): # noinspection PyUnusedLocal my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) # success my_batch_definition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ) group_names = ["name", "timestamp", "price"] regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$" my_data_reference = map_batch_definition_to_data_reference_string_using_regex( batch_definition=my_batch_definition, regex_pattern=regex_pattern, group_names=group_names, ) assert my_data_reference == "eugene_20200809_1500.csv"
def test_date_time(): first = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({"date": "20210101"}), ) second = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", batch_identifiers=IDDict({"date": "20210102"}), ) third = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"date": "20210103"}), ) batch_list = [first, second, third] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [third, second, first] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [first, second, third] with pytest.raises(ge_exceptions.SorterError): # numeric date_time_format i_dont_work = DateTimeSorter(name="date", datetime_format=12345, orderby="desc") my_date_is_not_a_string = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"date": 20210103}), ) batch_list = [first, second, third, my_date_is_not_a_string] my_sorter = DateTimeSorter(name="date", datetime_format="%Y%m%d", orderby="desc") with pytest.raises(ge_exceptions.SorterError): sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
def test_data_connector_query_sorted_filtered_by_custom_filter_with_slice_obj( create_files_and_instantiate_data_connector, ): # <TODO> is this behavior correct? my_data_connector = create_files_and_instantiate_data_connector # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter. def my_custom_batch_selector(batch_identifiers: dict) -> bool: return (batch_identifiers["name"] in ["abe", "james", "eugene"] and datetime.datetime.strptime(batch_identifiers["timestamp"], "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date()) returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={ "custom_filter_function": my_custom_batch_selector, "index": slice(3, 5, None), }, ))) assert len(returned_batch_definition_list) == 2 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200811", "price": "1009" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200810", "price": "1003" }), ), ] assert returned_batch_definition_list == expected
def test_OpsgenieRenderer_checkpoint_validation_results_success(): batch_definition = BatchDefinition( datasource_name="test_datasource", data_connector_name="test_dataconnector", data_asset_name="test_data_asset", batch_identifiers=IDDict({"id": "my_id"}), ) validation_result_suite = ExpectationSuiteValidationResult( results=[], success=True, statistics={ "evaluated_expectations": 0, "successful_expectations": 0, "unsuccessful_expectations": 0, "success_percent": None, }, meta={ "great_expectations_version": "v0.12.2__develop", "active_batch_definition": batch_definition, "expectation_suite_name": "default", "run_id": "2021-01-01T000000.000000Z", }, ) rendered_output = OpsgenieRenderer().render(validation_result_suite) expected_output = "Batch Validation Status: Success 🎉\nExpectation suite name: default\nData asset name: test_data_asset\nRun ID: 2021-01-01T000000.000000Z\nBatch ID: ()\nSummary: 0 of 0 expectations were met" assert rendered_output == expected_output
def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest): self._validate_batch_request(batch_request=batch_request) if len(self._data_references_cache) == 0: self._refresh_data_references_cache() batch_definition_list: List[BatchDefinition] = [] try: sub_cache = self._data_references_cache[batch_request.data_asset_name] except KeyError as e: raise KeyError( f"data_asset_name {batch_request.data_asset_name} is not recognized." ) for batch_identifiers in sub_cache: batch_definition: BatchDefinition = BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=batch_request.data_asset_name, batch_identifiers=IDDict(batch_identifiers), batch_spec_passthrough=batch_request.batch_spec_passthrough, ) if batch_definition_matches_batch_request(batch_definition, batch_request): batch_definition_list.append(batch_definition) return batch_definition_list
def test_data_connector_query_data_connector_query_batch_identifiers_2_key_name_timestamp( create_files_and_instantiate_data_connector, ): my_data_connector = create_files_and_instantiate_data_connector # no limit returned_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query={ "batch_filter_parameters": { "timestamp": "20200809", "name": "will", }, }, ))) assert len(returned_batch_definition_list) == 1 expected: List[BatchDefinition] = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), ] assert returned_batch_definition_list == expected
def test_batch_definition_equality(): # noinspection PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"})) # noinspection PyUnusedLocal,PyPep8Naming B = BatchDefinition("B", "b", "bbb", batch_identifiers=IDDict({"id": "B"})) assert A != B # noinspection PyUnusedLocal,PyPep8Naming A2 = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"})) assert A == A2
def test_batch__str__method(): batch = Batch( data=None, batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", ), batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_data_connector", data_asset_name="my_data_asset_name", batch_identifiers=IDDict({}), ), batch_spec=BatchSpec(path="/some/path/some.file"), batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"), ) print(batch.__str__()) assert (batch.__str__() == """{ "data": "None", "batch_request": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name" }, "batch_definition": { "datasource_name": "my_datasource", "data_connector_name": "my_data_connector", "data_asset_name": "my_data_asset_name", "batch_identifiers": {} }, "batch_spec": "{'path': '/some/path/some.file'}", "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}" }""")
def test_get_validator_expectation_suite_options( data_context_with_simple_sql_datasource_for_testing_get_batch, ): context = data_context_with_simple_sql_datasource_for_testing_get_batch context.create_expectation_suite("some_expectations") # Successful specification with an existing expectation_suite_name context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", expectation_suite_name="some_expectations", date="2020-01-15", ) # Successful specification with a fetched ExpectationSuite object some_expectations = context.get_expectation_suite("some_expectations") context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", expectation_suite=some_expectations, date="2020-01-15", ) # Successful specification with a fresh ExpectationSuite object some_more_expectations = context.create_expectation_suite( expectation_suite_name="some_more_expectations") context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", expectation_suite=some_more_expectations, date="2020-01-15", ) # Successful specification using overwrite_existing_expectation_suite context.get_validator( batch_request=BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", data_connector_query=IDDict( batch_filter_parameters={"date": "2020-01-15"}), ), create_expectation_suite_with_name="yet_more_expectations", ) # Failed specification: incorrectly typed expectation suite with pytest.raises(TypeError): context.get_validator( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="table_partitioned_by_date_column__A", expectation_suite={ "im": "a", "dictionary": "not a", "ExepctationSuite": False, }, date="2020-01-15", )
def test_batch_definition_instantiation(): with pytest.raises(TypeError): # noinspection PyTypeChecker,PyUnusedLocal,PyPep8Naming A = BatchDefinition("A", "a", "aaa", {"id": "A"}) A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"})) print(A.id)
def test_EmailRenderer_checkpoint_validation_results_with_datadocs(): batch_definition = BatchDefinition( datasource_name="test_datasource", data_connector_name="test_dataconnector", data_asset_name="test_data_asset", batch_identifiers=IDDict({"id": "my_id"}), ) validation_result_suite = ExpectationSuiteValidationResult( results=[], success=True, statistics={ "evaluated_expectations": 0, "successful_expectations": 0, "unsuccessful_expectations": 0, "success_percent": None, }, meta={ "great_expectations_version": "v0.8.0__develop", "active_batch_definition": batch_definition, "expectation_suite_name": "default", "run_id": "2019-09-25T060538.829112Z", }, ) rendered_output = EmailRenderer().render(validation_result_suite) expected_output = ( "default: Success 🎉", '<p><strong>Batch Validation Status</strong>: Success 🎉</p>\n<p><strong>Expectation suite name</strong>: default</p>\n<p><strong>Data asset name</strong>: test_data_asset</p>\n<p><strong>Run ID</strong>: 2019-09-25T060538.829112Z</p>\n<p><strong>Batch ID</strong>: ()</p>\n<p><strong>Summary</strong>: <strong>0</strong> of <strong>0</strong> expectations were met</p><p>Learn <a href="https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started/set_up_data_docs.html">here</a> how to review validation results in Data Docs</p>', ) assert rendered_output == expected_output data_docs_pages = {"local_site": "file:///localsite/index.html"} notify_with = ["local_site"] rendered_output = EmailRenderer().render(validation_result_suite, data_docs_pages, notify_with) expected_output = ( "default: Success 🎉", '<p><strong>Batch Validation Status</strong>: Success 🎉</p>\n<p><strong>Expectation suite name</strong>: default</p>\n<p><strong>Data asset name</strong>: test_data_asset</p>\n<p><strong>Run ID</strong>: 2019-09-25T060538.829112Z</p>\n<p><strong>Batch ID</strong>: ()</p>\n<p><strong>Summary</strong>: <strong>0</strong> of <strong>0</strong> expectations were met</p><p><strong>DataDocs</strong> can be found here: <a href="file:///localsite/index.html">file:///localsite/index.html</a>.</br>(Please copy and paste link into a browser to view)</p><p>Learn <a href="https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started/set_up_data_docs.html">here</a> how to review validation results in Data Docs</p>', ) assert rendered_output == expected_output # not configured notify_with = ["fake_site"] rendered_output = EmailRenderer().render(validation_result_suite, data_docs_pages, notify_with) expected_output = ( "default: Success 🎉", '<p><strong>Batch Validation Status</strong>: Success 🎉</p>\n<p><strong>Expectation suite name</strong>: default</p>\n<p><strong>Data asset name</strong>: test_data_asset</p>\n<p><strong>Run ID</strong>: 2019-09-25T060538.829112Z</p>\n<p><strong>Batch ID</strong>: ()</p>\n<p><strong>Summary</strong>: <strong>0</strong> of <strong>0</strong> expectations were met</p><strong>ERROR</strong>: The email is trying to provide a link to the following DataDocs: `fake_site`, but it is not configured under data_docs_sites in the great_expectations.yml</br><p>Learn <a href="https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started/set_up_data_docs.html">here</a> how to review validation results in Data Docs</p>', ) assert rendered_output == expected_output
def test_complex_regex_example_with_implicit_data_asset_names( mock_gcs_conn, mock_list_keys, mock_emit): my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector( name="my_data_connector", datasource_name="FAKE_DATASOURCE_NAME", execution_engine=PandasExecutionEngine(), default_regex={ "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv", "group_names": ["year_dir", "month_dir", "data_asset_name"], }, bucket_or_name="test_bucket", prefix="", ) my_data_connector._refresh_data_references_cache() assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", ))) == 3) assert (len( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="beta", ))) == 4) assert my_data_connector.get_batch_definition_list_from_batch_request( batch_request=BatchRequest( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", data_connector_query={ "batch_filter_parameters": { "year_dir": "2020", "month_dir": "03", } }, )) == [ BatchDefinition( datasource_name="FAKE_DATASOURCE_NAME", data_connector_name="my_data_connector", data_asset_name="alpha", batch_identifiers=IDDict( year_dir="2020", month_dir="03", ), ) ]
def test_create_three_batch_definitions_sort_numerically(): one = BatchDefinition( datasource_name="A", data_connector_name="a", data_asset_name="aaa", batch_identifiers=IDDict({"id": 1}), ) two = BatchDefinition( datasource_name="B", data_connector_name="b", data_asset_name="bbb", batch_identifiers=IDDict({"id": 2}), ) three = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"id": 3}), ) batch_list = [one, two, three] my_sorter = NumericSorter(name="id", orderby="desc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [three, two, one] my_sorter = NumericSorter(name="id", orderby="asc") sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list) assert sorted_batch_list == [one, two, three] # testing a non-numeric, which should throw an error i_should_not_work = BatchDefinition( datasource_name="C", data_connector_name="c", data_asset_name="ccc", batch_identifiers=IDDict({"id": "aaa"}), ) batch_list = [one, two, three, i_should_not_work] with pytest.raises(ge_exceptions.SorterError): sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
def _map_data_reference_to_batch_definition_list( self, data_reference, data_asset_name: Optional[str] = None #: Any, ) -> Optional[List[BatchDefinition]]: # Note: This is a bit hacky, but it works. In sql_data_connectors, data references *are* dictionaries, # allowing us to invoke `IDDict(data_reference)` return [ BatchDefinition( datasource_name=self.datasource_name, data_connector_name=self.name, data_asset_name=data_asset_name, batch_identifiers=IDDict(data_reference), ) ]
def test_batch_data_sparkdf_execution_engine_get_batch_definitions_and_get_batch_basics( datasource_with_runtime_data_connector_and_sparkdf_execution_engine, spark_session): test_df: "pyspark.sql.dataframe.DataFrame" = ( # noqa: F821 spark_session.createDataFrame(data=pd.DataFrame(data={ "col1": [1, 2], "col2": [3, 4] }))) data_connector_name: str = "test_runtime_data_connector" data_asset_name: str = "test_asset_1" batch_request: Dict[str, Any] = { "datasource_name": datasource_with_runtime_data_connector_and_sparkdf_execution_engine. name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": { "airflow_run_id": 1234567890, }, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) assert (len( datasource_with_runtime_data_connector_and_sparkdf_execution_engine. get_available_batch_definitions(batch_request=batch_request)) == 1) my_df: "pyspark.sql.dataframe.DataFrame" = ( spark_session.createDataFrame( # noqa: F821 pd.DataFrame({ "x": range(10), "y": range(10) }))) batch: Batch = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_from_batch_definition( batch_definition=BatchDefinition( "my_datasource", "_pipeline", "_pipeline", batch_identifiers=IDDict({"some_random_id": 1}), ), batch_data=my_df, ) assert batch.batch_request == {}
def test_get_validator_bad_batch_request( data_context_with_simple_sql_datasource_for_testing_get_batch, ): context: "DataContext" = ( data_context_with_simple_sql_datasource_for_testing_get_batch) context.create_expectation_suite("my_expectations") batch_request: BatchRequest = BatchRequest( datasource_name="my_sqlite_db", data_connector_name="daily", data_asset_name="i_dont_exist", data_connector_query=IDDict( batch_filter_parameters={"date": "2020-01-15"}), ) with pytest.raises(KeyError): # as a result of introspection, the data_assets will already be loaded into the cache. # an incorrect data_asset_name will result in a key error context.get_validator(batch_request=batch_request, expectation_suite_name="my_expectations")
def test_populate_dependencies_with_incorrect_metric_name(): df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}) expectation_configuration = ExpectationConfiguration( expectation_type="expect_column_value_z_scores_to_be_less_than", kwargs={ "column": "a", "mostly": 0.9, "threshold": 4, "double_sided": True, }, ) # noinspection PyUnusedLocal expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration) # noinspection PyUnusedLocal batch = Batch(data=df) graph = ValidationGraph() engine = PandasExecutionEngine() for configuration in [expectation_configuration]: expectation_impl = get_expectation_impl( "expect_column_value_z_scores_to_be_less_than" ) validation_dependencies = expectation_impl( configuration ).get_validation_dependencies( configuration, engine, ) try: Validator(execution_engine=engine).build_metric_dependency_graph( graph=graph, execution_engine=engine, metric_configuration=MetricConfiguration( "column_values.not_a_metric", IDDict() ), configuration=configuration, ) except ge_exceptions.MetricProviderError as e: graph = e assert isinstance(graph, ge_exceptions.MetricProviderError)
def test_get_batch_definitions_and_get_batch_basics( basic_datasource_with_runtime_data_connector, ): test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) data_connector_name: str = "test_runtime_data_connector" data_asset_name: str = "test_asset_1" batch_request: dict = { "datasource_name": basic_datasource_with_runtime_data_connector.name, "data_connector_name": data_connector_name, "data_asset_name": data_asset_name, "runtime_parameters": { "batch_data": test_df, }, "batch_identifiers": { "airflow_run_id": 1234567890, }, } batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request) assert (len( basic_datasource_with_runtime_data_connector. get_available_batch_definitions(batch_request=batch_request)) == 1) my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)}) batch: Batch = (basic_datasource_with_runtime_data_connector. get_batch_from_batch_definition( batch_definition=BatchDefinition( "my_datasource", "_pipeline", "_pipeline", batch_identifiers=IDDict({"some_random_id": 1}), ), batch_data=my_df, )) assert batch.batch_request == {}
def test_get_batch_definitions_and_get_batch_basics( basic_pandas_datasource_v013): my_data_connector: ConfiguredAssetFilesystemDataConnector = ( basic_pandas_datasource_v013. data_connectors["my_filesystem_data_connector"]) create_files_in_directory( my_data_connector.base_directory, ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"], ) assert (len( basic_pandas_datasource_v013.get_available_batch_definitions( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", ))) == 6) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", batch_identifiers=IDDict({ "letter": "B", "number": "1", }), )) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {} assert isinstance(batch.data.dataframe, pd.DataFrame) assert batch.batch_definition == BatchDefinition( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", batch_identifiers=IDDict({ "letter": "B", "number": "1", }), ) batch_list: List[ Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="B1", data_connector_query={ "batch_filter_parameters": { "letter": "B", "number": "1", } }, )) assert len(batch_list) == 0 batch_list: List[ Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request( batch_request=BatchRequest( datasource_name="my_datasource", data_connector_name="my_filesystem_data_connector", data_asset_name="Titanic", data_connector_query={ "batch_filter_parameters": { "letter": "B", "number": "1", } }, )) assert len(batch_list) == 1 assert isinstance(batch_list[0].data.dataframe, pd.DataFrame) my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)}) batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition( batch_definition=BatchDefinition( "my_datasource", "_pipeline", "_pipeline", batch_identifiers=IDDict({"some_random_id": 1}), ), batch_data=my_df, ) # TODO Abe 20201104: Make sure this is what we truly want to do. assert batch.batch_request == {}
def test_alpha(tmp_path_factory): base_directory = str(tmp_path_factory.mktemp("test_alpha")) create_files_in_directory( directory=base_directory, file_name_list=[ "test_dir_alpha/A.csv", "test_dir_alpha/B.csv", "test_dir_alpha/C.csv", "test_dir_alpha/D.csv", ], ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetFilesystemDataConnector datasource_name: BASE base_directory: {base_directory}/test_dir_alpha assets: A: glob_directive: "*.csv" default_regex: pattern: (.+)\\.csv group_names: - part_1 """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, )) self_check_report = my_data_connector.self_check() print(json.dumps(self_check_report, indent=2)) assert self_check_report[ "class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert set(list(self_check_report["data_assets"].keys())) == {"A"} assert self_check_report["unmatched_data_reference_count"] == 0 my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # Try to fetch a batch from a nonexistent asset my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="B", data_connector_query=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request)) assert len(my_batch_definition_list) == 0 my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_filesystem_data_connector", data_asset_name="A", data_connector_query=IDDict( **{"batch_filter_parameters": { "part_1": "B" }}), ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request)) assert len(my_batch_definition_list) == 1
def test_return_all_batch_definitions_unsorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted")) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, )) with pytest.raises(TypeError): # noinspection PyArgumentList my_data_connector.get_batch_definition_list_from_batch_request() # with unnamed data_asset_name with pytest.raises(TypeError): my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="", )) # with unnamed data_asset_name unsorted_batch_definition_list = ( my_data_connector._get_batch_definition_list_from_batch_request( BatchRequestBase( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="", ))) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200809", "price": "1000" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200819", "price": "1300" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200713", "price": "1567" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200810", "price": "1003" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200811", "price": "1009" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200810", "price": "1001" }), ), ] assert expected == unsorted_batch_definition_list # with named data_asset_name unsorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ))) assert expected == unsorted_batch_definition_list
def test_return_all_batch_definitions_sorted(tmp_path_factory): base_directory = str( tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted")) create_files_in_directory( directory=base_directory, file_name_list=[ "alex_20200809_1000.csv", "eugene_20200809_1500.csv", "james_20200811_1009.csv", "abe_20200809_1040.csv", "will_20200809_1002.csv", "james_20200713_1567.csv", "eugene_20201129_1900.csv", "will_20200810_1001.csv", "james_20200810_1003.csv", "alex_20200819_1300.csv", ], ) my_data_connector_yaml = yaml.load( f""" class_name: ConfiguredAssetFilesystemDataConnector datasource_name: test_environment base_directory: {base_directory} glob_directive: "*.csv" assets: TestFiles: default_regex: pattern: (.+)_(.+)_(.+)\\.csv group_names: - name - timestamp - price sorters: - orderby: asc class_name: LexicographicSorter name: name - datetime_format: "%Y%m%d" orderby: desc class_name: DateTimeSorter name: timestamp - orderby: desc class_name: NumericSorter name: price """, ) my_data_connector: ConfiguredAssetFilesystemDataConnector = ( instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_filesystem_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={ "module_name": "great_expectations.datasource.data_connector" }, )) self_check_report = my_data_connector.self_check() assert self_check_report[ "class_name"] == "ConfiguredAssetFilesystemDataConnector" assert self_check_report["data_asset_count"] == 1 assert self_check_report["data_assets"]["TestFiles"][ "batch_definition_count"] == 10 assert self_check_report["unmatched_data_reference_count"] == 0 sorted_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", ))) expected = [ BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "abe", "timestamp": "20200809", "price": "1040" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200819", "price": "1300" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "alex", "timestamp": "20200809", "price": "1000" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20201129", "price": "1900" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "eugene", "timestamp": "20200809", "price": "1500" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200811", "price": "1009" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200810", "price": "1003" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "james", "timestamp": "20200713", "price": "1567" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200810", "price": "1001" }), ), BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict({ "name": "will", "timestamp": "20200809", "price": "1002" }), ), ] # TEST 1: Sorting works assert expected == sorted_batch_definition_list my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query=IDDict( **{ "batch_filter_parameters": { "name": "james", "timestamp": "20200713", "price": "1567", } }), ) my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # TEST 2: Should only return the specified partition my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request)) assert len(my_batch_definition_list) == 1 my_batch_definition = my_batch_definition_list[0] expected_batch_definition: BatchDefinition = BatchDefinition( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", batch_identifiers=IDDict(**{ "name": "james", "timestamp": "20200713", "price": "1567", }), ) assert my_batch_definition == expected_batch_definition # TEST 3: Without data_connector_query, should return all 10 my_batch_request: BatchRequest = BatchRequest( datasource_name="test_environment", data_connector_name="general_filesystem_data_connector", data_asset_name="TestFiles", data_connector_query=None, ) # should return 10 my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request)) assert len(my_batch_definition_list) == 10
def test_alpha(): region_name: str = "us-east-1" bucket: str = "test_bucket" conn = boto3.resource("s3", region_name=region_name) conn.create_bucket(Bucket=bucket) client = boto3.client("s3", region_name=region_name) test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) keys: List[str] = [ "test_dir_alpha/A.csv", "test_dir_alpha/B.csv", "test_dir_alpha/C.csv", "test_dir_alpha/D.csv", ] for key in keys: client.put_object( Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key ) my_data_connector_yaml = yaml.load( f""" module_name: great_expectations.datasource.data_connector class_name: ConfiguredAssetS3DataConnector datasource_name: BASE bucket: {bucket} prefix: test_dir_alpha assets: A: default_regex: pattern: .*(.+)\\.csv group_names: - part_1 """, ) my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config( config=my_data_connector_yaml, runtime_environment={ "name": "general_s3_data_connector", "execution_engine": PandasExecutionEngine(), }, config_defaults={"module_name": "great_expectations.datasource.data_connector"}, ) self_check_report = my_data_connector.self_check() print(json.dumps(self_check_report, indent=2)) assert self_check_report["class_name"] == "ConfiguredAssetS3DataConnector" assert self_check_report["data_asset_count"] == 1 assert set(list(self_check_report["data_assets"].keys())) == {"A"} assert self_check_report["unmatched_data_reference_count"] == 0 my_batch_definition_list: List[BatchDefinition] my_batch_definition: BatchDefinition # Try to fetch a batch from a nonexistent asset my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_s3_data_connector", data_asset_name="B", data_connector_query=None, ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 0 my_batch_request: BatchRequest = BatchRequest( datasource_name="BASE", data_connector_name="general_s3_data_connector", data_asset_name="A", data_connector_query=IDDict(**{"batch_filter_parameters": {"part_1": "B"}}), ) my_batch_definition_list = ( my_data_connector.get_batch_definition_list_from_batch_request( batch_request=my_batch_request ) ) assert len(my_batch_definition_list) == 1