def test_custom_list(periodic_table_of_elements):
    Hydrogen = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        batch_identifiers=IDDict({"element": "Hydrogen"}),
    )
    Helium = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        batch_identifiers=IDDict({"element": "Helium"}),
    )
    Lithium = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"element": "Lithium"}),
    )

    batch_list = [Hydrogen, Helium, Lithium]
    my_sorter = CustomListSorter(name="element",
                                 orderby="desc",
                                 reference_list=periodic_table_of_elements)
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [Lithium, Helium, Hydrogen]

    my_sorter = CustomListSorter(name="element",
                                 orderby="asc",
                                 reference_list=periodic_table_of_elements)
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [Hydrogen, Helium, Lithium]
Beispiel #2
0
def test_convert_data_reference_string_to_batch_identifiers_using_regex_with_named_groups(
    caplog, ):
    data_reference = "alex_20200809_1000.csv"
    pattern = r"^(?P<name>.+)_(?P<timestamp>\d+)_(?P<price>\d+)\.csv$"

    group_names = ["name", "timestamp", "price"]
    assert convert_data_reference_string_to_batch_identifiers_using_regex(
        data_reference=data_reference,
        regex_pattern=pattern,
        group_names=group_names) == (
            "DEFAULT_ASSET_NAME",
            IDDict({
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000",
            }),
        )

    group_names = ["name", "timestamp",
                   "cost"]  # Mismatch between "price" and "cost"!
    assert convert_data_reference_string_to_batch_identifiers_using_regex(
        data_reference=data_reference,
        regex_pattern=pattern,
        group_names=group_names) == (
            "DEFAULT_ASSET_NAME",
            IDDict({
                "name": "alex",
                "timestamp": "20200809",
            }),
        )
    assert "The named group 'price' must explicitly be stated" in caplog.text
def test_create_three_batch_definitions_sort_lexicographically():
    a = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        batch_identifiers=IDDict({"id": "A"}),
    )
    b = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        batch_identifiers=IDDict({"id": "B"}),
    )
    c = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"id": "C"}),
    )

    batch_list = [a, b, c]

    # sorting by "id" reverse alphabetically (descending)
    my_sorter = LexicographicSorter(name="id", orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, )
    assert sorted_batch_list == [c, b, a]

    # sorting by "id" reverse alphabetically (ascending)
    my_sorter = LexicographicSorter(name="id", orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, )
    assert sorted_batch_list == [a, b, c]
def test_convert_batch_identifiers_to_data_reference_string_using_regex():
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    batch_identifiers = IDDict(
        **{
            "name": "alex",
            "timestamp": "20200809",
            "price": "1000",
        }
    )
    assert (
        convert_batch_identifiers_to_data_reference_string_using_regex(
            batch_identifiers=batch_identifiers,
            regex_pattern=pattern,
            group_names=group_names,
        )
        == "alex_20200809_1000.csv"
    )

    # Test an example with an uncaptured regex group (should return a WildcardDataReference)
    pattern = r"^(.+)_(\d+)_\d+\.csv$"
    group_names = ["name", "timestamp"]
    batch_identifiers = IDDict(
        **{
            "name": "alex",
            "timestamp": "20200809",
            "price": "1000",
        }
    )
    assert (
        convert_batch_identifiers_to_data_reference_string_using_regex(
            batch_identifiers=batch_identifiers,
            regex_pattern=pattern,
            group_names=group_names,
        )
        == "alex_20200809_*.csv"
    )

    # Test an example with an uncaptured regex group (should return a WildcardDataReference)
    pattern = r"^.+_(\d+)_(\d+)\.csv$"
    group_names = ["timestamp", "price"]
    batch_identifiers = IDDict(
        **{
            "name": "alex",
            "timestamp": "20200809",
            "price": "1000",
        }
    )
    assert (
        convert_batch_identifiers_to_data_reference_string_using_regex(
            batch_identifiers=batch_identifiers,
            regex_pattern=pattern,
            group_names=group_names,
        )
        == "*_20200809_1000.csv"
    )
def test_data_connector_query_sorted_filtered_by_custom_filter_with_index_as_slice_via_string_no_left_right_no_step(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector

    # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter.
    def my_custom_batch_selector(batch_identifiers: dict) -> bool:
        return (batch_identifiers["name"] in ["abe", "james", "eugene"]
                and datetime.datetime.strptime(batch_identifiers["timestamp"],
                                               "%Y%m%d").date() >
                datetime.datetime(2020, 7, 15).date())

    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                data_connector_query={
                    "custom_filter_function": my_custom_batch_selector,
                    "index": ":3",
                },
            )))
    assert len(returned_batch_definition_list) == 3
    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test_batch_definition_id():
    # noinspection PyUnusedLocal,PyPep8Naming
    A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"}))
    print(A.id)

    # noinspection PyUnusedLocal,PyPep8Naming
    B = BatchDefinition("B", "b", "bbb", batch_identifiers=IDDict({"id": "B"}))
    print(B.id)

    assert A.id != B.id
def test_convert_data_reference_string_to_batch_identifiers_using_regex():
    data_reference = "alex_20200809_1000.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert convert_data_reference_string_to_batch_identifiers_using_regex(
        data_reference=data_reference, regex_pattern=pattern, group_names=group_names
    ) == (
        "DEFAULT_ASSET_NAME",
        IDDict(
            {
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000",
            }
        ),
    )

    data_reference = "eugene_20200810_1500.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert convert_data_reference_string_to_batch_identifiers_using_regex(
        data_reference=data_reference, regex_pattern=pattern, group_names=group_names
    ) == (
        "DEFAULT_ASSET_NAME",
        IDDict(
            {
                "name": "eugene",
                "timestamp": "20200810",
                "price": "1500",
            }
        ),
    )
    data_reference = "DOESNT_MATCH_CAPTURING_GROUPS.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert (
        convert_data_reference_string_to_batch_identifiers_using_regex(
            data_reference=data_reference,
            regex_pattern=pattern,
            group_names=group_names,
        )
        is None
    )

    data_reference = "eugene_DOESNT_MATCH_ALL_CAPTURING_GROUPS_1500.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert (
        convert_data_reference_string_to_batch_identifiers_using_regex(
            data_reference=data_reference,
            regex_pattern=pattern,
            group_names=group_names,
        )
        is None
    )
Beispiel #8
0
def test_map_batch_definition_to_data_reference_string_using_regex():
    # not BatchDefinition
    my_batch_definition = "I_am_a_string"
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(TypeError):
        # noinspection PyUnusedLocal,PyTypeChecker
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # group names do not match
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["i", "wont", "match"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(KeyError):
        # noinspection PyUnusedLocal
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # success
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"

    my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
        batch_definition=my_batch_definition,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert my_data_reference == "eugene_20200809_1500.csv"
def test_date_time():
    first = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        batch_identifiers=IDDict({"date": "20210101"}),
    )
    second = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        batch_identifiers=IDDict({"date": "20210102"}),
    )
    third = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"date": "20210103"}),
    )

    batch_list = [first, second, third]
    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [third, second, first]

    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [first, second, third]

    with pytest.raises(ge_exceptions.SorterError):
        # numeric date_time_format
        i_dont_work = DateTimeSorter(name="date",
                                     datetime_format=12345,
                                     orderby="desc")

    my_date_is_not_a_string = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"date": 20210103}),
    )

    batch_list = [first, second, third, my_date_is_not_a_string]
    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="desc")

    with pytest.raises(ge_exceptions.SorterError):
        sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
def test_data_connector_query_sorted_filtered_by_custom_filter_with_slice_obj(
    create_files_and_instantiate_data_connector, ):
    # <TODO> is this behavior correct?
    my_data_connector = create_files_and_instantiate_data_connector

    # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter.
    def my_custom_batch_selector(batch_identifiers: dict) -> bool:
        return (batch_identifiers["name"] in ["abe", "james", "eugene"]
                and datetime.datetime.strptime(batch_identifiers["timestamp"],
                                               "%Y%m%d").date() >
                datetime.datetime(2020, 7, 15).date())

    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                data_connector_query={
                    "custom_filter_function": my_custom_batch_selector,
                    "index": slice(3, 5, None),
                },
            )))
    assert len(returned_batch_definition_list) == 2

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200811",
                "price": "1009"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200810",
                "price": "1003"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test_OpsgenieRenderer_checkpoint_validation_results_success():
    batch_definition = BatchDefinition(
        datasource_name="test_datasource",
        data_connector_name="test_dataconnector",
        data_asset_name="test_data_asset",
        batch_identifiers=IDDict({"id": "my_id"}),
    )
    validation_result_suite = ExpectationSuiteValidationResult(
        results=[],
        success=True,
        statistics={
            "evaluated_expectations": 0,
            "successful_expectations": 0,
            "unsuccessful_expectations": 0,
            "success_percent": None,
        },
        meta={
            "great_expectations_version": "v0.12.2__develop",
            "active_batch_definition": batch_definition,
            "expectation_suite_name": "default",
            "run_id": "2021-01-01T000000.000000Z",
        },
    )

    rendered_output = OpsgenieRenderer().render(validation_result_suite)

    expected_output = "Batch Validation Status: Success 🎉\nExpectation suite name: default\nData asset name: test_data_asset\nRun ID: 2021-01-01T000000.000000Z\nBatch ID: ()\nSummary: 0 of 0 expectations were met"

    assert rendered_output == expected_output
Beispiel #12
0
    def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest):
        self._validate_batch_request(batch_request=batch_request)

        if len(self._data_references_cache) == 0:
            self._refresh_data_references_cache()

        batch_definition_list: List[BatchDefinition] = []
        try:
            sub_cache = self._data_references_cache[batch_request.data_asset_name]
        except KeyError as e:
            raise KeyError(
                f"data_asset_name {batch_request.data_asset_name} is not recognized."
            )

        for batch_identifiers in sub_cache:
            batch_definition: BatchDefinition = BatchDefinition(
                datasource_name=self.datasource_name,
                data_connector_name=self.name,
                data_asset_name=batch_request.data_asset_name,
                batch_identifiers=IDDict(batch_identifiers),
                batch_spec_passthrough=batch_request.batch_spec_passthrough,
            )
            if batch_definition_matches_batch_request(batch_definition, batch_request):
                batch_definition_list.append(batch_definition)

        return batch_definition_list
def test_data_connector_query_data_connector_query_batch_identifiers_2_key_name_timestamp(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector
    # no limit
    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                data_connector_query={
                    "batch_filter_parameters": {
                        "timestamp": "20200809",
                        "name": "will",
                    },
                },
            )))
    assert len(returned_batch_definition_list) == 1

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test_batch_definition_equality():
    # noinspection PyUnusedLocal,PyPep8Naming
    A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"}))

    # noinspection PyUnusedLocal,PyPep8Naming
    B = BatchDefinition("B", "b", "bbb", batch_identifiers=IDDict({"id": "B"}))

    assert A != B

    # noinspection PyUnusedLocal,PyPep8Naming
    A2 = BatchDefinition("A",
                         "a",
                         "aaa",
                         batch_identifiers=IDDict({"id": "A"}))

    assert A == A2
def test_batch__str__method():
    batch = Batch(
        data=None,
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
        ),
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
            batch_identifiers=IDDict({}),
        ),
        batch_spec=BatchSpec(path="/some/path/some.file"),
        batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"),
    )
    print(batch.__str__())

    assert (batch.__str__() == """{
  "data": "None",
  "batch_request": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name"
  },
  "batch_definition": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name",
    "batch_identifiers": {}
  },
  "batch_spec": "{'path': '/some/path/some.file'}",
  "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}"
}""")
Beispiel #16
0
def test_get_validator_expectation_suite_options(
    data_context_with_simple_sql_datasource_for_testing_get_batch, ):
    context = data_context_with_simple_sql_datasource_for_testing_get_batch
    context.create_expectation_suite("some_expectations")

    # Successful specification with an existing expectation_suite_name
    context.get_validator(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="table_partitioned_by_date_column__A",
        expectation_suite_name="some_expectations",
        date="2020-01-15",
    )

    # Successful specification with a fetched ExpectationSuite object
    some_expectations = context.get_expectation_suite("some_expectations")
    context.get_validator(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="table_partitioned_by_date_column__A",
        expectation_suite=some_expectations,
        date="2020-01-15",
    )

    # Successful specification with a fresh ExpectationSuite object
    some_more_expectations = context.create_expectation_suite(
        expectation_suite_name="some_more_expectations")
    context.get_validator(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="table_partitioned_by_date_column__A",
        expectation_suite=some_more_expectations,
        date="2020-01-15",
    )

    # Successful specification using overwrite_existing_expectation_suite
    context.get_validator(
        batch_request=BatchRequest(
            datasource_name="my_sqlite_db",
            data_connector_name="daily",
            data_asset_name="table_partitioned_by_date_column__A",
            data_connector_query=IDDict(
                batch_filter_parameters={"date": "2020-01-15"}),
        ),
        create_expectation_suite_with_name="yet_more_expectations",
    )

    # Failed specification: incorrectly typed expectation suite
    with pytest.raises(TypeError):
        context.get_validator(
            datasource_name="my_sqlite_db",
            data_connector_name="daily",
            data_asset_name="table_partitioned_by_date_column__A",
            expectation_suite={
                "im": "a",
                "dictionary": "not a",
                "ExepctationSuite": False,
            },
            date="2020-01-15",
        )
def test_batch_definition_instantiation():
    with pytest.raises(TypeError):
        # noinspection PyTypeChecker,PyUnusedLocal,PyPep8Naming
        A = BatchDefinition("A", "a", "aaa", {"id": "A"})

    A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"}))

    print(A.id)
Beispiel #18
0
def test_EmailRenderer_checkpoint_validation_results_with_datadocs():
    batch_definition = BatchDefinition(
        datasource_name="test_datasource",
        data_connector_name="test_dataconnector",
        data_asset_name="test_data_asset",
        batch_identifiers=IDDict({"id": "my_id"}),
    )

    validation_result_suite = ExpectationSuiteValidationResult(
        results=[],
        success=True,
        statistics={
            "evaluated_expectations": 0,
            "successful_expectations": 0,
            "unsuccessful_expectations": 0,
            "success_percent": None,
        },
        meta={
            "great_expectations_version": "v0.8.0__develop",
            "active_batch_definition": batch_definition,
            "expectation_suite_name": "default",
            "run_id": "2019-09-25T060538.829112Z",
        },
    )

    rendered_output = EmailRenderer().render(validation_result_suite)

    expected_output = (
        "default: Success 🎉",
        '<p><strong>Batch Validation Status</strong>: Success 🎉</p>\n<p><strong>Expectation suite name</strong>: default</p>\n<p><strong>Data asset name</strong>: test_data_asset</p>\n<p><strong>Run ID</strong>: 2019-09-25T060538.829112Z</p>\n<p><strong>Batch ID</strong>: ()</p>\n<p><strong>Summary</strong>: <strong>0</strong> of <strong>0</strong> expectations were met</p><p>Learn <a href="https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started/set_up_data_docs.html">here</a> how to review validation results in Data Docs</p>',
    )
    assert rendered_output == expected_output

    data_docs_pages = {"local_site": "file:///localsite/index.html"}
    notify_with = ["local_site"]
    rendered_output = EmailRenderer().render(validation_result_suite,
                                             data_docs_pages, notify_with)

    expected_output = (
        "default: Success 🎉",
        '<p><strong>Batch Validation Status</strong>: Success 🎉</p>\n<p><strong>Expectation suite name</strong>: default</p>\n<p><strong>Data asset name</strong>: test_data_asset</p>\n<p><strong>Run ID</strong>: 2019-09-25T060538.829112Z</p>\n<p><strong>Batch ID</strong>: ()</p>\n<p><strong>Summary</strong>: <strong>0</strong> of <strong>0</strong> expectations were met</p><p><strong>DataDocs</strong> can be found here: <a href="file:///localsite/index.html">file:///localsite/index.html</a>.</br>(Please copy and paste link into a browser to view)</p><p>Learn <a href="https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started/set_up_data_docs.html">here</a> how to review validation results in Data Docs</p>',
    )
    assert rendered_output == expected_output

    # not configured
    notify_with = ["fake_site"]
    rendered_output = EmailRenderer().render(validation_result_suite,
                                             data_docs_pages, notify_with)

    expected_output = (
        "default: Success 🎉",
        '<p><strong>Batch Validation Status</strong>: Success 🎉</p>\n<p><strong>Expectation suite name</strong>: default</p>\n<p><strong>Data asset name</strong>: test_data_asset</p>\n<p><strong>Run ID</strong>: 2019-09-25T060538.829112Z</p>\n<p><strong>Batch ID</strong>: ()</p>\n<p><strong>Summary</strong>: <strong>0</strong> of <strong>0</strong> expectations were met</p><strong>ERROR</strong>: The email is trying to provide a link to the following DataDocs: `fake_site`, but it is not configured under data_docs_sites in the great_expectations.yml</br><p>Learn <a href="https://docs.greatexpectations.io/en/latest/guides/tutorials/getting_started/set_up_data_docs.html">here</a> how to review validation results in Data Docs</p>',
    )

    assert rendered_output == expected_output
Beispiel #19
0
def test_complex_regex_example_with_implicit_data_asset_names(
        mock_gcs_conn, mock_list_keys, mock_emit):
    my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
            "group_names": ["year_dir", "month_dir", "data_asset_name"],
        },
        bucket_or_name="test_bucket",
        prefix="",
    )

    my_data_connector._refresh_data_references_cache()

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
            ))) == 3)

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="beta",
            ))) == 4)

    assert my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            data_connector_query={
                "batch_filter_parameters": {
                    "year_dir": "2020",
                    "month_dir": "03",
                }
            },
        )) == [
            BatchDefinition(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
                batch_identifiers=IDDict(
                    year_dir="2020",
                    month_dir="03",
                ),
            )
        ]
def test_create_three_batch_definitions_sort_numerically():
    one = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        batch_identifiers=IDDict({"id": 1}),
    )
    two = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        batch_identifiers=IDDict({"id": 2}),
    )
    three = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"id": 3}),
    )

    batch_list = [one, two, three]
    my_sorter = NumericSorter(name="id", orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [three, two, one]

    my_sorter = NumericSorter(name="id", orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [one, two, three]

    # testing a non-numeric, which should throw an error
    i_should_not_work = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"id": "aaa"}),
    )

    batch_list = [one, two, three, i_should_not_work]
    with pytest.raises(ge_exceptions.SorterError):
        sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
Beispiel #21
0
 def _map_data_reference_to_batch_definition_list(
     self, data_reference, data_asset_name: Optional[str] = None  #: Any,
 ) -> Optional[List[BatchDefinition]]:
     # Note: This is a bit hacky, but it works. In sql_data_connectors, data references *are* dictionaries,
     # allowing us to invoke `IDDict(data_reference)`
     return [
         BatchDefinition(
             datasource_name=self.datasource_name,
             data_connector_name=self.name,
             data_asset_name=data_asset_name,
             batch_identifiers=IDDict(data_reference),
         )
     ]
Beispiel #22
0
def test_batch_data_sparkdf_execution_engine_get_batch_definitions_and_get_batch_basics(
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine,
        spark_session):
    test_df: "pyspark.sql.dataframe.DataFrame" = (  # noqa: F821
        spark_session.createDataFrame(data=pd.DataFrame(data={
            "col1": [1, 2],
            "col2": [3, 4]
        })))

    data_connector_name: str = "test_runtime_data_connector"
    data_asset_name: str = "test_asset_1"

    batch_request: Dict[str, Any] = {
        "datasource_name":
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        name,
        "data_connector_name":
        data_connector_name,
        "data_asset_name":
        data_asset_name,
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": {
            "airflow_run_id": 1234567890,
        },
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    assert (len(
        datasource_with_runtime_data_connector_and_sparkdf_execution_engine.
        get_available_batch_definitions(batch_request=batch_request)) == 1)

    my_df: "pyspark.sql.dataframe.DataFrame" = (
        spark_session.createDataFrame(  # noqa: F821
            pd.DataFrame({
                "x": range(10),
                "y": range(10)
            })))
    batch: Batch = datasource_with_runtime_data_connector_and_sparkdf_execution_engine.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            "my_datasource",
            "_pipeline",
            "_pipeline",
            batch_identifiers=IDDict({"some_random_id": 1}),
        ),
        batch_data=my_df,
    )
    assert batch.batch_request == {}
def test_get_validator_bad_batch_request(
    data_context_with_simple_sql_datasource_for_testing_get_batch, ):
    context: "DataContext" = (
        data_context_with_simple_sql_datasource_for_testing_get_batch)
    context.create_expectation_suite("my_expectations")
    batch_request: BatchRequest = BatchRequest(
        datasource_name="my_sqlite_db",
        data_connector_name="daily",
        data_asset_name="i_dont_exist",
        data_connector_query=IDDict(
            batch_filter_parameters={"date": "2020-01-15"}),
    )
    with pytest.raises(KeyError):
        # as a result of introspection, the data_assets will already be loaded into the cache.
        # an incorrect data_asset_name will result in a key error
        context.get_validator(batch_request=batch_request,
                              expectation_suite_name="my_expectations")
Beispiel #24
0
def test_populate_dependencies_with_incorrect_metric_name():
    df = pd.DataFrame({"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]})
    expectation_configuration = ExpectationConfiguration(
        expectation_type="expect_column_value_z_scores_to_be_less_than",
        kwargs={
            "column": "a",
            "mostly": 0.9,
            "threshold": 4,
            "double_sided": True,
        },
    )
    # noinspection PyUnusedLocal
    expectation = ExpectColumnValueZScoresToBeLessThan(expectation_configuration)
    # noinspection PyUnusedLocal
    batch = Batch(data=df)
    graph = ValidationGraph()
    engine = PandasExecutionEngine()
    for configuration in [expectation_configuration]:
        expectation_impl = get_expectation_impl(
            "expect_column_value_z_scores_to_be_less_than"
        )
        validation_dependencies = expectation_impl(
            configuration
        ).get_validation_dependencies(
            configuration,
            engine,
        )

        try:
            Validator(execution_engine=engine).build_metric_dependency_graph(
                graph=graph,
                execution_engine=engine,
                metric_configuration=MetricConfiguration(
                    "column_values.not_a_metric", IDDict()
                ),
                configuration=configuration,
            )
        except ge_exceptions.MetricProviderError as e:
            graph = e

    assert isinstance(graph, ge_exceptions.MetricProviderError)
def test_get_batch_definitions_and_get_batch_basics(
    basic_datasource_with_runtime_data_connector, ):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    data_connector_name: str = "test_runtime_data_connector"
    data_asset_name: str = "test_asset_1"

    batch_request: dict = {
        "datasource_name": basic_datasource_with_runtime_data_connector.name,
        "data_connector_name": data_connector_name,
        "data_asset_name": data_asset_name,
        "runtime_parameters": {
            "batch_data": test_df,
        },
        "batch_identifiers": {
            "airflow_run_id": 1234567890,
        },
    }
    batch_request: RuntimeBatchRequest = RuntimeBatchRequest(**batch_request)

    assert (len(
        basic_datasource_with_runtime_data_connector.
        get_available_batch_definitions(batch_request=batch_request)) == 1)

    my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)})
    batch: Batch = (basic_datasource_with_runtime_data_connector.
                    get_batch_from_batch_definition(
                        batch_definition=BatchDefinition(
                            "my_datasource",
                            "_pipeline",
                            "_pipeline",
                            batch_identifiers=IDDict({"some_random_id": 1}),
                        ),
                        batch_data=my_df,
                    ))
    assert batch.batch_request == {}
Beispiel #26
0
def test_get_batch_definitions_and_get_batch_basics(
        basic_pandas_datasource_v013):
    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        basic_pandas_datasource_v013.
        data_connectors["my_filesystem_data_connector"])
    create_files_in_directory(
        my_data_connector.base_directory,
        ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"],
    )

    assert (len(
        basic_pandas_datasource_v013.get_available_batch_definitions(
            batch_request=BatchRequest(
                datasource_name="my_datasource",
                data_connector_name="my_filesystem_data_connector",
                data_asset_name="Titanic",
            ))) == 6)

    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="B1",
            batch_identifiers=IDDict({
                "letter": "B",
                "number": "1",
            }),
        ))

    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
    assert isinstance(batch.data.dataframe, pd.DataFrame)
    assert batch.batch_definition == BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="B1",
        batch_identifiers=IDDict({
            "letter": "B",
            "number": "1",
        }),
    )

    batch_list: List[
        Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="my_datasource",
                data_connector_name="my_filesystem_data_connector",
                data_asset_name="B1",
                data_connector_query={
                    "batch_filter_parameters": {
                        "letter": "B",
                        "number": "1",
                    }
                },
            ))
    assert len(batch_list) == 0

    batch_list: List[
        Batch] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="my_datasource",
                data_connector_name="my_filesystem_data_connector",
                data_asset_name="Titanic",
                data_connector_query={
                    "batch_filter_parameters": {
                        "letter": "B",
                        "number": "1",
                    }
                },
            ))
    assert len(batch_list) == 1
    assert isinstance(batch_list[0].data.dataframe, pd.DataFrame)

    my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)})
    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            "my_datasource",
            "_pipeline",
            "_pipeline",
            batch_identifiers=IDDict({"some_random_id": 1}),
        ),
        batch_data=my_df,
    )
    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
def test_alpha(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_alpha"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "test_dir_alpha/A.csv",
            "test_dir_alpha/B.csv",
            "test_dir_alpha/C.csv",
            "test_dir_alpha/D.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
                module_name: great_expectations.datasource.data_connector
                class_name: ConfiguredAssetFilesystemDataConnector
                datasource_name: BASE
                base_directory: {base_directory}/test_dir_alpha
                assets:
                  A:
                    glob_directive: "*.csv"
                default_regex:
                    pattern: (.+)\\.csv
                    group_names:
                    - part_1
            """, )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "execution_engine": PandasExecutionEngine(),
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        ))
    self_check_report = my_data_connector.self_check()
    print(json.dumps(self_check_report, indent=2))

    assert self_check_report[
        "class_name"] == "ConfiguredAssetFilesystemDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert set(list(self_check_report["data_assets"].keys())) == {"A"}
    assert self_check_report["unmatched_data_reference_count"] == 0

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # Try to fetch a batch from a nonexistent asset
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="B",
        data_connector_query=None,
    )

    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request))
    assert len(my_batch_definition_list) == 0

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="A",
        data_connector_query=IDDict(
            **{"batch_filter_parameters": {
                "part_1": "B"
            }}),
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request))
    assert len(my_batch_definition_list) == 1
def test_return_all_batch_definitions_unsorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            class_name: ConfiguredAssetFilesystemDataConnector
            datasource_name: test_environment
            base_directory: {base_directory}
            glob_directive: "*.csv"
            assets:
                TestFiles:
            default_regex:
                pattern: (.+)_(.+)_(.+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - price
        """, )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "execution_engine": PandasExecutionEngine(),
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        ))

    with pytest.raises(TypeError):
        # noinspection PyArgumentList
        my_data_connector.get_batch_definition_list_from_batch_request()

    # with unnamed data_asset_name
    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="",
            ))

    # with unnamed data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector._get_batch_definition_list_from_batch_request(
            BatchRequestBase(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="",
            )))
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200819",
                "price": "1300"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200713",
                "price": "1567"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200810",
                "price": "1003"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200811",
                "price": "1009"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200810",
                "price": "1001"
            }),
        ),
    ]
    assert expected == unsorted_batch_definition_list

    # with named data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )))
    assert expected == unsorted_batch_definition_list
def test_return_all_batch_definitions_sorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
        class_name: ConfiguredAssetFilesystemDataConnector
        datasource_name: test_environment
        base_directory: {base_directory}
        glob_directive: "*.csv"
        assets:
            TestFiles:
        default_regex:
            pattern: (.+)_(.+)_(.+)\\.csv
            group_names:
                - name
                - timestamp
                - price
        sorters:
            - orderby: asc
              class_name: LexicographicSorter
              name: name
            - datetime_format: "%Y%m%d"
              orderby: desc
              class_name: DateTimeSorter
              name: timestamp
            - orderby: desc
              class_name: NumericSorter
              name: price

    """, )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "execution_engine": PandasExecutionEngine(),
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        ))

    self_check_report = my_data_connector.self_check()

    assert self_check_report[
        "class_name"] == "ConfiguredAssetFilesystemDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert self_check_report["data_assets"]["TestFiles"][
        "batch_definition_count"] == 10
    assert self_check_report["unmatched_data_reference_count"] == 0

    sorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )))

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200819",
                "price": "1300"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200811",
                "price": "1009"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200810",
                "price": "1003"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200713",
                "price": "1567"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200810",
                "price": "1001"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
    ]

    # TEST 1: Sorting works
    assert expected == sorted_batch_definition_list

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        data_connector_query=IDDict(
            **{
                "batch_filter_parameters": {
                    "name": "james",
                    "timestamp": "20200713",
                    "price": "1567",
                }
            }),
    )

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # TEST 2: Should only return the specified partition
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request))

    assert len(my_batch_definition_list) == 1
    my_batch_definition = my_batch_definition_list[0]
    expected_batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict(**{
            "name": "james",
            "timestamp": "20200713",
            "price": "1567",
        }),
    )
    assert my_batch_definition == expected_batch_definition

    # TEST 3: Without data_connector_query, should return all 10
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        data_connector_query=None,
    )
    # should return 10
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request))
    assert len(my_batch_definition_list) == 10
def test_alpha():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "test_dir_alpha/A.csv",
        "test_dir_alpha/B.csv",
        "test_dir_alpha/C.csv",
        "test_dir_alpha/D.csv",
    ]
    for key in keys:
        client.put_object(
            Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key
        )

    my_data_connector_yaml = yaml.load(
        f"""
                module_name: great_expectations.datasource.data_connector
                class_name: ConfiguredAssetS3DataConnector
                datasource_name: BASE
                bucket: {bucket}
                prefix: test_dir_alpha
                assets:
                  A:
                default_regex:
                    pattern: .*(.+)\\.csv
                    group_names:
                    - part_1
            """,
    )

    my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "general_s3_data_connector",
            "execution_engine": PandasExecutionEngine(),
        },
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
    )
    self_check_report = my_data_connector.self_check()
    print(json.dumps(self_check_report, indent=2))

    assert self_check_report["class_name"] == "ConfiguredAssetS3DataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert set(list(self_check_report["data_assets"].keys())) == {"A"}
    assert self_check_report["unmatched_data_reference_count"] == 0

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # Try to fetch a batch from a nonexistent asset
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_s3_data_connector",
        data_asset_name="B",
        data_connector_query=None,
    )

    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 0

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="BASE",
        data_connector_name="general_s3_data_connector",
        data_asset_name="A",
        data_connector_query=IDDict(**{"batch_filter_parameters": {"part_1": "B"}}),
    )
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 1