コード例 #1
0
def test_partition_request_partition_request_partition_identifiers_1_key(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector
    # no limit
    returned_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_request={
                "partition_identifiers": {
                    "timestamp": "20200809"
                },
            },
        ))
    assert len(returned_batch_definition_list) == 4

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
コード例 #2
0
def test_custom_list(periodic_table_of_elements):
    Hydrogen = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        partition_definition=PartitionDefinition({"element": "Hydrogen"}),
    )
    Helium = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        partition_definition=PartitionDefinition({"element": "Helium"}),
    )
    Lithium = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        partition_definition=PartitionDefinition({"element": "Lithium"}),
    )

    batch_list = [Hydrogen, Helium, Lithium]
    my_sorter = CustomListSorter(name="element",
                                 orderby="desc",
                                 reference_list=periodic_table_of_elements)
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [Lithium, Helium, Hydrogen]

    my_sorter = CustomListSorter(name="element",
                                 orderby="asc",
                                 reference_list=periodic_table_of_elements)
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [Hydrogen, Helium, Lithium]
コード例 #3
0
def test_create_three_batch_definitions_sort_lexicographically():
    a = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        partition_definition=PartitionDefinition({"id": "A"}),
    )
    b = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        partition_definition=PartitionDefinition({"id": "B"}),
    )
    c = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        partition_definition=PartitionDefinition({"id": "C"}),
    )

    batch_list = [a, b, c]

    # sorting by "id" reverse alphabetically (descending)
    my_sorter = LexicographicSorter(name="id", orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, )
    assert sorted_batch_list == [c, b, a]

    # sorting by "id" reverse alphabetically (ascending)
    my_sorter = LexicographicSorter(name="id", orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, )
    assert sorted_batch_list == [a, b, c]
def test_batch_definition_id():
    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))
    print(A.id)

    B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"}))
    print(B.id)

    assert A.id != B.id
コード例 #5
0
def test_map_data_reference_string_to_batch_definition_list_using_regex():
    # regex_pattern does not match --> None
    data_reference = "alex_20200809_1000.csv"
    regex_pattern = r"^(.+)_____________\.csv$"
    group_names = ["name", "timestamp", "price"]
    returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex(
        datasource_name="test_datasource",
        data_connector_name="test_data_connector",
        data_asset_name=None,
        data_reference=data_reference,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert returned_batch_def_list is None

    # no data_asset_name configured --> DEFAULT_ASSET_NAME
    data_reference = "alex_20200809_1000.csv"
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex(
        datasource_name="test_datasource",
        data_connector_name="test_data_connector",
        data_asset_name=None,
        data_reference=data_reference,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert returned_batch_def_list == [
        BatchDefinition(
            datasource_name="test_datasource",
            data_connector_name="test_data_connector",
            data_asset_name="DEFAULT_ASSET_NAME",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000",}
            ),
        )
    ]

    # data_asset_name configured
    returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex(
        datasource_name="test_datasource",
        data_connector_name="test_data_connector",
        data_asset_name="test_data_asset",
        data_reference=data_reference,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert returned_batch_def_list == [
        BatchDefinition(
            datasource_name="test_datasource",
            data_connector_name="test_data_connector",
            data_asset_name="test_data_asset",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000",}
            ),
        )
    ]
コード例 #6
0
def test_convert_partition_definition_to_data_reference_string_using_regex():
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    partition_definition = PartitionDefinition(
        **{
            "name": "alex",
            "timestamp": "20200809",
            "price": "1000",
        }
    )
    assert (
        convert_partition_definition_to_data_reference_string_using_regex(
            partition_definition=partition_definition,
            regex_pattern=pattern,
            group_names=group_names,
        )
        == "alex_20200809_1000.csv"
    )

    # Test an example with an uncaptured regex group (should return a WildcardDataReference)
    pattern = r"^(.+)_(\d+)_\d+\.csv$"
    group_names = ["name", "timestamp"]
    partition_definition = PartitionDefinition(
        **{
            "name": "alex",
            "timestamp": "20200809",
            "price": "1000",
        }
    )
    assert (
        convert_partition_definition_to_data_reference_string_using_regex(
            partition_definition=partition_definition,
            regex_pattern=pattern,
            group_names=group_names,
        )
        == "alex_20200809_*.csv"
    )

    # Test an example with an uncaptured regex group (should return a WildcardDataReference)
    pattern = r"^.+_(\d+)_(\d+)\.csv$"
    group_names = ["timestamp", "price"]
    partition_definition = PartitionDefinition(
        **{
            "name": "alex",
            "timestamp": "20200809",
            "price": "1000",
        }
    )
    assert (
        convert_partition_definition_to_data_reference_string_using_regex(
            partition_definition=partition_definition,
            regex_pattern=pattern,
            group_names=group_names,
        )
        == "*_20200809_1000.csv"
    )
def test_batch_definition_equality():
    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"}))

    assert A != B

    A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    assert A == A2
コード例 #8
0
def test_batch_definition_id():
    # noinspection PyUnusedLocal,PyPep8Naming
    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))
    print(A.id)

    # noinspection PyUnusedLocal,PyPep8Naming
    B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"}))
    print(B.id)

    assert A.id != B.id
コード例 #9
0
def test_convert_data_reference_string_to_partition_definition_using_regex():
    data_reference = "alex_20200809_1000.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert convert_data_reference_string_to_partition_definition_using_regex(
        data_reference=data_reference, regex_pattern=pattern, group_names=group_names
    ) == (
        "DEFAULT_ASSET_NAME",
        PartitionDefinition(
            {
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000",
            }
        ),
    )

    data_reference = "eugene_20200810_1500.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert convert_data_reference_string_to_partition_definition_using_regex(
        data_reference=data_reference, regex_pattern=pattern, group_names=group_names
    ) == (
        "DEFAULT_ASSET_NAME",
        PartitionDefinition(
            {
                "name": "eugene",
                "timestamp": "20200810",
                "price": "1500",
            }
        ),
    )
    data_reference = "DOESNT_MATCH_CAPTURING_GROUPS.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert (
        convert_data_reference_string_to_partition_definition_using_regex(
            data_reference=data_reference,
            regex_pattern=pattern,
            group_names=group_names,
        )
        is None
    )

    data_reference = "eugene_DOESNT_MATCH_ALL_CAPTURING_GROUPS_1500.csv"
    pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    assert (
        convert_data_reference_string_to_partition_definition_using_regex(
            data_reference=data_reference,
            regex_pattern=pattern,
            group_names=group_names,
        )
        is None
    )
コード例 #10
0
def test_map_batch_definition_to_data_reference_string_using_regex():
    # not BatchDefinition
    my_batch_definition = "I_am_a_string"
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(TypeError):
        # noinspection PyUnusedLocal,PyTypeChecker
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # group names do not match
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_definition=PartitionDefinition({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["i", "wont", "match"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(KeyError):
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # success
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_definition=PartitionDefinition({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"

    my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
        batch_definition=my_batch_definition,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert my_data_reference == "eugene_20200809_1500.csv"
コード例 #11
0
def test_date_time():
    first = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        partition_definition=PartitionDefinition({"date": "20210101"}),
    )
    second = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        partition_definition=PartitionDefinition({"date": "20210102"}),
    )
    third = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        partition_definition=PartitionDefinition({"date": "20210103"}),
    )

    batch_list = [first, second, third]
    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [third, second, first]

    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [first, second, third]

    with pytest.raises(ge_exceptions.SorterError):
        # numeric date_time_format
        i_dont_work = DateTimeSorter(name="date",
                                     datetime_format=12345,
                                     orderby="desc")

    my_date_is_not_a_string = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        partition_definition=PartitionDefinition({"date": 20210103}),
    )

    batch_list = [first, second, third, my_date_is_not_a_string]
    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="desc")

    with pytest.raises(ge_exceptions.SorterError):
        sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
コード例 #12
0
def test_batch_definition_equality():
    # noinspection PyUnusedLocal,PyPep8Naming
    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    # noinspection PyUnusedLocal,PyPep8Naming
    B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"}))

    assert A != B

    # noinspection PyUnusedLocal,PyPep8Naming
    A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    assert A == A2
コード例 #13
0
def test_partition_request_sorted_filtered_by_custom_filter_with_slice_as_list(
    create_files_and_instantiate_data_connector, ):
    # <TODO> is this behavior correct?
    my_data_connector = create_files_and_instantiate_data_connector

    # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter.

    def my_custom_partition_selector(partition_definition: dict) -> bool:
        return (partition_definition["name"] in ["abe", "james", "eugene"]
                and datetime.datetime.strptime(
                    partition_definition["timestamp"],
                    "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date())

    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                partition_request={
                    "custom_filter_function": my_custom_partition_selector,
                    "index": [1, 3],
                },
            )))

    assert len(returned_batch_definition_list) == 2

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
コード例 #14
0
def test_partition_request_partition_request_partition_identifiers_1_key_and_index(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector
    # no limit
    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                partition_request={
                    "partition_identifiers": {
                        "name": "james"
                    },
                    "index": 0,
                },
            )))
    assert len(returned_batch_definition_list) == 1

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "james",
                "timestamp": "20200713",
                "price": "1567"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
コード例 #15
0
    def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest):
        self._validate_batch_request(batch_request=batch_request)

        if self._data_references_cache is None:
            self._refresh_data_references_cache()

        batch_definition_list: List[BatchDefinition] = []

        try:
            sub_cache = self._data_references_cache[batch_request.data_asset_name]
        except KeyError as e:
            raise KeyError(
                f"data_asset_name {batch_request.data_asset_name} is not recognized."
            )

        for partition_definition in sub_cache:
            batch_definition: BatchDefinition = BatchDefinition(
                datasource_name=self.datasource_name,
                data_connector_name=self.name,
                data_asset_name=batch_request.data_asset_name,
                partition_definition=PartitionDefinition(partition_definition),
            )
            if batch_definition_matches_batch_request(batch_definition, batch_request):
                batch_definition_list.append(batch_definition)

        return batch_definition_list
def test_batch_definition_instantiation():
    with pytest.raises(TypeError):
        A = BatchDefinition("A", "a", "aaa", {"id": "A"})

    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    print(A.id)
コード例 #17
0
def test_batch_definition_instantiation():
    with pytest.raises(TypeError):
        # noinspection PyTypeChecker,PyUnusedLocal,PyPep8Naming
        A = BatchDefinition("A", "a", "aaa", {"id": "A"})

    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    print(A.id)
コード例 #18
0
def test_create_three_batch_definitions_sort_numerically():
    one = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        partition_definition=PartitionDefinition({"id": 1}),
    )
    two = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        partition_definition=PartitionDefinition({"id": 2}),
    )
    three = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        partition_definition=PartitionDefinition({"id": 3}),
    )

    batch_list = [one, two, three]
    my_sorter = NumericSorter(name="id", orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [three, two, one]

    my_sorter = NumericSorter(name="id", orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [one, two, three]

    # testing a non-numeric, which should throw an error
    i_should_not_work = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        partition_definition=PartitionDefinition({"id": "aaa"}),
    )

    batch_list = [one, two, three, i_should_not_work]
    with pytest.raises(ge_exceptions.SorterError):
        sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
コード例 #19
0
 def _map_data_reference_to_batch_definition_list(
     self, data_reference, data_asset_name: Optional[str] = None  #: Any,
 ) -> Optional[List[BatchDefinition]]:
     # Note: This is a bit hacky, but it works. In sql_data_connectors, data references *are* dictionaries,
     # allowing us to invoke `PartitionDefinition(data_reference)`
     return [
         BatchDefinition(
             datasource_name=self.datasource_name,
             data_connector_name=self.name,
             data_asset_name=data_asset_name,
             partition_definition=PartitionDefinition(data_reference),
         )
     ]
コード例 #20
0
def test_batch__str__method():
    batch = Batch(
        data=None,
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
        ),
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
            partition_definition=PartitionDefinition({}),
        ),
        batch_spec=BatchSpec(path="/some/path/some.file"),
        batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"),
    )
    print(batch.__str__())

    assert (
        batch.__str__()
        == """{
  "data": "None",
  "batch_request": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name",
    "partition_request": null
  },
  "batch_definition": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name",
    "partition_definition": {}
  },
  "batch_spec": "{'path': '/some/path/some.file'}",
  "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}"
}"""
    )
def test_get_batch_definitions_and_get_batch_basics(
    basic_datasource_with_runtime_data_connector, ):
    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    data_connector_name: str = "test_runtime_data_connector"
    data_asset_name: str = "test_asset_1"

    batch_request: dict = {
        "datasource_name": basic_datasource_with_runtime_data_connector.name,
        "data_connector_name": data_connector_name,
        "data_asset_name": data_asset_name,
        "batch_data": test_df,
        "partition_request": {
            "batch_identifiers": {
                "airflow_run_id": 1234567890,
            }
        },
        "limit": None,
    }
    batch_request: BatchRequest = BatchRequest(**batch_request)

    assert (len(
        basic_datasource_with_runtime_data_connector.
        get_available_batch_definitions(batch_request=batch_request)) == 1)

    my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)})
    batch: Batch = (basic_datasource_with_runtime_data_connector.
                    get_batch_from_batch_definition(
                        batch_definition=BatchDefinition(
                            "my_datasource",
                            "_pipeline",
                            "_pipeline",
                            partition_definition=PartitionDefinition(
                                {"some_random_id": 1}),
                        ),
                        batch_data=my_df,
                    ))
    assert batch.batch_request == {}
コード例 #22
0
def test_get_batch_definitions_and_get_batch_basics(basic_pandas_datasource_v013):
    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        basic_pandas_datasource_v013.data_connectors["my_filesystem_data_connector"]
    )
    create_files_in_directory(
        my_data_connector.base_directory,
        ["A_1.csv", "A_2.csv", "A_3.csv", "B_1.csv", "B_2.csv", "B_3.csv"],
    )

    assert (
        len(
            basic_pandas_datasource_v013.get_available_batch_definitions(
                batch_request=BatchRequest(
                    datasource_name="my_datasource",
                    data_connector_name="my_filesystem_data_connector",
                    data_asset_name="Titanic",
                )
            )
        )
        == 6
    )

    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="B1",
            partition_definition=PartitionDefinition(
                {
                    "letter": "B",
                    "number": "1",
                }
            ),
        )
    )

    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
    assert isinstance(batch.data.dataframe, pd.DataFrame)
    assert batch.batch_definition == BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_filesystem_data_connector",
        data_asset_name="B1",
        partition_definition=PartitionDefinition(
            {
                "letter": "B",
                "number": "1",
            }
        ),
    )

    batch_list: List[
        Batch
    ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="B1",
            partition_request={
                "batch_identifiers": {
                    "letter": "B",
                    "number": "1",
                }
            },
        )
    )
    assert len(batch_list) == 0

    batch_list: List[
        Batch
    ] = basic_pandas_datasource_v013.get_batch_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_filesystem_data_connector",
            data_asset_name="Titanic",
            partition_request={
                "batch_identifiers": {
                    "letter": "B",
                    "number": "1",
                }
            },
        )
    )
    assert len(batch_list) == 1
    assert isinstance(batch_list[0].data.dataframe, pd.DataFrame)

    my_df: pd.DataFrame = pd.DataFrame({"x": range(10), "y": range(10)})
    batch: Batch = basic_pandas_datasource_v013.get_batch_from_batch_definition(
        batch_definition=BatchDefinition(
            "my_datasource",
            "_pipeline",
            "_pipeline",
            partition_definition=PartitionDefinition({"some_random_id": 1}),
        ),
        batch_data=my_df,
    )
    # TODO Abe 20201104: Make sure this is what we truly want to do.
    assert batch.batch_request == {}
コード例 #23
0
def test_complex_regex_example_with_implicit_data_asset_names(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_complex_regex_example_with_implicit_data_asset_names"
        )
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "2020/01/alpha-1001.csv",
            "2020/01/beta-1002.csv",
            "2020/02/alpha-1003.csv",
            "2020/02/beta-1004.csv",
            "2020/03/alpha-1005.csv",
            "2020/03/beta-1006.csv",
            "2020/04/beta-1007.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        InferredAssetFilesystemDataConnector(
            name="my_data_connector",
            datasource_name="FAKE_DATASOURCE_NAME",
            default_regex={
                "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
                "group_names": ["year_dir", "month_dir", "data_asset_name"],
            },
            glob_directive="*/*/*.csv",
            base_directory=base_directory,
        )
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    # Test for an unknown execution environment
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition
        ] = my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="non_existent_datasource",
                data_connector_name="my_data_connector",
                data_asset_name="my_data_asset",
            )
        )

    # Test for an unknown data_connector
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition
        ] = my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="non_existent_data_connector",
                data_asset_name="my_data_asset",
            )
        )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="alpha",
                )
            )
        )
        == 3
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="beta",
                )
            )
        )
        == 4
    )

    assert my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            partition_request={
                "batch_identifiers": {
                    "year_dir": "2020",
                    "month_dir": "03",
                }
            },
        )
    ) == [
        BatchDefinition(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            partition_definition=PartitionDefinition(
                year_dir="2020",
                month_dir="03",
            ),
        )
    ]
def test_complex_regex_example_with_implicit_data_asset_names():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "2020/01/alpha-1001.csv",
        "2020/01/beta-1002.csv",
        "2020/02/alpha-1003.csv",
        "2020/02/beta-1004.csv",
        "2020/03/alpha-1005.csv",
        "2020/03/beta-1006.csv",
        "2020/04/beta-1007.csv",
    ]
    for key in keys:
        client.put_object(Bucket=bucket,
                          Body=test_df.to_csv(index=False).encode("utf-8"),
                          Key=key)

    my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
            "group_names": ["year_dir", "month_dir", "data_asset_name"],
        },
        bucket=bucket,
        prefix="",
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    # Test for an unknown execution environment
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="non_existent_datasource",
                    data_connector_name="my_data_connector",
                    data_asset_name="my_data_asset",
                ))

    # Test for an unknown data_connector
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="non_existent_data_connector",
                    data_asset_name="my_data_asset",
                ))

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
            ))) == 3)

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
            ))) == 3)

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                data_connector_name="my_data_connector",
                data_asset_name="beta",
            ))) == 4)

    assert my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            partition_request={
                "partition_identifiers": {
                    "year_dir": "2020",
                    "month_dir": "03",
                }
            },
        )) == [
            BatchDefinition(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
                partition_definition=PartitionDefinition(
                    year_dir="2020",
                    month_dir="03",
                ),
            )
        ]
def test_redundant_information_in_naming_convention_bucket_sorted():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "some_bucket/2021/01/01/log_file-20210101.txt.gz",
        "some_bucket/2021/01/02/log_file-20210102.txt.gz",
        "some_bucket/2021/01/03/log_file-20210103.txt.gz",
        "some_bucket/2021/01/04/log_file-20210104.txt.gz",
        "some_bucket/2021/01/05/log_file-20210105.txt.gz",
        "some_bucket/2021/01/06/log_file-20210106.txt.gz",
        "some_bucket/2021/01/07/log_file-20210107.txt.gz",
    ]
    for key in keys:
        client.put_object(Bucket=bucket,
                          Body=test_df.to_csv(index=False).encode("utf-8"),
                          Key=key)

    my_data_connector_yaml = yaml.load(
        f"""
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetS3DataConnector
          datasource_name: test_environment
          name: my_inferred_asset_filesystem_data_connector
          bucket: {bucket}
          prefix: ""
          default_regex:
              group_names:
                  - data_asset_name
                  - year
                  - month
                  - day
                  - full_date
              pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz
          sorters:
              - orderby: desc
                class_name: DateTimeSorter
                name: full_date

          """, )

    my_data_connector: InferredAssetS3DataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "my_inferred_asset_filesystem_data_connector",
            "datasource_name": "test_environment",
            "execution_engine": "BASE_ENGINE",
        },
        config_defaults={
            "module_name": "great_expectations.datasource.data_connector"
        },
    )

    sorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
        ))

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition({
                "year": "2021",
                "month": "01",
                "day": "07",
                "full_date": "20210107"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition({
                "year": "2021",
                "month": "01",
                "day": "06",
                "full_date": "20210106"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition({
                "year": "2021",
                "month": "01",
                "day": "05",
                "full_date": "20210105"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition({
                "year": "2021",
                "month": "01",
                "day": "04",
                "full_date": "20210104"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition({
                "year": "2021",
                "month": "01",
                "day": "03",
                "full_date": "20210103"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition({
                "year": "2021",
                "month": "01",
                "day": "02",
                "full_date": "20210102"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition({
                "year": "2021",
                "month": "01",
                "day": "01",
                "full_date": "20210101"
            }),
        ),
    ]
    assert expected == sorted_batch_definition_list
コード例 #26
0
def test_return_all_batch_definitions_sorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_sorted")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
        class_name: ConfiguredAssetFilesystemDataConnector
        datasource_name: test_environment
        #execution_engine:
        #    class_name: PandasExecutionEngine
        base_directory: {base_directory}
        glob_directive: "*.csv"
        assets:
            TestFiles:
        default_regex:
            pattern: (.+)_(.+)_(.+)\\.csv
            group_names:
                - name
                - timestamp
                - price
        sorters:
            - orderby: asc
              class_name: LexicographicSorter
              name: name
            - datetime_format: "%Y%m%d"
              orderby: desc
              class_name: DateTimeSorter
              name: timestamp
            - orderby: desc
              class_name: NumericSorter
              name: price

    """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "test_environment",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    self_check_report = my_data_connector.self_check()

    assert self_check_report["class_name"] == "ConfiguredAssetFilesystemDataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10
    assert self_check_report["unmatched_data_reference_count"] == 0

    sorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
    ]

    # TEST 1: Sorting works
    assert expected == sorted_batch_definition_list

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_request=PartitionRequest(
            **{
                "batch_identifiers": {
                    "name": "james",
                    "timestamp": "20200713",
                    "price": "1567",
                }
            }
        ),
    )

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # TEST 2: Should only return the specified partition
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )

    assert len(my_batch_definition_list) == 1
    my_batch_definition = my_batch_definition_list[0]
    expected_batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_definition=PartitionDefinition(
            **{
                "name": "james",
                "timestamp": "20200713",
                "price": "1567",
            }
        ),
    )
    assert my_batch_definition == expected_batch_definition

    # TEST 3: Without partition request, should return all 10
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        partition_request=None,
    )
    # should return 10
    my_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=my_batch_request
        )
    )
    assert len(my_batch_definition_list) == 10
コード例 #27
0
def test_return_all_batch_definitions_unsorted(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp("test_return_all_batch_definitions_unsorted")
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20200809_1000.csv",
            "eugene_20200809_1500.csv",
            "james_20200811_1009.csv",
            "abe_20200809_1040.csv",
            "will_20200809_1002.csv",
            "james_20200713_1567.csv",
            "eugene_20201129_1900.csv",
            "will_20200810_1001.csv",
            "james_20200810_1003.csv",
            "alex_20200819_1300.csv",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
            class_name: ConfiguredAssetFilesystemDataConnector
            datasource_name: test_environment
            #execution_engine:
            #    class_name: PandasExecutionEngine
            base_directory: {base_directory}
            glob_directive: "*.csv"
            assets:
                TestFiles:
            default_regex:
                pattern: (.+)_(.+)_(.+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - price
        """,
    )

    my_data_connector: ConfiguredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "general_filesystem_data_connector",
                "datasource_name": "test_environment",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request()

    # with unnamed data_asset_name
    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name=None,
            )
        )

    # with unnamed data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector._get_batch_definition_list_from_batch_request(
            BatchRequestBase(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name=None,
            )
        )
    )
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
    ]
    assert expected == unsorted_batch_definition_list

    # with named data_asset_name
    unsorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
            )
        )
    )
    assert expected == unsorted_batch_definition_list
def test_return_all_batch_definitions_sorted():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ]
    for key in keys:
        client.put_object(
            Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key
        )

    my_data_connector_yaml = yaml.load(
        f"""
        class_name: ConfiguredAssetS3DataConnector
        datasource_name: test_environment
        #execution_engine:
        #    class_name: PandasExecutionEngine
        bucket: {bucket}
        prefix: ""
        assets:
            TestFiles:
        default_regex:
            pattern: (.+)_(.+)_(.+)\\.csv
            group_names:
                - name
                - timestamp
                - price
        sorters:
            - orderby: asc
              class_name: LexicographicSorter
              name: name
            - datetime_format: "%Y%m%d"
              orderby: desc
              class_name: DateTimeSorter
              name: timestamp
            - orderby: desc
              class_name: NumericSorter
              name: price

    """,
    )

    my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "general_s3_data_connector",
            "datasource_name": "test_environment",
        },
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
    )

    self_check_report = my_data_connector.self_check()

    assert self_check_report["class_name"] == "ConfiguredAssetS3DataConnector"
    assert self_check_report["data_asset_count"] == 1
    assert self_check_report["data_assets"]["TestFiles"]["batch_definition_count"] == 10
    assert self_check_report["unmatched_data_reference_count"] == 0

    sorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
        )
    )

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
    ]

    # TEST 1: Sorting works
    assert expected == sorted_batch_definition_list

    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_s3_data_connector",
        data_asset_name="TestFiles",
        partition_request=PartitionRequest(
            **{
                "partition_identifiers": {
                    "name": "james",
                    "timestamp": "20200713",
                    "price": "1567",
                }
            }
        ),
    )

    my_batch_definition_list: List[BatchDefinition]
    my_batch_definition: BatchDefinition

    # TEST 2: Should only return the specified partition
    my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=my_batch_request
    )

    assert len(my_batch_definition_list) == 1
    my_batch_definition = my_batch_definition_list[0]
    expected_batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_s3_data_connector",
        data_asset_name="TestFiles",
        partition_definition=PartitionDefinition(
            **{"name": "james", "timestamp": "20200713", "price": "1567",}
        ),
    )
    assert my_batch_definition == expected_batch_definition

    # TEST 3: Without partition request, should return all 10
    my_batch_request: BatchRequest = BatchRequest(
        datasource_name="test_environment",
        data_connector_name="general_s3_data_connector",
        data_asset_name="TestFiles",
        partition_request=None,
    )
    # should return 10
    my_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=my_batch_request
    )
    assert len(my_batch_definition_list) == 10
def test_return_all_batch_definitions_unsorted():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "alex_20200809_1000.csv",
        "eugene_20200809_1500.csv",
        "james_20200811_1009.csv",
        "abe_20200809_1040.csv",
        "will_20200809_1002.csv",
        "james_20200713_1567.csv",
        "eugene_20201129_1900.csv",
        "will_20200810_1001.csv",
        "james_20200810_1003.csv",
        "alex_20200819_1300.csv",
    ]
    for key in keys:
        client.put_object(
            Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key
        )

    my_data_connector_yaml = yaml.load(
        f"""
            class_name: ConfiguredAssetS3DataConnector
            datasource_name: test_environment
            #execution_engine:
            #    class_name: PandasExecutionEngine
            bucket: {bucket}
            prefix: ""
            assets:
                TestFiles:
            default_regex:
                pattern: (.+)_(.+)_(.+)\\.csv
                group_names:
                    - name
                    - timestamp
                    - price
        """,
    )

    my_data_connector: ConfiguredAssetS3DataConnector = instantiate_class_from_config(
        config=my_data_connector_yaml,
        runtime_environment={
            "name": "general_s3_data_connector",
            "datasource_name": "test_environment",
        },
        config_defaults={"module_name": "great_expectations.datasource.data_connector"},
    )

    with pytest.raises(TypeError):
        my_data_connector.get_batch_definition_list_from_batch_request()

    # with unnamed data_asset_name
    unsorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name=None,
        )
    )
    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "abe", "timestamp": "20200809", "price": "1040"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200819", "price": "1300"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20200809", "price": "1500"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "eugene", "timestamp": "20201129", "price": "1900"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200713", "price": "1567"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200810", "price": "1003"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "james", "timestamp": "20200811", "price": "1009"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200809", "price": "1002"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition(
                {"name": "will", "timestamp": "20200810", "price": "1001"}
            ),
        ),
    ]
    assert expected == unsorted_batch_definition_list

    # with named data_asset_name
    unsorted_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="test_environment",
            data_connector_name="general_s3_data_connector",
            data_asset_name="TestFiles",
        )
    )
    assert expected == unsorted_batch_definition_list
コード例 #30
0
def test_redundant_information_in_naming_convention_bucket_sorted(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("logs"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "some_bucket/2021/01/01/log_file-20210101.txt.gz",
            "some_bucket/2021/01/02/log_file-20210102.txt.gz",
            "some_bucket/2021/01/03/log_file-20210103.txt.gz",
            "some_bucket/2021/01/04/log_file-20210104.txt.gz",
            "some_bucket/2021/01/05/log_file-20210105.txt.gz",
            "some_bucket/2021/01/06/log_file-20210106.txt.gz",
            "some_bucket/2021/01/07/log_file-20210107.txt.gz",
        ],
    )

    my_data_connector_yaml = yaml.load(
        f"""
          module_name: great_expectations.datasource.data_connector
          class_name: InferredAssetFilesystemDataConnector
          datasource_name: test_environment
          name: my_inferred_asset_filesystem_data_connector
          base_directory: {base_directory}/
          glob_directive: "*/*/*/*/*.txt.gz"
          default_regex:
              group_names:
                  - data_asset_name
                  - year
                  - month
                  - day
                  - full_date
              pattern: (\\w{{11}})/(\\d{{4}})/(\\d{{2}})/(\\d{{2}})/log_file-(.*)\\.txt\\.gz
          sorters:
              - orderby: desc
                class_name: DateTimeSorter
                name: full_date

          """,
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        instantiate_class_from_config(
            config=my_data_connector_yaml,
            runtime_environment={
                "name": "my_inferred_asset_filesystem_data_connector",
                "datasource_name": "test_environment",
                "execution_engine": "BASE_ENGINE",
            },
            config_defaults={
                "module_name": "great_expectations.datasource.data_connector"
            },
        )
    )

    sorted_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="my_inferred_asset_filesystem_data_connector",
                data_asset_name="some_bucket",
            )
        )
    )

    expected = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "07", "full_date": "20210107"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "06", "full_date": "20210106"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "05", "full_date": "20210105"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "04", "full_date": "20210104"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "03", "full_date": "20210103"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "02", "full_date": "20210102"}
            ),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="my_inferred_asset_filesystem_data_connector",
            data_asset_name="some_bucket",
            partition_definition=PartitionDefinition(
                {"year": "2021", "month": "01", "day": "01", "full_date": "20210101"}
            ),
        ),
    ]
    assert expected == sorted_batch_definition_list