def test_batch_definition_instantiation():
    with pytest.raises(TypeError):
        A = BatchDefinition("A", "a", "aaa", {"id": "A"})

    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    print(A.id)
def test_partition_request_partition_request_partition_identifiers_1_key(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector
    # no limit
    returned_batch_definition_list = my_data_connector.get_batch_definition_list_from_batch_request(
        BatchRequest(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_request={
                "partition_identifiers": {
                    "timestamp": "20200809"
                },
            },
        ))
    assert len(returned_batch_definition_list) == 4

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "alex",
                "timestamp": "20200809",
                "price": "1000"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test_custom_list(periodic_table_of_elements):
    Hydrogen = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        batch_identifiers=IDDict({"element": "Hydrogen"}),
    )
    Helium = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        batch_identifiers=IDDict({"element": "Helium"}),
    )
    Lithium = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"element": "Lithium"}),
    )

    batch_list = [Hydrogen, Helium, Lithium]
    my_sorter = CustomListSorter(name="element",
                                 orderby="desc",
                                 reference_list=periodic_table_of_elements)
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [Lithium, Helium, Hydrogen]

    my_sorter = CustomListSorter(name="element",
                                 orderby="asc",
                                 reference_list=periodic_table_of_elements)
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [Hydrogen, Helium, Lithium]
def test_create_three_batch_definitions_sort_lexicographically():
    a = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        batch_identifiers=IDDict({"id": "A"}),
    )
    b = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        batch_identifiers=IDDict({"id": "B"}),
    )
    c = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"id": "C"}),
    )

    batch_list = [a, b, c]

    # sorting by "id" reverse alphabetically (descending)
    my_sorter = LexicographicSorter(name="id", orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, )
    assert sorted_batch_list == [c, b, a]

    # sorting by "id" reverse alphabetically (ascending)
    my_sorter = LexicographicSorter(name="id", orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list, )
    assert sorted_batch_list == [a, b, c]
def test_sorter_instantiation_custom_list_with_periodic_table(
    periodic_table_of_elements, ):
    # CustomListSorter
    sorter_params: dict = {
        "reference_list": periodic_table_of_elements,
    }
    my_custom_sorter = CustomListSorter(name="element",
                                        orderby="asc",
                                        **sorter_params)
    # noinspection PyProtectedMember
    assert my_custom_sorter._reference_list == periodic_table_of_elements
    # This element exists : Hydrogen
    test_batch_def = BatchDefinition(
        datasource_name="test",
        data_connector_name="fake",
        data_asset_name="nowhere",
        batch_identifiers=IDDict({"element": "Hydrogen"}),
    )
    returned_partition_key = my_custom_sorter.get_batch_key(test_batch_def)
    assert returned_partition_key == 0

    # This element does not : Vibranium
    test_batch_def = BatchDefinition(
        datasource_name="test",
        data_connector_name="fake",
        data_asset_name="nowhere",
        batch_identifiers=IDDict({"element": "Vibranium"}),
    )
    with pytest.raises(ge_exceptions.SorterError):
        my_custom_sorter.get_batch_key(test_batch_def)
def test_batch_definition_id():
    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))
    print(A.id)

    B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"}))
    print(B.id)

    assert A.id != B.id
def test_batch_definition_instantiation():
    with pytest.raises(TypeError):
        # noinspection PyTypeChecker,PyUnusedLocal,PyPep8Naming
        A = BatchDefinition("A", "a", "aaa", {"id": "A"})

    A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"}))

    print(A.id)
Beispiel #8
0
def test_map_data_reference_string_to_batch_definition_list_using_regex():
    # regex_pattern does not match --> None
    data_reference = "alex_20200809_1000.csv"
    regex_pattern = r"^(.+)_____________\.csv$"
    group_names = ["name", "timestamp", "price"]
    returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex(
        datasource_name="test_datasource",
        data_connector_name="test_data_connector",
        data_asset_name=None,
        data_reference=data_reference,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert returned_batch_def_list is None

    # no data_asset_name configured --> DEFAULT_ASSET_NAME
    data_reference = "alex_20200809_1000.csv"
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    group_names = ["name", "timestamp", "price"]
    returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex(
        datasource_name="test_datasource",
        data_connector_name="test_data_connector",
        data_asset_name=None,
        data_reference=data_reference,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert returned_batch_def_list == [
        BatchDefinition(
            datasource_name="test_datasource",
            data_connector_name="test_data_connector",
            data_asset_name="DEFAULT_ASSET_NAME",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000",}
            ),
        )
    ]

    # data_asset_name configured
    returned_batch_def_list = map_data_reference_string_to_batch_definition_list_using_regex(
        datasource_name="test_datasource",
        data_connector_name="test_data_connector",
        data_asset_name="test_data_asset",
        data_reference=data_reference,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert returned_batch_def_list == [
        BatchDefinition(
            datasource_name="test_datasource",
            data_connector_name="test_data_connector",
            data_asset_name="test_data_asset",
            partition_definition=PartitionDefinition(
                {"name": "alex", "timestamp": "20200809", "price": "1000",}
            ),
        )
    ]
def test_data_connector_query_sorted_filtered_by_custom_filter_with_index_as_slice_via_string_no_left_right_no_step(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector

    # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter.
    def my_custom_batch_selector(batch_identifiers: dict) -> bool:
        return (batch_identifiers["name"] in ["abe", "james", "eugene"]
                and datetime.datetime.strptime(batch_identifiers["timestamp"],
                                               "%Y%m%d").date() >
                datetime.datetime(2020, 7, 15).date())

    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                data_connector_query={
                    "custom_filter_function": my_custom_batch_selector,
                    "index": ":3",
                },
            )))
    assert len(returned_batch_definition_list) == 3
    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "abe",
                "timestamp": "20200809",
                "price": "1040"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test_batch_definition_equality():
    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"}))

    assert A != B

    A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    assert A == A2
def test_batch_definition_id():
    # noinspection PyUnusedLocal,PyPep8Naming
    A = BatchDefinition("A", "a", "aaa", batch_identifiers=IDDict({"id": "A"}))
    print(A.id)

    # noinspection PyUnusedLocal,PyPep8Naming
    B = BatchDefinition("B", "b", "bbb", batch_identifiers=IDDict({"id": "B"}))
    print(B.id)

    assert A.id != B.id
Beispiel #12
0
def test_map_batch_definition_to_data_reference_string_using_regex():
    # not BatchDefinition
    my_batch_definition = "I_am_a_string"
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(TypeError):
        # noinspection PyUnusedLocal,PyTypeChecker
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # group names do not match
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["i", "wont", "match"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"
    with pytest.raises(KeyError):
        # noinspection PyUnusedLocal
        my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
            batch_definition=my_batch_definition,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )

    # success
    my_batch_definition = BatchDefinition(
        datasource_name="test_environment",
        data_connector_name="general_filesystem_data_connector",
        data_asset_name="TestFiles",
        batch_identifiers=IDDict({
            "name": "eugene",
            "timestamp": "20200809",
            "price": "1500"
        }),
    )
    group_names = ["name", "timestamp", "price"]
    regex_pattern = r"^(.+)_(\d+)_(\d+)\.csv$"

    my_data_reference = map_batch_definition_to_data_reference_string_using_regex(
        batch_definition=my_batch_definition,
        regex_pattern=regex_pattern,
        group_names=group_names,
    )
    assert my_data_reference == "eugene_20200809_1500.csv"
def test_date_time():
    first = BatchDefinition(
        datasource_name="A",
        data_connector_name="a",
        data_asset_name="aaa",
        batch_identifiers=IDDict({"date": "20210101"}),
    )
    second = BatchDefinition(
        datasource_name="B",
        data_connector_name="b",
        data_asset_name="bbb",
        batch_identifiers=IDDict({"date": "20210102"}),
    )
    third = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"date": "20210103"}),
    )

    batch_list = [first, second, third]
    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="desc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [third, second, first]

    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="asc")
    sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
    assert sorted_batch_list == [first, second, third]

    with pytest.raises(ge_exceptions.SorterError):
        # numeric date_time_format
        i_dont_work = DateTimeSorter(name="date",
                                     datetime_format=12345,
                                     orderby="desc")

    my_date_is_not_a_string = BatchDefinition(
        datasource_name="C",
        data_connector_name="c",
        data_asset_name="ccc",
        batch_identifiers=IDDict({"date": 20210103}),
    )

    batch_list = [first, second, third, my_date_is_not_a_string]
    my_sorter = DateTimeSorter(name="date",
                               datetime_format="%Y%m%d",
                               orderby="desc")

    with pytest.raises(ge_exceptions.SorterError):
        sorted_batch_list = my_sorter.get_sorted_batch_definitions(batch_list)
Beispiel #14
0
def test_get_batch_with_split_on_whole_table_s3_with_configured_asset_s3_data_connector(
    test_s3_files, test_df_small
):
    bucket, _keys = test_s3_files
    expected_df = test_df_small

    execution_engine: ExecutionEngine = PandasExecutionEngine()

    my_data_connector = ConfiguredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        bucket=bucket,
        execution_engine=execution_engine,
        prefix="",
        assets={"alpha": {}},
        default_regex={
            "pattern": "alpha-(.*)\\.csv",
            "group_names": ["index"],
        },
    )
    batch_def: BatchDefinition = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=1),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    test_df = execution_engine.get_batch_data(
        batch_spec=my_data_connector.build_batch_spec(batch_definition=batch_def)
    )
    assert test_df.dataframe.shape == expected_df.shape

    # if key does not exist
    batch_def_no_key = BatchDefinition(
        datasource_name="FAKE_DATASOURCE_NAME",
        data_connector_name="my_data_connector",
        data_asset_name="alpha",
        batch_identifiers=IDDict(index=9),
        batch_spec_passthrough={
            "reader_method": "read_csv",
            "splitter_method": "_split_on_whole_table",
        },
    )
    with pytest.raises(ge_exceptions.ExecutionEngineError):
        execution_engine.get_batch_data(
            batch_spec=my_data_connector.build_batch_spec(
                batch_definition=batch_def_no_key
            )
        )
def test_batch_definition_equality():
    # noinspection PyUnusedLocal,PyPep8Naming
    A = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    # noinspection PyUnusedLocal,PyPep8Naming
    B = BatchDefinition("B", "b", "bbb", PartitionDefinition({"id": "B"}))

    assert A != B

    # noinspection PyUnusedLocal,PyPep8Naming
    A2 = BatchDefinition("A", "a", "aaa", PartitionDefinition({"id": "A"}))

    assert A == A2
Beispiel #16
0
def test_partition_request_sorted_filtered_by_custom_filter_with_slice_as_list(
    create_files_and_instantiate_data_connector, ):
    # <TODO> is this behavior correct?
    my_data_connector = create_files_and_instantiate_data_connector

    # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter.

    def my_custom_partition_selector(partition_definition: dict) -> bool:
        return (partition_definition["name"] in ["abe", "james", "eugene"]
                and datetime.datetime.strptime(
                    partition_definition["timestamp"],
                    "%Y%m%d").date() > datetime.datetime(2020, 7, 15).date())

    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                partition_request={
                    "custom_filter_function": my_custom_partition_selector,
                    "index": [1, 3],
                },
            )))

    assert len(returned_batch_definition_list) == 2

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "eugene",
                "timestamp": "20201129",
                "price": "1900"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "eugene",
                "timestamp": "20200809",
                "price": "1500"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test_data_connector_query_sorted_filtered_by_custom_filter_with_slice_obj(
    create_files_and_instantiate_data_connector, ):
    # <TODO> is this behavior correct?
    my_data_connector = create_files_and_instantiate_data_connector

    # Note that both a function and a lambda Callable types are acceptable as the definition of a custom filter.
    def my_custom_batch_selector(batch_identifiers: dict) -> bool:
        return (batch_identifiers["name"] in ["abe", "james", "eugene"]
                and datetime.datetime.strptime(batch_identifiers["timestamp"],
                                               "%Y%m%d").date() >
                datetime.datetime(2020, 7, 15).date())

    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                data_connector_query={
                    "custom_filter_function": my_custom_batch_selector,
                    "index": slice(3, 5, None),
                },
            )))
    assert len(returned_batch_definition_list) == 2

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200811",
                "price": "1009"
            }),
        ),
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "james",
                "timestamp": "20200810",
                "price": "1003"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test__build_batch_spec(basic_datasource):
    partition_request: dict = {
        "batch_identifiers": {
            "custom_key_0": "staging",
            "airflow_run_id": 1234567890,
        }
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    # noinspection PyProtectedMember
    batch_spec: BatchSpec = test_runtime_data_connector.build_batch_spec(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="test_runtime_data_connector",
            data_asset_name="my_data_asset",
            partition_definition=PartitionDefinition(
                partition_request["batch_identifiers"]
            ),
        ),
        batch_data=pd.DataFrame({"x": range(10)}),
    )
    assert type(batch_spec) == RuntimeDataBatchSpec
    assert set(batch_spec.keys()) == {"batch_data"}
    assert batch_spec["batch_data"].shape == (10, 1)
def test__generate_batch_spec_parameters_from_batch_definition(
    basic_datasource,
):
    partition_request: dict = {
        "batch_identifiers": {
            "custom_key_0": "staging",
            "airflow_run_id": 1234567890,
        }
    }

    test_runtime_data_connector: RuntimeDataConnector = (
        basic_datasource.data_connectors["test_runtime_data_connector"]
    )

    expected_batch_spec_parameters: dict = {}

    # noinspection PyProtectedMember
    batch_spec_parameters: dict = test_runtime_data_connector._generate_batch_spec_parameters_from_batch_definition(
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="test_runtime_data_connector",
            data_asset_name="my_data_asset",
            partition_definition=PartitionDefinition(
                partition_request["batch_identifiers"]
            ),
        )
    )

    assert batch_spec_parameters == expected_batch_spec_parameters
Beispiel #20
0
def test_asset_is_name_batch_identifier_correctly_used(
        basic_datasource_with_assets, test_df_pandas):
    """
    Using asset_a, which is named in the RuntimeDataConnector configuration, and using batch_identifier that is named.
    """
    runtime_data_connector: RuntimeDataConnector = (
        basic_datasource_with_assets.data_connectors["runtime"])
    res: List[
        BatchDefinition] = runtime_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=RuntimeBatchRequest(
                datasource_name=basic_datasource_with_assets.name,
                data_connector_name="runtime",
                data_asset_name="asset_a",
                batch_identifiers={
                    "month": 4,
                    "day": 1
                },
                runtime_parameters={"batch_data": test_df_pandas},
            ))
    assert len(res) == 1
    assert res[0] == BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="runtime",
        data_asset_name="asset_a",
        batch_identifiers=IDDict({
            "month": 4,
            "day": 1
        }),
    )
def test_data_connector_query_data_connector_query_batch_identifiers_2_key_name_timestamp(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector
    # no limit
    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                data_connector_query={
                    "batch_filter_parameters": {
                        "timestamp": "20200809",
                        "name": "will",
                    },
                },
            )))
    assert len(returned_batch_definition_list) == 1

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            batch_identifiers=IDDict({
                "name": "will",
                "timestamp": "20200809",
                "price": "1002"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def map_data_reference_string_to_batch_definition_list_using_regex(
    datasource_name: str,
    data_connector_name: str,
    data_reference: str,
    regex_pattern: str,
    group_names: List[str],
    data_asset_name: Optional[str] = None,
) -> Optional[List[BatchDefinition]]:
    processed_data_reference: Optional[Tuple[
        str,
        PartitionDefinitionSubset]] = convert_data_reference_string_to_partition_definition_using_regex(
            data_reference=data_reference,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )
    if processed_data_reference is None:
        return None
    data_asset_name_from_partition_definition: str = processed_data_reference[
        0]
    partition_definition: PartitionDefinitionSubset = processed_data_reference[
        1]
    if data_asset_name is None:
        data_asset_name = data_asset_name_from_partition_definition

    return [
        BatchDefinition(
            datasource_name=datasource_name,
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
            partition_definition=PartitionDefinition(partition_definition),
        )
    ]
Beispiel #23
0
def map_data_reference_string_to_batch_definition_list_using_regex(
    datasource_name: str,
    data_connector_name: str,
    data_reference: str,
    regex_pattern: str,
    group_names: List[str],
    data_asset_name: Optional[str] = None,
) -> Optional[List[BatchDefinition]]:
    processed_data_reference: Optional[Tuple[
        str,
        IDDict]] = convert_data_reference_string_to_batch_identifiers_using_regex(
            data_reference=data_reference,
            regex_pattern=regex_pattern,
            group_names=group_names,
        )
    if processed_data_reference is None:
        return None
    data_asset_name_from_batch_identifiers: str = processed_data_reference[0]
    batch_identifiers: IDDict = processed_data_reference[1]
    if data_asset_name is None:
        data_asset_name = data_asset_name_from_batch_identifiers

    return [
        BatchDefinition(
            datasource_name=datasource_name,
            data_connector_name=data_connector_name,
            data_asset_name=data_asset_name,
            batch_identifiers=IDDict(batch_identifiers),
        )
    ]
Beispiel #24
0
def batch_fixture() -> Batch:
    """
    Fixture for Batch object that contains data, BatchRequest, BatchDefinition
    as well as BatchSpec and BatchMarkers. To be used in unittesting.
    """
    df: pd.DataFrame = pd.DataFrame(
        {"a": [1, 5, 22, 3, 5, 10], "b": [1, 2, 3, 4, 5, 6]}
    )
    batch_request: BatchRequest = BatchRequest(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
    )
    batch_definition: BatchDefinition = BatchDefinition(
        datasource_name="my_datasource",
        data_connector_name="my_data_connector",
        data_asset_name="my_data_asset_name",
        batch_identifiers=IDDict({"id": "A"}),
    )
    batch_spec: BatchSpec = BatchSpec(path="/some/path/some.file")
    batch_markers: BatchMarkers = BatchMarkers(ge_load_time="FAKE_LOAD_TIME")
    batch: Batch = Batch(
        data=df,
        batch_request=batch_request,
        batch_definition=batch_definition,
        batch_spec=batch_spec,
        batch_markers=batch_markers,
    )
    return batch
def ge_validator_sqlalchemy() -> Validator:
    validator = Validator(
        execution_engine=SqlAlchemyExecutionEngine(
            connection_string="postgresql://localhost:5432/test"),
        batches=[
            Batch(
                data=None,
                batch_request=BatchRequest(
                    datasource_name="my_postgresql_datasource",
                    data_connector_name="whole_table",
                    data_asset_name="foo2",
                ),
                batch_definition=BatchDefinition(
                    datasource_name="my_postgresql_datasource",
                    data_connector_name="whole_table",
                    data_asset_name="foo2",
                    batch_identifiers=IDDict(),
                ),
                batch_spec=SqlAlchemyDatasourceBatchSpec({
                    "data_asset_name": "foo2",
                    "table_name": "foo2",
                    "batch_identifiers": {},
                    "schema_name": "public",
                    "type": "table",
                }),
            )
        ],
    )
    return validator
Beispiel #26
0
def test_partition_request_partition_request_partition_identifiers_1_key_and_index(
    create_files_and_instantiate_data_connector, ):
    my_data_connector = create_files_and_instantiate_data_connector
    # no limit
    returned_batch_definition_list = (
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="test_environment",
                data_connector_name="general_filesystem_data_connector",
                data_asset_name="TestFiles",
                partition_request={
                    "partition_identifiers": {
                        "name": "james"
                    },
                    "index": 0,
                },
            )))
    assert len(returned_batch_definition_list) == 1

    expected: List[BatchDefinition] = [
        BatchDefinition(
            datasource_name="test_environment",
            data_connector_name="general_filesystem_data_connector",
            data_asset_name="TestFiles",
            partition_definition=PartitionDefinition({
                "name": "james",
                "timestamp": "20200713",
                "price": "1567"
            }),
        ),
    ]
    assert returned_batch_definition_list == expected
def test_batch__str__method():
    batch = Batch(
        data=None,
        batch_request=BatchRequest(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
        ),
        batch_definition=BatchDefinition(
            datasource_name="my_datasource",
            data_connector_name="my_data_connector",
            data_asset_name="my_data_asset_name",
            batch_identifiers=IDDict({}),
        ),
        batch_spec=BatchSpec(path="/some/path/some.file"),
        batch_markers=BatchMarkers(ge_load_time="FAKE_LOAD_TIME"),
    )
    print(batch.__str__())

    assert (batch.__str__() == """{
  "data": "None",
  "batch_request": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name"
  },
  "batch_definition": {
    "datasource_name": "my_datasource",
    "data_connector_name": "my_data_connector",
    "data_asset_name": "my_data_asset_name",
    "batch_identifiers": {}
  },
  "batch_spec": "{'path': '/some/path/some.file'}",
  "batch_markers": "{'ge_load_time': 'FAKE_LOAD_TIME'}"
}""")
Beispiel #28
0
    def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest):
        self._validate_batch_request(batch_request=batch_request)

        if self._data_references_cache is None:
            self._refresh_data_references_cache()

        batch_definition_list: List[BatchDefinition] = []

        try:
            sub_cache = self._data_references_cache[batch_request.data_asset_name]
        except KeyError as e:
            raise KeyError(
                f"data_asset_name {batch_request.data_asset_name} is not recognized."
            )

        for partition_definition in sub_cache:
            batch_definition: BatchDefinition = BatchDefinition(
                datasource_name=self.datasource_name,
                data_connector_name=self.name,
                data_asset_name=batch_request.data_asset_name,
                partition_definition=PartitionDefinition(partition_definition),
            )
            if batch_definition_matches_batch_request(batch_definition, batch_request):
                batch_definition_list.append(batch_definition)

        return batch_definition_list
    def build_batch_spec(self, batch_definition: BatchDefinition) -> PathBatchSpec:
        """
        Build BatchSpec from batch_definition by calling DataConnector's build_batch_spec function.

        Args:
            batch_definition (BatchDefinition): to be used to build batch_spec

        Returns:
            BatchSpec built from batch_definition
        """

        data_asset_name: str = batch_definition.data_asset_name
        if (
            data_asset_name in self.assets
            and self.assets[data_asset_name].batch_spec_passthrough
            and isinstance(self.assets[data_asset_name].batch_spec_passthrough, dict)
        ):
            # batch_spec_passthrough from data_asset
            batch_spec_passthrough = deepcopy(
                self.assets[data_asset_name]["batch_spec_passthrough"]
            )
            batch_definition_batch_spec_passthrough = (
                deepcopy(batch_definition.batch_spec_passthrough) or {}
            )
            # batch_spec_passthrough from Batch Definition supersedes batch_spec_passthrough from data_asset
            batch_spec_passthrough.update(batch_definition_batch_spec_passthrough)
            batch_definition.batch_spec_passthrough = batch_spec_passthrough

        batch_spec: PathBatchSpec = super().build_batch_spec(
            batch_definition=batch_definition
        )

        return batch_spec
Beispiel #30
0
    def get_batch_definition_list_from_batch_request(self, batch_request: BatchRequest):
        self._validate_batch_request(batch_request=batch_request)

        if len(self._data_references_cache) == 0:
            self._refresh_data_references_cache()

        batch_definition_list: List[BatchDefinition] = []
        try:
            sub_cache = self._data_references_cache[batch_request.data_asset_name]
        except KeyError as e:
            raise KeyError(
                f"data_asset_name {batch_request.data_asset_name} is not recognized."
            )

        for batch_identifiers in sub_cache:
            batch_definition: BatchDefinition = BatchDefinition(
                datasource_name=self.datasource_name,
                data_connector_name=self.name,
                data_asset_name=batch_request.data_asset_name,
                batch_identifiers=IDDict(batch_identifiers),
                batch_spec_passthrough=batch_request.batch_spec_passthrough,
            )
            if batch_definition_matches_batch_request(batch_definition, batch_request):
                batch_definition_list.append(batch_definition)

        return batch_definition_list