def test_for_self_check_using_InferredAssetFilesystemDataConnector_SparkDFExecutionEngine(
        spark_session, tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "basic_data_connector_inferred_asset_filesystem_data_connector"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "alex_20201010_1000.csv",
            "abe_202011111_2000.csv",
            "will_20201212_3000.csv",
        ],
    )
    my_data_connector = InferredAssetFilesystemDataConnector(
        name="my_data_connector",
        base_directory=base_directory,
        glob_directive="*.csv",
        datasource_name="FAKE_DATASOURCE",
        execution_engine=SparkDFExecutionEngine(),
        default_regex={
            "pattern": "(.+)_(\\d+)_(\\d+)\\.csv",
            "group_names": ["data_asset_name", "timestamp", "size"],
        },
    )
    self_check_results = my_data_connector.self_check()
    assert self_check_results["data_asset_count"] == 3
    assert self_check_results["example_data_reference"]["n_rows"] == 3
def test_simple_regex_example_with_implicit_data_asset_names_self_check(
    tmp_path_factory,
):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_simple_regex_example_with_implicit_data_asset_names"
        )
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "A-100.csv",
            "A-101.csv",
            "B-1.csv",
            "B-2.csv",
            "CCC.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        InferredAssetFilesystemDataConnector(
            name="my_data_connector",
            datasource_name="FAKE_DATASOURCE_NAME",
            execution_engine=PandasExecutionEngine(),
            default_regex={
                "pattern": r"(.+)-(\d+)\.csv",
                "group_names": [
                    "data_asset_name",
                    "number",
                ],
            },
            glob_directive="*",
            base_directory=base_directory,
        )
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    self_check_report_object = my_data_connector.self_check()

    assert self_check_report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["A", "B"],
        "data_assets": {
            "A": {
                "example_data_references": ["A-100.csv", "A-101.csv"],
                "batch_definition_count": 2,
            },
            "B": {
                "example_data_references": ["B-1.csv", "B-2.csv"],
                "batch_definition_count": 2,
            },
        },
        "example_unmatched_data_references": ["CCC.csv"],
        "unmatched_data_reference_count": 1,
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
def test_self_check(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_self_check"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "A-100.csv",
            "A-101.csv",
            "B-1.csv",
            "B-2.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = InferredAssetFilesystemDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": r"(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "number"],
        },
        glob_directive="*",
        base_directory=base_directory,
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    self_check_report_object = my_data_connector.self_check()

    assert self_check_report_object == {
        "class_name": "InferredAssetFilesystemDataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["A", "B"],
        "data_assets": {
            "A": {
                "example_data_references": ["A-100.csv", "A-101.csv"],
                "batch_definition_count": 2,
            },
            "B": {
                "example_data_references": ["B-1.csv", "B-2.csv"],
                "batch_definition_count": 2,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
        "example_data_reference": {},
    }
def test_basic_instantiation(tmp_path_factory):
    base_directory = str(tmp_path_factory.mktemp("test_basic_instantiation"))
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "path/A-100.csv",
            "path/A-101.csv",
            "directory/B-1.csv",
            "directory/B-2.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        InferredAssetFilesystemDataConnector(
            name="my_data_connector",
            datasource_name="FAKE_DATASOURCE_NAME",
            execution_engine=PandasExecutionEngine(),
            default_regex={
                "pattern": r"(.+)/(.+)-(\d+)\.csv",
                "group_names": ["data_asset_name", "letter", "number"],
            },
            glob_directive="*/*.csv",
            base_directory=base_directory,
        )
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert my_data_connector.get_data_reference_list_count() == 4
    assert my_data_connector.get_unmatched_data_references() == []

    # Illegal execution environment name
    with pytest.raises(ValueError):
        print(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="something",
                    data_connector_name="my_data_connector",
                    data_asset_name="something",
                )
            )
        )
Esempio n. 5
0
def test_complex_regex_example_with_implicit_data_asset_names(tmp_path_factory):
    base_directory = str(
        tmp_path_factory.mktemp(
            "test_complex_regex_example_with_implicit_data_asset_names"
        )
    )
    create_files_in_directory(
        directory=base_directory,
        file_name_list=[
            "2020/01/alpha-1001.csv",
            "2020/01/beta-1002.csv",
            "2020/02/alpha-1003.csv",
            "2020/02/beta-1004.csv",
            "2020/03/alpha-1005.csv",
            "2020/03/beta-1006.csv",
            "2020/04/beta-1007.csv",
        ],
    )

    my_data_connector: InferredAssetFilesystemDataConnector = (
        InferredAssetFilesystemDataConnector(
            name="my_data_connector",
            datasource_name="FAKE_DATASOURCE_NAME",
            default_regex={
                "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
                "group_names": ["year_dir", "month_dir", "data_asset_name"],
            },
            glob_directive="*/*/*.csv",
            base_directory=base_directory,
        )
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    # Test for an unknown execution environment
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition
        ] = my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="non_existent_datasource",
                data_connector_name="my_data_connector",
                data_asset_name="my_data_asset",
            )
        )

    # Test for an unknown data_connector
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition
        ] = my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="non_existent_data_connector",
                data_asset_name="my_data_asset",
            )
        )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="alpha",
                )
            )
        )
        == 3
    )

    assert (
        len(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="my_data_connector",
                    data_asset_name="beta",
                )
            )
        )
        == 4
    )

    assert my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            data_connector_query={
                "batch_filter_parameters": {
                    "year_dir": "2020",
                    "month_dir": "03",
                }
            },
        )
    ) == [
        BatchDefinition(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            batch_identifiers=IDDict(
                year_dir="2020",
                month_dir="03",
            ),
        )
    ]