Example #1
0
def test_self_check(mock_gcs_conn, mock_list_keys, mock_emit):
    my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "number"],
        },
        bucket_or_name="test_bucket",
        prefix="",
    )

    my_data_connector._refresh_data_references_cache()
    self_check_report_object = my_data_connector.self_check()

    assert self_check_report_object == {
        "class_name": "InferredAssetGCSDataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["A", "B"],
        "data_assets": {
            "A": {
                "example_data_references": ["A-100.csv", "A-101.csv"],
                "batch_definition_count": 2,
            },
            "B": {
                "example_data_references": ["B-1.csv", "B-2.csv"],
                "batch_definition_count": 2,
            },
        },
        "example_unmatched_data_references": [],
        "unmatched_data_reference_count": 0,
    }
Example #2
0
def test_complex_regex_example_with_implicit_data_asset_names(
        mock_gcs_conn, mock_list_keys, mock_emit):
    my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
            "group_names": ["year_dir", "month_dir", "data_asset_name"],
        },
        bucket_or_name="test_bucket",
        prefix="",
    )

    my_data_connector._refresh_data_references_cache()

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
            ))) == 3)

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="beta",
            ))) == 4)

    assert my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            data_connector_query={
                "batch_filter_parameters": {
                    "year_dir": "2020",
                    "month_dir": "03",
                }
            },
        )) == [
            BatchDefinition(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
                batch_identifiers=IDDict(
                    year_dir="2020",
                    month_dir="03",
                ),
            )
        ]
Example #3
0
def test_get_batch_definition_list_from_batch_request_with_nonexistent_datasource_name_raises_error(
        mock_gcs_conn, mock_list_keys, mock_emit,
        empty_data_context_stats_enabled):
    my_data_connector = InferredAssetGCSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(.+)/(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "letter", "number"],
        },
        bucket_or_name="test_bucket",
        prefix="",
    )

    # Raises error in `DataConnector._validate_batch_request()` due to `datasource_name` in BatchRequest not matching DataConnector `datasource_name`
    with pytest.raises(ValueError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            BatchRequest(
                datasource_name="something",
                data_connector_name="my_data_connector",
                data_asset_name="something",
            ))
Example #4
0
def test_instantiation_without_args(mock_gcs_conn, mock_list_keys,
                                    expected_config_dict):
    my_data_connector = InferredAssetGCSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(.+)/(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "letter", "number"],
        },
        bucket_or_name="test_bucket",
        prefix="",
    )
    assert my_data_connector.self_check() == expected_config_dict

    my_data_connector._refresh_data_references_cache()
    assert my_data_connector.get_data_reference_list_count() == 4
    assert my_data_connector.get_unmatched_data_references() == []
Example #5
0
def test_get_batch_definition_list_from_batch_request_with_unknown_data_connector_raises_error(
        mock_gcs_conn, mock_list_keys, mock_emit):
    my_data_connector: InferredAssetGCSDataConnector = InferredAssetGCSDataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
            "group_names": ["year_dir", "month_dir", "data_asset_name"],
        },
        bucket_or_name="test_bucket",
        prefix="",
    )

    my_data_connector._refresh_data_references_cache()

    # Raises error in `DataConnector._validate_batch_request()` due to `data-connector_name` in BatchRequest not matching DataConnector name
    with pytest.raises(ValueError):
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="non_existent_data_connector",
                data_asset_name="my_data_asset",
            ))