def test_simple_regex_example_with_implicit_data_asset_names_self_check():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "A-100.csv",
        "A-101.csv",
        "B-1.csv",
        "B-2.csv",
        "CCC.csv",
    ]
    for key in keys:
        client.put_object(
            Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key
        )

    my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(.+)-(\d+)\.csv",
            "group_names": [
                "data_asset_name",
                "number",
            ],
        },
        bucket=bucket,
        prefix="",
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    self_check_report_object = my_data_connector.self_check()

    assert self_check_report_object == {
        "class_name": "InferredAssetS3DataConnector",
        "data_asset_count": 2,
        "example_data_asset_names": ["A", "B"],
        "data_assets": {
            "A": {
                "example_data_references": ["A-100.csv", "A-101.csv"],
                "batch_definition_count": 2,
            },
            "B": {
                "example_data_references": ["B-1.csv", "B-2.csv"],
                "batch_definition_count": 2,
            },
        },
        "example_unmatched_data_references": ["CCC.csv"],
        "unmatched_data_reference_count": 1,
        # FIXME: (Sam) example_data_reference removed temporarily in PR #2590:
        # "example_data_reference": {},
    }
def test_basic_instantiation():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "path/A-100.csv",
        "path/A-101.csv",
        "directory/B-1.csv",
        "directory/B-2.csv",
    ]
    for key in keys:
        client.put_object(
            Bucket=bucket, Body=test_df.to_csv(index=False).encode("utf-8"), Key=key
        )

    my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        execution_engine=PandasExecutionEngine(),
        default_regex={
            "pattern": r"(.+)/(.+)-(\d+)\.csv",
            "group_names": ["data_asset_name", "letter", "number"],
        },
        bucket=bucket,
        prefix="",
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    assert my_data_connector.get_data_reference_list_count() == 4
    assert my_data_connector.get_unmatched_data_references() == []

    # Illegal execution environment name
    with pytest.raises(ValueError):
        print(
            my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="something",
                    data_connector_name="my_data_connector",
                    data_asset_name="something",
                )
            )
        )
def test_complex_regex_example_with_implicit_data_asset_names():
    region_name: str = "us-east-1"
    bucket: str = "test_bucket"
    conn = boto3.resource("s3", region_name=region_name)
    conn.create_bucket(Bucket=bucket)
    client = boto3.client("s3", region_name=region_name)

    test_df: pd.DataFrame = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})

    keys: List[str] = [
        "2020/01/alpha-1001.csv",
        "2020/01/beta-1002.csv",
        "2020/02/alpha-1003.csv",
        "2020/02/beta-1004.csv",
        "2020/03/alpha-1005.csv",
        "2020/03/beta-1006.csv",
        "2020/04/beta-1007.csv",
    ]
    for key in keys:
        client.put_object(Bucket=bucket,
                          Body=test_df.to_csv(index=False).encode("utf-8"),
                          Key=key)

    my_data_connector: InferredAssetS3DataConnector = InferredAssetS3DataConnector(
        name="my_data_connector",
        datasource_name="FAKE_DATASOURCE_NAME",
        default_regex={
            "pattern": r"(\d{4})/(\d{2})/(.+)-\d+\.csv",
            "group_names": ["year_dir", "month_dir", "data_asset_name"],
        },
        bucket=bucket,
        prefix="",
    )

    # noinspection PyProtectedMember
    my_data_connector._refresh_data_references_cache()

    # Test for an unknown execution environment
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="non_existent_datasource",
                    data_connector_name="my_data_connector",
                    data_asset_name="my_data_asset",
                ))

    # Test for an unknown data_connector
    with pytest.raises(ValueError):
        # noinspection PyUnusedLocal
        batch_definition_list: List[
            BatchDefinition] = my_data_connector.get_batch_definition_list_from_batch_request(
                batch_request=BatchRequest(
                    datasource_name="FAKE_DATASOURCE_NAME",
                    data_connector_name="non_existent_data_connector",
                    data_asset_name="my_data_asset",
                ))

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
            ))) == 3)

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
            ))) == 3)

    assert (len(
        my_data_connector.get_batch_definition_list_from_batch_request(
            batch_request=BatchRequest(
                data_connector_name="my_data_connector",
                data_asset_name="beta",
            ))) == 4)

    assert my_data_connector.get_batch_definition_list_from_batch_request(
        batch_request=BatchRequest(
            datasource_name="FAKE_DATASOURCE_NAME",
            data_connector_name="my_data_connector",
            data_asset_name="alpha",
            partition_request={
                "partition_identifiers": {
                    "year_dir": "2020",
                    "month_dir": "03",
                }
            },
        )) == [
            BatchDefinition(
                datasource_name="FAKE_DATASOURCE_NAME",
                data_connector_name="my_data_connector",
                data_asset_name="alpha",
                partition_definition=PartitionDefinition(
                    year_dir="2020",
                    month_dir="03",
                ),
            )
        ]