def test_it_rejects_json_with_dot_in_keys(mock_get_details, mock_get_format,
                                          mock_get_location,
                                          get_existing_s3_locations):
    mock_get_details.return_value = get_table_stub(
        {"Location": "s3://bucket/prefix/"})
    get_existing_s3_locations.return_value = []
    mock_get_location.return_value = "s3://bucket/prefix/"
    mock_get_format.return_value = (
        "org.openx.data.jsonserde.JsonSerDe",
        {
            "dots.in.keys": "TRUE"
        },
    )
    with pytest.raises(ValueError) as e:
        handlers.validate_mapper({
            "DataMapperId": "1234",
            "Columns": ["column"],
            "QueryExecutor": "athena",
            "QueryExecutorParameters": {
                "DataCatalogProvider": "glue",
                "Database": "test",
                "Table": "test",
            },
        })
    assert (e.value.args[0] == "The parameter dots.in.keys cannot be TRUE for "
            "SerDe library org.openx.data.jsonserde.JsonSerDe")
def test_it_rejects_json_with_column_mapping(mock_get_details, mock_get_format,
                                             mock_get_location,
                                             get_existing_s3_locations):
    mock_get_details.return_value = get_table_stub(
        {"Location": "s3://bucket/prefix/"})
    get_existing_s3_locations.return_value = []
    mock_get_location.return_value = "s3://bucket/prefix/"
    mock_get_format.return_value = (
        "org.openx.data.jsonserde.JsonSerDe",
        {
            "case.insensitive": "FALSE",
            "mapping.userid": "userId"
        },
    )
    with pytest.raises(ValueError) as e:
        handlers.validate_mapper({
            "DataMapperId": "1234",
            "Columns": ["column"],
            "QueryExecutor": "athena",
            "QueryExecutorParameters": {
                "DataCatalogProvider": "glue",
                "Database": "test",
                "Table": "test",
            },
        })
    assert (e.value.args[0] == "Column mappings are not supported for "
            "SerDe library org.openx.data.jsonserde.JsonSerDe")
def test_it_rejects_not_supported_tables(mock_get_details, mock_get_format,
                                         mock_get_location,
                                         get_existing_s3_locations):
    mock_get_details.return_value = get_table_stub(
        {"Location": "s3://bucket/prefix/"})
    get_existing_s3_locations.return_value = []
    mock_get_location.return_value = "s3://bucket/prefix/"
    mock_get_format.return_value = (
        "org.apache.hadoop.hive.serde2.OpenCSVSerde",
        {
            "field.delim": ","
        },
    )
    with pytest.raises(ValueError) as e:
        handlers.validate_mapper({
            "DataMapperId": "1234",
            "Columns": ["column"],
            "QueryExecutor": "athena",
            "QueryExecutorParameters": {
                "DataCatalogProvider": "glue",
                "Database": "test",
                "Table": "test",
            },
        })
    assert (
        e.value.args[0] ==
        "The format for the specified table is not supported. "
        "The SerDe lib must be one of org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe,"
        " org.apache.hive.hcatalog.data.JsonSerDe, org.openx.data.jsonserde.JsonSerDe"
    )
def test_it_rejects_overlapping_s3_paths(mock_get_details, mock_get_format,
                                         mock_get_location,
                                         get_existing_s3_locations):
    mock_get_details.return_value = get_table_stub(
        {"Location": "s3://bucket/prefix/"})
    get_existing_s3_locations.return_value = ["s3://bucket/prefix/"]
    mock_get_location.return_value = "s3://bucket/prefix/"
    mock_get_format.return_value = (
        "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
        {
            "serialization.format": "1"
        },
    )
    with pytest.raises(ValueError) as e:
        handlers.validate_mapper({
            "DataMapperId": "1234",
            "Columns": ["column"],
            "QueryExecutor": "athena",
            "QueryExecutorParameters": {
                "DataCatalogProvider": "glue",
                "Database": "test",
                "Table": "test",
            },
        })
    assert (e.value.args[0] ==
            "A data mapper already exists which covers this S3 location")
Example #5
0
def test_it_rejects_non_existent_glue_tables(mock_get_details,
                                             get_existing_s3_locations):
    # Simulate raising an exception for table not existing
    get_existing_s3_locations.return_value = ["s3://bucket/prefix/"]
    mock_get_details.side_effect = ClientError(
        {"ResponseMetadata": {
            "HTTPStatusCode": 404
        }}, "get_table")
    with pytest.raises(ClientError):
        handlers.validate_mapper({
            "Columns": ["column"],
            "QueryExecutor": "athena",
            "QueryExecutorParameters": {
                "DataCatalogProvider": "glue",
                "Database": "test",
                "Table": "test",
            },
        })
Example #6
0
def test_it_rejects_non_parquet_tables(mock_get_details, mock_get_format,
                                       mock_get_location,
                                       get_existing_s3_locations):
    mock_get_details.return_value = get_table_stub(
        {"Location": "s3://bucket/prefix/"})
    get_existing_s3_locations.return_value = []
    mock_get_location.return_value = "s3://bucket/prefix/"
    mock_get_format.return_value = (
        "org.apache.hadoop.mapred.TextInputFormat",
        "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
        "org.openx.data.jsonserde.JsonSerDe",
    )
    with pytest.raises(ValueError):
        handlers.validate_mapper({
            "Columns": ["column"],
            "QueryExecutor": "athena",
            "QueryExecutorParameters": {
                "DataCatalogProvider": "glue",
                "Database": "test",
                "Table": "test",
            },
        })
def test_it_rejects_overlapping_s3_paths(mock_get_details, mock_get_format,
                                         mock_get_location,
                                         get_existing_s3_locations):
    mock_get_details.return_value = get_table_stub(
        {"Location": "s3://bucket/prefix/"})
    get_existing_s3_locations.return_value = ["s3://bucket/prefix/"]
    mock_get_location.return_value = "s3://bucket/prefix/"
    mock_get_format.return_value = (
        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
        "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
        "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
    )
    with pytest.raises(ValueError):
        handlers.validate_mapper({
            "Columns": ["column"],
            "QueryExecutor": "athena",
            "QueryExecutorParameters": {
                "DataCatalogProvider": "glue",
                "Database": "test",
                "Table": "test"
            },
        })