def test_it_rejects_json_with_dot_in_keys(mock_get_details, mock_get_format, mock_get_location, get_existing_s3_locations): mock_get_details.return_value = get_table_stub( {"Location": "s3://bucket/prefix/"}) get_existing_s3_locations.return_value = [] mock_get_location.return_value = "s3://bucket/prefix/" mock_get_format.return_value = ( "org.openx.data.jsonserde.JsonSerDe", { "dots.in.keys": "TRUE" }, ) with pytest.raises(ValueError) as e: handlers.validate_mapper({ "DataMapperId": "1234", "Columns": ["column"], "QueryExecutor": "athena", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test", "Table": "test", }, }) assert (e.value.args[0] == "The parameter dots.in.keys cannot be TRUE for " "SerDe library org.openx.data.jsonserde.JsonSerDe")
def test_it_rejects_json_with_column_mapping(mock_get_details, mock_get_format, mock_get_location, get_existing_s3_locations): mock_get_details.return_value = get_table_stub( {"Location": "s3://bucket/prefix/"}) get_existing_s3_locations.return_value = [] mock_get_location.return_value = "s3://bucket/prefix/" mock_get_format.return_value = ( "org.openx.data.jsonserde.JsonSerDe", { "case.insensitive": "FALSE", "mapping.userid": "userId" }, ) with pytest.raises(ValueError) as e: handlers.validate_mapper({ "DataMapperId": "1234", "Columns": ["column"], "QueryExecutor": "athena", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test", "Table": "test", }, }) assert (e.value.args[0] == "Column mappings are not supported for " "SerDe library org.openx.data.jsonserde.JsonSerDe")
def test_it_rejects_not_supported_tables(mock_get_details, mock_get_format, mock_get_location, get_existing_s3_locations): mock_get_details.return_value = get_table_stub( {"Location": "s3://bucket/prefix/"}) get_existing_s3_locations.return_value = [] mock_get_location.return_value = "s3://bucket/prefix/" mock_get_format.return_value = ( "org.apache.hadoop.hive.serde2.OpenCSVSerde", { "field.delim": "," }, ) with pytest.raises(ValueError) as e: handlers.validate_mapper({ "DataMapperId": "1234", "Columns": ["column"], "QueryExecutor": "athena", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test", "Table": "test", }, }) assert ( e.value.args[0] == "The format for the specified table is not supported. " "The SerDe lib must be one of org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe," " org.apache.hive.hcatalog.data.JsonSerDe, org.openx.data.jsonserde.JsonSerDe" )
def test_it_rejects_overlapping_s3_paths(mock_get_details, mock_get_format, mock_get_location, get_existing_s3_locations): mock_get_details.return_value = get_table_stub( {"Location": "s3://bucket/prefix/"}) get_existing_s3_locations.return_value = ["s3://bucket/prefix/"] mock_get_location.return_value = "s3://bucket/prefix/" mock_get_format.return_value = ( "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", { "serialization.format": "1" }, ) with pytest.raises(ValueError) as e: handlers.validate_mapper({ "DataMapperId": "1234", "Columns": ["column"], "QueryExecutor": "athena", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test", "Table": "test", }, }) assert (e.value.args[0] == "A data mapper already exists which covers this S3 location")
def test_it_rejects_non_existent_glue_tables(mock_get_details, get_existing_s3_locations): # Simulate raising an exception for table not existing get_existing_s3_locations.return_value = ["s3://bucket/prefix/"] mock_get_details.side_effect = ClientError( {"ResponseMetadata": { "HTTPStatusCode": 404 }}, "get_table") with pytest.raises(ClientError): handlers.validate_mapper({ "Columns": ["column"], "QueryExecutor": "athena", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test", "Table": "test", }, })
def test_it_rejects_non_parquet_tables(mock_get_details, mock_get_format, mock_get_location, get_existing_s3_locations): mock_get_details.return_value = get_table_stub( {"Location": "s3://bucket/prefix/"}) get_existing_s3_locations.return_value = [] mock_get_location.return_value = "s3://bucket/prefix/" mock_get_format.return_value = ( "org.apache.hadoop.mapred.TextInputFormat", "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", "org.openx.data.jsonserde.JsonSerDe", ) with pytest.raises(ValueError): handlers.validate_mapper({ "Columns": ["column"], "QueryExecutor": "athena", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test", "Table": "test", }, })
def test_it_rejects_overlapping_s3_paths(mock_get_details, mock_get_format, mock_get_location, get_existing_s3_locations): mock_get_details.return_value = get_table_stub( {"Location": "s3://bucket/prefix/"}) get_existing_s3_locations.return_value = ["s3://bucket/prefix/"] mock_get_location.return_value = "s3://bucket/prefix/" mock_get_format.return_value = ( "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", ) with pytest.raises(ValueError): handlers.validate_mapper({ "Columns": ["column"], "QueryExecutor": "athena", "QueryExecutorParameters": { "DataCatalogProvider": "glue", "Database": "test", "Table": "test" }, })