def test_it_runs_for_parquet_composite_matches( del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader, job_complete_waiter, job_table, ): # Arrange glue_data_mapper_factory( "test", partition_keys=["year", "month", "day"], partitions=[["2019", "08", "20"]], ) item = del_queue_factory( [ { "Column": "user_info.personal_information.first_name", "Value": "John" }, { "Column": "user_info.personal_information.last_name", "Value": "Doe" }, ], "id123", matchid_type="Composite", data_mappers=["test"], ) object_key = "test/2019/08/20/test.parquet" data_loader("basic.parquet", object_key, Metadata={"foo": "bar"}, CacheControl="cache") bucket = dummy_lake["bucket"] job_id = job_factory(del_queue_items=[item])["Id"] # Act job_complete_waiter.wait(TableName=job_table.name, Key={ "Id": { "S": job_id }, "Sk": { "S": job_id } }) # Assert tmp = tempfile.NamedTemporaryFile() bucket.download_fileobj(object_key, tmp) assert ("COMPLETED" == job_table.get_item(Key={ "Id": job_id, "Sk": job_id })["Item"]["JobStatus"]) assert 0 == len(query_parquet_file(tmp, "customer_id", "12345")) assert 1 == len(query_parquet_file(tmp, "customer_id", "23456")) assert 1 == len(query_parquet_file(tmp, "customer_id", "34567")) assert 2 == len(list(bucket.object_versions.filter(Prefix=object_key))) assert {"foo": "bar"} == bucket.Object(object_key).metadata assert "cache" == bucket.Object(object_key).cache_control
def test_it_runs_for_complex_types( del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader, job_complete_waiter, job_table, ): # Arrange glue_data_mapper_factory("test", column_identifiers=["user_info.name"]) item = del_queue_factory("matteo") object_key = "test/test.parquet" data_loader( "basic.parquet", object_key, Metadata={"foo": "bar"}, CacheControl="cache" ) bucket = dummy_lake["bucket"] job_id = job_factory(del_queue_items=[item])["Id"] # Act job_complete_waiter.wait( TableName=job_table.name, Key={"Id": {"S": job_id}, "Sk": {"S": job_id}} ) # Assert tmp = tempfile.NamedTemporaryFile() bucket.download_fileobj(object_key, tmp) assert ( "COMPLETED" == job_table.get_item(Key={"Id": job_id, "Sk": job_id})["Item"]["JobStatus"] ) assert 0 == len(query_parquet_file(tmp, "customer_id", "12345")) print(query_parquet_file(tmp, "customer_id", "23456")) assert {"foo": "bar"} == bucket.Object(object_key).metadata assert "cache" == bucket.Object(object_key).cache_control
def test_it_supports_data_access_roles(del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader, job_complete_waiter, job_table, data_access_role): # Arrange glue_data_mapper_factory("test", partition_keys=["year", "month", "day"], partitions=[["2019", "08", "20"]], delete_old_versions=False, role_arn=data_access_role["Arn"]) item = del_queue_factory("12345") object_key = "test/2019/08/20/test.parquet" data_loader("basic.parquet", object_key) bucket = dummy_lake["bucket"] job_id = job_factory(del_queue_items=[item])["Id"] # Act job_complete_waiter.wait(TableName=job_table.name, Key={ "Id": { "S": job_id }, "Sk": { "S": job_id } }) # Assert tmp = tempfile.NamedTemporaryFile() bucket.download_fileobj(object_key, tmp) assert "COMPLETED" == job_table.get_item(Key={ "Id": job_id, "Sk": job_id })["Item"]["JobStatus"] assert 0 == len(query_parquet_file(tmp, "customer_id", "12345"))
def test_it_does_not_permit_unversioned_buckets( del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader, job_finished_waiter, job_table, s3_resource): try: # Arrange s3_resource.BucketVersioning(dummy_lake["bucket_name"]).suspend() glue_data_mapper_factory("test", partition_keys=["year", "month", "day"], partitions=[["2019", "08", "20"]]) item = del_queue_factory("12345") object_key = "test/2019/08/20/test.parquet" data_loader("basic.parquet", object_key) bucket = dummy_lake["bucket"] job_id = job_factory(del_queue_items=[item], delete_previous_versions=False)["Id"] # Act job_finished_waiter.wait(TableName=job_table.name, Key={ "Id": { "S": job_id }, "Sk": { "S": job_id } }) # Assert tmp = tempfile.NamedTemporaryFile() bucket.download_fileobj(object_key, tmp) assert "FORGET_PARTIALLY_FAILED" == job_table.get_item(Key={ "Id": job_id, "Sk": job_id })["Item"]["JobStatus"] assert 1 == len(query_parquet_file(tmp, "customer_id", "12345")) finally: s3_resource.BucketVersioning(dummy_lake["bucket_name"]).enable()
def test_it_runs_for_partitioned_data_with_non_string_partitions( del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader, job_complete_waiter, job_table, ): # Arrange glue_data_mapper_factory( "test", partition_keys=["year", "month", "day"], partitions=[["2019", "10", "20"]], partition_key_types="int", ) item = del_queue_factory("12345") object_key = "test/2019/10/20/test.parquet" data_loader("basic.parquet", object_key, Metadata={"foo": "bar"}, CacheControl="cache") bucket = dummy_lake["bucket"] job_id = job_factory(del_queue_items=[item])["Id"] # Act job_complete_waiter.wait(TableName=job_table.name, Key={ "Id": { "S": job_id }, "Sk": { "S": job_id } }) # Assert tmp = tempfile.NamedTemporaryFile() bucket.download_fileobj(object_key, tmp) assert ("COMPLETED" == job_table.get_item(Key={ "Id": job_id, "Sk": job_id })["Item"]["JobStatus"]) assert 0 == len(query_parquet_file(tmp, "customer_id", "12345")) assert 1 == len(query_parquet_file(tmp, "customer_id", "23456")) assert 1 == len(query_parquet_file(tmp, "customer_id", "34567")) assert 2 == len(list(bucket.object_versions.filter(Prefix=object_key))) assert {"foo": "bar"} == bucket.Object(object_key).metadata assert "cache" == bucket.Object(object_key).cache_control
def test_it_handles_injection_attacks(del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader, job_complete_waiter, job_table): # Generate a parquet file and add it to the lake glue_data_mapper_factory("test", partition_keys=["year", "month", "day"], partitions=[["2019", "08", "20"]]) glue_data_mapper_factory("test2", database="acceptancetest2", table="acceptancetest2") legit_match_id = "12345" object_key = "test/2019/08/20/test.parquet" data_loader("basic.parquet", object_key) bucket = dummy_lake["bucket"] """ Using single quotes as part of the match_id could be a SQL injection attack for reading information from other tables. While this should be prevented by configuring IAM, it is appropriate to test that the query_handler properly escapes the quotes and Athena doesn't access other tables. """ cross_db_access = "foo')) UNION (select * from acceptancetests2.acceptancetests2 where customer_id in ('12345" """ Using single quotes as part of the match_id could be a SQL injection attack for reading information from other tables. While this should be prevented by configuring IAM, it is appropriate to test that the query_handler properly escapes the quotes and Athena doesn't access other tables. """ cross_db_escaped = "foo\')) UNION (select * from acceptancetests2.acceptancetests2 where customer_id in (\'12345" """ Unicode smuggling is taken care out of the box. Here is a test with "ʼ", which is similar to single quote. """ unicode_smuggling = "fooʼ)) UNION (select * from acceptancetests2.acceptancetests2 where customer_id in (ʼ12345" """ Another common SQLi attack vector consists on fragmented attacks. Tamper the result of the select by commenting out relevant match_ids by using "--" after a successful escape. This attack wouldn't work because Athena's way to escape single quotes are by doubling them rather than using backslash. Example: ... WHERE (user_id in ('foo', '\')) --','legit')) """ commenting = ["\'", ")) --", legit_match_id] new_lines = ["\n--", legit_match_id, "\n"] del_queue_items = [] for i in [legit_match_id, cross_db_access, cross_db_escaped, unicode_smuggling, *commenting, *new_lines]: del_queue_items.append(del_queue_factory(i)) job_id = job_factory(del_queue_items=del_queue_items)["Id"] # Act job_complete_waiter.wait(TableName=job_table.name, Key={"Id": {"S": job_id}, "Sk": {"S": job_id}}) # Assert tmp = tempfile.NamedTemporaryFile() bucket.download_fileobj(object_key, tmp) assert "COMPLETED" == job_table.get_item(Key={"Id": job_id, "Sk": job_id})["Item"]["JobStatus"] assert 0 == len(query_parquet_file(tmp, "customer_id", "12345")) assert 1 == len(job_table.query( KeyConditionExpression=Key('Id').eq(job_id), ScanIndexForward=True, Limit=20, FilterExpression=Attr('Type').eq("JobEvent") & Attr('EventName').eq("ObjectUpdated"), ExclusiveStartKey={ "Id": job_id, "Sk": str(0) } )["Items"])
def test_it_deletes_old_versions( del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader, job_complete_waiter, job_table, data_access_role, ): # Arrange glue_data_mapper_factory( "test", partition_keys=["year", "month", "day"], partitions=[["2019", "08", "20"]], delete_old_versions=True, role_arn=data_access_role["Arn"], ) item = del_queue_factory("12345") object_key = "test/2019/08/20/test.parquet" bucket = dummy_lake["bucket"] # Create the object, add a deletion marker, then recreate it data_loader("basic.parquet", object_key) bucket.Object("basic.parquet").delete() data_loader("basic.parquet", object_key) job_id = job_factory(del_queue_items=[item])["Id"] # Act job_complete_waiter.wait(TableName=job_table.name, Key={ "Id": { "S": job_id }, "Sk": { "S": job_id } }) # Assert tmp = tempfile.NamedTemporaryFile() bucket.download_fileobj(object_key, tmp) assert ("COMPLETED" == job_table.get_item(Key={ "Id": job_id, "Sk": job_id })["Item"]["JobStatus"]) assert 0 == len(query_parquet_file(tmp, "customer_id", "12345")) assert 1 == len(list(bucket.object_versions.filter(Prefix=object_key)))