Exemple #1
0
def test_it_runs_for_parquet_composite_matches(
    del_queue_factory,
    job_factory,
    dummy_lake,
    glue_data_mapper_factory,
    data_loader,
    job_complete_waiter,
    job_table,
):
    # Arrange
    glue_data_mapper_factory(
        "test",
        partition_keys=["year", "month", "day"],
        partitions=[["2019", "08", "20"]],
    )
    item = del_queue_factory(
        [
            {
                "Column": "user_info.personal_information.first_name",
                "Value": "John"
            },
            {
                "Column": "user_info.personal_information.last_name",
                "Value": "Doe"
            },
        ],
        "id123",
        matchid_type="Composite",
        data_mappers=["test"],
    )
    object_key = "test/2019/08/20/test.parquet"
    data_loader("basic.parquet",
                object_key,
                Metadata={"foo": "bar"},
                CacheControl="cache")
    bucket = dummy_lake["bucket"]
    job_id = job_factory(del_queue_items=[item])["Id"]
    # Act
    job_complete_waiter.wait(TableName=job_table.name,
                             Key={
                                 "Id": {
                                     "S": job_id
                                 },
                                 "Sk": {
                                     "S": job_id
                                 }
                             })
    # Assert
    tmp = tempfile.NamedTemporaryFile()
    bucket.download_fileobj(object_key, tmp)
    assert ("COMPLETED" == job_table.get_item(Key={
        "Id": job_id,
        "Sk": job_id
    })["Item"]["JobStatus"])
    assert 0 == len(query_parquet_file(tmp, "customer_id", "12345"))
    assert 1 == len(query_parquet_file(tmp, "customer_id", "23456"))
    assert 1 == len(query_parquet_file(tmp, "customer_id", "34567"))
    assert 2 == len(list(bucket.object_versions.filter(Prefix=object_key)))
    assert {"foo": "bar"} == bucket.Object(object_key).metadata
    assert "cache" == bucket.Object(object_key).cache_control
Exemple #2
0
def test_it_runs_for_complex_types(
    del_queue_factory,
    job_factory,
    dummy_lake,
    glue_data_mapper_factory,
    data_loader,
    job_complete_waiter,
    job_table,
):
    # Arrange
    glue_data_mapper_factory("test", column_identifiers=["user_info.name"])
    item = del_queue_factory("matteo")
    object_key = "test/test.parquet"
    data_loader(
        "basic.parquet", object_key, Metadata={"foo": "bar"}, CacheControl="cache"
    )
    bucket = dummy_lake["bucket"]
    job_id = job_factory(del_queue_items=[item])["Id"]
    # Act
    job_complete_waiter.wait(
        TableName=job_table.name, Key={"Id": {"S": job_id}, "Sk": {"S": job_id}}
    )
    # Assert
    tmp = tempfile.NamedTemporaryFile()
    bucket.download_fileobj(object_key, tmp)
    assert (
        "COMPLETED"
        == job_table.get_item(Key={"Id": job_id, "Sk": job_id})["Item"]["JobStatus"]
    )
    assert 0 == len(query_parquet_file(tmp, "customer_id", "12345"))
    print(query_parquet_file(tmp, "customer_id", "23456"))
    assert {"foo": "bar"} == bucket.Object(object_key).metadata
    assert "cache" == bucket.Object(object_key).cache_control
def test_it_supports_data_access_roles(del_queue_factory, job_factory,
                                       dummy_lake, glue_data_mapper_factory,
                                       data_loader, job_complete_waiter,
                                       job_table, data_access_role):
    # Arrange
    glue_data_mapper_factory("test",
                             partition_keys=["year", "month", "day"],
                             partitions=[["2019", "08", "20"]],
                             delete_old_versions=False,
                             role_arn=data_access_role["Arn"])
    item = del_queue_factory("12345")
    object_key = "test/2019/08/20/test.parquet"
    data_loader("basic.parquet", object_key)
    bucket = dummy_lake["bucket"]
    job_id = job_factory(del_queue_items=[item])["Id"]
    # Act
    job_complete_waiter.wait(TableName=job_table.name,
                             Key={
                                 "Id": {
                                     "S": job_id
                                 },
                                 "Sk": {
                                     "S": job_id
                                 }
                             })
    # Assert
    tmp = tempfile.NamedTemporaryFile()
    bucket.download_fileobj(object_key, tmp)
    assert "COMPLETED" == job_table.get_item(Key={
        "Id": job_id,
        "Sk": job_id
    })["Item"]["JobStatus"]
    assert 0 == len(query_parquet_file(tmp, "customer_id", "12345"))
def test_it_does_not_permit_unversioned_buckets(
        del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory,
        data_loader, job_finished_waiter, job_table, s3_resource):
    try:
        # Arrange
        s3_resource.BucketVersioning(dummy_lake["bucket_name"]).suspend()
        glue_data_mapper_factory("test",
                                 partition_keys=["year", "month", "day"],
                                 partitions=[["2019", "08", "20"]])
        item = del_queue_factory("12345")
        object_key = "test/2019/08/20/test.parquet"
        data_loader("basic.parquet", object_key)
        bucket = dummy_lake["bucket"]
        job_id = job_factory(del_queue_items=[item],
                             delete_previous_versions=False)["Id"]
        # Act
        job_finished_waiter.wait(TableName=job_table.name,
                                 Key={
                                     "Id": {
                                         "S": job_id
                                     },
                                     "Sk": {
                                         "S": job_id
                                     }
                                 })
        # Assert
        tmp = tempfile.NamedTemporaryFile()
        bucket.download_fileobj(object_key, tmp)
        assert "FORGET_PARTIALLY_FAILED" == job_table.get_item(Key={
            "Id": job_id,
            "Sk": job_id
        })["Item"]["JobStatus"]
        assert 1 == len(query_parquet_file(tmp, "customer_id", "12345"))
    finally:
        s3_resource.BucketVersioning(dummy_lake["bucket_name"]).enable()
Exemple #5
0
def test_it_runs_for_partitioned_data_with_non_string_partitions(
    del_queue_factory,
    job_factory,
    dummy_lake,
    glue_data_mapper_factory,
    data_loader,
    job_complete_waiter,
    job_table,
):
    # Arrange
    glue_data_mapper_factory(
        "test",
        partition_keys=["year", "month", "day"],
        partitions=[["2019", "10", "20"]],
        partition_key_types="int",
    )
    item = del_queue_factory("12345")
    object_key = "test/2019/10/20/test.parquet"
    data_loader("basic.parquet",
                object_key,
                Metadata={"foo": "bar"},
                CacheControl="cache")
    bucket = dummy_lake["bucket"]
    job_id = job_factory(del_queue_items=[item])["Id"]
    # Act
    job_complete_waiter.wait(TableName=job_table.name,
                             Key={
                                 "Id": {
                                     "S": job_id
                                 },
                                 "Sk": {
                                     "S": job_id
                                 }
                             })
    # Assert
    tmp = tempfile.NamedTemporaryFile()
    bucket.download_fileobj(object_key, tmp)
    assert ("COMPLETED" == job_table.get_item(Key={
        "Id": job_id,
        "Sk": job_id
    })["Item"]["JobStatus"])
    assert 0 == len(query_parquet_file(tmp, "customer_id", "12345"))
    assert 1 == len(query_parquet_file(tmp, "customer_id", "23456"))
    assert 1 == len(query_parquet_file(tmp, "customer_id", "34567"))
    assert 2 == len(list(bucket.object_versions.filter(Prefix=object_key)))
    assert {"foo": "bar"} == bucket.Object(object_key).metadata
    assert "cache" == bucket.Object(object_key).cache_control
def test_it_handles_injection_attacks(del_queue_factory, job_factory, dummy_lake, glue_data_mapper_factory, data_loader,
                                      job_complete_waiter, job_table):
    # Generate a parquet file and add it to the lake
    glue_data_mapper_factory("test", partition_keys=["year", "month", "day"], partitions=[["2019", "08", "20"]])
    glue_data_mapper_factory("test2", database="acceptancetest2", table="acceptancetest2")
    legit_match_id = "12345"
    object_key = "test/2019/08/20/test.parquet"
    data_loader("basic.parquet", object_key)
    bucket = dummy_lake["bucket"]
    """
    Using single quotes as part of the match_id could be a SQL injection attack for reading information from other 
    tables. While this should be prevented by configuring IAM, it is appropriate to test that the query_handler properly
    escapes the quotes and Athena doesn't access other tables.
    """
    cross_db_access = "foo')) UNION (select * from acceptancetests2.acceptancetests2 where customer_id in ('12345"
    """
    Using single quotes as part of the match_id could be a SQL injection attack for reading information from other 
    tables. While this should be prevented by configuring IAM, it is appropriate to test that the query_handler properly
    escapes the quotes and Athena doesn't access other tables.
    """
    cross_db_escaped = "foo\')) UNION (select * from acceptancetests2.acceptancetests2 where customer_id in (\'12345"
    """
    Unicode smuggling is taken care out of the box. Here is a test with "ʼ", which is similar to single quote.
    """
    unicode_smuggling = "fooʼ)) UNION (select * from acceptancetests2.acceptancetests2 where customer_id in (ʼ12345"
    """
    Another common SQLi attack vector consists on fragmented attacks. Tamper the result of the select by commenting 
    out relevant match_ids by using "--" after a successful escape. This attack wouldn't work because Athena's
    way to escape single quotes are by doubling them rather than using backslash.
    Example: ... WHERE (user_id in ('foo', '\')) --','legit'))
    """
    commenting = ["\'", ")) --", legit_match_id]
    new_lines = ["\n--", legit_match_id, "\n"]
    del_queue_items = []
    for i in [legit_match_id, cross_db_access, cross_db_escaped, unicode_smuggling, *commenting, *new_lines]:
        del_queue_items.append(del_queue_factory(i))
    job_id = job_factory(del_queue_items=del_queue_items)["Id"]
    # Act
    job_complete_waiter.wait(TableName=job_table.name, Key={"Id": {"S": job_id}, "Sk": {"S": job_id}})
    # Assert
    tmp = tempfile.NamedTemporaryFile()
    bucket.download_fileobj(object_key, tmp)
    assert "COMPLETED" == job_table.get_item(Key={"Id": job_id, "Sk": job_id})["Item"]["JobStatus"]
    assert 0 == len(query_parquet_file(tmp, "customer_id", "12345"))
    assert 1 == len(job_table.query(
        KeyConditionExpression=Key('Id').eq(job_id),
        ScanIndexForward=True,
        Limit=20,
        FilterExpression=Attr('Type').eq("JobEvent") & Attr('EventName').eq("ObjectUpdated"),
        ExclusiveStartKey={
            "Id": job_id,
            "Sk": str(0)
        }
    )["Items"])
Exemple #7
0
def test_it_deletes_old_versions(
    del_queue_factory,
    job_factory,
    dummy_lake,
    glue_data_mapper_factory,
    data_loader,
    job_complete_waiter,
    job_table,
    data_access_role,
):
    # Arrange
    glue_data_mapper_factory(
        "test",
        partition_keys=["year", "month", "day"],
        partitions=[["2019", "08", "20"]],
        delete_old_versions=True,
        role_arn=data_access_role["Arn"],
    )
    item = del_queue_factory("12345")
    object_key = "test/2019/08/20/test.parquet"
    bucket = dummy_lake["bucket"]
    # Create the object, add a deletion marker, then recreate it
    data_loader("basic.parquet", object_key)
    bucket.Object("basic.parquet").delete()
    data_loader("basic.parquet", object_key)
    job_id = job_factory(del_queue_items=[item])["Id"]
    # Act
    job_complete_waiter.wait(TableName=job_table.name,
                             Key={
                                 "Id": {
                                     "S": job_id
                                 },
                                 "Sk": {
                                     "S": job_id
                                 }
                             })
    # Assert
    tmp = tempfile.NamedTemporaryFile()
    bucket.download_fileobj(object_key, tmp)
    assert ("COMPLETED" == job_table.get_item(Key={
        "Id": job_id,
        "Sk": job_id
    })["Item"]["JobStatus"])
    assert 0 == len(query_parquet_file(tmp, "customer_id", "12345"))
    assert 1 == len(list(bucket.object_versions.filter(Prefix=object_key)))