예제 #1
0
def _aggregate_stats(events):
    stats = Counter({})

    for event in events:
        event_name = event["EventName"]
        event_data = event.get("EventData", {})
        if event_name in ["QuerySucceeded", "QueryFailed"]:
            stats += Counter({
                "TotalQueryCount":
                1,
                "TotalQuerySucceededCount":
                1 if event_name == "QuerySucceeded" else 0,
                "TotalQueryFailedCount":
                1 if event_name == "QueryFailed" else 0,
                "TotalQueryScannedInBytes":
                event_data.get("Statistics", {}).get("DataScannedInBytes", 0),
                "TotalQueryTimeInMillis":
                event_data.get("Statistics",
                               {}).get("EngineExecutionTimeInMillis", 0),
            })
        if event_name in [
                "ObjectUpdated", "ObjectUpdateFailed", "ObjectRollbackFailed"
        ]:
            stats += Counter({
                "TotalObjectUpdatedCount":
                1 if event_name == "ObjectUpdated" else 0,
                "TotalObjectUpdateFailedCount":
                1 if event_name == "ObjectUpdateFailed" else 0,
                "TotalObjectRollbackFailedCount":
                1 if event_name == "ObjectRollbackFailed" else 0,
            })
            if event_name == "ObjectUpdateFailed":
                # TODO:: this can only work when deleteOldVersion flag is True for all involved data mappers
                try:
                    data_access_role_arn = event_data["Message"].get(
                        "RoleArn", getenv("DataAccessRoleArn"))
                    session = get_session(data_access_role_arn)
                    client = session.client("s3")
                    s3_object = event_data["Message"]["Object"]
                    input_bucket, input_key = parse_s3_url(s3_object)
                    revert_last(client, input_bucket, input_key)
                except Exception:
                    logger.exception("Unable to revert last")
                    try:
                        purge_queue(deletion_queue_url)
                    except Exception:
                        logger.exception("Unable to purge queue")

    return stats
 def _load_value(value):
     parsed_bucket, parsed_key = parse_s3_url(value)
     logger.info("Loading data from S3 key %s", parsed_key)
     obj = s3.Object(parsed_bucket, parsed_key).get()["Body"].read()
     return json.loads(obj)
def test_it_throws_for_invalid_urls():
    with pytest.raises(ValueError):
        parse_s3_url("not a url")
    with pytest.raises(ValueError):
        parse_s3_url(["s3://", "not", "string"])
예제 #4
0
def execute(queue_url, message_body, receipt_handle):
    logger.info("Message received")
    queue = get_queue(queue_url)
    msg = queue.Message(receipt_handle)
    try:
        # Parse and validate incoming message
        validate_message(message_body)
        body = json.loads(message_body)
        session = get_session(body.get("RoleArn"))
        client = session.client("s3")
        cols, object_path, job_id = itemgetter('Columns', 'Object',
                                               'JobId')(body)
        input_bucket, input_key = parse_s3_url(object_path)
        validate_bucket_versioning(client, input_bucket)
        creds = session.get_credentials().get_frozen_credentials()
        s3 = s3fs.S3FileSystem(key=creds.access_key,
                               secret=creds.secret_key,
                               token=creds.token,
                               default_cache_type='none',
                               requester_pays=True,
                               default_fill_cache=False,
                               version_aware=True)
        # Download the object in-memory and convert to PyArrow NativeFile
        logger.info("Downloading and opening %s object in-memory", object_path)
        with s3.open(object_path, "rb") as f:
            source_version = f.version_id
            logger.info("Using object version %s as source", source_version)
            infile = load_parquet(f)
            # Write new file in-memory
            logger.info("Generating new parquet file without matches")
            out_sink, stats = delete_matches_from_file(infile, cols)
        if stats["DeletedRows"] == 0:
            raise ValueError(
                "The object {} was processed successfully but no rows required deletion"
                .format(object_path))
        with pa.BufferReader(out_sink.getvalue()) as output_buf:
            new_version = save(s3, client, output_buf, input_bucket, input_key,
                               source_version)
            logger.info("New object version: %s", new_version)
            verify_object_versions_integrity(client, input_bucket, input_key,
                                             source_version, new_version)
        if body.get("DeleteOldVersions"):
            logger.info(
                "Deleting object {} versions older than version {}".format(
                    input_key, new_version))
            delete_old_versions(client, input_bucket, input_key, new_version)
        msg.delete()
        emit_deletion_event(body, stats)
    except (KeyError, ArrowException) as e:
        err_message = "Parquet processing error: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except IOError as e:
        err_message = "Unable to retrieve object: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except MemoryError as e:
        err_message = "Insufficient memory to work on object: {}".format(
            str(e))
        handle_error(msg, message_body, err_message)
    except ClientError as e:
        err_message = "ClientError: {}".format(str(e))
        if e.operation_name == "PutObjectAcl":
            err_message += ". Redacted object uploaded successfully but unable to restore WRITE ACL"
        if e.operation_name == "ListObjectVersions":
            err_message += ". Could not verify redacted object version integrity"
        handle_error(msg, message_body, err_message)
    except ValueError as e:
        err_message = "Unprocessable message: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except DeleteOldVersionsError as e:
        err_message = "Unable to delete previous versions: {}".format(str(e))
        handle_error(msg, message_body, err_message)
    except IntegrityCheckFailedError as e:
        err_description, client, bucket, key, version_id = e.args
        err_message = "Object version integrity check failed: {}".format(
            err_description)
        handle_error(msg, message_body, err_message)
        rollback_object_version(
            client,
            bucket,
            key,
            version_id,
            on_error=lambda err: handle_error(None, "{}", err,
                                              "ObjectRollbackFailed", False))
    except Exception as e:
        err_message = "Unknown error during message processing: {}".format(
            str(e))
        handle_error(msg, message_body, err_message)
def test_it_parses_s3_url():
    assert ["bucket", "test/key"] == parse_s3_url("s3://bucket/test/key")
    assert ["bucket", "key"] == parse_s3_url("s3://bucket/key")
    assert ["bucket"] == parse_s3_url("s3://bucket")