def _aggregate_stats(events): stats = Counter({}) for event in events: event_name = event["EventName"] event_data = event.get("EventData", {}) if event_name in ["QuerySucceeded", "QueryFailed"]: stats += Counter({ "TotalQueryCount": 1, "TotalQuerySucceededCount": 1 if event_name == "QuerySucceeded" else 0, "TotalQueryFailedCount": 1 if event_name == "QueryFailed" else 0, "TotalQueryScannedInBytes": event_data.get("Statistics", {}).get("DataScannedInBytes", 0), "TotalQueryTimeInMillis": event_data.get("Statistics", {}).get("EngineExecutionTimeInMillis", 0), }) if event_name in [ "ObjectUpdated", "ObjectUpdateFailed", "ObjectRollbackFailed" ]: stats += Counter({ "TotalObjectUpdatedCount": 1 if event_name == "ObjectUpdated" else 0, "TotalObjectUpdateFailedCount": 1 if event_name == "ObjectUpdateFailed" else 0, "TotalObjectRollbackFailedCount": 1 if event_name == "ObjectRollbackFailed" else 0, }) if event_name == "ObjectUpdateFailed": # TODO:: this can only work when deleteOldVersion flag is True for all involved data mappers try: data_access_role_arn = event_data["Message"].get( "RoleArn", getenv("DataAccessRoleArn")) session = get_session(data_access_role_arn) client = session.client("s3") s3_object = event_data["Message"]["Object"] input_bucket, input_key = parse_s3_url(s3_object) revert_last(client, input_bucket, input_key) except Exception: logger.exception("Unable to revert last") try: purge_queue(deletion_queue_url) except Exception: logger.exception("Unable to purge queue") return stats
def _load_value(value): parsed_bucket, parsed_key = parse_s3_url(value) logger.info("Loading data from S3 key %s", parsed_key) obj = s3.Object(parsed_bucket, parsed_key).get()["Body"].read() return json.loads(obj)
def test_it_throws_for_invalid_urls(): with pytest.raises(ValueError): parse_s3_url("not a url") with pytest.raises(ValueError): parse_s3_url(["s3://", "not", "string"])
def execute(queue_url, message_body, receipt_handle): logger.info("Message received") queue = get_queue(queue_url) msg = queue.Message(receipt_handle) try: # Parse and validate incoming message validate_message(message_body) body = json.loads(message_body) session = get_session(body.get("RoleArn")) client = session.client("s3") cols, object_path, job_id = itemgetter('Columns', 'Object', 'JobId')(body) input_bucket, input_key = parse_s3_url(object_path) validate_bucket_versioning(client, input_bucket) creds = session.get_credentials().get_frozen_credentials() s3 = s3fs.S3FileSystem(key=creds.access_key, secret=creds.secret_key, token=creds.token, default_cache_type='none', requester_pays=True, default_fill_cache=False, version_aware=True) # Download the object in-memory and convert to PyArrow NativeFile logger.info("Downloading and opening %s object in-memory", object_path) with s3.open(object_path, "rb") as f: source_version = f.version_id logger.info("Using object version %s as source", source_version) infile = load_parquet(f) # Write new file in-memory logger.info("Generating new parquet file without matches") out_sink, stats = delete_matches_from_file(infile, cols) if stats["DeletedRows"] == 0: raise ValueError( "The object {} was processed successfully but no rows required deletion" .format(object_path)) with pa.BufferReader(out_sink.getvalue()) as output_buf: new_version = save(s3, client, output_buf, input_bucket, input_key, source_version) logger.info("New object version: %s", new_version) verify_object_versions_integrity(client, input_bucket, input_key, source_version, new_version) if body.get("DeleteOldVersions"): logger.info( "Deleting object {} versions older than version {}".format( input_key, new_version)) delete_old_versions(client, input_bucket, input_key, new_version) msg.delete() emit_deletion_event(body, stats) except (KeyError, ArrowException) as e: err_message = "Parquet processing error: {}".format(str(e)) handle_error(msg, message_body, err_message) except IOError as e: err_message = "Unable to retrieve object: {}".format(str(e)) handle_error(msg, message_body, err_message) except MemoryError as e: err_message = "Insufficient memory to work on object: {}".format( str(e)) handle_error(msg, message_body, err_message) except ClientError as e: err_message = "ClientError: {}".format(str(e)) if e.operation_name == "PutObjectAcl": err_message += ". Redacted object uploaded successfully but unable to restore WRITE ACL" if e.operation_name == "ListObjectVersions": err_message += ". Could not verify redacted object version integrity" handle_error(msg, message_body, err_message) except ValueError as e: err_message = "Unprocessable message: {}".format(str(e)) handle_error(msg, message_body, err_message) except DeleteOldVersionsError as e: err_message = "Unable to delete previous versions: {}".format(str(e)) handle_error(msg, message_body, err_message) except IntegrityCheckFailedError as e: err_description, client, bucket, key, version_id = e.args err_message = "Object version integrity check failed: {}".format( err_description) handle_error(msg, message_body, err_message) rollback_object_version( client, bucket, key, version_id, on_error=lambda err: handle_error(None, "{}", err, "ObjectRollbackFailed", False)) except Exception as e: err_message = "Unknown error during message processing: {}".format( str(e)) handle_error(msg, message_body, err_message)
def test_it_parses_s3_url(): assert ["bucket", "test/key"] == parse_s3_url("s3://bucket/test/key") assert ["bucket", "key"] == parse_s3_url("s3://bucket/key") assert ["bucket"] == parse_s3_url("s3://bucket")