def handler(event, context): ''' somewhere in the event data should be a jobid ''' RESPONSE_JSON_CONTENT_INCLUSION_LIMIT = 30000 # strictly it is 32,768 but just to be safe. # s3 bucket that stores the output bucket_name = event['config']['log_bucket'] s3 = s3_utils.s3Utils(bucket_name, bucket_name, bucket_name) # info about the jobby job jobid = event['jobid'] job_started = "%s.job_started" % jobid job_success = "%s.success" % jobid job_error = "%s.error" % jobid job_log = "%s.log" % jobid postrunjson = "%s.postrun.json" % jobid job_log_location = "https://s3.amazonaws.com/%s/%s" % (bucket_name, job_log) postrunjson_location = "https://s3.amazonaws.com/%s/%s" % (bucket_name, postrunjson) # check to see ensure this job has started else fail if not s3.does_key_exist(job_started): raise EC2StartingException("Failed to find jobid %s, ec2 is probably still booting" % jobid) # check to see if job has error, report if so if s3.does_key_exist(job_error): raise AWSEMJobErrorException("Job encountered an error check log at %s" % job_log_location) # check to see if job has completed if not throw retry error if s3.does_key_exist(job_success): if not s3.does_key_exist(postrunjson): raise Exception("Postrun json not found at %s" % postrunjson_location) postrunjsoncontent = json.loads(s3.read_s3(postrunjson)) if len(str(postrunjsoncontent)) + len(str(event)) < RESPONSE_JSON_CONTENT_INCLUSION_LIMIT: event['postrunjson'] = postrunjsoncontent else: event['postrunjson'] = {'log': 'postrun json not included due to data size limit', 'Job': {'Output': postrunjsoncontent['Job']['Output']}} print("completed successfully") return event else: raise StillRunningException("job %s still running" % jobid)
from core.utils import ( powerup, StillRunningException, AWSEMJobErrorException, ) import pytest import mock @powerup("wrapped_fun", mock.Mock(side_effect=StillRunningException("metadata"))) def wrapped_fun(event, context): raise StillRunningException("I should not be called") # this will raise an error @powerup("update_ffmeta_awsem", mock.Mock()) def update_ffmeta_error_fun(event, context): raise Exception("I should raise an error") @powerup('error_fun', mock.Mock()) def error_fun(event, context): raise Exception("lambda made a mess") @powerup('awsem_error_fun', mock.Mock()) def awsem_error_fun(event, context): raise AWSEMJobErrorException()
def wrapped_fun(event, context): raise StillRunningException("I should not be called")
def handler(event, context): ''' somewhere in the event data should be a jobid ''' # s3 bucket that stores the output bucket_name = event['config']['log_bucket'] # info about the jobby job jobid = event['jobid'] job_started = "%s.job_started" % jobid job_success = "%s.success" % jobid job_error = "%s.error" % jobid # check to see ensure this job has started else fail if not does_key_exist(bucket_name, job_started): raise EC2StartingException( "Failed to find jobid %s, ec2 is probably still booting" % jobid) # check to see if job has error, report if so if does_key_exist(bucket_name, job_error): handle_postrun_json(bucket_name, jobid, event, False) raise AWSEMJobErrorException( "Job encountered an error check log using invoke log --job-id=%s" % jobid) # check to see if job has completed if does_key_exist(bucket_name, job_success): handle_postrun_json(bucket_name, jobid, event) print("completed successfully") return event # checking if instance is terminated for no reason instance_id = event.get('instance_id', '') if instance_id: # skip test for instance_id by not giving it to event try: res = boto3.client('ec2').describe_instances( InstanceIds=[instance_id]) except Exception as e: if 'InvalidInstanceID.NotFound' in str(e): raise EC2UnintendedTerminationException( "EC2 is no longer found for job %s - please rerun." % jobid) else: raise e if not res['Reservations']: raise EC2UnintendedTerminationException( "EC2 is no longer found for job %s - please rerun." % jobid) else: ec2_state = res['Reservations'][0]['Instances'][0]['State']['Name'] if ec2_state in ['stopped', 'shutting-down', 'terminated']: errmsg = "EC2 is terminated unintendedly for job %s - please rerun." % jobid printlog(errmsg) raise EC2UnintendedTerminationException(errmsg) # check CPU utilization for the past hour filesystem = '/dev/nvme1n1' # doesn't matter for cpu utilization end = datetime.now(tzutc()) start = end - timedelta(hours=1) jobstart_time = boto3.client('s3').get_object( Bucket=bucket_name, Key=job_started).get('LastModified') if jobstart_time + timedelta(hours=1) < end: cw_res = TibannaResource(instance_id, filesystem, start, end).as_dict() if 'max_cpu_utilization_percent' in cw_res: if not cw_res['max_cpu_utilization_percent'] or cw_res[ 'max_cpu_utilization_percent'] < 1.0: # the instance wasn't terminated - otherwise it would have been captured in the previous error. try: boto3.client('ec2').terminate_instances( InstanceIds=[instance_id]) except Exception as e: errmsg = "Nothing has been running for the past hour for job %s," + \ "but cannot terminate the instance (cpu utilization (%s) : %s" % \ jobid, str(cw_res['max_cpu_utilization_percent']), str(e) printlog(errmsg) raise EC2IdleException(errmsg) # if none of the above raise StillRunningException("job %s still running" % jobid)