Exemple #1
0
def handler(event, context):
    '''
    somewhere in the event data should be a jobid
    '''
    RESPONSE_JSON_CONTENT_INCLUSION_LIMIT = 30000  # strictly it is 32,768 but just to be safe.

    # s3 bucket that stores the output
    bucket_name = event['config']['log_bucket']
    s3 = s3_utils.s3Utils(bucket_name, bucket_name, bucket_name)

    # info about the jobby job
    jobid = event['jobid']
    job_started = "%s.job_started" % jobid
    job_success = "%s.success" % jobid
    job_error = "%s.error" % jobid
    job_log = "%s.log" % jobid
    postrunjson = "%s.postrun.json" % jobid
    job_log_location = "https://s3.amazonaws.com/%s/%s" % (bucket_name, job_log)
    postrunjson_location = "https://s3.amazonaws.com/%s/%s" % (bucket_name, postrunjson)

    # check to see ensure this job has started else fail
    if not s3.does_key_exist(job_started):
        raise EC2StartingException("Failed to find jobid %s, ec2 is probably still booting" % jobid)

    # check to see if job has error, report if so
    if s3.does_key_exist(job_error):
        raise AWSEMJobErrorException("Job encountered an error check log at %s" % job_log_location)

    # check to see if job has completed if not throw retry error
    if s3.does_key_exist(job_success):
        if not s3.does_key_exist(postrunjson):
            raise Exception("Postrun json not found at %s" % postrunjson_location)
        postrunjsoncontent = json.loads(s3.read_s3(postrunjson))
        if len(str(postrunjsoncontent)) + len(str(event)) < RESPONSE_JSON_CONTENT_INCLUSION_LIMIT:
            event['postrunjson'] = postrunjsoncontent
        else:
            event['postrunjson'] = {'log': 'postrun json not included due to data size limit',
                                    'Job': {'Output':  postrunjsoncontent['Job']['Output']}}
        print("completed successfully")
        return event
    else:
        raise StillRunningException("job %s still running" % jobid)
Exemple #2
0
from core.utils import (
    powerup,
    StillRunningException,
    AWSEMJobErrorException,
)
import pytest
import mock


@powerup("wrapped_fun",
         mock.Mock(side_effect=StillRunningException("metadata")))
def wrapped_fun(event, context):
    raise StillRunningException("I should not be called")


# this will raise an error
@powerup("update_ffmeta_awsem", mock.Mock())
def update_ffmeta_error_fun(event, context):
    raise Exception("I should raise an error")


@powerup('error_fun', mock.Mock())
def error_fun(event, context):
    raise Exception("lambda made a mess")


@powerup('awsem_error_fun', mock.Mock())
def awsem_error_fun(event, context):
    raise AWSEMJobErrorException()

Exemple #3
0
def wrapped_fun(event, context):
    raise StillRunningException("I should not be called")
Exemple #4
0
def handler(event, context):
    '''
    somewhere in the event data should be a jobid
    '''

    # s3 bucket that stores the output
    bucket_name = event['config']['log_bucket']

    # info about the jobby job
    jobid = event['jobid']
    job_started = "%s.job_started" % jobid
    job_success = "%s.success" % jobid
    job_error = "%s.error" % jobid

    # check to see ensure this job has started else fail
    if not does_key_exist(bucket_name, job_started):
        raise EC2StartingException(
            "Failed to find jobid %s, ec2 is probably still booting" % jobid)

    # check to see if job has error, report if so
    if does_key_exist(bucket_name, job_error):
        handle_postrun_json(bucket_name, jobid, event, False)
        raise AWSEMJobErrorException(
            "Job encountered an error check log using invoke log --job-id=%s" %
            jobid)

    # check to see if job has completed
    if does_key_exist(bucket_name, job_success):
        handle_postrun_json(bucket_name, jobid, event)
        print("completed successfully")
        return event

    # checking if instance is terminated for no reason
    instance_id = event.get('instance_id', '')
    if instance_id:  # skip test for instance_id by not giving it to event
        try:
            res = boto3.client('ec2').describe_instances(
                InstanceIds=[instance_id])
        except Exception as e:
            if 'InvalidInstanceID.NotFound' in str(e):
                raise EC2UnintendedTerminationException(
                    "EC2 is no longer found for job %s - please rerun." %
                    jobid)
            else:
                raise e
        if not res['Reservations']:
            raise EC2UnintendedTerminationException(
                "EC2 is no longer found for job %s - please rerun." % jobid)
        else:
            ec2_state = res['Reservations'][0]['Instances'][0]['State']['Name']
            if ec2_state in ['stopped', 'shutting-down', 'terminated']:
                errmsg = "EC2 is terminated unintendedly for job %s - please rerun." % jobid
                printlog(errmsg)
                raise EC2UnintendedTerminationException(errmsg)

        # check CPU utilization for the past hour
        filesystem = '/dev/nvme1n1'  # doesn't matter for cpu utilization
        end = datetime.now(tzutc())
        start = end - timedelta(hours=1)
        jobstart_time = boto3.client('s3').get_object(
            Bucket=bucket_name, Key=job_started).get('LastModified')
        if jobstart_time + timedelta(hours=1) < end:
            cw_res = TibannaResource(instance_id, filesystem, start,
                                     end).as_dict()
            if 'max_cpu_utilization_percent' in cw_res:
                if not cw_res['max_cpu_utilization_percent'] or cw_res[
                        'max_cpu_utilization_percent'] < 1.0:
                    # the instance wasn't terminated - otherwise it would have been captured in the previous error.
                    try:
                        boto3.client('ec2').terminate_instances(
                            InstanceIds=[instance_id])
                    except Exception as e:
                        errmsg = "Nothing has been running for the past hour for job %s," + \
                                 "but cannot terminate the instance (cpu utilization (%s) : %s" % \
                                 jobid, str(cw_res['max_cpu_utilization_percent']), str(e)
                        printlog(errmsg)
                        raise EC2IdleException(errmsg)

    # if none of the above
    raise StillRunningException("job %s still running" % jobid)