Example #1
0
def update_usage_stats(app, job_id):
    if not job_id:
        return False

    print "Updating usage_stats"

    try:
        cmd = [
            "/home/ubuntu/task_engine/system_stats.sh",
            "{0}".format(time.time())
        ]
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        out, err = proc.communicate()
    except Exception as e:
        print "Failed to run system_stats.sh"
        print "Caught exception : {0}".format(e)
        return

    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)

    old = record.get("usage_stats", "")
    current = old + out.strip('\n')
    st = update_record(record, "usage_stats", current)
    return
Example #2
0
def retract_job(jobid):

    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)

    record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid)
    print record['username']

    if record['username'] == session["username"]:
        print "Username matches"
    else:
        return template(
            './views/error.tpl',
            error_str=
            "You are not the owner of this job :{0} \nInsufficient permissions to retract job"
            .format(jobid),
            session=session)

    record["i_ispublished"] = '0'
    record.save(overwrite=True)

    return template("./views/retract_confirm.tpl",
                    job_id=jobid,
                    title="Retract Confirmation",
                    session=session)
Example #3
0
def update_job_for_publish(request, job_id):
    print "Updating job for publish"
    record = dutils.dynamodb_get(request.app.config["dyno.conn"], job_id)
    record["i_ispublished"] = '1'
    record["jobname"] = request.POST.get('jobname').strip()
    record["description"] = request.POST.get('jobdesc').strip()
    record["publishdate"] = str(time.strftime('%Y-%m-%d %H:%M:%S'))
    record.save(overwrite=True)
    return True
Example #4
0
def update_job_for_publish(request, job_id):
    print "Updating job for publish"
    record = dutils.dynamodb_get(request.app.config["dyno.conn"], job_id)
    record["i_ispublished"]  = '1'
    record["jobname"]      = request.POST.get('jobname').strip()
    record["description"]  = request.POST.get('jobdesc').strip()
    record["publishdate"]  = str(time.strftime('%Y-%m-%d %H:%M:%S'))
    record.save(overwrite=True)
    return True
Example #5
0
def cancel_task(app, jobid):
    debug_print("Cancelling task : {0}".format(jobid))

    record = dutils.dynamodb_get(app.config["dyno.conn"], jobid)
    tstamp = str(time.strftime('%Y-%m-%d %H:%M:%S'))

    update_record(record, "status", "cancelled")
    update_record(record, "reason", "User request cancel")
    update_record(record, "cancel_time", tstamp)
    debug_print ("{0} - {1} - {2}".format(record["job_id"], record["status"], record["reason"]))
    return True
Example #6
0
def cancel_task(app, jobid):
    debug_print("Cancelling task : {0}".format(jobid))

    record = dutils.dynamodb_get(app.config["dyno.conn"], jobid)
    tstamp = str(time.strftime("%Y-%m-%d %H:%M:%S"))

    update_record(record, "status", "cancelled")
    update_record(record, "reason", "User request cancel")
    update_record(record, "cancel_time", tstamp)
    debug_print("{0} - {1} - {2}".format(record["job_id"], record["status"], record["reason"]))
    return True
Example #7
0
def status_task(app, jobid):
    debug_print("Status task : {0}".format(jobid))

    record = dutils.dynamodb_get(app.config["dyno.conn"], jobid)
    status = {}

    if GLOBAL_VERBOSE:        
        for item in record.items():
            print "|{0:10}  | {1:50}".format(item[0], item[1])
        
    print record["status"]
    return record["status"]
Example #8
0
def status_task(app, jobid):
    debug_print("Status task : {0}".format(jobid))

    record = dutils.dynamodb_get(app.config["dyno.conn"], jobid)
    status = {}

    if GLOBAL_VERBOSE:
        for item in record.items():
            print "|{0:10}  | {1:50}".format(item[0], item[1])

    print record["status"]
    return record["status"]
Example #9
0
def check_if_cancelled(app, job_id):
    if not job_id :
        return False

    print "Statecheck"
    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
    if record["status"] == "cancelled":
        print "Cancelled"
        return True

    print "Job not cancelled"
    return False
Example #10
0
def check_if_cancelled(app, job_id):
    if not job_id:
        return False

    print "Statecheck"
    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
    if record["status"] == "cancelled":
        print "Cancelled"
        return True

    print "Job not cancelled"
    return False
Example #11
0
def update_usage_stats(app, job_id):
    if not job_id :
        return False

    print "Updating usage_stats"

    try:
        cmd = ["/home/ubuntu/task_engine/system_stats.sh", "{0}".format(time.time())]
        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        out, err = proc.communicate()
    except Exception as e:
        print "Failed to run system_stats.sh"
        print "Caught exception : {0}".format(e)
        return
    
    cm.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)

    old = record.get("usage_stats", "")
    current = old + out.strip('\n')
    st = update_record(record, "usage_stats", current)
    return
Example #12
0
def retract_job(jobid):

    session = bottle.request.environ.get('beaker.session')
    conf_man.update_creds_from_metadata_server(request.app)

    record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid)
    print record['username']
    
    if record['username'] == session["username"]:
        print "Username matches"
    else:
        return template('./views/error.tpl',
                        error_str="You are not the owner of this job :{0} \nInsufficient permissions to retract job".format(jobid),
                        session=session)
    
    record["i_ispublished"]  = '0'
    record.save(overwrite=True)

    return template("./views/retract_confirm.tpl",
                    job_id=jobid,
                    title="Retract Confirmation",
                    session=session)
Example #13
0
def watch_loop(app):
    """
    Watch_loop looks at the definition of the autoscaling_groups and the active queues
    to determine whether :
        1. An instance needs to be removed from the scaling group and terminated
        2. A task has been in the active queue for long and appears to have timed out
           and needs to be moved to the pending queue, for re-attempt.
           Why would a task fail ?
           -> Hard error in task causes worker to fail
           -> Instance was lost mid run
              
    """
    status = conf_man.update_creds_from_metadata_server(app)
    stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"]
    autoscale = get_autoscale_info(app, stack_name)
    print autoscale

    # Select all relevant queues in our cloudformation stack
    queues = [
        q for q in app.config["sqs.conn"].get_all_queues()
        if q.name.startswith(stack_name)
    ]
    # Select only the active queues
    active_q = [q for q in queues if "Active" in q.name]
    pending_q = [q for q in queues if "Active" not in q.name]

    for q in active_q:

        print "Active queue : ", q.name
        qtype = None

        if "Test" in q.name:
            qtype = "test"
        elif "Prod" in q.name:
            qtype = "prod"
        else:
            logging.error("Unknown queue : ".format(q.name))
            break

        # Find the corresponding pending queue to the current active queue
        p_q = None
        p_qs = [pq for pq in pending_q if qtype in pq.name.lower()]
        if len(p_qs) == 1:
            p_q = p_qs[0]
            print "Pending queue : {0}".format(p_q)
        else:
            logging.error("Found too many pending queues : {0}".format(p_qs))
            exit(0)

        print "Instances in this group : ", autoscale[qtype]["instances"]
        for i in autoscale[qtype]["instances"]:
            print i.health_status
            print i.lifecycle_state

        while (1):
            """
            Here we get all messages in the current queue and check the following conditions:
            1. No more messages to check -> Break
            2. If messages exists
            -> Check if it is a kill_request.
            -> Kill the instance and decrement the autoscale group desired count
            -> 
            """
            messages = q.get_messages(num_messages=10,
                                      visibility_timeout=2,
                                      wait_time_seconds=1,
                                      message_attributes=['All'])
            if not messages:
                break

            for msg in messages:
                # Check if message is a kill_request
                if msg.message_attributes["job_id"][
                        "string_value"] == "kill_request":
                    logging.info("Received a kill_request from : ".format(
                        msg.message_attributes["instance_id"]["string_value"]))
                    # Are there more machines than the minimum
                    if autoscale[qtype]["current"] > autoscale[qtype]["min"]:
                        logging.info(
                            "Instances in autoscale group current:{0} > min:{1}"
                            .format(autoscale[qtype]["current"],
                                    autoscale[qtype]["min"]))
                        logging.info("Kill : {0}".format(
                            msg.message_attributes["instance_id"]
                            ["string_value"]))
                        kill_instance(
                            app, msg.message_attributes["instance_id"]
                            ["string_value"], autoscale[qtype])
                        q.delete_message(msg)
                        # Message is a regular job
                    else:
                        # We do not have excess machines. So no kill requests need to be made.
                        # However the message needs to be deleted
                        logging.info("Kill request ignored: {0}".format(
                            msg.message_attributes["instance_id"]
                            ["string_value"]))
                        q.delete_message(msg)

                # If message is not a kill_request it is an active job.
                # Check if job has timed-out or the machine it is running on has
                # accidentally terminated
                else:
                    job_id = msg.message_attributes["job_id"]["string_value"]
                    instance_id = msg.message_attributes["instance_id"][
                        "string_value"]

                    try:
                        record = dutils.dynamodb_get(app.config["dyno.conn"],
                                                     job_id)
                    except Exception, e:
                        logging.debug(
                            "JOb {0} not found in dynamodb \nDeleting the message"
                        )
                        q.delete_message(msg)
                        record = None

                    if record and record["status"] in ["completed", "failed"]:
                        logging.debug(
                            "Job {0} is {1} -> Deleting the active job message"
                            .format(job_id, record["status"]))
                        q.delete_message(msg)
                    else:
                        logging.debug(
                            "Job_id: {0}  Active on Instance: {1}".format(
                                job_id, instance_id))
                        check_job_status(app, msg, job_id, instance_id,
                                         autoscale[qtype], q, p_q)
Example #14
0
def watch_loop(app):
    """
    Watch_loop looks at the definition of the autoscaling_groups and the active queues
    to determine whether :
        1. An instance needs to be removed from the scaling group and terminated
        2. A task has been in the active queue for long and appears to have timed out
           and needs to be moved to the pending queue, for re-attempt.
           Why would a task fail ?
           -> Hard error in task causes worker to fail
           -> Instance was lost mid run
              
    """
    status     = conf_man.update_creds_from_metadata_server(app)
    stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"]    
    autoscale  = get_autoscale_info(app, stack_name)
    print autoscale

    # Select all relevant queues in our cloudformation stack
    queues     = [q for q in app.config["sqs.conn"].get_all_queues() if q.name.startswith(stack_name)]
    # Select only the active queues
    active_q   = [q for q in queues if "Active" in q.name]
    pending_q  = [q for q in queues if "Active" not in q.name]

    for q in active_q:

        print "Active queue : ", q.name
        qtype = None
        
        if "Test" in q.name:
            qtype = "test"
        elif "Prod" in q.name:
            qtype = "prod"
        else:
            logging.error("Unknown queue : ".format(q.name))
            break

        # Find the corresponding pending queue to the current active queue
        p_q   = None
        p_qs = [pq for pq in pending_q if qtype in pq.name.lower()]
        if len(p_qs) == 1:
            p_q = p_qs[0]
            print "Pending queue : {0}".format(p_q)
        else:
            logging.error("Found too many pending queues : {0}".format(p_qs))
            exit(0)
                        
        print "Instances in this group : ", autoscale[qtype]["instances"]        
        for i in autoscale[qtype]["instances"]:
            print i.health_status
            print i.lifecycle_state


        while (1):
            """
            Here we get all messages in the current queue and check the following conditions:
            1. No more messages to check -> Break
            2. If messages exists
            -> Check if it is a kill_request.
            -> Kill the instance and decrement the autoscale group desired count
            -> 
            """
            messages = q.get_messages(num_messages=10, visibility_timeout=2, wait_time_seconds=1, message_attributes=['All'])                
            if not messages:
                break

            for msg in messages:
                # Check if message is a kill_request
                if msg.message_attributes["job_id"]["string_value"] == "kill_request":
                    logging.info("Received a kill_request from : ".format(msg.message_attributes["instance_id"]["string_value"]))
                    # Are there more machines than the minimum
                    if autoscale[qtype]["current"] > autoscale[qtype]["min"]:
                        logging.info("Instances in autoscale group current:{0} > min:{1}".format(autoscale[qtype]["current"], autoscale[qtype]["min"]))
                        logging.info("Kill : {0}".format(msg.message_attributes["instance_id"]["string_value"]))
                        kill_instance(app, msg.message_attributes["instance_id"]["string_value"], autoscale[qtype])                                    
                        q.delete_message(msg)
                        # Message is a regular job
                    else:
                        # We do not have excess machines. So no kill requests need to be made.
                        # However the message needs to be deleted
                        logging.info("Kill request ignored: {0}".format(msg.message_attributes["instance_id"]["string_value"]))
                        q.delete_message(msg)

                # If message is not a kill_request it is an active job.
                # Check if job has timed-out or the machine it is running on has
                # accidentally terminated 
                else:
                    job_id      = msg.message_attributes["job_id"]["string_value"]
                    instance_id = msg.message_attributes["instance_id"]["string_value"]

                    try:
                        record      = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
                    except Exception, e:
                        logging.debug("JOb {0} not found in dynamodb \nDeleting the message")
                        q.delete_message(msg)                
                        record      = None
                        
                    if record and record["status"] in ["completed", "failed"]:
                        logging.debug("Job {0} is {1} -> Deleting the active job message".format(job_id, record["status"]))
                        q.delete_message(msg)
                    else:
                        logging.debug("Job_id: {0}  Active on Instance: {1}".format(job_id, instance_id))
                        check_job_status(app, msg, job_id, instance_id, autoscale[qtype], q, p_q)
Example #15
0
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data,
             auth):

    # Save current folder and chdir to a temporary folder
    conf_man.update_creds_from_metadata_server(app)
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)

    ##############################################################################
    # Notify job execution start time
    ##############################################################################
    update_record(record, "start_time", time.time())

    ##############################################################################
    # Setup dirs for execution
    ##############################################################################
    cwd = os.getcwd()
    tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id)
    try:
        os.makedirs(tmpdir)
    except:
        print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir)
        shutil.rmtree(tmpdir)
        os.makedirs(tmpdir)
    os.chdir(tmpdir)

    ##############################################################################
    # Download the inputs to the temp folder
    ##############################################################################
    update_record(record, "status", "staging_inputs")
    stagein_start = time.time()
    try:
        get_inputs(app, inputs, auth)
    except Exception as e:
        print "Exception info : ".format(sys.exc_info()[0])
        update_record(record, "ERROR",
                      "Failed to download inputs {0}".format(e))
        update_record(record, "status", "failed")
        update_record(record, "complete_time", time.time())
        logging.error("Failed to download inputs")
        return False
    stagein_total = time.time() - stagein_start

    ##############################################################################
    # Download the inputs to the temp folder
    ##############################################################################
    # Check if job is valid
    update_record(record, "status", "processing")
    if jobtype not in apps.JOBS:
        logging.error("Jobtype : {0} does not exist".format(jobtype))
        print "Unable to process jobtype : {0}".format(jobtype)
        return False
    print "JOBS : ", apps.JOBS[jobtype]

    status = True
    returncode = 0
    process_start = time.time()
    try:
        returncode = apps.JOBS[jobtype](app, data)
        print "Returncode : {0}".format(returncode)
        conf_man.update_creds_from_metadata_server(app)

    except Exception as e:
        update_record(record, "status", "Failed")
        update_record(record, "complete_time", time.time())
        update_record(record, "ERROR", str(e))
        print "Job execution failed : {0}".format(e)
        status = False
    process_total = time.time() - process_start

    ##############################################################################
    # Upload the results to the S3
    ##############################################################################
    record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
    update_record(record, "status", "staging_outputs")
    stageout_start = time.time()

    # Upload the result to S3
    try:
        put_outputs(app, outputs)
    except Exception as e:
        print "Exception info : ".format(sys.exc_info()[0])
        update_record(record, "ERROR",
                      "Failed to upload outputs {0}".format(e))
        update_record(record, "status", "failed")
        update_record(record, "complete_time", time.time())
        logging.error("Failed to upload inputs")
        return False
    stageout_total = time.time() - stageout_start

    update_record(record, "z_stagein_dur", stagein_total)
    update_record(record, "z_stageout_dur", stageout_total)
    update_record(record, "z_processing_dur", process_total - 1)

    if returncode != 0:
        update_record(record, "status", "failed")
        update_record(record, "complete_time", time.time())
        update_record(record, "ERROR_CODE", returncode)
        status = False
    else:

        update_record(record, "status", "completed")
        update_record(record, "complete_time", time.time())

    if clean_tmp_dirs:
        shutil.rmtree(tmpdir)
    # Chdir back to the original folder
    os.chdir(cwd)
    return True
Example #16
0
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, auth):

   # Save current folder and chdir to a temporary folder
   conf_man.update_creds_from_metadata_server(app)
   record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
   
   ##############################################################################
   # Notify job execution start time
   ##############################################################################
   update_record(record, "start_time", time.time())

   ##############################################################################
   # Setup dirs for execution
   ##############################################################################
   cwd    = os.getcwd()
   tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id)
   try:
      os.makedirs(tmpdir)
   except:
      print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir)
      shutil.rmtree(tmpdir)
      os.makedirs(tmpdir)
   os.chdir(tmpdir)


   ##############################################################################
   # Download the inputs to the temp folder
   ##############################################################################
   update_record(record, "status", "staging_inputs")
   stagein_start = time.time()
   try:
      get_inputs(app, inputs, auth)
   except Exception as e:
      print "Exception info : ".format(sys.exc_info()[0])
      update_record(record, "ERROR", "Failed to download inputs {0}".format(e))
      update_record(record, "status", "failed")
      update_record(record, "complete_time", time.time())
      logging.error("Failed to download inputs")
      return False
   stagein_total = time.time() - stagein_start

   ##############################################################################
   # Download the inputs to the temp folder
   ##############################################################################
   # Check if job is valid
   update_record(record, "status", "processing")
   if jobtype not in apps.JOBS:
      logging.error("Jobtype : {0} does not exist".format(jobtype))
      print "Unable to process jobtype : {0}".format(jobtype)
      return False
   print "JOBS : ", apps.JOBS[jobtype]

   status = True
   returncode = 0
   process_start = time.time()
   try:
      returncode = apps.JOBS[jobtype](app, data)
      print "Returncode : {0}".format(returncode)
      conf_man.update_creds_from_metadata_server(app)

   except Exception as e:
      update_record(record, "status", "Failed");
      update_record(record, "complete_time", time.time())
      update_record(record, "ERROR", str(e));
      print "Job execution failed : {0}".format(e)
      status = False
   process_total = time.time() - process_start

   ##############################################################################
   # Upload the results to the S3
   ##############################################################################
   record = dutils.dynamodb_get(app.config["dyno.conn"], job_id)
   update_record(record, "status", "staging_outputs")
   stageout_start = time.time()

   # Upload the result to S3
   try:
      put_outputs(app, outputs)
   except Exception as e:
      print "Exception info : ".format(sys.exc_info()[0])
      update_record(record, "ERROR", "Failed to upload outputs {0}".format(e))
      update_record(record, "status", "failed")
      update_record(record, "complete_time", time.time())
      logging.error( "Failed to upload inputs")
      return False
   stageout_total = time.time() - stageout_start

   update_record(record, "z_stagein_dur",    stagein_total)
   update_record(record, "z_stageout_dur",   stageout_total)
   update_record(record, "z_processing_dur", process_total - 1)

   if returncode != 0 :
      update_record(record, "status", "failed");
      update_record(record, "complete_time", time.time())
      update_record(record, "ERROR_CODE", returncode);
      status = False
   else:
      
      update_record(record, "status", "completed")
      update_record(record, "complete_time", time.time())

   if clean_tmp_dirs:
      shutil.rmtree(tmpdir)
   # Chdir back to the original folder
   os.chdir(cwd)
   return True