def retract_job(jobid): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid) print record['username'] if record['username'] == session["username"]: print "Username matches" else: return template( './views/error.tpl', error_str= "You are not the owner of this job :{0} \nInsufficient permissions to retract job" .format(jobid), session=session) record["i_ispublished"] = '0' record.save(overwrite=True) return template("./views/retract_confirm.tpl", job_id=jobid, title="Retract Confirmation", session=session)
def published_jobs(): print "Hi" session = bottle.request.environ.get('beaker.session') require_login(session) current_user = session["user_id"] conf_man.update_creds_from_metadata_server(request.app) results = request.app.config["dyno.conn"].scan(i_ispublished__eq='1') table_tpl = [] for r in results: print r jobinfourl = request.app.get_url('/jobs')+"/"+str(r["job_id"]) joburl = '' if r["jobname"] : joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"]) else: joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, str(r["job_id"])) row = [joburl , str(r["description"]).replace('\r\n', '</br>'), str(r["username"]), str(r["publishdate"])] table_tpl.append(row) table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S'), reverse=True) return template("./views/published_jobs.tpl", title="Published Jobs", table=table, session=session)
def handle_login(): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) access_token = request.params.get("access_token") expires_in = request.params.get("expires_in") aws_client_id = request.app.config["server.aws_client_id"] user_id, name, email = identity.get_identity_from_token( access_token, aws_client_id) user_info = identity.find_user_role(request.app, user_id) if not user_info: return template("./views/login_reject.tpl", title="Turing - Login Rejected!", username=name, user_id=user_id, email=email, session=session) session["logged_in"] = True session["user_id"] = user_id session["username"] = name session["email"] = user_info["email"] #email session["user_role"] = user_info["role"] print session return template("./views/login_confirm.tpl", title="Turing - Login Success!", session=session)
def update_usage_stats(app, job_id): if not job_id: return False print "Updating usage_stats" try: cmd = [ "/home/ubuntu/task_engine/system_stats.sh", "{0}".format(time.time()) ] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) out, err = proc.communicate() except Exception as e: print "Failed to run system_stats.sh" print "Caught exception : {0}".format(e) return cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) old = record.get("usage_stats", "") current = old + out.strip('\n') st = update_record(record, "usage_stats", current) return
def list_jobs(): session = bottle.request.environ.get('beaker.session') require_login(session) current_user = session["user_id"] conf_man.update_creds_from_metadata_server(request.app) results = request.app.config["dyno.conn"].scan(i_user_id__eq=current_user) table_tpl = [] print "Jobs: " print "-"*50 for r in results: jobinfourl = request.app.get_url('/jobs')+"/"+str(r["job_id"]) joburl = '' if r["jobname"] : joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"]) else: joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, str(r["job_id"])) row = [joburl , str(r["status"]), str(r["jobtype"]), str(r["submit_stamp"])] table_tpl.append(row) table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime(row[3], '%Y-%m-%d %H:%M:%S'), reverse=True) return template("./views/jobs.tpl", title="Task Status", table=table, session=session)
def list_jobs_rest(): print "Rest Interface for list_task" session = bottle.request.environ.get('beaker.session') response.content_type = 'application/json' if request.POST.get("access_token"): print "Attempt to auth with access_token" user_info = validate_session(request.app, request.POST.get("access_token")) if not user_info : return {"status" : "Fail", "reason" : "Failed to authenticate"} session.update(user_info) session["logged_in"] = True #print "Session : ",session else: return {"status" : "Fail", "reason" : "access_token missing"} conf_man.update_creds_from_metadata_server(request.app) results = request.app.config["dyno.conn"].scan(i_user_id__eq=session['user_id']) table_tpl = {} table_tpl['items'] = {} print "Jobs: " print "-"*50 for i,r in enumerate(results): table_tpl['items'][i] = { "job_id" : str(r["job_id"]), "status" : str(r["status"]), "jobtype" : str(r["jobtype"]), "submit_stamp" : str(r["submit_stamp"])} table_tpl['status'] = "Success" return table_tpl
def handle_login(): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) access_token = request.params.get("access_token") expires_in = request.params.get("expires_in") aws_client_id = request.app.config["server.aws_client_id"] user_id, name, email = identity.get_identity_from_token(access_token, aws_client_id); user_info = identity.find_user_role(request.app, user_id) if not user_info : return template("./views/login_reject.tpl", title="Turing - Login Rejected!", username = name, user_id = user_id, email = email, session = session) session["logged_in"] = True session["user_id"] = user_id session["username"] = name session["email"] = user_info["email"] #email session["user_role"] = user_info["role"] print session return template("./views/login_confirm.tpl", title="Turing - Login Success!", session=session)
def usage_stats(): session = bottle.request.environ.get('beaker.session') require_login(session) current_user = session["user_id"] conf_man.update_creds_from_metadata_server(request.app) results = request.app.config["dyno.conn"].scan(status__in=['active', 'staging_inputs', 'processing', 'staging_output']) table_tpl = [] print "Jobs: " print "-"*50 for r in results: print r["username"] row = [str(r["username"]), str(r["job_id"]), str(r["status"]), str(r["jobtype"]), str(r["submit_stamp"]), str(r["queue"])] table_tpl.append(row) stackname = request.app.config["instance.tags"]["aws:cloudformation:stack-name"] myautoscale = [x for x in request.app.config["scale.conn"].get_all_groups() if x.name.startswith(stackname)] autoscale = {} for grp in myautoscale: instances = grp.instances count = len(instances) print grp.name print grp.name.strip("{0}-".format(stackname)) print grp.name.strip("{0}-".format(stackname)).startswith('Test') grp_name = grp.name[len(stackname)+1:] if grp_name.startswith('Test'): autoscale['test'] = [grp.min_size*100/grp.max_size, (count-grp.min_size)*100/grp.max_size, (grp.desired_capacity-count)*100/grp.max_size, grp.max_size] elif grp_name.startswith('Prod'): autoscale['prod'] = [grp.min_size*100/grp.max_size, (count-grp.min_size)*100/grp.max_size, (grp.desired_capacity-count)*100/grp.max_size, grp.max_size] else: print "Error: could not find scaling groups" print autoscale table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime(row[4], '%Y-%m-%d %H:%M:%S'), reverse=True) return template("./views/usage_stats.tpl", title="Task Status", table=table, autoscale=autoscale, session=session)
def submit_job_description(): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) uid = _submit_task(request, session) return template("./views/submit_confirm.tpl", job_id=uid, title="Task Confirmation", session=session)
def publish_job_description(): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) job_id = request.POST.get('jobid') status = update_job_for_publish(request, job_id) print job_id, status return template("./views/publish_confirm.tpl", job_id=job_id, title="Publish Confirmation", session=session)
def check_if_cancelled(app, job_id): if not job_id: return False print "Statecheck" cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) if record["status"] == "cancelled": print "Cancelled" return True print "Job not cancelled" return False
def check_if_cancelled(app, job_id): if not job_id : return False print "Statecheck" cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) if record["status"] == "cancelled": print "Cancelled" return True print "Job not cancelled" return False
def job_info(job_id): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) response.content_type = 'application/json' pairs = get_job_info(request, job_id) result = {} result['items'] = {} print "Pairs : ", pairs for i,p in enumerate(pairs): result['items'][i] = {p[0]:p[1]} if p[0] == "status": result['status'] = p[1] #print result return result
def watch_loop(app): cloudwatch = get_connection(app) while 1: status = conf_man.update_creds_from_metadata_server(app) if status : cloudwatch = get_connection(app) for q in app.config["sqs.conn"].get_all_queues(): q_attr = q.get_attributes() visible = q_attr['ApproximateNumberOfMessages'] inflight = q_attr['ApproximateNumberOfMessagesNotVisible'] total = visible+inflight r= cloudwatch.put_metric_data("SQS", "ApproximateNumberOfTotalMessages", value=total, unit="Count", dimensions = {"QueueName" : q.name}) logging.debug("[{0}] queue:{1} Total:{2} Visible:{3} Inflight:{4}".format(datetime.now().isoformat(), q.name, total, visible, inflight)) print r time.sleep(60)
def job_info(job_id): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) response.content_type = 'application/json' pairs = get_job_info(request, job_id) result = {} result['items'] = {} print "Pairs : ", pairs for i, p in enumerate(pairs): result['items'][i] = {p[0]: p[1]} if p[0] == "status": result['status'] = p[1] #print result return result
def upload_to_s3(): session = bottle.request.environ.get('beaker.session') require_login(session) conf_man.update_creds_from_metadata_server(request.app) job_id = str(uuid.uuid4()) exp_time = tstamp_plus_nmins(60) bucket_name = "klab-jobs" #"klab-webofscience" # print "Uploads page" vals = { "redirect_url": "{0}/{1}".format(request.app.config["server.url"], "upload_confirm"), "aws_key_id": request.app.config["instance.tags"]["S3UploadKeyId"], "job_id": job_id, "exp_date": exp_time, "bucket_name": bucket_name } print "Uploading with key : {0}".format( request.app.config["instance.tags"]["S3UploadKeyId"]) policy, signature = get_signature_and_policy(request.app, vals) vals["policy"] = policy vals["signature"] = signature print "policy, signature : ", policy, signature return template('./views/upload.tpl', name="", email="", username="", redirect_url=vals["redirect_url"], aws_key_id=vals["aws_key_id"], exp_date=vals["exp_date"], job_id=vals["job_id"], bucket_name=vals["bucket_name"], policy=policy, signature=signature, alert=False, title="Upload data", session=session)
def job_cancel(job_id): session = bottle.request.environ.get('beaker.session') require_login(session) conf_man.update_creds_from_metadata_server(request.app) dyntable = request.app.config['dyno.conn'] try: tstamp = str(time.strftime('%Y-%m-%d %H:%M:%S')) item = dyntable.get_item(job_id=job_id) item["status"] = "cancelled" item["reason"] = "User request cancel" item["cancel_time"] = tstamp dynamodb_update(dyntable, item) except ItemNotFound: return "The requested job_id was not found in the jobs database" redirect('/jobs/' + job_id)
def upload_to_s3(): session = bottle.request.environ.get('beaker.session') require_login(session) conf_man.update_creds_from_metadata_server(request.app) job_id = str(uuid.uuid4()) exp_time = tstamp_plus_nmins(60) bucket_name = "klab-jobs" #"klab-webofscience" # print "Uploads page" vals = { "redirect_url" : "{0}/{1}".format(request.app.config["server.url"], "upload_confirm"), "aws_key_id" : request.app.config["instance.tags"]["S3UploadKeyId"], "job_id" : job_id, "exp_date" : exp_time, "bucket_name" : bucket_name } print "Uploading with key : {0}".format(request.app.config["instance.tags"]["S3UploadKeyId"]) policy, signature = get_signature_and_policy(request.app, vals) vals["policy"] = policy vals["signature"] = signature print "policy, signature : ", policy, signature return template('./views/upload.tpl', name = "", email = "", username = "", redirect_url = vals["redirect_url"], aws_key_id = vals["aws_key_id"], exp_date = vals["exp_date"], job_id = vals["job_id"], bucket_name = vals["bucket_name"], policy = policy, signature = signature, alert=False, title="Upload data", session=session)
def retract_job(jobid): session = bottle.request.environ.get('beaker.session') conf_man.update_creds_from_metadata_server(request.app) record = dutils.dynamodb_get(request.app.config["dyno.conn"], jobid) print record['username'] if record['username'] == session["username"]: print "Username matches" else: return template('./views/error.tpl', error_str="You are not the owner of this job :{0} \nInsufficient permissions to retract job".format(jobid), session=session) record["i_ispublished"] = '0' record.save(overwrite=True) return template("./views/retract_confirm.tpl", job_id=jobid, title="Retract Confirmation", session=session)
def job_cancel(job_id): session = bottle.request.environ.get('beaker.session') require_login(session) conf_man.update_creds_from_metadata_server(request.app) dyntable = request.app.config['dyno.conn'] try: tstamp = str(time.strftime('%Y-%m-%d %H:%M:%S')) item = dyntable.get_item(job_id=job_id) item["status"] = "cancelled" item["reason"] = "User request cancel" item["cancel_time"] = tstamp dynamodb_update(dyntable, item) except boto.dynamodb2.exceptions.ItemNotFound: return template("./views/error.tpl", session=session, error_str="The requested job_id was not found in the jobs database") redirect('/jobs/' + job_id)
def update_usage_stats(app, job_id): if not job_id : return False print "Updating usage_stats" try: cmd = ["/home/ubuntu/task_engine/system_stats.sh", "{0}".format(time.time())] proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) out, err = proc.communicate() except Exception as e: print "Failed to run system_stats.sh" print "Caught exception : {0}".format(e) return cm.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) old = record.get("usage_stats", "") current = old + out.strip('\n') st = update_record(record, "usage_stats", current) return
def published_jobs(): print "Hi" session = bottle.request.environ.get('beaker.session') require_login(session) current_user = session["user_id"] conf_man.update_creds_from_metadata_server(request.app) results = request.app.config["dyno.conn"].scan(i_ispublished__eq='1') table_tpl = [] for r in results: print r jobinfourl = request.app.get_url('/jobs') + "/" + str(r["job_id"]) joburl = '' if r["jobname"]: joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"]) else: joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, str(r["job_id"])) row = [ joburl, str(r["description"]).replace('\r\n', '</br>'), str(r["username"]), str(r["publishdate"]) ] table_tpl.append(row) table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime( row[3], '%Y-%m-%d %H:%M:%S'), reverse=True) return template("./views/published_jobs.tpl", title="Published Jobs", table=table, session=session)
def list_jobs(): session = bottle.request.environ.get('beaker.session') require_login(session) current_user = session["user_id"] conf_man.update_creds_from_metadata_server(request.app) results = request.app.config["dyno.conn"].scan(i_user_id__eq=current_user) table_tpl = [] print "Jobs: " print "-" * 50 for r in results: jobinfourl = request.app.get_url('/jobs') + "/" + str(r["job_id"]) joburl = '' if r["jobname"]: joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, r["jobname"]) else: joburl = '<a href="{0}">{1}</a>'.format(jobinfourl, str(r["job_id"])) row = [ joburl, str(r["status"]), str(r["jobtype"]), str(r["submit_stamp"]) ] table_tpl.append(row) table = sorted(table_tpl, key=lambda row: datetime.datetime.strptime( row[3], '%Y-%m-%d %H:%M:%S'), reverse=True) return template("./views/jobs.tpl", title="Task Status", table=table, session=session)
def list_jobs_rest(): print "Rest Interface for list_task" session = bottle.request.environ.get('beaker.session') response.content_type = 'application/json' if request.POST.get("access_token"): print "Attempt to auth with access_token" user_info = validate_session(request.app, request.POST.get("access_token")) if not user_info: return {"status": "Fail", "reason": "Failed to authenticate"} session.update(user_info) session["logged_in"] = True #print "Session : ",session else: return {"status": "Fail", "reason": "access_token missing"} conf_man.update_creds_from_metadata_server(request.app) results = request.app.config["dyno.conn"].scan( i_user_id__eq=session['user_id']) table_tpl = {} table_tpl['items'] = {} print "Jobs: " print "-" * 50 for i, r in enumerate(results): table_tpl['items'][i] = { "job_id": str(r["job_id"]), "status": str(r["status"]), "jobtype": str(r["jobtype"]), "submit_stamp": str(r["submit_stamp"]) } table_tpl['status'] = "Success" return table_tpl
def watch_loop(app): cloudwatch = get_connection(app) while 1: status = conf_man.update_creds_from_metadata_server(app) if status: cloudwatch = get_connection(app) for q in app.config["sqs.conn"].get_all_queues(): q_attr = q.get_attributes() visible = q_attr['ApproximateNumberOfMessages'] inflight = q_attr['ApproximateNumberOfMessagesNotVisible'] total = visible + inflight r = cloudwatch.put_metric_data("SQS", "ApproximateNumberOfTotalMessages", value=total, unit="Count", dimensions={"QueueName": q.name}) logging.debug( "[{0}] queue:{1} Total:{2} Visible:{3} Inflight:{4}".format( datetime.now().isoformat(), q.name, total, visible, inflight)) print r time.sleep(60)
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, auth): # Save current folder and chdir to a temporary folder conf_man.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) ############################################################################## # Notify job execution start time ############################################################################## update_record(record, "start_time", time.time()) ############################################################################## # Setup dirs for execution ############################################################################## cwd = os.getcwd() tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id) try: os.makedirs(tmpdir) except: print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir) shutil.rmtree(tmpdir) os.makedirs(tmpdir) os.chdir(tmpdir) ############################################################################## # Download the inputs to the temp folder ############################################################################## update_record(record, "status", "staging_inputs") stagein_start = time.time() try: get_inputs(app, inputs, auth) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to download inputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error("Failed to download inputs") return False stagein_total = time.time() - stagein_start ############################################################################## # Download the inputs to the temp folder ############################################################################## # Check if job is valid update_record(record, "status", "processing") if jobtype not in apps.JOBS: logging.error("Jobtype : {0} does not exist".format(jobtype)) print "Unable to process jobtype : {0}".format(jobtype) return False print "JOBS : ", apps.JOBS[jobtype] status = True returncode = 0 process_start = time.time() try: returncode = apps.JOBS[jobtype](app, data) print "Returncode : {0}".format(returncode) conf_man.update_creds_from_metadata_server(app) except Exception as e: update_record(record, "status", "Failed"); update_record(record, "complete_time", time.time()) update_record(record, "ERROR", str(e)); print "Job execution failed : {0}".format(e) status = False process_total = time.time() - process_start ############################################################################## # Upload the results to the S3 ############################################################################## record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) update_record(record, "status", "staging_outputs") stageout_start = time.time() # Upload the result to S3 try: put_outputs(app, outputs) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to upload outputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error( "Failed to upload inputs") return False stageout_total = time.time() - stageout_start update_record(record, "z_stagein_dur", stagein_total) update_record(record, "z_stageout_dur", stageout_total) update_record(record, "z_processing_dur", process_total - 1) if returncode != 0 : update_record(record, "status", "failed"); update_record(record, "complete_time", time.time()) update_record(record, "ERROR_CODE", returncode); status = False else: update_record(record, "status", "completed") update_record(record, "complete_time", time.time()) if clean_tmp_dirs: shutil.rmtree(tmpdir) # Chdir back to the original folder os.chdir(cwd) return True
def task_loop(app): sqs_conn = app.config["sqs.conn"] pending = app.config["instance.tags"]["JobsQueueName"] active = app.config["instance.tags"]["ActiveQueueName"] pending_q = sqs_conn.get_queue(pending) active_q = sqs_conn.get_queue(active) while 1: # Wait to read a message from the pending_q msg = pending_q.read(wait_time_seconds=20) print "Received message from pending_q" if msg: # Too many things could fail here, do a blanket # Try catch try: sreq = json.loads(msg.get_body())["Message"] if not sreq : continue app.config["current_msg_handle"] = msg data = ast.literal_eval(sreq) job_id = data.get('job_id') jobtype = data.get('jobtype') executable = data.get('executable') args = data.get('args') inputs = data.get('inputs') inputs = data.get('inputs') outputs = data.get('outputs') user_auth = {"user" : data.get('i_user_id'), "role" : data.get('i_user_role'), "token" : data.get('i_token'), "keyid" : data.get('i_keyid'), "keysecret" : data.get('i_keysecret')} # Post the job to the active queue and delete it from the pending queue attr, current_msg = sns_sqs.post_message_to_active(app, active_q, msg.get_body(), job_id) print "Posted job from pending to active queue" if not pending_q.delete_message(msg): print "Deleting message from pending queue failed" for key in data: print "{0} : {1}".format(key, data[key]) print "Starting task" status = exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, user_auth) print "Status : ", status if status == True: conf_man.send_success_mail(data, app) else: conf_man.send_failure_mail(data, app) except Exception as e: print "Job failed to complete : {0}".format(sys.exc_info()[0]) print "Trace : ", inspect.trace() else: print "{0}: Waiting for job description".format(time.time()) seppukku.die_at_hour_edge(app, dry_run=True) logging.debug("{0}: Waiting for job description".format(time.time())) conf_man.update_creds_from_metadata_server(app)
def exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, auth): # Save current folder and chdir to a temporary folder conf_man.update_creds_from_metadata_server(app) record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) ############################################################################## # Notify job execution start time ############################################################################## update_record(record, "start_time", time.time()) ############################################################################## # Setup dirs for execution ############################################################################## cwd = os.getcwd() tmpdir = "/tmp/task_executor_jobs/{0}".format(job_id) try: os.makedirs(tmpdir) except: print "Tmpdir {0} exists. Deleting and recreating".format(tmpdir) shutil.rmtree(tmpdir) os.makedirs(tmpdir) os.chdir(tmpdir) ############################################################################## # Download the inputs to the temp folder ############################################################################## update_record(record, "status", "staging_inputs") stagein_start = time.time() try: get_inputs(app, inputs, auth) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to download inputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error("Failed to download inputs") return False stagein_total = time.time() - stagein_start ############################################################################## # Download the inputs to the temp folder ############################################################################## # Check if job is valid update_record(record, "status", "processing") if jobtype not in apps.JOBS: logging.error("Jobtype : {0} does not exist".format(jobtype)) print "Unable to process jobtype : {0}".format(jobtype) return False print "JOBS : ", apps.JOBS[jobtype] status = True returncode = 0 process_start = time.time() try: returncode = apps.JOBS[jobtype](app, data) print "Returncode : {0}".format(returncode) conf_man.update_creds_from_metadata_server(app) except Exception as e: update_record(record, "status", "Failed") update_record(record, "complete_time", time.time()) update_record(record, "ERROR", str(e)) print "Job execution failed : {0}".format(e) status = False process_total = time.time() - process_start ############################################################################## # Upload the results to the S3 ############################################################################## record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) update_record(record, "status", "staging_outputs") stageout_start = time.time() # Upload the result to S3 try: put_outputs(app, outputs) except Exception as e: print "Exception info : ".format(sys.exc_info()[0]) update_record(record, "ERROR", "Failed to upload outputs {0}".format(e)) update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) logging.error("Failed to upload inputs") return False stageout_total = time.time() - stageout_start update_record(record, "z_stagein_dur", stagein_total) update_record(record, "z_stageout_dur", stageout_total) update_record(record, "z_processing_dur", process_total - 1) if returncode != 0: update_record(record, "status", "failed") update_record(record, "complete_time", time.time()) update_record(record, "ERROR_CODE", returncode) status = False else: update_record(record, "status", "completed") update_record(record, "complete_time", time.time()) if clean_tmp_dirs: shutil.rmtree(tmpdir) # Chdir back to the original folder os.chdir(cwd) return True
def watch_loop(app): """ Watch_loop looks at the definition of the autoscaling_groups and the active queues to determine whether : 1. An instance needs to be removed from the scaling group and terminated 2. A task has been in the active queue for long and appears to have timed out and needs to be moved to the pending queue, for re-attempt. Why would a task fail ? -> Hard error in task causes worker to fail -> Instance was lost mid run """ status = conf_man.update_creds_from_metadata_server(app) stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"] autoscale = get_autoscale_info(app, stack_name) print autoscale # Select all relevant queues in our cloudformation stack queues = [ q for q in app.config["sqs.conn"].get_all_queues() if q.name.startswith(stack_name) ] # Select only the active queues active_q = [q for q in queues if "Active" in q.name] pending_q = [q for q in queues if "Active" not in q.name] for q in active_q: print "Active queue : ", q.name qtype = None if "Test" in q.name: qtype = "test" elif "Prod" in q.name: qtype = "prod" else: logging.error("Unknown queue : ".format(q.name)) break # Find the corresponding pending queue to the current active queue p_q = None p_qs = [pq for pq in pending_q if qtype in pq.name.lower()] if len(p_qs) == 1: p_q = p_qs[0] print "Pending queue : {0}".format(p_q) else: logging.error("Found too many pending queues : {0}".format(p_qs)) exit(0) print "Instances in this group : ", autoscale[qtype]["instances"] for i in autoscale[qtype]["instances"]: print i.health_status print i.lifecycle_state while (1): """ Here we get all messages in the current queue and check the following conditions: 1. No more messages to check -> Break 2. If messages exists -> Check if it is a kill_request. -> Kill the instance and decrement the autoscale group desired count -> """ messages = q.get_messages(num_messages=10, visibility_timeout=2, wait_time_seconds=1, message_attributes=['All']) if not messages: break for msg in messages: # Check if message is a kill_request if msg.message_attributes["job_id"][ "string_value"] == "kill_request": logging.info("Received a kill_request from : ".format( msg.message_attributes["instance_id"]["string_value"])) # Are there more machines than the minimum if autoscale[qtype]["current"] > autoscale[qtype]["min"]: logging.info( "Instances in autoscale group current:{0} > min:{1}" .format(autoscale[qtype]["current"], autoscale[qtype]["min"])) logging.info("Kill : {0}".format( msg.message_attributes["instance_id"] ["string_value"])) kill_instance( app, msg.message_attributes["instance_id"] ["string_value"], autoscale[qtype]) q.delete_message(msg) # Message is a regular job else: # We do not have excess machines. So no kill requests need to be made. # However the message needs to be deleted logging.info("Kill request ignored: {0}".format( msg.message_attributes["instance_id"] ["string_value"])) q.delete_message(msg) # If message is not a kill_request it is an active job. # Check if job has timed-out or the machine it is running on has # accidentally terminated else: job_id = msg.message_attributes["job_id"]["string_value"] instance_id = msg.message_attributes["instance_id"][ "string_value"] try: record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) except Exception, e: logging.debug( "JOb {0} not found in dynamodb \nDeleting the message" ) q.delete_message(msg) record = None if record and record["status"] in ["completed", "failed"]: logging.debug( "Job {0} is {1} -> Deleting the active job message" .format(job_id, record["status"])) q.delete_message(msg) else: logging.debug( "Job_id: {0} Active on Instance: {1}".format( job_id, instance_id)) check_job_status(app, msg, job_id, instance_id, autoscale[qtype], q, p_q)
def watch_loop(app): """ Watch_loop looks at the definition of the autoscaling_groups and the active queues to determine whether : 1. An instance needs to be removed from the scaling group and terminated 2. A task has been in the active queue for long and appears to have timed out and needs to be moved to the pending queue, for re-attempt. Why would a task fail ? -> Hard error in task causes worker to fail -> Instance was lost mid run """ status = conf_man.update_creds_from_metadata_server(app) stack_name = app.config["instance.tags"]["aws:cloudformation:stack-name"] autoscale = get_autoscale_info(app, stack_name) print autoscale # Select all relevant queues in our cloudformation stack queues = [q for q in app.config["sqs.conn"].get_all_queues() if q.name.startswith(stack_name)] # Select only the active queues active_q = [q for q in queues if "Active" in q.name] pending_q = [q for q in queues if "Active" not in q.name] for q in active_q: print "Active queue : ", q.name qtype = None if "Test" in q.name: qtype = "test" elif "Prod" in q.name: qtype = "prod" else: logging.error("Unknown queue : ".format(q.name)) break # Find the corresponding pending queue to the current active queue p_q = None p_qs = [pq for pq in pending_q if qtype in pq.name.lower()] if len(p_qs) == 1: p_q = p_qs[0] print "Pending queue : {0}".format(p_q) else: logging.error("Found too many pending queues : {0}".format(p_qs)) exit(0) print "Instances in this group : ", autoscale[qtype]["instances"] for i in autoscale[qtype]["instances"]: print i.health_status print i.lifecycle_state while (1): """ Here we get all messages in the current queue and check the following conditions: 1. No more messages to check -> Break 2. If messages exists -> Check if it is a kill_request. -> Kill the instance and decrement the autoscale group desired count -> """ messages = q.get_messages(num_messages=10, visibility_timeout=2, wait_time_seconds=1, message_attributes=['All']) if not messages: break for msg in messages: # Check if message is a kill_request if msg.message_attributes["job_id"]["string_value"] == "kill_request": logging.info("Received a kill_request from : ".format(msg.message_attributes["instance_id"]["string_value"])) # Are there more machines than the minimum if autoscale[qtype]["current"] > autoscale[qtype]["min"]: logging.info("Instances in autoscale group current:{0} > min:{1}".format(autoscale[qtype]["current"], autoscale[qtype]["min"])) logging.info("Kill : {0}".format(msg.message_attributes["instance_id"]["string_value"])) kill_instance(app, msg.message_attributes["instance_id"]["string_value"], autoscale[qtype]) q.delete_message(msg) # Message is a regular job else: # We do not have excess machines. So no kill requests need to be made. # However the message needs to be deleted logging.info("Kill request ignored: {0}".format(msg.message_attributes["instance_id"]["string_value"])) q.delete_message(msg) # If message is not a kill_request it is an active job. # Check if job has timed-out or the machine it is running on has # accidentally terminated else: job_id = msg.message_attributes["job_id"]["string_value"] instance_id = msg.message_attributes["instance_id"]["string_value"] try: record = dutils.dynamodb_get(app.config["dyno.conn"], job_id) except Exception, e: logging.debug("JOb {0} not found in dynamodb \nDeleting the message") q.delete_message(msg) record = None if record and record["status"] in ["completed", "failed"]: logging.debug("Job {0} is {1} -> Deleting the active job message".format(job_id, record["status"])) q.delete_message(msg) else: logging.debug("Job_id: {0} Active on Instance: {1}".format(job_id, instance_id)) check_job_status(app, msg, job_id, instance_id, autoscale[qtype], q, p_q)
>>>>>>> 856f3c026f2bf7cae078da071f7353178bc5f27b monitoring_enabled=False, instance_profile_arn=role, dry_run=DRY_RUN) if __name__ == "__main__": mappings = load_mapping_csvs("ami_mapping.csv") app = bottle.default_app() try: app.config.load_config("production.conf") except Exception as e: logging.error("Exception {0} in load_config".format(e)) exit(-1) cm.update_creds_from_metadata_server(app) #instances = ["m4.10xlarge", "c4.8xlarge", "m4.large", "m4.xlarge", "c4.xlarge" ] # instances = ["m4.10xlarge", "c4.8xlarge"] # "m4.large", "m4.xlarge", "c4.xlarge" ] instances = ["c4.8xlarge"] # "m4.large", "m4.xlarge", "c4.xlarge" ] for instance in instances : for m in mappings: print m["region_code"] cm.init(app, m["region_code"]) print app.config["ec2.conn"] status = start_instance(app, m["ami"], instance) print "{0} {1} {2}".format(m["region_code"], instance, status) <<<<<<< HEAD break; =======
def task_loop(app): sqs_conn = app.config["sqs.conn"] pending = app.config["instance.tags"]["JobsQueueName"] active = app.config["instance.tags"]["ActiveQueueName"] pending_q = sqs_conn.get_queue(pending) active_q = sqs_conn.get_queue(active) while 1: # Wait to read a message from the pending_q msg = pending_q.read(wait_time_seconds=20) print "Received message from pending_q" if msg: # Too many things could fail here, do a blanket # Try catch try: sreq = json.loads(msg.get_body())["Message"] if not sreq: continue app.config["current_msg_handle"] = msg data = ast.literal_eval(sreq) job_id = data.get('job_id') jobtype = data.get('jobtype') executable = data.get('executable') args = data.get('args') inputs = data.get('inputs') inputs = data.get('inputs') outputs = data.get('outputs') user_auth = { "user": data.get('i_user_id'), "role": data.get('i_user_role'), "token": data.get('i_token'), "keyid": data.get('i_keyid'), "keysecret": data.get('i_keysecret') } # Post the job to the active queue and delete it from the pending queue attr, current_msg = sns_sqs.post_message_to_active( app, active_q, msg.get_body(), job_id) print "Posted job from pending to active queue" if not pending_q.delete_message(msg): print "Deleting message from pending queue failed" for key in data: print "{0} : {1}".format(key, data[key]) print "Starting task" status = exec_job(app, jobtype, job_id, executable, args, inputs, outputs, data, user_auth) print "Status : ", status if status == True: conf_man.send_success_mail(data, app) else: conf_man.send_failure_mail(data, app) except Exception as e: print "Job failed to complete : {0}".format(sys.exc_info()[0]) print "Trace : ", inspect.trace() else: print "{0}: Waiting for job description".format(time.time()) seppukku.die_at_hour_edge(app, dry_run=True) logging.debug("{0}: Waiting for job description".format( time.time())) conf_man.update_creds_from_metadata_server(app)