def perf(opts, conf, args): def task_count_last(i): s = i[1].split() try: count = int(s[0]) last = int(s[1]) except: return None else: return count, last script = ['if', '[', '-f', 'task_count', ']', '&&', '[', '-f', 'task_last', '];', 'then', 'cat', 'task_count;', 'cat', 'task_last;', 'else', 'echo', '0;', 'fi'] instances = aws.filter_instances(opts, conf) idict = dict([(i.dns_name, i) for i in instances]) sdict = aws.get_spot_request_dict(conf) data = {} for i in run_cmd_list(opts, conf, ssh_cmd_list(opts, conf, script, instances), show_output=False, capture_stderr=False): host = i[0] inst = idict.get(host) if inst: sir = sdict.get(inst.spot_instance_request_id) price = None if sir: price = float(sir.price) tasks = task_count_last(i) if tasks: task_count, task_last = tasks uptime = aws.get_uptime(task_last, inst.launch_time) / 3600.0 stat = data.setdefault(inst.instance_type, dict(n=0, uptime_sum=0.0, task_sum=0, price_sum=0.0)) stat['n'] += 1 stat['uptime_sum'] += uptime stat['task_sum'] += task_count if price is not None: stat['price_sum'] += price tph= [] tpd = [] total_tasks = 0.0 total_uptime = 0 total_n = 0 for itype, stat in data.items(): total_tasks += stat['task_sum'] total_uptime += stat['uptime_sum'] total_n += stat['n'] tasks_per_hour = stat['task_sum'] / stat['uptime_sum'] tph.append((tasks_per_hour, itype)) if 'price_sum' in stat: mprice = stat['price_sum'] / stat['n'] tasks_per_dollar = tasks_per_hour / mprice tpd.append((tasks_per_dollar, itype)) tph.sort(reverse=True) tpd.sort(reverse=True) if total_n: print "Tasks per hour (%.02f)" % (total_tasks / total_uptime * total_n,) for tasks_per_hour, itype in tph: print " %s %.02f" % (itype, tasks_per_hour) print "Tasks per US$" for tasks_per_dollar, itype in tpd: print " %s %.02f" % (itype, tasks_per_dollar)
def status(opts, conf): ec2 = aws.get_ec2_conn(conf) instances = aws.filter_instances(opts, conf) if instances: print "Active Instances" now = time.time() for i in instances: uptime = aws.get_uptime(now, i.launch_time) print ' ', i.image_id, aws.format_uptime(uptime), i.public_dns_name requests = ec2.get_all_spot_instance_requests() if requests: print "Spot Requests" for r in requests: dns_name = '' print " %s %s %s %s $%s %s %s" % (r.id, r.region, r.type, r.create_time, r.price, r.state, r.status)
def status(opts, conf): now = time.time() instances = aws.filter_instances(opts, conf, {'instance-state-name': 'running'}) if instances: print "Running Instances" for i in instances: uptime = aws.get_uptime(now, i.launch_time) print ' ', i.image_id, aws.format_uptime( uptime), i.public_dns_name, i.tags requests = aws.get_all_spot_instance_requests( opts, conf, {'state': ['active', 'open']}) if requests: print "Active Spot Requests" for r in requests: print " %s %s %s %s $%s %s %s %s" % (r.id, r.region, r.type, r.create_time, r.price, r.state, r.status, r.tags)
def task_loop(): try: # reset tasks local.task_active = None local.task_push = None # get SQS work queue q = aws.get_sqs_queue(conf) # Loop over tasks. There are up to two different tasks at any # given moment that we are processing concurrently: # # 1. Active task -- usually a blender render operation. # 2. S3 push task -- a task which pushes the products of the # previous active task (such as rendered # frames) to S3. while True: # reset active task local.task_active = None # initialize active task object task = State() task.msg = None task.proc = None task.retcode = None task.outdir = None task.id = 0 # Get a task from the SQS work queue. This is normally # a short script that runs blender to render one # or more frames. queuemsg = q.get_messages(message_attributes=['config']) # output some debug info print "queue read:", task.msg if local.task_push: print "push task:", local.task_push.__dict__ else: print "no task push task" # process task if len(queuemsg) > 0: task.msg = queuemsg[0] # assign an ID to task local.task_id_counter += 1 task.id = local.task_id_counter # register active task local.task_active = task # create output directory task.outdir = os.path.join(work_dir, "brenda-outdir%d.tmp" % (task.id,)) utils.rmtree(task.outdir) utils.mkdir(task.outdir) # Create a config dictionary using combination of global and task-specific config values taskconfig = conf.copy() if 'config' in task.msg.message_attributes: taskconfig.update(json.loads(task.msg.message_attributes['config']['string_value'])) # Store outdir in task config for later use taskconfig['OUTDIR'] = task.outdir if not 'BLENDER_FILE' in taskconfig: taskconfig['BLENDER_FILE'] = '*.blend' print "task-specific config:", taskconfig # get the task script script = task.msg.get_body() print "script len:", len(script) # do macro substitution on the task script for k in taskconfig: script = script.replace('$' + k, taskconfig[k]) # add shebang if absent if not script.startswith("#!"): script = "#!/bin/bash\n" + script # Make sure we're working with the correct project directory # FIXME - this is likely not the most efficient way of doing it, and probably leads to unnecessary # downloads from s3. Ideally we would keep all project directories and switch between them, # but currently brenda only supports one working project directory at a time proj_dir = get_project(taskconfig, taskconfig['BLENDER_PROJECT']) # mount additional EBS volumes aws.mount_additional_ebs(taskconfig, proj_dir) # cd to project directory, where we will run blender from with utils.Cd(proj_dir) as cd: # write script file and make it executable script_fn = "./brenda-go" with open(script_fn, 'w') as f: f.write(script) st = os.stat(script_fn) os.chmod(script_fn, st.st_mode | (stat.S_IEXEC|stat.S_IXGRP|stat.S_IXOTH)) # run the script print "------- Run script %s -------" % (os.path.realpath(script_fn),) print script, print "--------------------------" task.proc = Subprocess([script_fn]) print "active task:", local.task_active.__dict__ # Wait for active and S3-push tasks to complete, # while periodically reasserting with SQS to # acknowledge that tasks are still pending. # (If we don't reassert with SQS frequently enough, # it will assume we died, and put our tasks back # in the queue. "frequently enough" means within # visibility_timeout.) count = 0 while True: reassert = (count >= visibility_timeout_reassert) for i, task in enumerate((local.task_active, local.task_push)): if task: name = task_names[i] if task.proc is not None: # test if process has finished task.retcode = task.proc.poll() if task.retcode is not None: # process has finished task.proc = None # did process finish with errors? if task.retcode != 0: errtxt = "fatal error in %s task" % (name,) if name == 'active': raise error.ValueErrorRetry(errtxt) else: raise ValueError(errtxt) # Process finished successfully. If S3-push process, # tell SQS that the task completed successfully. if name == 'push': print "******* TASK", task.id, "COMMITTED to S3" q.delete_message(task.msg) task.msg = None local.task_count += 1 task_complete_accounting(local.task_count) # active task completed? if name == 'active': print "******* TASK", task.id, "READY-FOR-PUSH" # tell SQS that we are still working on the task if reassert and task.proc is not None: print "******* REASSERT", name, task.id task.msg.change_visibility(visibility_timeout) # break out of loop only when no pending tasks remain if ((not local.task_active or local.task_active.proc is None) and (not local.task_push or local.task_push.proc is None)): break # setup for next process poll iteration if reassert: count = 0 time.sleep(1) count += 1 # clean up the S3-push task cleanup(local.task_push, 'push') local.task_push = None # start a concurrent push task to commit files generated by # just-completed active task (such as blender render frames) to S3 if local.task_active: local.task_active.proc = start_s3_push_process(opts, args, taskconfig, local.task_active.outdir) local.task_push = local.task_active local.task_active = None # if no active task and no S3-push task, we are done (unless DONE is set to "poll") if not local.task_active and not local.task_push: action = read_done_file() if action == "poll": print "Polling for more work..." time.sleep(15) elif action == "smart": now = time.time() try: instance_id = aws.get_instance_id_self() spot_request_id = aws.get_spot_request_from_instance_id(conf, instance_id) launch_time = aws.get_launch_time(conf, spot_request_id) if launch_time: spottime = aws.get_uptime(now, launch_time) minutes_after_hour = (spottime / 60) % 60 print "Smart poll: ", minutes_after_hour if minutes_after_hour >= smart_shutdown_threshold: print "Smart poll threshold passed, shutting down (%d minutes after the hour with no work in queue)" % (minutes_after_hour) # update the value of DONE config var for clean shutdown conf['DONE'] = 'shutdown' write_done_file() break; else: print "Smart poll: no launch_time for spot request %s" % (spot_request_id) except Exception, e: print "Smart poll failed!", e time.sleep(15) else: break finally: cleanup_all()
def instances(opts, conf): now = time.time() for i in aws.filter_instances(opts, conf): uptime = aws.get_uptime(now, i.launch_time) print i.state, i.image_id, aws.format_uptime( uptime), i.public_dns_name, i.tags
def instances(opts, conf): now = time.time() for i in aws.filter_instances(opts, conf): uptime = aws.get_uptime(now, i.launch_time) print i.image_id, aws.format_uptime(uptime), i.public_dns_name
def perf(opts, conf, args): def task_count_last(i): s = i[1].split() try: count = int(s[0]) last = int(s[1]) except: return None else: return count, last script = [ "if", "[", "-f", "task_count", "]", "&&", "[", "-f", "task_last", "];", "then", "cat", "task_count;", "cat", "task_last;", "else", "echo", "0;", "fi", ] instances = aws.filter_instances(opts, conf) idict = dict([(i.dns_name, i) for i in instances]) sdict = aws.get_spot_request_dict(conf) data = {} for i in run_cmd_list( opts, conf, ssh_cmd_list(opts, conf, script, instances), show_output=False, capture_stderr=False ): host = i[0] inst = idict.get(host) if inst: sir = sdict.get(inst.spot_instance_request_id) price = None if sir: price = float(sir.price) tasks = task_count_last(i) if tasks: task_count, task_last = tasks uptime = aws.get_uptime(task_last, inst.launch_time) / 3600.0 stat = data.setdefault(inst.instance_type, dict(n=0, uptime_sum=0.0, task_sum=0, price_sum=0.0)) stat["n"] += 1 stat["uptime_sum"] += uptime stat["task_sum"] += task_count if price is not None: stat["price_sum"] += price tph = [] tpd = [] total_tasks = 0.0 total_uptime = 0 total_n = 0 for itype, stat in data.items(): total_tasks += stat["task_sum"] total_uptime += stat["uptime_sum"] total_n += stat["n"] tasks_per_hour = stat["task_sum"] / stat["uptime_sum"] tph.append((tasks_per_hour, itype)) if "price_sum" in stat: mprice = stat["price_sum"] / stat["n"] tasks_per_dollar = tasks_per_hour / mprice tpd.append((tasks_per_dollar, itype)) tph.sort(reverse=True) tpd.sort(reverse=True) if total_n: print "Tasks per hour (%.02f)" % (total_tasks / total_uptime * total_n,) for tasks_per_hour, itype in tph: print " %s %.02f" % (itype, tasks_per_hour) print "Tasks per US$" for tasks_per_dollar, itype in tpd: print " %s %.02f" % (itype, tasks_per_dollar)