def next_work_unit(self, node): job_queue = defaultdict(list) for unit in self.grid.get_queued(): job_queue[unit.job.job_id].append(unit) if len(job_queue) == 0: return None # Point of differece from FCFS. Have to process # jobs before we can see what the earliest deadline is earliest_deadline = None earliest_job = None for job_id, units in job_queue.items(): deadline = units[0].job.deadline wall_seconds = walltime.wall_secs(units[0].job.wall_time) time_left = deadline - wall_seconds # If we don't have a deadline, assign the # first job's deadline as earliest if earliest_deadline is None: earliest_deadline = time_left earliest_job = job_id # Handle case of >1 jobs with varying deadlines #elif deadline < earliest_deadline: elif time_left < earliest_deadline: earliest_deadline = time_left earliest_job = job_id return job_queue[earliest_job][0]
def allocate_work_units(self): with self.grid.queue_lock: # Check that there are jobs to schedule if len(self.grid.get_queued()) == 0: self.write_to_log("Waiting for tasks to schedule.\n") return # Write the job queue to the log self.write_queue_to_log() for queue in self.grid.node_queue.keys(): free_nodes = False for node in self.grid.get_free_node(queue): free_nodes = True # Kill any work_units which have no chance of finishing before the deadline. for unit in self.grid.get_queued(): if (int(time.time()) + walltime.wall_secs(unit.job.wall_time)) > unit.job.deadline: unit.kill_msg = "Killed by scheduler: Unable to complete work_unit by deadline." unit.kill() # Want to allocate on all free cores on the node for free_core in range(0, (node['cores'] - len(node['work_units']))): # Get the next work unit to allocate try: unit = self.next_work_unit(node, queue) except Exception as e: self.write_to_log("Work unit allocator crashed\n") exc_type, exc_value, exc_tb = sys.exc_info() traceback_msg = "".join(traceback.format_exception(exc_type, exc_value, exc_tb)) self.log.write(traceback_msg) self.log.close() print "Error in Scheduler. Shutting down Server." os._exit(1) # No work units to allocate for this queue, continue if unit == None: continue # Output to log file self.write_to_log("Allocating work unit " + str(unit.work_unit_id) + " of job " + str(unit.job.job_id) + " on node " + str(node['node_id']) + ".\n\n") # If allocating the work unit has failed, # we break to avoid death. try: self.allocate_work_unit(node, unit) except NodeUnavailableException as e: self.write_to_log("Failed to allocated job!\n") self.grid.nodes[ node['node_id'] ]['status'] = "DEAD" # Find a cleaner way to do this! if not free_nodes: self.write_to_log("Waiting for free nodes of type %s." % queue)
def monitor_tasks(self): if len(self.tasks) != 0: print self.tasks for i, task in list(self.tasks.items()): # Check if a task has finished if task.has_finished(): self.finish_task(task) del self.tasks[i] # Kill task if its exceeded it wall time elif (int(time.time()) - task.running_ts) > walltime.wall_secs(task.wall_time): self.kill_task(task, "Exceeded Wall time.") print "Work unit %s of job %s killed: Exceeded Wall Time." % (task.job_id, task.work_unit_id) # Kill task if it exceeds its deadline (for fairness) elif int(time.time()) > task.deadline: self.kill_task(task, "Exceeded deadline.") print "Work unit %s of job %s killed: Exceeded Deadline." % (task.job_id, task.work_unit_id)
def next_deadline_work_unit(self, node): job_queue = defaultdict(list) for unit in self.grid.get_queued(): # Only want jobs of the specified type in the queue. if unit.job.job_type == node['type']: job_queue[unit.job.job_id].append(unit) # No jobs to schedule of this type! if len(job_queue) == 0: return None # Get the node's cost from the node JSON node_cost = node['cost'] earliest_deadline = None work_unit_to_send = None for job_id, units in job_queue.items(): # Check that job runs on node that is within # the job's budget budget_per_node_hour = units[0].job.budget_per_node_hour if budget_per_node_hour >= node_cost: deadline = int(units[0].job.deadline) wall_seconds = walltime.wall_secs(units[0].job.wall_time) time_left = deadline - wall_seconds # If we don't have a deadline, assign the # first job's deadline as earliest if earliest_deadline is None: earliest_deadline = time_left work_unit_to_send = job_queue[job_id][0] # Handle case of >1 jobs with varying deadlines elif time_left < earliest_deadline: earliest_deadline = time_left work_unit_to_send = job_queue[job_id][0] # Handle case where the deadlines are the same but budgets is higher elif time_left == earliest_deadline and units[0].job.budget_per_node_hour > work_unit_to_send.job.budget_per_node_hour: work_unit_to_send = job_queue[job_id][0] return work_unit_to_send
def monitor_tasks(self): if len(self.tasks) != 0: print self.tasks for i, task in list(self.tasks.items()): # Check if a task has finished if task.has_finished(): self.finish_task(task) del self.tasks[i] # Kill task if its exceeded it wall time elif (int(time.time()) - task.running_ts) > walltime.wall_secs( task.wall_time): self.kill_task(task, "Exceeded Wall time.") print "Work unit %s of job %s killed: Exceeded Wall Time." % ( task.job_id, task.work_unit_id) # Kill task if it exceeds its deadline (for fairness) elif int(time.time()) > task.deadline: self.kill_task(task, "Exceeded deadline.") print "Work unit %s of job %s killed: Exceeded Deadline." % ( task.job_id, task.work_unit_id)
def add_job(self, flags, wall_time, deadline, budget, job_type, name): # Need to check job_type is a valid queue if job_type is None: job_type = "DEFAULT" elif job_type not in self.node_queue.keys(): raise InvalidJobTypeException( "Invalid Job Type specified: %s. Valid job types are: %s." % (job_type, ", ".join(self.node_queue.keys()))) # Check for Valid budget try: budget = int(budget) except (TypeError, ValueError): raise InvalidJobBudgetException( "Invalid Budget specified: %s. Format: amount in cents as a whole number." % budget) if budget < 0: raise InvalidJobBudgetException( "Invalid Budget specified: %s. Budget must be greater than 0" % budget) # Check that wall_time is valid: try: wall_stripped = walltime.strptime(wall_time) except WallTimeFormatException: raise InvalidWallTimeFormatException( "Invalid Wall Time specified: %s. Format: DD:HH:MM:SS." % wall_time) # Check that deadline format is valid try: deadline_since_epoch = time.mktime( time.strptime(deadline, "%Y-%m-%d %H:%M:%S")) except ValueError: raise InvalidJobDeadlineFormatException( "Invalid Deadline specified: %s. Format: YYYY-MM-DD HH:MM:SS" % deadline) # Check that deadline is valid if deadline_since_epoch <= int(time.time()): raise InvalidJobDeadlineException( "Invalid Deadline specified: %s. Deadline specified is in the past." % deadline) # Check that deadline is reasonable if (deadline_since_epoch - walltime.wall_secs(wall_stripped)) < int( time.time()): raise InvalidJobDeadlineException( "Error: Current time plus wall time is later than the specified deadline. Please adjust either and resubmit." ) # Check that wall time is within acceptable range for job queue placement if self.node_queue[job_type][1] != None and walltime.wall_secs( wall_stripped) > walltime.wall_secs( self.node_queue[job_type][1]): raise InvalidJobTypeException( "Invalid Job Type specified: %s. Wall time %s is too large. Wall time must be shorter than %s for job type %s." % (job_type, walltime.strftime(wall_stripped), self.node_queue[job_type][1], job_type)) # # All tests passed, add to grid. # job = Job(job_id=self.next_job_id, flags=flags, wall_time=wall_stripped, deadline=deadline_since_epoch, budget=budget, job_type=job_type, name=name) self.jobs[self.next_job_id] = job self.next_job_id += 1 return job
def add_job(self, flags, wall_time, deadline, budget, job_type, name): # Need to check job_type is a valid queue if job_type is None: job_type = "DEFAULT" elif job_type not in self.node_queue.keys(): raise InvalidJobTypeException( "Invalid Job Type specified: %s. Valid job types are: %s." % (job_type, ", ".join(self.node_queue.keys())) ) # Check for Valid budget try: budget = int(budget) except (TypeError, ValueError): raise InvalidJobBudgetException("Invalid Budget specified: %s. Format: amount in cents as a whole number." % budget) if budget < 0: raise InvalidJobBudgetException("Invalid Budget specified: %s. Budget must be greater than 0" % budget) # Check that wall_time is valid: try: wall_stripped = walltime.strptime(wall_time) except WallTimeFormatException: raise InvalidWallTimeFormatException("Invalid Wall Time specified: %s. Format: DD:HH:MM:SS." % wall_time) # Check that deadline format is valid try: deadline_since_epoch = time.mktime(time.strptime(deadline, "%Y-%m-%d %H:%M:%S")) except ValueError: raise InvalidJobDeadlineFormatException("Invalid Deadline specified: %s. Format: YYYY-MM-DD HH:MM:SS" % deadline) # Check that deadline is valid if deadline_since_epoch <= int(time.time()): raise InvalidJobDeadlineException("Invalid Deadline specified: %s. Deadline specified is in the past." % deadline) # Check that deadline is reasonable if (deadline_since_epoch - walltime.wall_secs(wall_stripped)) < int(time.time()): raise InvalidJobDeadlineException( "Error: Current time plus wall time is later than the specified deadline. Please adjust either and resubmit." ) # Check that wall time is within acceptable range for job queue placement if self.node_queue[job_type][1] != None and walltime.wall_secs(wall_stripped) > walltime.wall_secs(self.node_queue[job_type][1]): raise InvalidJobTypeException( "Invalid Job Type specified: %s. Wall time %s is too large. Wall time must be shorter than %s for job type %s." % (job_type, walltime.strftime(wall_stripped), self.node_queue[job_type][1], job_type) ) # # All tests passed, add to grid. # job = Job( job_id = self.next_job_id, flags = flags, wall_time = wall_stripped, deadline = deadline_since_epoch, budget = budget, job_type = job_type, name = name ) self.jobs[ self.next_job_id ] = job self.next_job_id += 1 return job
for node_id in request.response: node = request.response[node_id] if node['status'] == "DEAD": continue print "Node: %s" % (node_id) print "Status: %s" % node['status'] print "CPU: %s" % (float(node['cpu'])/int(node['cores'])) print "Cost: $ %0.2f" % (node['cost']/100) print "Cores: %s" % node['cores'] print "Type: %s" % node['type'] print "Free Spots: %s" % (int(node['cores']) - len(node['work_units'])) if (int(node['cores']) - len(node['work_units'])) == 0: earliest_end = None for unit in node['work_units']: end = int(unit['created_ts']) + walltime.wall_secs(walltime.strptime(unit['wall_time'])) if earliest_end == None: earliest_end = end if end < earliest_end: earliest_end = end print "Next free: %s" % time.asctime(time.localtime(earliest_end)) print sys.exit(1) # # Begin Client # # Check the files exist before starting to avoid creating