def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: # This should probably be handled in a better way new_active_tasks.append(mytask) else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already # throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): cpu = child_proc.cpu_percent(interval=1.0) mem_pct = child_proc.memory_percent() mem_mb = mem_pct * psutil.virtual_memory( ).total / 1048576 # == 2**20, to convert to MB # save in task object mytask.max_mem = max(mytask.max_mem, mem_mb) mytask.cpu_load_avg += cpu / 100. mytask.n_cpu_load_polls += 1 # echo out to screen logger.debug( 'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%,' ' mem={mem:.1f}%, Naffinity={aff}'.format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=cpu, mem=mem_pct, aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return
def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() self.initial_startup = True # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() while self.keep_running: if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) self.ext_command_hook() self.get_new_active_obs() self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) if self.check_taskmanager(tm) is False: # Check if the TaskManager is still available, if not it will pop it out continue while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks: # I think this will work # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) action_from_queue = self.pop_action_queue(tm, tx=False) # FIXME: MIght still be having a small issue when a TM goes offline and back on if action_from_queue is not False: if self.launch_action(action_from_queue) != "OK": # If we had a connection error stop trying until TM checks back in break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) self.initial_startup = False # We've run once now, all obs were assigned via roundrobin if they were not previously self.shutdown()
def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() self.initial_startup = True # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() while self.keep_running: num_of_open_obs = len(self.dbi.list_open_observations()) if num_of_open_obs == 0 and self.sg.aws_enabled == "1": for still in self.launched_actions: self.post_to_server(still, "HALT_NOW") self.shutdown() logger.debug("Shutting down AWS nodes as we are out of data to process...") # shutdown aws nodes... if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) self.ext_command_hook() self.get_new_active_obs() if self.check_if_stills_reporting_full() is not True: self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) if self.check_taskmanager(tm) is False: # Check if the TaskManager is still available, if not it will pop it out continue while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks: # I think this will work # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) action_from_queue = self.pop_action_queue(tm, tx=False) # FIXME: MIght still be having a small issue when a TM goes offline and back on if action_from_queue is not False: if self.launch_action(action_from_queue) != "OK": # If we had a connection error stop trying until TM checks back in break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) self.initial_startup = False # We've run once now, all obs were assigned via roundrobin if they were not previously self.shutdown()
def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: new_active_tasks.append( mytask ) # This should probably be handled in a better way else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): logger.debug( 'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}' .format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0), mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return
def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: new_active_tasks.append(mytask) # This should probably be handled in a better way else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): logger.debug('Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}'.format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0), mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return
class Scheduler(ThreadingMixIn, HTTPServer): ### # A Scheduler reads a DataBaseInterface to determine what Actions can be # taken, and then schedules them on stills according to priority.''' ### def __init__(self, task_clients, workflow, sg): global logger logger = sg.logger self.myhostname = socket.gethostname() HTTPServer.__init__(self, (self.myhostname, 8080), MonitorHandler) # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server. self.sg = sg # Might as well have it around in case I find I need something from it... Its just a little memory self.nstills = len(sg.hosts) # preauto self.actions_per_still = sg.actions_per_still self.transfers_per_still = sg.transfers_per_still # Jon : This isn't used... self.block_size = sg.block_size # preauto self.timeout = sg.timeout self.sleep_time = sg.sleep_time self.lock_all_neighbors_to_same_still = workflow.lock_all_neighbors_to_same_still self.active_obs = [] self.active_obs_dict = {} self.action_queue = [] self.dbi = sg.dbi self.launched_actions = {} self.keep_running = False self.failcount = {} self.wf = workflow # Jon: Moved the workflow class to instantiated on object creation, should do the same for dbi probably self.task_clients = {} self.stills = [] signal.signal(signal.SIGINT, self.signal_handler) # Enabled clean shutdown after Cntrl-C event. logger.info("Starting monitoring interface.") threading.Thread(target=self.serve_forever).start() # Launch a thread of a multithreaded http server to view information on currently running tasks # If task_clients is set to AUTO then check the db for still servers if task_clients[0].host_port[0] == "AUTO": self.find_all_taskmanagers() self.auto = 1 else: self.auto = 0 self.task_clients = task_clients def signal_handler(self, signum, frame): logger.info("Caught Ctrl-C, Initiating clean shutdown.") self.shutdown() def find_all_taskmanagers(self): ### # find_all_taskmanagers : Check the database for all available stills with status OK # Should also remove stills that have gone offline. ### logger.debug("looking for TaskManagers...") self.stills = self.dbi.get_available_stills() while len(self.stills) < 1: logger.debug("Can't find any TaskManagers! Waiting for 10sec and trying again") time.sleep(10) self.stills = self.dbi.get_available_stills() for still in self.stills: if still.hostname not in self.task_clients: logger.debug("Discovery of new TaskManager : %s" % still.hostname) self.task_clients[still.hostname] = TaskClient(self.dbi, still.hostname, self.wf, still.port, self.sg) self.launched_actions[still.hostname] = [] return def ext_command_hook(self): return def check_taskmanager(self, tm): tm_info = self.dbi.get_still_info(tm) since = datetime.datetime.now() - datetime.timedelta(minutes=3) if tm_info.status != "OK" or tm_info.last_checkin < since: # Status not OK or hasn't checked-in in over 3min. logger.info("Removing offline TaskManager : %s" % tm_info.hostname) self.launched_actions.pop(tm_info.hostname, None) self.task_clients.pop(tm_info.hostname, None) for obs in self.dbi.get_obs_assigned_to_still(tm_info.hostname): if obs.obsnum in self.active_obs_dict: self.active_obs_dict.pop(obs.obsnum) self.active_obs.remove(obs.obsnum) return False elif tm_info.cur_num_of_tasks >= tm_info.max_num_of_tasks: # Check to ensure we are not at max # of tasks for the taskmanager return False return True def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() self.initial_startup = True # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() while self.keep_running: if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) self.ext_command_hook() self.get_new_active_obs() self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) if self.check_taskmanager(tm) is False: # Check if the TaskManager is still available, if not it will pop it out continue while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks: # I think this will work # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) action_from_queue = self.pop_action_queue(tm, tx=False) # FIXME: MIght still be having a small issue when a TM goes offline and back on if action_from_queue is not False: if self.launch_action(action_from_queue) != "OK": # If we had a connection error stop trying until TM checks back in break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) self.initial_startup = False # We've run once now, all obs were assigned via roundrobin if they were not previously self.shutdown() def shutdown(self): logger.info("Shutting down...") self.keep_running = False HTTPServer.shutdown(self) sys.exit(0) def get_all_neighbors(self, obsnum): ### # get_all_neighbors: Go down (and up) the rabbit hole and find ALL the neighbors of a particular obsid ### neighbor_obs_nums = [] neighbor_obs_nums.append(obsnum) # Go ahead and add the current obsid to the list low_obs, high_obs = self.dbi.get_neighbors(obsnum) while high_obs is not None: # Traverse the list UP to find all neighbors above this one neighbor_obs_nums.append(high_obs) high_obs = self.dbi.get_neighbors(high_obs)[1] while low_obs is not None: # Traverse the list DOWN to find all neighbors above this one neighbor_obs_nums.append(low_obs) low_obs = self.dbi.get_neighbors(low_obs)[0] return neighbor_obs_nums def pop_action_queue(self, still, tx=False): '''Return highest priority action for the given still.''' # Seems like we're going through all the actions to find the ones for the particular still.. # Should think about optimizing at some point for i in xrange(len(self.action_queue)): a = self.action_queue[i] if a.still == still and a.is_transfer == tx: return self.action_queue.pop(i) return False def get_launched_actions(self, still, tx=False): return [a for a in self.launched_actions[still] if a.is_transfer == tx] def launch_action(self, a): '''Launch the specified Action and record its launch for tracking later.''' self.launched_actions[a.still].append(a) connection_status = a.launch() return connection_status def kill_action(self, a): logger.info('Scheduler.kill_action: called on (%s,%s)' % (a.task, a.obs)) connection_status = a.run_remote_task(action_type="KILL_TASK") return connection_status def clean_completed_actions(self, dbi): '''Check launched actions for completion, timeout or fail''' for still in self.launched_actions: updated_actions = [] for action in self.launched_actions[still]: obs = dbi.get_obs(action.obs) status = obs.status pid = dbi.get_obs_pid(action.obs) try: self.failcount[str(action.obs) + status] except(KeyError): self.failcount[str(action.obs) + status] = 0 if status == action.task: logger.info('Task %s for obs %s on still %s completed successfully.' % (action.task, action.obs, still)) elif action.timed_out(): logger.info('Task %s for obs %s on still %s TIMED OUT.' % (action.task, action.obs, still)) if self.kill_action(action) != "OK": break self.failcount[str(action.obs) + status] += 1 # XXX make db entry for documentation elif pid == -9: self.failcount[str(action.obs) + status] += 1 logger.info('Task %s for obs %s on still %s HAS DIED. failcount=%d' % (action.task, action.obs, still, self.failcount[str(action.obs) + status])) else: # still active updated_actions.append(action) self.launched_actions[still] = updated_actions def already_launched(self, action): '''Determine if this action has already been launched. Enforces fact that only one valid action can be taken for a given obs at any one time.''' for a in self.launched_actions[action.still]: if a.obs == action.obs: return True return False def get_new_active_obs(self): '''Check for any new obs that may have appeared. Actions for these obs may potentially take priority over ones currently active.''' observations = [] observations += self.dbi.list_open_observations_on_tm() for tm_name in self.launched_actions: observations += self.dbi.list_open_observations_on_tm(tm_hostname=tm_name) for open_obs in observations: if open_obs not in self.active_obs_dict: self.active_obs_dict[open_obs] = len(self.active_obs) self.active_obs.append(open_obs) return def update_action_queue(self, ActionClass=None, action_args=()): '''Based on the current list of active obs (which you might want to update first), generate a prioritized list of actions that can be taken.''' actions = [] for myobs in self.active_obs: myobs_info = self.dbi.get_obs(myobs) if myobs_info.current_stage_in_progress == "FAILED" or myobs_info.current_stage_in_progress == "KILLED" or myobs_info.status == "COMPLETE" or (myobs_info.stillhost not in self.task_clients and myobs_info.stillhost): self.active_obs_dict.pop(myobs_info.obsnum) self.active_obs.remove(myobs_info.obsnum) logger.debug("update_action_queue: Removing obsid : %s, task : %s, Status: %s, TM: %s" % (myobs_info.obsnum, myobs_info.current_stage_in_progress, myobs_info.status, myobs_info.stillhost)) else: myaction = self.get_action(myobs, ActionClass=ActionClass, action_args=action_args) if (myaction is not None) and (self.already_launched(myaction) is not True): if self.wf.prioritize_obs == 1: myaction.set_priority(self.determine_priority(myaction)) actions.append(myaction) actions.sort(action_cmp, reverse=True) # place most important actions first self.action_queue = actions # completely throw out previous action list return def get_action(self, obsnum, ActionClass=None, action_args=()): '''Find the next actionable step for obs f (one for which all prerequisites have been met. Return None if no action is available. This function is allowed to return actions that have already been launched. ActionClass: a subclass of Action, for customizing actions. None defaults to the standard Action''' obsinfo = self.dbi.get_obs(obsnum) status = obsinfo.status if obsinfo.current_stage_in_progress == "FAILED" or obsinfo.current_stage_in_progress == "KILLED": return None if status == 'COMPLETE': # logger.debug("COMPLETE for obsid : %s" % obsnum) # You can see how often completeds are checked by uncommenting this.. its a LOT return None # obs is complete if status == '' or status == 'NEW': # Not yet ready for processing. return None # Check that the still assigned to the obs is currently in the list of active stills # !!!!FIXME!!! - Maybe fixed? if obsinfo.stillhost is not None: if any(still for still in self.stills if still.hostname == obsinfo.stillhost): pass else: return None if self.wf.neighbors == 1: # FIX ME, I don't want to call the same thing twice.. its ugly neighbors = self.dbi.get_neighbors(obsnum) if None in neighbors: cur_step_index = self.wf.workflow_actions_endfile.index(status) next_step = self.wf.workflow_actions_endfile[cur_step_index + 1] else: # this is a normal file cur_step_index = self.wf.workflow_actions.index(status) next_step = self.wf.workflow_actions[cur_step_index + 1] neighbor_status = [self.dbi.get_obs_status(n) for n in neighbors if n is not None] else: cur_step_index = self.wf.workflow_actions.index(status) next_step = self.wf.workflow_actions[cur_step_index + 1] neighbor_status = 0 still = self.dbi.get_obs_still_host(obsnum) if not still: if self.initial_startup is True: still = self.tm_cycle.next().hostname # Balance out all the nodes on startup else: still = self.obs_to_still(obsnum) # Get a still for a new obsid if one doesn't already exist. if still is False: return None self.dbi.set_obs_still_host(obsnum, still) # Assign the still to the obsid if self.lock_all_neighbors_to_same_still == 1 and self.wf.neighbors == 1: for neighbor in self.get_all_neighbors(obsnum): self.dbi.set_obs_still_host(neighbor, still) if still != 0: # If the obsnum is assigned to a server that doesn't exist at the moment we need to skip it, maybe reassign later if ActionClass is None: ActionClass = Action a = ActionClass(obsnum, next_step, neighbor_status, self.task_clients[still], self.wf, still, timeout=self.timeout) if self.wf.neighbors == 1: if a.has_prerequisites() is True: return a else: return a # logging.debug('scheduler.get_action: (%s,%s) does not have prereqs' % (a.task, a.obs)) return None def determine_priority(self, action): '''Assign a priority to an action based on its status and the time order of the obs to which this action is attached.''' pol, jdcnt = int(action.obs) / 2 ** 32, int(action.obs) % 2 ** 32 # XXX maybe not make this have to explicitly match dbi bits return jdcnt * 4 + pol # prioritize first by time, then by pol # XXX might want to prioritize finishing a obs already started before # moving to the latest one (at least, up to a point) to avoid a # build up of partial obs. But if you prioritize obs already # started too excessively, then the queue could eventually fill with # partially completed tasks that are failing for some reason def obs_to_still(self, obs): ############## # Check if a obsid has a still already, if it does simply return it. If it does not then lets find the lowest # loaded (cpu) one and assign it. If none are under 80% then lets just wait around, they're busy enough as is. ############## mystill = self.dbi.get_obs_still_host(obs) if mystill: if mystill in self.task_clients: return mystill else: # We couldn't find its still server as its not in task_clients for whatever reason so punt for now logger.debug("Obs attached to non-existant STILL OBS : %s, STILL %s" % (obs, mystill)) return 0 else: still = self.dbi.get_most_available_still() if still is not False: return still else: return False
class TaskServer(HTTPServer): allow_reuse_address = True def __init__(self, dbi, sg, data_dir='.', port=14204, handler=TaskHandler, path_to_do_scripts=".", drmaa_shared='/shared'): global logger logger = sg.logger self.myhostname = socket.gethostname() self.httpd = HTTPServer.__init__(self, (self.myhostname, port), handler) # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server. self.active_tasks_semaphore = threading.Semaphore() self.active_tasks = [] self.dbi = dbi self.sg = sg self.data_dir = data_dir self.keep_running = False self.watchdog_count = 0 self.port = port self.path_to_do_scripts = path_to_do_scripts self.logger = sg.logger self.drmaa_session = '' self.drmaa_shared = drmaa_shared self.shutting_down = False # signal.signal(signal.SIGINT, self.signal_handler) # Enabled clean shutdown after Cntrl-C event. def append_task(self, t): self.active_tasks_semaphore.acquire() # Jon : Not sure why we're doing this, we only have one primary thread self.active_tasks.append(t) self.active_tasks_semaphore.release() def poll_task_status(self, task): if self.sg.cluster_scheduler == 1: # Do we need to interface with a cluster scheduler? try: task_info = self.drmaa_session.jobStatus(task.jid) except: task_info = "failed" logger.debug("TS: poll_task_status : DRMAA jobstatus failed for jid : %s" % task.jid) if task_info == "done" or task_info == "failed": # Check if task is done or failed.. poll_status = True else: poll_status = None # attributes: retval. : jobId, hasExited, hasSignal, terminatedSignal, hasCoreDump, wasAborted, exitStatus, and resourceUsage else: try: poll_status = task.process.poll() # race condition due to threading, might fix later, pretty rare except: poll_status = None time.sleep(2) return poll_status def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: new_active_tasks.append(mytask) # This should probably be handled in a better way else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): logger.debug('Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}'.format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0), mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return def kill(self, pid): try: for task in self.active_tasks: if self.sg.cluster_scheduler == 1: # Do we need to interface with a cluster scheduler? if int(task.jid) == int(pid): task.kill() break else: if int(task.process.pid) == int(pid): task.kill() break except: logger.exception("Problem killing off task: %s w/ pid : %s" % (task, pid)) def kill_all(self): for task in self.active_tasks: task.kill() break def checkin_timer(self): # # Just a timer that will update that its last_checkin time in the database every 5min # while self.keep_running is True: hostname = socket.gethostname() ip_addr = socket.gethostbyname(hostname) cpu_usage = psutil.cpu_percent() self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OK", max_tasks=self.sg.actions_per_still, cur_tasks=len(self.active_tasks)) time.sleep(10) return 0 def start(self): psutil.cpu_percent() time.sleep(1) self.keep_running = True t = threading.Thread(target=self.finalize_tasks) t.daemon = True t.start() logger.info('Starting Task Server') logger.info("using code at: " + __file__) logger.info("Path to do_ Scripts : %s" % self.path_to_do_scripts) logger.info("Data_dir : %s" % self.data_dir) logger.info("Port : %s" % self.port) if self.sg.cluster_scheduler == 1: logger.info("Initilizing DRMAA interface to cluster scheduler") import drmaa self.drmaa_session = drmaa.Session() # Start the interface session to DRMAA to control GridEngine self.drmaa_session.initialize() try: # Setup a thread that just updates the last checkin time for this still every 5min timer_thread = threading.Thread(target=self.checkin_timer) timer_thread.daemon = True # Make it a daemon so that when ctrl-c happens this thread goes away timer_thread.start() # Start heartbeat self.serve_forever() # Start the lisetenser server finally: self.shutdown() return def shutdown(self): if self.shutting_down is False: # check to see if we're already shutting down so we don't step over multiple threads attempting this. self.shutting_down = True logger.debug("Shutting down task_server") hostname = socket.gethostname() ip_addr = socket.gethostbyname(hostname) cpu_usage = psutil.cpu_percent() self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OFFLINE") self.keep_running = False parentproc = psutil.Process() myprocs = parentproc.children(recursive=True) for proc in myprocs: logger.debug("Killing nicely -> Pid: %s - Proc: %s" % (proc.pid, proc.name)) proc.terminate() gone, alive = psutil.wait_procs(myprocs, timeout=3) for proc in alive: logger.debug("Killing with gusto -> Pid: %s - Proc: %s" % (proc.pid, proc.name)) proc.kill() HTTPServer.shutdown(self) if self.sg.cluster_scheduler == 1: self.drmaa_session.exit() # Terminate DRMAA sessionmaker sys.exit(0)
class TaskServer(HTTPServer): allow_reuse_address = True def __init__(self, dbi, sg, data_dir='.', port=14204, handler=TaskHandler, path_to_do_scripts=".", drmaa_shared='/shared', workflow=None): global logger logger = sg.logger self.myhostname = socket.gethostname() # Class us into HTTPServer so we can make calls from TaskHandler into # this class via self.server. self.httpd = HTTPServer.__init__(self, (self.myhostname, port), handler) self.active_tasks_semaphore = threading.Semaphore() self.active_tasks = [] self.dbi = dbi self.sg = sg self.data_dir = data_dir self.keep_running = False self.watchdog_count = 0 self.port = port self.path_to_do_scripts = path_to_do_scripts self.logger = sg.logger self.drmaa_session = '' self.drmaa_shared = drmaa_shared self.shutting_down = False self.wf = workflow # signal.signal(signal.SIGINT, self.signal_handler) # Enabled clean # shutdown after Cntrl-C event. def append_task(self, t): # Jon : Not sure why we're doing this, we only have one primary thread self.active_tasks_semaphore.acquire() self.active_tasks.append(t) self.active_tasks_semaphore.release() def poll_task_status(self, task): # Do we need to interface with a cluster scheduler? if self.sg.cluster_scheduler == 1: try: task_info = self.drmaa_session.jobStatus(task.jid) except: task_info = "failed" logger.debug( "TS: poll_task_status : DRMAA jobstatus failed for jid : %s" % task.jid) # Check if task is done or failed.. if task_info == "done" or task_info == "failed": poll_status = True else: poll_status = None # attributes: retval. : jobId, hasExited, hasSignal, terminatedSignal, hasCoreDump, # wasAborted, exitStatus, and resourceUsage else: try: # race condition due to threading, might fix later, pretty rare poll_status = task.process.poll() except: poll_status = None time.sleep(2) return poll_status def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: # This should probably be handled in a better way new_active_tasks.append(mytask) else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already # throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): cpu = child_proc.cpu_percent(interval=1.0) mem_pct = child_proc.memory_percent() mem_mb = mem_pct * psutil.virtual_memory( ).total / 1048576 # == 2**20, to convert to MB # save in task object mytask.max_mem = max(mytask.max_mem, mem_mb) mytask.cpu_load_avg += cpu / 100. mytask.n_cpu_load_polls += 1 # echo out to screen logger.debug( 'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%,' ' mem={mem:.1f}%, Naffinity={aff}'.format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=cpu, mem=mem_pct, aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return def kill(self, pid): try: for task in self.active_tasks: if self.sg.cluster_scheduler == 1: # Do we need to interface with a cluster scheduler? if int(task.jid) == int(pid): task.kill() break else: if int(task.process.pid) == int(pid): task.kill() break except: logger.exception("Problem killing off task: %s w/ pid : %s" % (task, pid)) def kill_all(self): for task in self.active_tasks: task.kill() break def checkin_timer(self): # # Just a timer that will update that its last_checkin time in the database every 5min # while self.keep_running is True: hostname = platform.node() ip_addr = self.get_ip_address() cpu_usage = os.getloadavg()[1] # using the 5 min load avg ntasks = len(self.active_tasks) self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OK", max_tasks=self.sg.actions_per_still, cur_tasks=ntasks) time.sleep(10) return 0 def mc_checkin_thread(self): import mc_utils while self.keep_running is True: hostname = platform.node() ip_addr = self.get_ip_address() cpu_usage = os.getloadavg()[1] # using the 5 min load avg ntasks = len(self.active_tasks) # get general info ncpu = psutil.cpu_count() s = self.dbi.Session() still = s.query(Still).filter(Still.hostname == hostname).one() status = still.status s.close() # get memory usage vmem = psutil.virtual_memory() vmem_tot = vmem.total / 1024 / 1024 / 1024 vmem_pct = vmem.percent # get disk usage du = psutil.disk_usage('/') du_tot = du.total / 1024 / 1024 / 1024 du_pct = du.percent # get time since boot in hr now = time.time() boot_time = psutil.boot_time() dt_boot = now - boot_time dt_boot_days = dt_boot / 60 / 60 / 24 # call functions for interacting with M&C mc_utils.add_mc_server_status(hostname, ip_addr, ncpu, cpu_usage, dt_boot_days, vmem_pct, vmem_tot, du_pct, du_tot) # sleep time.sleep(TIME_INT_FOR_NEW_MC_CHECK) return 0 def start(self): psutil.cpu_percent() time.sleep(1) self.keep_running = True t = threading.Thread(target=self.finalize_tasks) t.daemon = True t.start() logger.info('Starting Task Server') logger.info("using code at: " + __file__) logger.info("Path to do_ Scripts : %s" % self.path_to_do_scripts) logger.info("Data_dir : %s" % self.data_dir) logger.info("Port : %s" % self.port) if self.sg.cluster_scheduler == 1: logger.info("Initilizing DRMAA interface to cluster scheduler") import drmaa # Start the interface session to DRMAA to control GridEngine self.drmaa_session = drmaa.Session() self.drmaa_session.initialize() try: # Setup a thread that just updates the last checkin time for this # still every 10s timer_thread = threading.Thread(target=self.checkin_timer) # Make it a daemon so that when ctrl-c happens this thread goes # away timer_thread.daemon = True timer_thread.start() # Start heartbeat self.serve_forever() # Start the lisetenser server if self.wf.log_to_mc: # Also set up an M&C status thread mc_thread = threading.Thread(target=self.mc_checkin_thread) # Make it a daemon so that when ctrl-c happens this thread # goes away mc_thread.daemon = True mc_thread.start() finally: self.shutdown() return def shutdown(self): if self.shutting_down is False: # check to see if we're already shutting down so we don't step # over multiple threads attempting this self.shutting_down = True logger.debug("Shutting down task_server") hostname = socket.gethostname() ip_addr = socket.gethostbyname(hostname) cpu_usage = psutil.cpu_percent() self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OFFLINE") self.keep_running = False parentproc = psutil.Process() myprocs = parentproc.children(recursive=True) for proc in myprocs: logger.debug("Killing nicely -> Pid: %s - Proc: %s" % (proc.pid, proc.name)) proc.terminate() gone, alive = psutil.wait_procs(myprocs, timeout=3) for proc in alive: logger.debug("Killing with gusto -> Pid: %s - Proc: %s" % (proc.pid, proc.name)) proc.kill() HTTPServer.shutdown(self) if self.sg.cluster_scheduler == 1: self.drmaa_session.exit() # Terminate DRMAA sessionmaker sys.exit(0) def get_ip_address(self): """Return an IP address for this machine as a string. https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python This is actually not well defined -- machines have multiple interfaces, each with its own IP address. We use eth0 if it exists, otherwise the first one that isn't `lo`. Copied from hera_mc/scripts/mc_server_status_daemon.py """ try: addrs = netifaces.ifaddresses('eth0') except ValueError: for ifname in sorted(netifaces.interfaces()): if ifname != 'lo': addrs = netifaces.ifaddresses(ifname) break else: # triggered if we never did the 'break' return '?.?.?.?' return addrs[netifaces.AF_INET][0]['addr']
class Scheduler(ThreadingMixIn, HTTPServer): ### # A Scheduler reads a DataBaseInterface to determine what Actions can be # taken, and then schedules them on stills according to priority.''' ### def __init__(self, task_clients, workflow, sg): global logger logger = sg.logger self.myhostname = socket.gethostname() HTTPServer.__init__(self, (self.myhostname, 8080), MonitorHandler) # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server. self.sg = sg # Might as well have it around in case I find I need something from it... Its just a little memory self.nstills = len(sg.hosts) # preauto self.actions_per_still = sg.actions_per_still self.transfers_per_still = sg.transfers_per_still # Jon : This isn't used... self.block_size = sg.block_size # preauto self.timeout = sg.timeout self.sleep_time = sg.sleep_time self.lock_all_neighbors_to_same_still = workflow.lock_all_neighbors_to_same_still self.active_obs = [] self.active_obs_dict = {} self.action_queue = [] self.dbi = sg.dbi self.launched_actions = {} self.keep_running = False self.failcount = {} self.wf = workflow # Jon: Moved the workflow class to instantiated on object creation, should do the same for dbi probably self.task_clients = {} self.stills = [] signal.signal(signal.SIGINT, self.signal_handler) # Enabled clean shutdown after Cntrl-C event. logger.info("Starting monitoring interface.") threading.Thread(target=self.serve_forever).start() # Launch a thread of a multithreaded http server to view information on currently running tasks # If task_clients is set to AUTO then check the db for still servers if task_clients[0].host_port[0] == "AUTO": self.find_all_taskmanagers() self.auto = 1 else: self.auto = 0 self.task_clients = task_clients def signal_handler(self, signum, frame): logger.info("Caught Ctrl-C, Initiating clean shutdown.") self.shutdown() def find_all_taskmanagers(self): ### # find_all_taskmanagers : Check the database for all available stills with status OK # Should also remove stills that have gone offline. ### logger.debug("looking for TaskManagers...") self.stills = self.dbi.get_available_stills() while len(self.stills) < 1: logger.debug("Can't find any TaskManagers! Waiting for 10sec and trying again") time.sleep(10) self.stills = self.dbi.get_available_stills() for still in self.stills: if still.hostname not in self.task_clients: logger.debug("Discovery of new TaskManager : %s" % still.hostname) self.task_clients[still.hostname] = TaskClient(self.dbi, still.hostname, self.wf, still.port, self.sg) self.launched_actions[still.hostname] = [] return def ext_command_hook(self): return def check_taskmanager(self, tm): tm_info = self.dbi.get_still_info(tm) since = datetime.datetime.now() - datetime.timedelta(minutes=3) if tm_info.status != "OK" or tm_info.last_checkin < since: # Status not OK or hasn't checked-in in over 3min. logger.info("Removing offline TaskManager : %s" % tm_info.hostname) self.launched_actions.pop(tm_info.hostname, None) self.task_clients.pop(tm_info.hostname, None) for obs in self.dbi.get_obs_assigned_to_still(tm_info.hostname): if obs.obsnum in self.active_obs_dict: self.active_obs_dict.pop(obs.obsnum) self.active_obs.remove(obs.obsnum) return False elif tm_info.cur_num_of_tasks >= tm_info.max_num_of_tasks: # Check to ensure we are not at max # of tasks for the taskmanager return False return True def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() self.initial_startup = True # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() while self.keep_running: if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) self.ext_command_hook() self.get_new_active_obs() self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) if self.check_taskmanager(tm) is False: # Check if the TaskManager is still available, if not it will pop it out continue while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks: # I think this will work # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) action_from_queue = self.pop_action_queue(tm, tx=False) # FIXME: MIght still be having a small issue when a TM goes offline and back on if action_from_queue is not False: if self.launch_action(action_from_queue) != "OK": # If we had a connection error stop trying until TM checks back in break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) self.initial_startup = False # We've run once now, all obs were assigned via roundrobin if they were not previously self.shutdown() def shutdown(self): logger.info("Shutting down...") self.keep_running = False HTTPServer.shutdown(self) sys.exit(0) def get_all_neighbors(self, obsnum): ### # get_all_neighbors: Go down (and up) the rabbit hole and find ALL the neighbors of a particular obsid ### neighbor_obs_nums = [] neighbor_obs_nums.append(obsnum) # Go ahead and add the current obsid to the list low_obs, high_obs = self.dbi.get_neighbors(obsnum) while high_obs is not None: # Traverse the list UP to find all neighbors above this one neighbor_obs_nums.append(high_obs) high_obs = self.dbi.get_neighbors(high_obs)[1] while low_obs is not None: # Traverse the list DOWN to find all neighbors above this one neighbor_obs_nums.append(low_obs) low_obs = self.dbi.get_neighbors(low_obs)[0] return neighbor_obs_nums def pop_action_queue(self, still, tx=False): '''Return highest priority action for the given still.''' # Seems like we're going through all the actions to find the ones for the particular still.. # Should think about optimizing at some point for i in xrange(len(self.action_queue)): a = self.action_queue[i] if a.still == still and a.is_transfer == tx: return self.action_queue.pop(i) return False def get_launched_actions(self, still, tx=False): return [a for a in self.launched_actions[still] if a.is_transfer == tx] def launch_action(self, a): '''Launch the specified Action and record its launch for tracking later.''' self.launched_actions[a.still].append(a) connection_status = a.launch() return connection_status def kill_action(self, a): logger.info('Scheduler.kill_action: called on (%s,%s)' % (a.task, a.obs)) connection_status = a.run_remote_task(action_type="KILL_TASK") return connection_status def clean_completed_actions(self, dbi): '''Check launched actions for completion, timeout or fail''' for still in self.launched_actions: updated_actions = [] for action in self.launched_actions[still]: obs = dbi.get_obs(action.obs) status = obs.status pid = dbi.get_obs_pid(action.obs) try: self.failcount[str(action.obs) + status] except(KeyError): self.failcount[str(action.obs) + status] = 0 if status == action.task: logger.info('Task %s for obs %s on still %s completed successfully.' % (action.task, action.obs, still)) elif action.timed_out(): logger.info('Task %s for obs %s on still %s TIMED OUT.' % (action.task, action.obs, still)) if self.kill_action(action) != "OK": break self.failcount[str(action.obs) + status] += 1 # XXX make db entry for documentation elif pid == -9: self.failcount[str(action.obs) + status] += 1 logger.info('Task %s for obs %s on still %s HAS DIED. failcount=%d' % (action.task, action.obs, still, self.failcount[str(action.obs) + status])) else: # still active updated_actions.append(action) self.launched_actions[still] = updated_actions def already_launched(self, action): '''Determine if this action has already been launched. Enforces fact that only one valid action can be taken for a given obs at any one time.''' for a in self.launched_actions[action.still]: if a.obs == action.obs: return True return False def get_new_active_obs(self): '''Check for any new obs that may have appeared. Actions for these obs may potentially take priority over ones currently active.''' observations = [] observations += self.dbi.list_open_observations_on_tm() for tm_name in self.launched_actions: observations += self.dbi.list_open_observations_on_tm(tm_hostname=tm_name) for open_obs in observations: if open_obs not in self.active_obs_dict: self.active_obs_dict[open_obs] = len(self.active_obs) self.active_obs.append(open_obs) return def update_action_queue(self, ActionClass=None, action_args=()): '''Based on the current list of active obs (which you might want to update first), generate a prioritized list of actions that can be taken.''' actions = [] for myobs in self.active_obs: myobs_info = self.dbi.get_obs(myobs) if myobs_info.current_stage_in_progress == "FAILED" or myobs_info.current_stage_in_progress == "KILLED" or myobs_info.status == "COMPLETE" or (myobs_info.stillhost not in self.task_clients and myobs_info.stillhost): self.active_obs_dict.pop(myobs_info.obsnum) self.active_obs.remove(myobs_info.obsnum) logger.debug("update_action_queue: Removing obsid : %s, task : %s, Status: %s, TM: %s" % (myobs_info.obsnum, myobs_info.current_stage_in_progress, myobs_info.status, myobs_info.stillhost)) else: myaction = self.get_action(myobs, ActionClass=ActionClass, action_args=action_args) if (myaction is not None) and (self.already_launched(myaction) is not True): if self.wf.prioritize_obs == 1: myaction.set_priority(self.determine_priority(myaction)) actions.append(myaction) actions.sort(action_cmp, reverse=True) # place most important actions first self.action_queue = actions # completely throw out previous action list return def get_action(self, obsnum, ActionClass=None, action_args=()): '''Find the next actionable step for obs f (one for which all prerequisites have been met. Return None if no action is available. This function is allowed to return actions that have already been launched. ActionClass: a subclass of Action, for customizing actions. None defaults to the standard Action''' obsinfo = self.dbi.get_obs(obsnum) status = obsinfo.status if obsinfo.current_stage_in_progress == "FAILED" or obsinfo.current_stage_in_progress == "KILLED": return None if status == 'COMPLETE': # logger.debug("COMPLETE for obsid : %s" % obsnum) # You can see how often completeds are checked by uncommenting this.. its a LOT return None # obs is complete if status == '' or status == 'NEW': # Not yet ready for processing. return None # Check that the still assigned to the obs is currently in the list of active stills # !!!!FIXME!!! - Maybe fixed? if obsinfo.stillhost is not None: if any(still for still in self.stills if still.hostname == obsinfo.stillhost): pass else: return None if self.wf.neighbors == 1: # FIX ME, I don't want to call the same thing twice.. its ugly neighbors = self.dbi.get_neighbors(obsnum) if None in neighbors: cur_step_index = self.wf.workflow_actions_endfile.index(status) next_step = self.wf.workflow_actions_endfile[cur_step_index + 1] else: # this is a normal file cur_step_index = self.wf.workflow_actions.index(status) next_step = self.wf.workflow_actions[cur_step_index + 1] neighbor_status = [self.dbi.get_obs_status(n) for n in neighbors if n is not None] else: cur_step_index = self.wf.workflow_actions.index(status) next_step = self.wf.workflow_actions[cur_step_index + 1] neighbor_status = 0 still = self.dbi.get_obs_still_host(obsnum) if not still: if self.initial_startup is True: still = self.tm_cycle.next().hostname # Balance out all the nodes on startup else: still = self.obs_to_still(obsnum) # Get a still for a new obsid if one doesn't already exist. if still is False: return None self.dbi.set_obs_still_host(obsnum, still) # Assign the still to the obsid if self.lock_all_neighbors_to_same_still == 1 and self.wf.neighbors == 1: for neighbor in self.get_all_neighbors(obsnum): self.dbi.set_obs_still_host(neighbor, still) if still != 0: # If the obsnum is assigned to a server that doesn't exist at the moment we need to skip it, maybe reassign later if ActionClass is None: ActionClass = Action a = ActionClass(obsnum, next_step, neighbor_status, self.task_clients[still], self.wf, still, timeout=self.timeout) if self.wf.neighbors == 1: if a.has_prerequisites() is True: return a else: return a # logging.debug('scheduler.get_action: (%s,%s) does not have prereqs' % (a.task, a.obs)) return None def determine_priority(self, action): '''Assign a priority to an action based on its status and the time order of the obs to which this action is attached.''' jdcnt = (float(file2jd(action.obs))- 2415020)/0.005 #get the jd and convert to an integer (0.005 is the length of a typical PAPER obs) pol = str2pol[file2pol(action.obs)] #pol, jdcnt = int(action.obs) / 2 ** 32, int(action.obs) % 2 ** 32 # XXX maybe not make this have to explicitly match dbi bits return jdcnt * 4 + pol # prioritize first by time, then by pol # XXX might want to prioritize finishing a obs already started before # moving to the latest one (at least, up to a point) to avoid a # build up of partial obs. But if you prioritize obs already # started too excessively, then the queue could eventually fill with # partially completed tasks that are failing for some reason def obs_to_still(self, obs): ############## # Check if a obsid has a still already, if it does simply return it. If it does not then lets find the lowest # loaded (cpu) one and assign it. If none are under 80% then lets just wait around, they're busy enough as is. ############## mystill = self.dbi.get_obs_still_host(obs) if mystill: if mystill in self.task_clients: return mystill else: # We couldn't find its still server as its not in task_clients for whatever reason so punt for now logger.debug("Obs attached to non-existant STILL OBS : %s, STILL %s" % (obs, mystill)) return 0 else: still = self.dbi.get_most_available_still() if still is not False: return still else: return False
def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() # The scheduler is just starting, for the first run if we have new obs # we need this to assign to proper taskmanagers self.initial_startup = True self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() last_checked_for_mc = time.time() while self.keep_running: if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) if self.wf.log_to_mc: import mc_utils now = time.time() dt_check = now - last_checked_for_mc if dt_check.total_seconds() > TIME_INT_FOR_NEW_MC_CHECK: # get total number of running tasks ntasks = 0 launched_actions_copy = copy.deepcopy( self.launched_actions) for tm in launched_actions_copy: ntasks += len(self.get_launched_actions(tm, tx=False)) # get time since check-in in minutes dt_check_min = dt_check / 60 # get time since boot in hours boot_time = psutil.boot_time() dt_boot = now - boot_time dt_boot_hr = dt_boot / 60 / 60 # log to M&C status = "OK" logger.debug( "Logging to M&C : {0} status, {1:5.2f} min since last check; {2}" " tasks running; {4:10.3} hr since boot".format( status, dt_check_min, str(ntasks), dt_boot_hr)) mc_utils.add_mc_rtp_status(status, dt_check_min, ntasks, dt_boot_hr) self.ext_command_hook() self.get_new_active_obs() self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) # Check if the TaskManager is still available, if not it will # pop it out if self.check_taskmanager(tm) is False: continue # I think this will work while len(self.get_launched_actions( tm, tx=False)) < tm_info.max_num_of_tasks: # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) # FIXME: MIght still be having a small issue when a TM goes # offline and back on action_from_queue = self.pop_action_queue(tm, tx=False) if action_from_queue is not False: # If we had a connection error stop trying until TM # checks back in if self.launch_action(action_from_queue) != "OK": break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) # We've run once now, all obs were assigned via roundrobin if they # were not previously self.initial_startup = False self.shutdown()
class TaskServer(HTTPServer): allow_reuse_address = True def __init__(self, dbi, sg, data_dir='.', port=14204, handler=TaskHandler, path_to_do_scripts=".", drmaa_shared='/shared'): global logger logger = sg.logger self.myhostname = socket.gethostname() self.httpd = HTTPServer.__init__( self, (self.myhostname, port), handler ) # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server. self.active_tasks_semaphore = threading.Semaphore() self.active_tasks = [] self.dbi = dbi self.sg = sg self.data_dir = data_dir self.keep_running = False self.watchdog_count = 0 self.port = port self.path_to_do_scripts = path_to_do_scripts self.logger = sg.logger self.drmaa_session = '' self.drmaa_shared = drmaa_shared self.shutting_down = False # signal.signal(signal.SIGINT, self.signal_handler) # Enabled clean shutdown after Cntrl-C event. def append_task(self, t): self.active_tasks_semaphore.acquire( ) # Jon : Not sure why we're doing this, we only have one primary thread self.active_tasks.append(t) self.active_tasks_semaphore.release() def poll_task_status(self, task): if self.sg.cluster_scheduler == 1: # Do we need to interface with a cluster scheduler? try: task_info = self.drmaa_session.jobStatus(task.jid) except: task_info = "failed" logger.debug( "TS: poll_task_status : DRMAA jobstatus failed for jid : %s" % task.jid) if task_info == "done" or task_info == "failed": # Check if task is done or failed.. poll_status = True else: poll_status = None # attributes: retval. : jobId, hasExited, hasSignal, terminatedSignal, hasCoreDump, wasAborted, exitStatus, and resourceUsage else: try: poll_status = task.process.poll( ) # race condition due to threading, might fix later, pretty rare except: poll_status = None time.sleep(2) return poll_status def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: new_active_tasks.append( mytask ) # This should probably be handled in a better way else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): logger.debug( 'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}' .format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0), mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return def kill(self, pid): try: for task in self.active_tasks: if self.sg.cluster_scheduler == 1: # Do we need to interface with a cluster scheduler? if int(task.jid) == int(pid): task.kill() break else: if int(task.process.pid) == int(pid): task.kill() break except: logger.exception("Problem killing off task: %s w/ pid : %s" % (task, pid)) def kill_all(self): for task in self.active_tasks: task.kill() break def checkin_timer(self): # # Just a timer that will update that its last_checkin time in the database every 5min # while self.keep_running is True: hostname = socket.gethostname() ip_addr = socket.gethostbyname(hostname) cpu_usage = os.getloadavg()[1] #using the 5 min load avg self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OK", max_tasks=self.sg.actions_per_still, cur_tasks=len(self.active_tasks)) time.sleep(10) return 0 def start(self): psutil.cpu_percent() time.sleep(1) self.keep_running = True t = threading.Thread(target=self.finalize_tasks) t.daemon = True t.start() logger.info('Starting Task Server') logger.info("using code at: " + __file__) logger.info("Path to do_ Scripts : %s" % self.path_to_do_scripts) logger.info("Data_dir : %s" % self.data_dir) logger.info("Port : %s" % self.port) if self.sg.cluster_scheduler == 1: logger.info("Initilizing DRMAA interface to cluster scheduler") import drmaa self.drmaa_session = drmaa.Session( ) # Start the interface session to DRMAA to control GridEngine self.drmaa_session.initialize() try: # Setup a thread that just updates the last checkin time for this still every 5min timer_thread = threading.Thread(target=self.checkin_timer) timer_thread.daemon = True # Make it a daemon so that when ctrl-c happens this thread goes away timer_thread.start() # Start heartbeat self.serve_forever() # Start the lisetenser server finally: self.shutdown() return def shutdown(self): if self.shutting_down is False: # check to see if we're already shutting down so we don't step over multiple threads attempting this. self.shutting_down = True logger.debug("Shutting down task_server") hostname = socket.gethostname() ip_addr = socket.gethostbyname(hostname) cpu_usage = psutil.cpu_percent() self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OFFLINE") self.keep_running = False parentproc = psutil.Process() myprocs = parentproc.children(recursive=True) for proc in myprocs: logger.debug("Killing nicely -> Pid: %s - Proc: %s" % (proc.pid, proc.name)) proc.terminate() gone, alive = psutil.wait_procs(myprocs, timeout=3) for proc in alive: logger.debug("Killing with gusto -> Pid: %s - Proc: %s" % (proc.pid, proc.name)) proc.kill() HTTPServer.shutdown(self) if self.sg.cluster_scheduler == 1: self.drmaa_session.exit() # Terminate DRMAA sessionmaker sys.exit(0)