def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() self.initial_startup = True # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() while self.keep_running: num_of_open_obs = len(self.dbi.list_open_observations()) if num_of_open_obs == 0 and self.sg.aws_enabled == "1": for still in self.launched_actions: self.post_to_server(still, "HALT_NOW") self.shutdown() logger.debug("Shutting down AWS nodes as we are out of data to process...") # shutdown aws nodes... if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) self.ext_command_hook() self.get_new_active_obs() if self.check_if_stills_reporting_full() is not True: self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) if self.check_taskmanager(tm) is False: # Check if the TaskManager is still available, if not it will pop it out continue while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks: # I think this will work # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) action_from_queue = self.pop_action_queue(tm, tx=False) # FIXME: MIght still be having a small issue when a TM goes offline and back on if action_from_queue is not False: if self.launch_action(action_from_queue) != "OK": # If we had a connection error stop trying until TM checks back in break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) self.initial_startup = False # We've run once now, all obs were assigned via roundrobin if they were not previously self.shutdown()
def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: # This should probably be handled in a better way new_active_tasks.append(mytask) else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already # throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): cpu = child_proc.cpu_percent(interval=1.0) mem_pct = child_proc.memory_percent() mem_mb = mem_pct * psutil.virtual_memory( ).total / 1048576 # == 2**20, to convert to MB # save in task object mytask.max_mem = max(mytask.max_mem, mem_mb) mytask.cpu_load_avg += cpu / 100. mytask.n_cpu_load_polls += 1 # echo out to screen logger.debug( 'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%,' ' mem={mem:.1f}%, Naffinity={aff}'.format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=cpu, mem=mem_pct, aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return
def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() self.initial_startup = True # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() while self.keep_running: if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) self.ext_command_hook() self.get_new_active_obs() self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) if self.check_taskmanager(tm) is False: # Check if the TaskManager is still available, if not it will pop it out continue while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks: # I think this will work # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) action_from_queue = self.pop_action_queue(tm, tx=False) # FIXME: MIght still be having a small issue when a TM goes offline and back on if action_from_queue is not False: if self.launch_action(action_from_queue) != "OK": # If we had a connection error stop trying until TM checks back in break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) self.initial_startup = False # We've run once now, all obs were assigned via roundrobin if they were not previously self.shutdown()
def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: new_active_tasks.append( mytask ) # This should probably be handled in a better way else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): logger.debug( 'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}' .format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0), mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return
def finalize_tasks(self, poll_interval=5.): self.user_input = InputThread() self.user_input.start() while self.keep_running: self.active_tasks_semaphore.acquire() new_active_tasks = [] for mytask in self.active_tasks: if self.poll_task_status(mytask) is None: new_active_tasks.append(mytask) # This should probably be handled in a better way else: mytask.finalize() self.active_tasks = new_active_tasks self.active_tasks_semaphore.release() # Jon: I think we can get rid of the watchdog as I'm already throwing this at the db time.sleep(poll_interval) if self.watchdog_count == 30: logger.debug('TaskServer is alive') for mytask in self.active_tasks: try: child_proc = mytask.process.children()[0] if psutil.pid_exists(child_proc.pid): logger.debug('Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}'.format( obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0), mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity()))) except: pass self.watchdog_count = 0 else: self.watchdog_count += 1 self.keyboard_input = self.user_input.get_user_input() if self.keyboard_input is not None: handle_keyboard_input(self, self.keyboard_input) return
def start(self, dbi, ActionClass=None, action_args=()): ### # Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc.. # This loop can be terminated by q + Enter, and paused by p + enter ### self.user_input = InputThread() self.user_input.start() # The scheduler is just starting, for the first run if we have new obs # we need this to assign to proper taskmanagers self.initial_startup = True self.tm_cycle = cycle(self.stills) self.keep_running = True logger.info('Starting Scheduler') self.dbi = dbi last_checked_for_stills = time.time() last_checked_for_mc = time.time() while self.keep_running: if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK: self.find_all_taskmanagers() last_checked_for_stills = time.time() logger.debug("Number of TaskManagers : %s" % len(self.task_clients)) if self.wf.log_to_mc: import mc_utils now = time.time() dt_check = now - last_checked_for_mc if dt_check.total_seconds() > TIME_INT_FOR_NEW_MC_CHECK: # get total number of running tasks ntasks = 0 launched_actions_copy = copy.deepcopy( self.launched_actions) for tm in launched_actions_copy: ntasks += len(self.get_launched_actions(tm, tx=False)) # get time since check-in in minutes dt_check_min = dt_check / 60 # get time since boot in hours boot_time = psutil.boot_time() dt_boot = now - boot_time dt_boot_hr = dt_boot / 60 / 60 # log to M&C status = "OK" logger.debug( "Logging to M&C : {0} status, {1:5.2f} min since last check; {2}" " tasks running; {4:10.3} hr since boot".format( status, dt_check_min, str(ntasks), dt_boot_hr)) mc_utils.add_mc_rtp_status(status, dt_check_min, ntasks, dt_boot_hr) self.ext_command_hook() self.get_new_active_obs() self.update_action_queue(ActionClass, action_args) launched_actions_copy = copy.copy(self.launched_actions) # Launch actions that can be scheduled for tm in launched_actions_copy: tm_info = self.dbi.get_still_info(tm) # Check if the TaskManager is still available, if not it will # pop it out if self.check_taskmanager(tm) is False: continue # I think this will work while len(self.get_launched_actions( tm, tx=False)) < tm_info.max_num_of_tasks: # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False))) # FIXME: MIght still be having a small issue when a TM goes # offline and back on action_from_queue = self.pop_action_queue(tm, tx=False) if action_from_queue is not False: # If we had a connection error stop trying until TM # checks back in if self.launch_action(action_from_queue) != "OK": break else: break self.clean_completed_actions(self.dbi) keyboard_input = self.user_input.get_user_input() if keyboard_input is not None and keyboard_input != '': handle_keyboard_input(self, keyboard_input) else: time.sleep(self.sleep_time) # We've run once now, all obs were assigned via roundrobin if they # were not previously self.initial_startup = False self.shutdown()