Example #1
0
    def finalize_tasks(self, poll_interval=5.):
        self.user_input = InputThread()
        self.user_input.start()

        while self.keep_running:
            self.active_tasks_semaphore.acquire()
            new_active_tasks = []
            for mytask in self.active_tasks:
                if self.poll_task_status(mytask) is None:
                    # This should probably be handled in a better way
                    new_active_tasks.append(mytask)
                else:
                    mytask.finalize()
            self.active_tasks = new_active_tasks
            self.active_tasks_semaphore.release()

            # Jon: I think we can get rid of the watchdog as I'm already
            # throwing this at the db
            time.sleep(poll_interval)
            if self.watchdog_count == 30:
                logger.debug('TaskServer is alive')
                for mytask in self.active_tasks:
                    try:
                        child_proc = mytask.process.children()[0]
                        if psutil.pid_exists(child_proc.pid):
                            cpu = child_proc.cpu_percent(interval=1.0)
                            mem_pct = child_proc.memory_percent()
                            mem_mb = mem_pct * psutil.virtual_memory(
                            ).total / 1048576  # == 2**20, to convert to MB
                            # save in task object
                            mytask.max_mem = max(mytask.max_mem, mem_mb)
                            mytask.cpu_load_avg += cpu / 100.
                            mytask.n_cpu_load_polls += 1
                            # echo out to screen
                            logger.debug(
                                'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%,'
                                ' mem={mem:.1f}%, Naffinity={aff}'.format(
                                    obsnum=mytask.obs,
                                    task=mytask.task,
                                    pid=child_proc.pid,
                                    cpu=cpu,
                                    mem=mem_pct,
                                    aff=len(child_proc.cpu_affinity())))
                    except:
                        pass
                self.watchdog_count = 0
            else:
                self.watchdog_count += 1

            self.keyboard_input = self.user_input.get_user_input()
            if self.keyboard_input is not None:
                handle_keyboard_input(self, self.keyboard_input)
        return
Example #2
0
    def start(self, dbi, ActionClass=None, action_args=()):
        ###
        #  Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc..
        #      This loop can be terminated by q + Enter, and paused by p + enter
        ###
        self.user_input = InputThread()
        self.user_input.start()
        self.initial_startup = True  # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers
        self.tm_cycle = cycle(self.stills)
        self.keep_running = True
        logger.info('Starting Scheduler')
        self.dbi = dbi
        last_checked_for_stills = time.time()

        while self.keep_running:

            if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK:
                self.find_all_taskmanagers()
                last_checked_for_stills = time.time()
                logger.debug("Number of TaskManagers : %s" % len(self.task_clients))

            self.ext_command_hook()
            self.get_new_active_obs()

            self.update_action_queue(ActionClass, action_args)
            launched_actions_copy = copy.copy(self.launched_actions)
            # Launch actions that can be scheduled
            for tm in launched_actions_copy:
                tm_info = self.dbi.get_still_info(tm)
                if self.check_taskmanager(tm) is False:  # Check if the TaskManager is still available, if not it will pop it out
                    continue

                while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks:  # I think this will work
                    # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False)))
                    action_from_queue = self.pop_action_queue(tm, tx=False)  # FIXME: MIght still be having a small issue when a TM goes offline and back on

                    if action_from_queue is not False:
                        if self.launch_action(action_from_queue) != "OK":  # If we had a connection error stop trying until TM checks back in
                            break
                    else:
                        break

            self.clean_completed_actions(self.dbi)

            keyboard_input = self.user_input.get_user_input()
            if keyboard_input is not None and keyboard_input != '':
                handle_keyboard_input(self, keyboard_input)
            else:
                time.sleep(self.sleep_time)
            self.initial_startup = False  # We've run once now, all obs were assigned via roundrobin if they were not previously
        self.shutdown()
Example #3
0
    def start(self, dbi, ActionClass=None, action_args=()):
        ###
        #  Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc..
        #      This loop can be terminated by q + Enter, and paused by p + enter
        ###
        self.user_input = InputThread()
        self.user_input.start()
        self.initial_startup = True  # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers
        self.tm_cycle = cycle(self.stills)
        self.keep_running = True
        logger.info('Starting Scheduler')
        self.dbi = dbi
        last_checked_for_stills = time.time()
        while self.keep_running:

            num_of_open_obs = len(self.dbi.list_open_observations())
            if num_of_open_obs == 0 and self.sg.aws_enabled == "1":
                for still in self.launched_actions:
                    self.post_to_server(still, "HALT_NOW")
                self.shutdown()
                logger.debug("Shutting down AWS nodes as we are out of data to process...")
                # shutdown aws nodes...

            if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK:
                self.find_all_taskmanagers()
                last_checked_for_stills = time.time()
                logger.debug("Number of TaskManagers : %s" % len(self.task_clients))

            self.ext_command_hook()
            self.get_new_active_obs()
            if self.check_if_stills_reporting_full() is not True:
                self.update_action_queue(ActionClass, action_args)
            launched_actions_copy = copy.copy(self.launched_actions)
            # Launch actions that can be scheduled
            for tm in launched_actions_copy:
                tm_info = self.dbi.get_still_info(tm)
                if self.check_taskmanager(tm) is False:  # Check if the TaskManager is still available, if not it will pop it out
                    continue

                while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks:  # I think this will work
                    # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False)))
                    action_from_queue = self.pop_action_queue(tm, tx=False)  # FIXME: MIght still be having a small issue when a TM goes offline and back on

                    if action_from_queue is not False:
                        if self.launch_action(action_from_queue) != "OK":  # If we had a connection error stop trying until TM checks back in
                            break
                    else:
                        break

            self.clean_completed_actions(self.dbi)

            keyboard_input = self.user_input.get_user_input()
            if keyboard_input is not None and keyboard_input != '':
                handle_keyboard_input(self, keyboard_input)
            else:
                time.sleep(self.sleep_time)
            self.initial_startup = False  # We've run once now, all obs were assigned via roundrobin if they were not previously
        self.shutdown()
Example #4
0
    def finalize_tasks(self, poll_interval=5.):
        self.user_input = InputThread()
        self.user_input.start()

        while self.keep_running:
            self.active_tasks_semaphore.acquire()
            new_active_tasks = []
            for mytask in self.active_tasks:
                if self.poll_task_status(mytask) is None:
                    new_active_tasks.append(
                        mytask
                    )  # This should probably be handled in a better way
                else:
                    mytask.finalize()
            self.active_tasks = new_active_tasks
            self.active_tasks_semaphore.release()

            #  Jon: I think we can get rid of the watchdog as I'm already throwing this at the db
            time.sleep(poll_interval)
            if self.watchdog_count == 30:
                logger.debug('TaskServer is alive')
                for mytask in self.active_tasks:
                    try:
                        child_proc = mytask.process.children()[0]
                        if psutil.pid_exists(child_proc.pid):
                            logger.debug(
                                'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}'
                                .format(
                                    obsnum=mytask.obs,
                                    task=mytask.task,
                                    pid=child_proc.pid,
                                    cpu=child_proc.cpu_percent(interval=1.0),
                                    mem=child_proc.memory_percent(),
                                    aff=len(child_proc.cpu_affinity())))
                    except:
                        pass
                self.watchdog_count = 0
            else:
                self.watchdog_count += 1

            self.keyboard_input = self.user_input.get_user_input()
            if self.keyboard_input is not None:
                handle_keyboard_input(self, self.keyboard_input)
        return
Example #5
0
    def finalize_tasks(self, poll_interval=5.):
        self.user_input = InputThread()
        self.user_input.start()

        while self.keep_running:
            self.active_tasks_semaphore.acquire()
            new_active_tasks = []
            for mytask in self.active_tasks:
                if self.poll_task_status(mytask) is None:
                    new_active_tasks.append(mytask)   # This should probably be handled in a better way
                else:
                    mytask.finalize()
            self.active_tasks = new_active_tasks
            self.active_tasks_semaphore.release()

            #  Jon: I think we can get rid of the watchdog as I'm already throwing this at the db
            time.sleep(poll_interval)
            if self.watchdog_count == 30:
                logger.debug('TaskServer is alive')
                for mytask in self.active_tasks:
                    try:
                        child_proc = mytask.process.children()[0]
                        if psutil.pid_exists(child_proc.pid):
                            logger.debug('Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}'.format(
                                obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0),
                                mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity())))
                    except:
                        pass
                self.watchdog_count = 0
            else:
                self.watchdog_count += 1

            self.keyboard_input = self.user_input.get_user_input()
            if self.keyboard_input is not None:
                handle_keyboard_input(self, self.keyboard_input)
        return
Example #6
0
class Scheduler(ThreadingMixIn, HTTPServer):
    ###
    # A Scheduler reads a DataBaseInterface to determine what Actions can be
    # taken, and then schedules them on stills according to priority.'''
    ###
    def __init__(self, task_clients, workflow, sg):

        global logger
        logger = sg.logger
        self.myhostname = socket.gethostname()

        HTTPServer.__init__(self, (self.myhostname, 8080), MonitorHandler)  # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server.
        self.sg = sg  # Might as well have it around in case I find I need something from it...  Its just a little memory
        self.nstills = len(sg.hosts)  # preauto
        self.actions_per_still = sg.actions_per_still
        self.transfers_per_still = sg.transfers_per_still  # Jon : This isn't used...
        self.block_size = sg.block_size  # preauto
        self.timeout = sg.timeout
        self.sleep_time = sg.sleep_time

        self.lock_all_neighbors_to_same_still = workflow.lock_all_neighbors_to_same_still
        self.active_obs = []
        self.active_obs_dict = {}
        self.action_queue = []
        self.dbi = sg.dbi
        self.launched_actions = {}

        self.keep_running = False
        self.failcount = {}
        self.wf = workflow  # Jon: Moved the workflow class to instantiated on object creation, should do the same for dbi probably
        self.task_clients = {}
        self.stills = []

        signal.signal(signal.SIGINT, self.signal_handler)  # Enabled clean shutdown after Cntrl-C event.

        logger.info("Starting monitoring interface.")
        threading.Thread(target=self.serve_forever).start()  # Launch a thread of a multithreaded http server to view information on currently running tasks

        # If task_clients is set to AUTO then check the db for still servers
        if task_clients[0].host_port[0] == "AUTO":
            self.find_all_taskmanagers()
            self.auto = 1
        else:
            self.auto = 0
            self.task_clients = task_clients

    def signal_handler(self, signum, frame):
        logger.info("Caught Ctrl-C, Initiating clean shutdown.")
        self.shutdown()

    def find_all_taskmanagers(self):
        ###
        # find_all_taskmanagers : Check the database for all available stills with status OK
        #  Should also remove stills that have gone offline.
        ###
        logger.debug("looking for TaskManagers...")
        self.stills = self.dbi.get_available_stills()
        while len(self.stills) < 1:
            logger.debug("Can't find any TaskManagers! Waiting for 10sec and trying again")
            time.sleep(10)
            self.stills = self.dbi.get_available_stills()

        for still in self.stills:
            if still.hostname not in self.task_clients:
                logger.debug("Discovery of new TaskManager : %s" % still.hostname)
                self.task_clients[still.hostname] = TaskClient(self.dbi, still.hostname, self.wf, still.port, self.sg)
                self.launched_actions[still.hostname] = []
        return

    def ext_command_hook(self):
        return

    def check_taskmanager(self, tm):
        tm_info = self.dbi.get_still_info(tm)
        since = datetime.datetime.now() - datetime.timedelta(minutes=3)
        if tm_info.status != "OK" or tm_info.last_checkin < since:  # Status not OK or hasn't checked-in in over 3min.
            logger.info("Removing offline TaskManager : %s" % tm_info.hostname)
            self.launched_actions.pop(tm_info.hostname, None)
            self.task_clients.pop(tm_info.hostname, None)
            for obs in self.dbi.get_obs_assigned_to_still(tm_info.hostname):

                if obs.obsnum in self.active_obs_dict:
                    self.active_obs_dict.pop(obs.obsnum)
                    self.active_obs.remove(obs.obsnum)

            return False

        elif tm_info.cur_num_of_tasks >= tm_info.max_num_of_tasks:  # Check to ensure we are not at max # of tasks for the taskmanager
            return False

        return True

    def start(self, dbi, ActionClass=None, action_args=()):
        ###
        #  Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc..
        #      This loop can be terminated by q + Enter, and paused by p + enter
        ###
        self.user_input = InputThread()
        self.user_input.start()
        self.initial_startup = True  # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers
        self.tm_cycle = cycle(self.stills)
        self.keep_running = True
        logger.info('Starting Scheduler')
        self.dbi = dbi
        last_checked_for_stills = time.time()

        while self.keep_running:

            if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK:
                self.find_all_taskmanagers()
                last_checked_for_stills = time.time()
                logger.debug("Number of TaskManagers : %s" % len(self.task_clients))

            self.ext_command_hook()
            self.get_new_active_obs()

            self.update_action_queue(ActionClass, action_args)
            launched_actions_copy = copy.copy(self.launched_actions)
            # Launch actions that can be scheduled
            for tm in launched_actions_copy:
                tm_info = self.dbi.get_still_info(tm)
                if self.check_taskmanager(tm) is False:  # Check if the TaskManager is still available, if not it will pop it out
                    continue

                while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks:  # I think this will work
                    # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False)))
                    action_from_queue = self.pop_action_queue(tm, tx=False)  # FIXME: MIght still be having a small issue when a TM goes offline and back on

                    if action_from_queue is not False:
                        if self.launch_action(action_from_queue) != "OK":  # If we had a connection error stop trying until TM checks back in
                            break
                    else:
                        break

            self.clean_completed_actions(self.dbi)

            keyboard_input = self.user_input.get_user_input()
            if keyboard_input is not None and keyboard_input != '':
                handle_keyboard_input(self, keyboard_input)
            else:
                time.sleep(self.sleep_time)
            self.initial_startup = False  # We've run once now, all obs were assigned via roundrobin if they were not previously
        self.shutdown()

    def shutdown(self):
        logger.info("Shutting down...")
        self.keep_running = False
        HTTPServer.shutdown(self)
        sys.exit(0)

    def get_all_neighbors(self, obsnum):
        ###
        # get_all_neighbors: Go down (and up) the rabbit hole and find ALL the neighbors of a particular obsid
        ###
        neighbor_obs_nums = []
        neighbor_obs_nums.append(obsnum)  # Go ahead and add the current obsid to the list

        low_obs, high_obs = self.dbi.get_neighbors(obsnum)
        while high_obs is not None:  # Traverse the list UP to find all neighbors above this one
            neighbor_obs_nums.append(high_obs)
            high_obs = self.dbi.get_neighbors(high_obs)[1]

        while low_obs is not None:  # Traverse the list DOWN to find all neighbors above this one
            neighbor_obs_nums.append(low_obs)
            low_obs = self.dbi.get_neighbors(low_obs)[0]
        return neighbor_obs_nums

    def pop_action_queue(self, still, tx=False):
        '''Return highest priority action for the given still.'''
        # Seems like we're going through all the actions to find the ones for the particular still..
        # Should think about optimizing at some point
        for i in xrange(len(self.action_queue)):
            a = self.action_queue[i]
            if a.still == still and a.is_transfer == tx:
                return self.action_queue.pop(i)
        return False

    def get_launched_actions(self, still, tx=False):
        return [a for a in self.launched_actions[still] if a.is_transfer == tx]

    def launch_action(self, a):
        '''Launch the specified Action and record its launch for tracking later.'''
        self.launched_actions[a.still].append(a)
        connection_status = a.launch()
        return connection_status

    def kill_action(self, a):
        logger.info('Scheduler.kill_action: called on (%s,%s)' % (a.task, a.obs))
        connection_status = a.run_remote_task(action_type="KILL_TASK")
        return connection_status

    def clean_completed_actions(self, dbi):
        '''Check launched actions for completion, timeout or fail'''

        for still in self.launched_actions:
            updated_actions = []
            for action in self.launched_actions[still]:
                obs = dbi.get_obs(action.obs)
                status = obs.status
                pid = dbi.get_obs_pid(action.obs)

                try:
                    self.failcount[str(action.obs) + status]
                except(KeyError):
                    self.failcount[str(action.obs) + status] = 0

                if status == action.task:
                    logger.info('Task %s for obs %s on still %s completed successfully.' % (action.task, action.obs, still))

                elif action.timed_out():
                    logger.info('Task %s for obs %s on still %s TIMED OUT.' % (action.task, action.obs, still))
                    if self.kill_action(action) != "OK":
                        break
                    self.failcount[str(action.obs) + status] += 1
                    # XXX make db entry for documentation

                elif pid == -9:
                    self.failcount[str(action.obs) + status] += 1
                    logger.info('Task %s for obs %s on still %s HAS DIED. failcount=%d' % (action.task, action.obs, still, self.failcount[str(action.obs) + status]))

                else:  # still active
                    updated_actions.append(action)

            self.launched_actions[still] = updated_actions

    def already_launched(self, action):
        '''Determine if this action has already been launched.  Enforces
        fact that only one valid action can be taken for a given obs
        at any one time.'''
        for a in self.launched_actions[action.still]:
            if a.obs == action.obs:
                return True
        return False

    def get_new_active_obs(self):
        '''Check for any new obs that may have appeared.  Actions for
        these obs may potentially take priority over ones currently
        active.'''

        observations = []
        observations += self.dbi.list_open_observations_on_tm()
        for tm_name in self.launched_actions:
            observations += self.dbi.list_open_observations_on_tm(tm_hostname=tm_name)

        for open_obs in observations:
            if open_obs not in self.active_obs_dict:
                self.active_obs_dict[open_obs] = len(self.active_obs)
                self.active_obs.append(open_obs)
        return

    def update_action_queue(self, ActionClass=None, action_args=()):
        '''Based on the current list of active obs (which you might want
        to update first), generate a prioritized list of actions that
        can be taken.'''

        actions = []
        for myobs in self.active_obs:
            myobs_info = self.dbi.get_obs(myobs)
            if myobs_info.current_stage_in_progress == "FAILED" or myobs_info.current_stage_in_progress == "KILLED" or myobs_info.status == "COMPLETE" or (myobs_info.stillhost not in self.task_clients and myobs_info.stillhost):
                self.active_obs_dict.pop(myobs_info.obsnum)
                self.active_obs.remove(myobs_info.obsnum)

                logger.debug("update_action_queue: Removing obsid : %s, task : %s, Status: %s, TM: %s" % (myobs_info.obsnum, myobs_info.current_stage_in_progress, myobs_info.status, myobs_info.stillhost))
            else:
                myaction = self.get_action(myobs, ActionClass=ActionClass, action_args=action_args)
                if (myaction is not None) and (self.already_launched(myaction) is not True):
                    if self.wf.prioritize_obs == 1:
                        myaction.set_priority(self.determine_priority(myaction))

                    actions.append(myaction)

        actions.sort(action_cmp, reverse=True)  # place most important actions first
        self.action_queue = actions  # completely throw out previous action list

        return

    def get_action(self, obsnum, ActionClass=None, action_args=()):
        '''Find the next actionable step for obs f (one for which all
        prerequisites have been met.  Return None if no action is available.
        This function is allowed to return actions that have already been
        launched.
        ActionClass: a subclass of Action, for customizing actions.
            None defaults to the standard Action'''
        obsinfo = self.dbi.get_obs(obsnum)
        status = obsinfo.status

        if obsinfo.current_stage_in_progress == "FAILED" or obsinfo.current_stage_in_progress == "KILLED":
            return None

        if status == 'COMPLETE':
            # logger.debug("COMPLETE for obsid : %s" % obsnum)  # You can see how often completeds are checked by uncommenting this.. its a LOT
            return None  # obs is complete

        if status == '' or status == 'NEW':
            # Not yet ready for processing.
            return None

        # Check that the still assigned to the obs is currently in the list of active stills
        # !!!!FIXME!!!  - Maybe fixed?
        if obsinfo.stillhost is not None:
            if any(still for still in self.stills if still.hostname == obsinfo.stillhost):
                pass
            else:
                return None
        if self.wf.neighbors == 1:  # FIX ME, I don't want to call the same thing twice.. its ugly
            neighbors = self.dbi.get_neighbors(obsnum)

            if None in neighbors:
                cur_step_index = self.wf.workflow_actions_endfile.index(status)
                next_step = self.wf.workflow_actions_endfile[cur_step_index + 1]

            else:  # this is a normal file
                cur_step_index = self.wf.workflow_actions.index(status)
                next_step = self.wf.workflow_actions[cur_step_index + 1]

            neighbor_status = [self.dbi.get_obs_status(n) for n in neighbors if n is not None]
        else:
            cur_step_index = self.wf.workflow_actions.index(status)
            next_step = self.wf.workflow_actions[cur_step_index + 1]
            neighbor_status = 0

        still = self.dbi.get_obs_still_host(obsnum)

        if not still:
            if self.initial_startup is True:

                still = self.tm_cycle.next().hostname  # Balance out all the nodes on startup
            else:

                still = self.obs_to_still(obsnum)  # Get a still for a new obsid if one doesn't already exist.
                if still is False:
                    return None

            self.dbi.set_obs_still_host(obsnum, still)  # Assign the still to the obsid

            if self.lock_all_neighbors_to_same_still == 1 and self.wf.neighbors == 1:
                for neighbor in self.get_all_neighbors(obsnum):
                    self.dbi.set_obs_still_host(neighbor, still)

        if still != 0:  # If the obsnum is assigned to a server that doesn't exist at the moment we need to skip it, maybe reassign later
            if ActionClass is None:
                ActionClass = Action

            a = ActionClass(obsnum, next_step, neighbor_status, self.task_clients[still], self.wf, still, timeout=self.timeout)
            if self.wf.neighbors == 1:
                if a.has_prerequisites() is True:
                    return a
            else:
                return a
        # logging.debug('scheduler.get_action: (%s,%s) does not have prereqs' % (a.task, a.obs))
        return None

    def determine_priority(self, action):
        '''Assign a priority to an action based on its status and the time
        order of the obs to which this action is attached.'''

        pol, jdcnt = int(action.obs) / 2 ** 32, int(action.obs) % 2 ** 32  # XXX maybe not make this have to explicitly match dbi bits
        return jdcnt * 4 + pol  # prioritize first by time, then by pol
        # XXX might want to prioritize finishing a obs already started before
        # moving to the latest one (at least, up to a point) to avoid a

        # build up of partial obs.  But if you prioritize obs already
        # started too excessively, then the queue could eventually fill with
        # partially completed tasks that are failing for some reason

    def obs_to_still(self, obs):
        ##############
        #   Check if a obsid has a still already, if it does simply return it.  If it does not then lets find the lowest
        #   loaded (cpu) one and assign it.  If none are under 80% then lets just wait around, they're busy enough as is.
        ##############
        mystill = self.dbi.get_obs_still_host(obs)
        if mystill:
            if mystill in self.task_clients:
                return mystill
            else:  # We couldn't find its still server as its not in task_clients for whatever reason so punt for now
                logger.debug("Obs attached to non-existant STILL OBS : %s, STILL %s" % (obs, mystill))
                return 0
        else:
            still = self.dbi.get_most_available_still()
            if still is not False:
                return still
            else:
                return False
Example #7
0
class TaskServer(HTTPServer):
    allow_reuse_address = True

    def __init__(self, dbi, sg, data_dir='.', port=14204, handler=TaskHandler, path_to_do_scripts=".", drmaa_shared='/shared'):
        global logger
        logger = sg.logger
        self.myhostname = socket.gethostname()
        self.httpd = HTTPServer.__init__(self, (self.myhostname, port), handler)  # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server.
        self.active_tasks_semaphore = threading.Semaphore()
        self.active_tasks = []
        self.dbi = dbi
        self.sg = sg
        self.data_dir = data_dir
        self.keep_running = False
        self.watchdog_count = 0
        self.port = port
        self.path_to_do_scripts = path_to_do_scripts
        self.logger = sg.logger
        self.drmaa_session = ''
        self.drmaa_shared = drmaa_shared
        self.shutting_down = False

        # signal.signal(signal.SIGINT, self.signal_handler)  # Enabled clean shutdown after Cntrl-C event.

    def append_task(self, t):
        self.active_tasks_semaphore.acquire()  # Jon : Not sure why we're doing this, we only have one primary thread
        self.active_tasks.append(t)
        self.active_tasks_semaphore.release()

    def poll_task_status(self, task):
        if self.sg.cluster_scheduler == 1:  # Do we need to interface with a cluster scheduler?
            try:
                task_info = self.drmaa_session.jobStatus(task.jid)
            except:
                task_info = "failed"
                logger.debug("TS: poll_task_status : DRMAA jobstatus failed for jid : %s" % task.jid)
            if task_info == "done" or task_info == "failed":  # Check if task is done or failed..
                poll_status = True
            else:
                poll_status = None
            # attributes: retval. :  jobId, hasExited, hasSignal, terminatedSignal, hasCoreDump, wasAborted, exitStatus, and resourceUsage
        else:
            try:
                poll_status = task.process.poll()  # race condition due to threading, might fix later, pretty rare
            except:
                poll_status = None
                time.sleep(2)

        return poll_status

    def finalize_tasks(self, poll_interval=5.):
        self.user_input = InputThread()
        self.user_input.start()

        while self.keep_running:
            self.active_tasks_semaphore.acquire()
            new_active_tasks = []
            for mytask in self.active_tasks:
                if self.poll_task_status(mytask) is None:
                    new_active_tasks.append(mytask)   # This should probably be handled in a better way
                else:
                    mytask.finalize()
            self.active_tasks = new_active_tasks
            self.active_tasks_semaphore.release()

            #  Jon: I think we can get rid of the watchdog as I'm already throwing this at the db
            time.sleep(poll_interval)
            if self.watchdog_count == 30:
                logger.debug('TaskServer is alive')
                for mytask in self.active_tasks:
                    try:
                        child_proc = mytask.process.children()[0]
                        if psutil.pid_exists(child_proc.pid):
                            logger.debug('Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}'.format(
                                obsnum=mytask.obs, task=mytask.task, pid=child_proc.pid, cpu=child_proc.cpu_percent(interval=1.0),
                                mem=child_proc.memory_percent(), aff=len(child_proc.cpu_affinity())))
                    except:
                        pass
                self.watchdog_count = 0
            else:
                self.watchdog_count += 1

            self.keyboard_input = self.user_input.get_user_input()
            if self.keyboard_input is not None:
                handle_keyboard_input(self, self.keyboard_input)
        return

    def kill(self, pid):
        try:
            for task in self.active_tasks:
                if self.sg.cluster_scheduler == 1:  # Do we need to interface with a cluster scheduler?

                    if int(task.jid) == int(pid):
                        task.kill()
                        break
                else:
                    if int(task.process.pid) == int(pid):
                        task.kill()
                        break
        except:
            logger.exception("Problem killing off task: %s  w/  pid : %s" % (task, pid))

    def kill_all(self):
        for task in self.active_tasks:
                task.kill()
                break

    def checkin_timer(self):
        #
        # Just a timer that will update that its last_checkin time in the database every 5min
        #
        while self.keep_running is True:
            hostname = socket.gethostname()
            ip_addr = socket.gethostbyname(hostname)
            cpu_usage = psutil.cpu_percent()
            self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OK", max_tasks=self.sg.actions_per_still, cur_tasks=len(self.active_tasks))
            time.sleep(10)
        return 0

    def start(self):
        psutil.cpu_percent()
        time.sleep(1)
        self.keep_running = True
        t = threading.Thread(target=self.finalize_tasks)
        t.daemon = True
        t.start()
        logger.info('Starting Task Server')
        logger.info("using code at: " + __file__)
        logger.info("Path to do_ Scripts : %s" % self.path_to_do_scripts)
        logger.info("Data_dir : %s" % self.data_dir)
        logger.info("Port : %s" % self.port)

        if self.sg.cluster_scheduler == 1:
            logger.info("Initilizing DRMAA interface to cluster scheduler")
            import drmaa
            self.drmaa_session = drmaa.Session()  # Start the interface session to DRMAA to control GridEngine
            self.drmaa_session.initialize()
        try:
            # Setup a thread that just updates the last checkin time for this still every 5min
            timer_thread = threading.Thread(target=self.checkin_timer)
            timer_thread.daemon = True  # Make it a daemon so that when ctrl-c happens this thread goes away
            timer_thread.start()  # Start heartbeat
            self.serve_forever()  # Start the lisetenser server
        finally:
            self.shutdown()
        return

    def shutdown(self):
        if self.shutting_down is False:  # check to see if we're already shutting down so we don't step over multiple threads attempting this.
            self.shutting_down = True
            logger.debug("Shutting down task_server")
            hostname = socket.gethostname()
            ip_addr = socket.gethostbyname(hostname)
            cpu_usage = psutil.cpu_percent()
            self.dbi.still_checkin(hostname, ip_addr, self.port, int(cpu_usage), self.data_dir, status="OFFLINE")
            self.keep_running = False
            parentproc = psutil.Process()
            myprocs = parentproc.children(recursive=True)
            for proc in myprocs:
                logger.debug("Killing nicely -> Pid: %s - Proc: %s" % (proc.pid, proc.name))
                proc.terminate()
            gone, alive = psutil.wait_procs(myprocs, timeout=3)
            for proc in alive:
                logger.debug("Killing with gusto -> Pid: %s - Proc: %s" % (proc.pid, proc.name))
                proc.kill()
            HTTPServer.shutdown(self)
            if self.sg.cluster_scheduler == 1:
                self.drmaa_session.exit()  # Terminate DRMAA sessionmaker

            sys.exit(0)
Example #8
0
class TaskServer(HTTPServer):
    allow_reuse_address = True

    def __init__(self,
                 dbi,
                 sg,
                 data_dir='.',
                 port=14204,
                 handler=TaskHandler,
                 path_to_do_scripts=".",
                 drmaa_shared='/shared',
                 workflow=None):
        global logger
        logger = sg.logger
        self.myhostname = socket.gethostname()
        # Class us into HTTPServer so we can make calls from TaskHandler into
        # this class via self.server.
        self.httpd = HTTPServer.__init__(self, (self.myhostname, port),
                                         handler)
        self.active_tasks_semaphore = threading.Semaphore()
        self.active_tasks = []
        self.dbi = dbi
        self.sg = sg
        self.data_dir = data_dir
        self.keep_running = False
        self.watchdog_count = 0
        self.port = port
        self.path_to_do_scripts = path_to_do_scripts
        self.logger = sg.logger
        self.drmaa_session = ''
        self.drmaa_shared = drmaa_shared
        self.shutting_down = False
        self.wf = workflow

        # signal.signal(signal.SIGINT, self.signal_handler)  # Enabled clean
        # shutdown after Cntrl-C event.

    def append_task(self, t):
        # Jon : Not sure why we're doing this, we only have one primary thread
        self.active_tasks_semaphore.acquire()
        self.active_tasks.append(t)
        self.active_tasks_semaphore.release()

    def poll_task_status(self, task):
        # Do we need to interface with a cluster scheduler?
        if self.sg.cluster_scheduler == 1:
            try:
                task_info = self.drmaa_session.jobStatus(task.jid)
            except:
                task_info = "failed"
                logger.debug(
                    "TS: poll_task_status : DRMAA jobstatus failed for jid : %s"
                    % task.jid)
            # Check if task is done or failed..
            if task_info == "done" or task_info == "failed":
                poll_status = True
            else:
                poll_status = None
            # attributes: retval. :  jobId, hasExited, hasSignal, terminatedSignal, hasCoreDump,
            #                        wasAborted, exitStatus, and resourceUsage
        else:
            try:
                # race condition due to threading, might fix later, pretty rare
                poll_status = task.process.poll()
            except:
                poll_status = None
                time.sleep(2)

        return poll_status

    def finalize_tasks(self, poll_interval=5.):
        self.user_input = InputThread()
        self.user_input.start()

        while self.keep_running:
            self.active_tasks_semaphore.acquire()
            new_active_tasks = []
            for mytask in self.active_tasks:
                if self.poll_task_status(mytask) is None:
                    # This should probably be handled in a better way
                    new_active_tasks.append(mytask)
                else:
                    mytask.finalize()
            self.active_tasks = new_active_tasks
            self.active_tasks_semaphore.release()

            # Jon: I think we can get rid of the watchdog as I'm already
            # throwing this at the db
            time.sleep(poll_interval)
            if self.watchdog_count == 30:
                logger.debug('TaskServer is alive')
                for mytask in self.active_tasks:
                    try:
                        child_proc = mytask.process.children()[0]
                        if psutil.pid_exists(child_proc.pid):
                            cpu = child_proc.cpu_percent(interval=1.0)
                            mem_pct = child_proc.memory_percent()
                            mem_mb = mem_pct * psutil.virtual_memory(
                            ).total / 1048576  # == 2**20, to convert to MB
                            # save in task object
                            mytask.max_mem = max(mytask.max_mem, mem_mb)
                            mytask.cpu_load_avg += cpu / 100.
                            mytask.n_cpu_load_polls += 1
                            # echo out to screen
                            logger.debug(
                                'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%,'
                                ' mem={mem:.1f}%, Naffinity={aff}'.format(
                                    obsnum=mytask.obs,
                                    task=mytask.task,
                                    pid=child_proc.pid,
                                    cpu=cpu,
                                    mem=mem_pct,
                                    aff=len(child_proc.cpu_affinity())))
                    except:
                        pass
                self.watchdog_count = 0
            else:
                self.watchdog_count += 1

            self.keyboard_input = self.user_input.get_user_input()
            if self.keyboard_input is not None:
                handle_keyboard_input(self, self.keyboard_input)
        return

    def kill(self, pid):
        try:
            for task in self.active_tasks:
                if self.sg.cluster_scheduler == 1:  # Do we need to interface with a cluster scheduler?

                    if int(task.jid) == int(pid):
                        task.kill()
                        break
                else:
                    if int(task.process.pid) == int(pid):
                        task.kill()
                        break
        except:
            logger.exception("Problem killing off task: %s  w/  pid : %s" %
                             (task, pid))

    def kill_all(self):
        for task in self.active_tasks:
            task.kill()
            break

    def checkin_timer(self):
        #
        # Just a timer that will update that its last_checkin time in the database every 5min
        #
        while self.keep_running is True:
            hostname = platform.node()
            ip_addr = self.get_ip_address()
            cpu_usage = os.getloadavg()[1]  # using the 5 min load avg
            ntasks = len(self.active_tasks)
            self.dbi.still_checkin(hostname,
                                   ip_addr,
                                   self.port,
                                   int(cpu_usage),
                                   self.data_dir,
                                   status="OK",
                                   max_tasks=self.sg.actions_per_still,
                                   cur_tasks=ntasks)

            time.sleep(10)
        return 0

    def mc_checkin_thread(self):
        import mc_utils

        while self.keep_running is True:
            hostname = platform.node()
            ip_addr = self.get_ip_address()
            cpu_usage = os.getloadavg()[1]  # using the 5 min load avg
            ntasks = len(self.active_tasks)

            # get general info
            ncpu = psutil.cpu_count()
            s = self.dbi.Session()
            still = s.query(Still).filter(Still.hostname == hostname).one()
            status = still.status
            s.close()

            # get memory usage
            vmem = psutil.virtual_memory()
            vmem_tot = vmem.total / 1024 / 1024 / 1024
            vmem_pct = vmem.percent

            # get disk usage
            du = psutil.disk_usage('/')
            du_tot = du.total / 1024 / 1024 / 1024
            du_pct = du.percent

            # get time since boot in hr
            now = time.time()
            boot_time = psutil.boot_time()
            dt_boot = now - boot_time
            dt_boot_days = dt_boot / 60 / 60 / 24

            # call functions for interacting with M&C
            mc_utils.add_mc_server_status(hostname, ip_addr, ncpu, cpu_usage,
                                          dt_boot_days, vmem_pct, vmem_tot,
                                          du_pct, du_tot)

            # sleep
            time.sleep(TIME_INT_FOR_NEW_MC_CHECK)

        return 0

    def start(self):
        psutil.cpu_percent()
        time.sleep(1)
        self.keep_running = True
        t = threading.Thread(target=self.finalize_tasks)
        t.daemon = True
        t.start()
        logger.info('Starting Task Server')
        logger.info("using code at: " + __file__)
        logger.info("Path to do_ Scripts : %s" % self.path_to_do_scripts)
        logger.info("Data_dir : %s" % self.data_dir)
        logger.info("Port : %s" % self.port)

        if self.sg.cluster_scheduler == 1:
            logger.info("Initilizing DRMAA interface to cluster scheduler")
            import drmaa
            # Start the interface session to DRMAA to control GridEngine
            self.drmaa_session = drmaa.Session()
            self.drmaa_session.initialize()
        try:
            # Setup a thread that just updates the last checkin time for this
            # still every 10s
            timer_thread = threading.Thread(target=self.checkin_timer)
            # Make it a daemon so that when ctrl-c happens this thread goes
            # away
            timer_thread.daemon = True
            timer_thread.start()  # Start heartbeat
            self.serve_forever()  # Start the lisetenser server

            if self.wf.log_to_mc:
                # Also set up an M&C status thread
                mc_thread = threading.Thread(target=self.mc_checkin_thread)
                # Make it a daemon so that when ctrl-c happens this thread
                # goes away
                mc_thread.daemon = True
                mc_thread.start()
        finally:
            self.shutdown()
        return

    def shutdown(self):
        if self.shutting_down is False:
            # check to see if we're already shutting down so we don't step
            #    over multiple threads attempting this
            self.shutting_down = True
            logger.debug("Shutting down task_server")
            hostname = socket.gethostname()
            ip_addr = socket.gethostbyname(hostname)
            cpu_usage = psutil.cpu_percent()
            self.dbi.still_checkin(hostname,
                                   ip_addr,
                                   self.port,
                                   int(cpu_usage),
                                   self.data_dir,
                                   status="OFFLINE")
            self.keep_running = False
            parentproc = psutil.Process()
            myprocs = parentproc.children(recursive=True)
            for proc in myprocs:
                logger.debug("Killing nicely -> Pid: %s - Proc: %s" %
                             (proc.pid, proc.name))
                proc.terminate()
            gone, alive = psutil.wait_procs(myprocs, timeout=3)
            for proc in alive:
                logger.debug("Killing with gusto -> Pid: %s - Proc: %s" %
                             (proc.pid, proc.name))
                proc.kill()
            HTTPServer.shutdown(self)
            if self.sg.cluster_scheduler == 1:
                self.drmaa_session.exit()  # Terminate DRMAA sessionmaker

            sys.exit(0)

    def get_ip_address(self):
        """Return an IP address for this machine as a string.

        https://stackoverflow.com/questions/24196932/how-can-i-get-the-ip-address-of-eth0-in-python

        This is actually not well defined -- machines have multiple interfaces,
        each with its own IP address. We use eth0 if it exists, otherwise the
        first one that isn't `lo`.

        Copied from hera_mc/scripts/mc_server_status_daemon.py
        """
        try:
            addrs = netifaces.ifaddresses('eth0')
        except ValueError:
            for ifname in sorted(netifaces.interfaces()):
                if ifname != 'lo':
                    addrs = netifaces.ifaddresses(ifname)
                    break
            else:  # triggered if we never did the 'break'
                return '?.?.?.?'

        return addrs[netifaces.AF_INET][0]['addr']
Example #9
0
class Scheduler(ThreadingMixIn, HTTPServer):
    ###
    # A Scheduler reads a DataBaseInterface to determine what Actions can be
    # taken, and then schedules them on stills according to priority.'''
    ###
    def __init__(self, task_clients, workflow, sg):

        global logger
        logger = sg.logger
        self.myhostname = socket.gethostname()

        HTTPServer.__init__(self, (self.myhostname, 8080), MonitorHandler)  # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server.
        self.sg = sg  # Might as well have it around in case I find I need something from it...  Its just a little memory
        self.nstills = len(sg.hosts)  # preauto
        self.actions_per_still = sg.actions_per_still
        self.transfers_per_still = sg.transfers_per_still  # Jon : This isn't used...
        self.block_size = sg.block_size  # preauto
        self.timeout = sg.timeout
        self.sleep_time = sg.sleep_time

        self.lock_all_neighbors_to_same_still = workflow.lock_all_neighbors_to_same_still
        self.active_obs = []
        self.active_obs_dict = {}
        self.action_queue = []
        self.dbi = sg.dbi
        self.launched_actions = {}

        self.keep_running = False
        self.failcount = {}
        self.wf = workflow  # Jon: Moved the workflow class to instantiated on object creation, should do the same for dbi probably
        self.task_clients = {}
        self.stills = []

        signal.signal(signal.SIGINT, self.signal_handler)  # Enabled clean shutdown after Cntrl-C event.

        logger.info("Starting monitoring interface.")
        threading.Thread(target=self.serve_forever).start()  # Launch a thread of a multithreaded http server to view information on currently running tasks

        # If task_clients is set to AUTO then check the db for still servers
        if task_clients[0].host_port[0] == "AUTO":
            self.find_all_taskmanagers()
            self.auto = 1
        else:
            self.auto = 0
            self.task_clients = task_clients

    def signal_handler(self, signum, frame):
        logger.info("Caught Ctrl-C, Initiating clean shutdown.")
        self.shutdown()

    def find_all_taskmanagers(self):
        ###
        # find_all_taskmanagers : Check the database for all available stills with status OK
        #  Should also remove stills that have gone offline.
        ###
        logger.debug("looking for TaskManagers...")
        self.stills = self.dbi.get_available_stills()
        while len(self.stills) < 1:
            logger.debug("Can't find any TaskManagers! Waiting for 10sec and trying again")
            time.sleep(10)
            self.stills = self.dbi.get_available_stills()

        for still in self.stills:
            if still.hostname not in self.task_clients:
                logger.debug("Discovery of new TaskManager : %s" % still.hostname)
                self.task_clients[still.hostname] = TaskClient(self.dbi, still.hostname, self.wf, still.port, self.sg)
                self.launched_actions[still.hostname] = []
        return

    def ext_command_hook(self):
        return

    def check_taskmanager(self, tm):
        tm_info = self.dbi.get_still_info(tm)
        since = datetime.datetime.now() - datetime.timedelta(minutes=3)
        if tm_info.status != "OK" or tm_info.last_checkin < since:  # Status not OK or hasn't checked-in in over 3min.
            logger.info("Removing offline TaskManager : %s" % tm_info.hostname)
            self.launched_actions.pop(tm_info.hostname, None)
            self.task_clients.pop(tm_info.hostname, None)
            for obs in self.dbi.get_obs_assigned_to_still(tm_info.hostname):

                if obs.obsnum in self.active_obs_dict:
                    self.active_obs_dict.pop(obs.obsnum)
                    self.active_obs.remove(obs.obsnum)

            return False

        elif tm_info.cur_num_of_tasks >= tm_info.max_num_of_tasks:  # Check to ensure we are not at max # of tasks for the taskmanager
            return False

        return True

    def start(self, dbi, ActionClass=None, action_args=()):
        ###
        #  Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc..
        #      This loop can be terminated by q + Enter, and paused by p + enter
        ###
        self.user_input = InputThread()
        self.user_input.start()
        self.initial_startup = True  # The scheduler is just starting, for the first run if we have new obs we need this to assign to proper taskmanagers
        self.tm_cycle = cycle(self.stills)
        self.keep_running = True
        logger.info('Starting Scheduler')
        self.dbi = dbi
        last_checked_for_stills = time.time()

        while self.keep_running:

            if (time.time() - last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK:
                self.find_all_taskmanagers()
                last_checked_for_stills = time.time()
                logger.debug("Number of TaskManagers : %s" % len(self.task_clients))

            self.ext_command_hook()
            self.get_new_active_obs()

            self.update_action_queue(ActionClass, action_args)
            launched_actions_copy = copy.copy(self.launched_actions)
            # Launch actions that can be scheduled
            for tm in launched_actions_copy:
                tm_info = self.dbi.get_still_info(tm)
                if self.check_taskmanager(tm) is False:  # Check if the TaskManager is still available, if not it will pop it out
                    continue

                while len(self.get_launched_actions(tm, tx=False)) < tm_info.max_num_of_tasks:  # I think this will work
                    # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False)))
                    action_from_queue = self.pop_action_queue(tm, tx=False)  # FIXME: MIght still be having a small issue when a TM goes offline and back on

                    if action_from_queue is not False:
                        if self.launch_action(action_from_queue) != "OK":  # If we had a connection error stop trying until TM checks back in
                            break
                    else:
                        break

            self.clean_completed_actions(self.dbi)

            keyboard_input = self.user_input.get_user_input()
            if keyboard_input is not None and keyboard_input != '':
                handle_keyboard_input(self, keyboard_input)
            else:
                time.sleep(self.sleep_time)
            self.initial_startup = False  # We've run once now, all obs were assigned via roundrobin if they were not previously
        self.shutdown()

    def shutdown(self):
        logger.info("Shutting down...")
        self.keep_running = False
        HTTPServer.shutdown(self)
        sys.exit(0)

    def get_all_neighbors(self, obsnum):
        ###
        # get_all_neighbors: Go down (and up) the rabbit hole and find ALL the neighbors of a particular obsid
        ###
        neighbor_obs_nums = []
        neighbor_obs_nums.append(obsnum)  # Go ahead and add the current obsid to the list

        low_obs, high_obs = self.dbi.get_neighbors(obsnum)
        while high_obs is not None:  # Traverse the list UP to find all neighbors above this one
            neighbor_obs_nums.append(high_obs)
            high_obs = self.dbi.get_neighbors(high_obs)[1]

        while low_obs is not None:  # Traverse the list DOWN to find all neighbors above this one
            neighbor_obs_nums.append(low_obs)
            low_obs = self.dbi.get_neighbors(low_obs)[0]
        return neighbor_obs_nums

    def pop_action_queue(self, still, tx=False):
        '''Return highest priority action for the given still.'''
        # Seems like we're going through all the actions to find the ones for the particular still..
        # Should think about optimizing at some point
        for i in xrange(len(self.action_queue)):
            a = self.action_queue[i]
            if a.still == still and a.is_transfer == tx:
                return self.action_queue.pop(i)
        return False

    def get_launched_actions(self, still, tx=False):
        return [a for a in self.launched_actions[still] if a.is_transfer == tx]

    def launch_action(self, a):
        '''Launch the specified Action and record its launch for tracking later.'''
        self.launched_actions[a.still].append(a)
        connection_status = a.launch()
        return connection_status

    def kill_action(self, a):
        logger.info('Scheduler.kill_action: called on (%s,%s)' % (a.task, a.obs))
        connection_status = a.run_remote_task(action_type="KILL_TASK")
        return connection_status

    def clean_completed_actions(self, dbi):
        '''Check launched actions for completion, timeout or fail'''

        for still in self.launched_actions:
            updated_actions = []
            for action in self.launched_actions[still]:
                obs = dbi.get_obs(action.obs)
                status = obs.status
                pid = dbi.get_obs_pid(action.obs)

                try:
                    self.failcount[str(action.obs) + status]
                except(KeyError):
                    self.failcount[str(action.obs) + status] = 0

                if status == action.task:
                    logger.info('Task %s for obs %s on still %s completed successfully.' % (action.task, action.obs, still))

                elif action.timed_out():
                    logger.info('Task %s for obs %s on still %s TIMED OUT.' % (action.task, action.obs, still))
                    if self.kill_action(action) != "OK":
                        break
                    self.failcount[str(action.obs) + status] += 1
                    # XXX make db entry for documentation

                elif pid == -9:
                    self.failcount[str(action.obs) + status] += 1
                    logger.info('Task %s for obs %s on still %s HAS DIED. failcount=%d' % (action.task, action.obs, still, self.failcount[str(action.obs) + status]))

                else:  # still active
                    updated_actions.append(action)

            self.launched_actions[still] = updated_actions

    def already_launched(self, action):
        '''Determine if this action has already been launched.  Enforces
        fact that only one valid action can be taken for a given obs
        at any one time.'''
        for a in self.launched_actions[action.still]:
            if a.obs == action.obs:
                return True
        return False

    def get_new_active_obs(self):
        '''Check for any new obs that may have appeared.  Actions for
        these obs may potentially take priority over ones currently
        active.'''

        observations = []
        observations += self.dbi.list_open_observations_on_tm()
        for tm_name in self.launched_actions:
            observations += self.dbi.list_open_observations_on_tm(tm_hostname=tm_name)

        for open_obs in observations:
            if open_obs not in self.active_obs_dict:
                self.active_obs_dict[open_obs] = len(self.active_obs)
                self.active_obs.append(open_obs)
        return

    def update_action_queue(self, ActionClass=None, action_args=()):
        '''Based on the current list of active obs (which you might want
        to update first), generate a prioritized list of actions that
        can be taken.'''

        actions = []
        for myobs in self.active_obs:
            myobs_info = self.dbi.get_obs(myobs)
            if myobs_info.current_stage_in_progress == "FAILED" or myobs_info.current_stage_in_progress == "KILLED" or myobs_info.status == "COMPLETE" or (myobs_info.stillhost not in self.task_clients and myobs_info.stillhost):
                self.active_obs_dict.pop(myobs_info.obsnum)
                self.active_obs.remove(myobs_info.obsnum)

                logger.debug("update_action_queue: Removing obsid : %s, task : %s, Status: %s, TM: %s" % (myobs_info.obsnum, myobs_info.current_stage_in_progress, myobs_info.status, myobs_info.stillhost))
            else:
                myaction = self.get_action(myobs, ActionClass=ActionClass, action_args=action_args)
                if (myaction is not None) and (self.already_launched(myaction) is not True):
                    if self.wf.prioritize_obs == 1:
                        myaction.set_priority(self.determine_priority(myaction))

                    actions.append(myaction)

        actions.sort(action_cmp, reverse=True)  # place most important actions first
        self.action_queue = actions  # completely throw out previous action list

        return

    def get_action(self, obsnum, ActionClass=None, action_args=()):
        '''Find the next actionable step for obs f (one for which all
        prerequisites have been met.  Return None if no action is available.
        This function is allowed to return actions that have already been
        launched.
        ActionClass: a subclass of Action, for customizing actions.
            None defaults to the standard Action'''
        obsinfo = self.dbi.get_obs(obsnum)
        status = obsinfo.status

        if obsinfo.current_stage_in_progress == "FAILED" or obsinfo.current_stage_in_progress == "KILLED":
            return None

        if status == 'COMPLETE':
            # logger.debug("COMPLETE for obsid : %s" % obsnum)  # You can see how often completeds are checked by uncommenting this.. its a LOT
            return None  # obs is complete

        if status == '' or status == 'NEW':
            # Not yet ready for processing.
            return None

        # Check that the still assigned to the obs is currently in the list of active stills
        # !!!!FIXME!!!  - Maybe fixed?
        if obsinfo.stillhost is not None:
            if any(still for still in self.stills if still.hostname == obsinfo.stillhost):
                pass
            else:
                return None
        if self.wf.neighbors == 1:  # FIX ME, I don't want to call the same thing twice.. its ugly
            neighbors = self.dbi.get_neighbors(obsnum)

            if None in neighbors:
                cur_step_index = self.wf.workflow_actions_endfile.index(status)
                next_step = self.wf.workflow_actions_endfile[cur_step_index + 1]

            else:  # this is a normal file
                cur_step_index = self.wf.workflow_actions.index(status)
                next_step = self.wf.workflow_actions[cur_step_index + 1]

            neighbor_status = [self.dbi.get_obs_status(n) for n in neighbors if n is not None]
        else:
            cur_step_index = self.wf.workflow_actions.index(status)
            next_step = self.wf.workflow_actions[cur_step_index + 1]
            neighbor_status = 0

        still = self.dbi.get_obs_still_host(obsnum)

        if not still:
            if self.initial_startup is True:

                still = self.tm_cycle.next().hostname  # Balance out all the nodes on startup
            else:

                still = self.obs_to_still(obsnum)  # Get a still for a new obsid if one doesn't already exist.
                if still is False:
                    return None

            self.dbi.set_obs_still_host(obsnum, still)  # Assign the still to the obsid

            if self.lock_all_neighbors_to_same_still == 1 and self.wf.neighbors == 1:
                for neighbor in self.get_all_neighbors(obsnum):
                    self.dbi.set_obs_still_host(neighbor, still)

        if still != 0:  # If the obsnum is assigned to a server that doesn't exist at the moment we need to skip it, maybe reassign later
            if ActionClass is None:
                ActionClass = Action

            a = ActionClass(obsnum, next_step, neighbor_status, self.task_clients[still], self.wf, still, timeout=self.timeout)
            if self.wf.neighbors == 1:
                if a.has_prerequisites() is True:
                    return a
            else:
                return a
        # logging.debug('scheduler.get_action: (%s,%s) does not have prereqs' % (a.task, a.obs))
        return None

    def determine_priority(self, action):
        '''Assign a priority to an action based on its status and the time
        order of the obs to which this action is attached.'''
        jdcnt = (float(file2jd(action.obs))- 2415020)/0.005 #get the jd and convert to an integer (0.005 is the length of a typical PAPER obs)
        pol = str2pol[file2pol(action.obs)]
        #pol, jdcnt = int(action.obs) / 2 ** 32, int(action.obs) % 2 ** 32  # XXX maybe not make this have to explicitly match dbi bits
        return jdcnt * 4 + pol  # prioritize first by time, then by pol
        # XXX might want to prioritize finishing a obs already started before
        # moving to the latest one (at least, up to a point) to avoid a

        # build up of partial obs.  But if you prioritize obs already
        # started too excessively, then the queue could eventually fill with
        # partially completed tasks that are failing for some reason

    def obs_to_still(self, obs):
        ##############
        #   Check if a obsid has a still already, if it does simply return it.  If it does not then lets find the lowest
        #   loaded (cpu) one and assign it.  If none are under 80% then lets just wait around, they're busy enough as is.
        ##############
        mystill = self.dbi.get_obs_still_host(obs)
        if mystill:
            if mystill in self.task_clients:
                return mystill
            else:  # We couldn't find its still server as its not in task_clients for whatever reason so punt for now
                logger.debug("Obs attached to non-existant STILL OBS : %s, STILL %s" % (obs, mystill))
                return 0
        else:
            still = self.dbi.get_most_available_still()
            if still is not False:
                return still
            else:
                return False
Example #10
0
    def start(self, dbi, ActionClass=None, action_args=()):
        ###
        #  Start scheduler, loop forever and handle checking for new obsid's, new/removed taskmanagers etc..
        #      This loop can be terminated by q + Enter, and paused by p + enter
        ###
        self.user_input = InputThread()
        self.user_input.start()
        # The scheduler is just starting, for the first run if we have new obs
        # we need this to assign to proper taskmanagers
        self.initial_startup = True
        self.tm_cycle = cycle(self.stills)
        self.keep_running = True
        logger.info('Starting Scheduler')
        self.dbi = dbi
        last_checked_for_stills = time.time()
        last_checked_for_mc = time.time()

        while self.keep_running:
            if (time.time() -
                    last_checked_for_stills) > TIME_INT_FOR_NEW_TM_CHECK:
                self.find_all_taskmanagers()
                last_checked_for_stills = time.time()
                logger.debug("Number of TaskManagers : %s" %
                             len(self.task_clients))

            if self.wf.log_to_mc:
                import mc_utils
                now = time.time()
                dt_check = now - last_checked_for_mc
                if dt_check.total_seconds() > TIME_INT_FOR_NEW_MC_CHECK:
                    # get total number of running tasks
                    ntasks = 0
                    launched_actions_copy = copy.deepcopy(
                        self.launched_actions)
                    for tm in launched_actions_copy:
                        ntasks += len(self.get_launched_actions(tm, tx=False))

                    # get time since check-in in minutes
                    dt_check_min = dt_check / 60

                    # get time since boot in hours
                    boot_time = psutil.boot_time()
                    dt_boot = now - boot_time
                    dt_boot_hr = dt_boot / 60 / 60

                    # log to M&C
                    status = "OK"
                    logger.debug(
                        "Logging to M&C : {0} status, {1:5.2f} min since last check; {2}"
                        " tasks running; {4:10.3} hr since boot".format(
                            status, dt_check_min, str(ntasks), dt_boot_hr))
                    mc_utils.add_mc_rtp_status(status, dt_check_min, ntasks,
                                               dt_boot_hr)

            self.ext_command_hook()
            self.get_new_active_obs()

            self.update_action_queue(ActionClass, action_args)
            launched_actions_copy = copy.copy(self.launched_actions)
            # Launch actions that can be scheduled
            for tm in launched_actions_copy:
                tm_info = self.dbi.get_still_info(tm)
                # Check if the TaskManager is still available, if not it will
                # pop it out
                if self.check_taskmanager(tm) is False:
                    continue

                # I think this will work
                while len(self.get_launched_actions(
                        tm, tx=False)) < tm_info.max_num_of_tasks:
                    # logger.debug("Number of launched actions : %s" % len(self.get_launched_actions(tm, tx=False)))
                    # FIXME: MIght still be having a small issue when a TM goes
                    # offline and back on
                    action_from_queue = self.pop_action_queue(tm, tx=False)

                    if action_from_queue is not False:
                        # If we had a connection error stop trying until TM
                        # checks back in
                        if self.launch_action(action_from_queue) != "OK":
                            break
                    else:
                        break

            self.clean_completed_actions(self.dbi)

            keyboard_input = self.user_input.get_user_input()
            if keyboard_input is not None and keyboard_input != '':
                handle_keyboard_input(self, keyboard_input)
            else:
                time.sleep(self.sleep_time)
            # We've run once now, all obs were assigned via roundrobin if they
            # were not previously
            self.initial_startup = False
        self.shutdown()
Example #11
0
class TaskServer(HTTPServer):
    allow_reuse_address = True

    def __init__(self,
                 dbi,
                 sg,
                 data_dir='.',
                 port=14204,
                 handler=TaskHandler,
                 path_to_do_scripts=".",
                 drmaa_shared='/shared'):
        global logger
        logger = sg.logger
        self.myhostname = socket.gethostname()
        self.httpd = HTTPServer.__init__(
            self, (self.myhostname, port), handler
        )  # Class us into HTTPServer so we can make calls from TaskHandler into this class via self.server.
        self.active_tasks_semaphore = threading.Semaphore()
        self.active_tasks = []
        self.dbi = dbi
        self.sg = sg
        self.data_dir = data_dir
        self.keep_running = False
        self.watchdog_count = 0
        self.port = port
        self.path_to_do_scripts = path_to_do_scripts
        self.logger = sg.logger
        self.drmaa_session = ''
        self.drmaa_shared = drmaa_shared
        self.shutting_down = False

        # signal.signal(signal.SIGINT, self.signal_handler)  # Enabled clean shutdown after Cntrl-C event.

    def append_task(self, t):
        self.active_tasks_semaphore.acquire(
        )  # Jon : Not sure why we're doing this, we only have one primary thread
        self.active_tasks.append(t)
        self.active_tasks_semaphore.release()

    def poll_task_status(self, task):
        if self.sg.cluster_scheduler == 1:  # Do we need to interface with a cluster scheduler?
            try:
                task_info = self.drmaa_session.jobStatus(task.jid)
            except:
                task_info = "failed"
                logger.debug(
                    "TS: poll_task_status : DRMAA jobstatus failed for jid : %s"
                    % task.jid)
            if task_info == "done" or task_info == "failed":  # Check if task is done or failed..
                poll_status = True
            else:
                poll_status = None
            # attributes: retval. :  jobId, hasExited, hasSignal, terminatedSignal, hasCoreDump, wasAborted, exitStatus, and resourceUsage
        else:
            try:
                poll_status = task.process.poll(
                )  # race condition due to threading, might fix later, pretty rare
            except:
                poll_status = None
                time.sleep(2)

        return poll_status

    def finalize_tasks(self, poll_interval=5.):
        self.user_input = InputThread()
        self.user_input.start()

        while self.keep_running:
            self.active_tasks_semaphore.acquire()
            new_active_tasks = []
            for mytask in self.active_tasks:
                if self.poll_task_status(mytask) is None:
                    new_active_tasks.append(
                        mytask
                    )  # This should probably be handled in a better way
                else:
                    mytask.finalize()
            self.active_tasks = new_active_tasks
            self.active_tasks_semaphore.release()

            #  Jon: I think we can get rid of the watchdog as I'm already throwing this at the db
            time.sleep(poll_interval)
            if self.watchdog_count == 30:
                logger.debug('TaskServer is alive')
                for mytask in self.active_tasks:
                    try:
                        child_proc = mytask.process.children()[0]
                        if psutil.pid_exists(child_proc.pid):
                            logger.debug(
                                'Proc info on {obsnum}:{task}:{pid} - cpu={cpu:.1f}%, mem={mem:.1f}%, Naffinity={aff}'
                                .format(
                                    obsnum=mytask.obs,
                                    task=mytask.task,
                                    pid=child_proc.pid,
                                    cpu=child_proc.cpu_percent(interval=1.0),
                                    mem=child_proc.memory_percent(),
                                    aff=len(child_proc.cpu_affinity())))
                    except:
                        pass
                self.watchdog_count = 0
            else:
                self.watchdog_count += 1

            self.keyboard_input = self.user_input.get_user_input()
            if self.keyboard_input is not None:
                handle_keyboard_input(self, self.keyboard_input)
        return

    def kill(self, pid):
        try:
            for task in self.active_tasks:
                if self.sg.cluster_scheduler == 1:  # Do we need to interface with a cluster scheduler?

                    if int(task.jid) == int(pid):
                        task.kill()
                        break
                else:
                    if int(task.process.pid) == int(pid):
                        task.kill()
                        break
        except:
            logger.exception("Problem killing off task: %s  w/  pid : %s" %
                             (task, pid))

    def kill_all(self):
        for task in self.active_tasks:
            task.kill()
            break

    def checkin_timer(self):
        #
        # Just a timer that will update that its last_checkin time in the database every 5min
        #
        while self.keep_running is True:
            hostname = socket.gethostname()
            ip_addr = socket.gethostbyname(hostname)
            cpu_usage = os.getloadavg()[1]  #using the 5 min load avg
            self.dbi.still_checkin(hostname,
                                   ip_addr,
                                   self.port,
                                   int(cpu_usage),
                                   self.data_dir,
                                   status="OK",
                                   max_tasks=self.sg.actions_per_still,
                                   cur_tasks=len(self.active_tasks))
            time.sleep(10)
        return 0

    def start(self):
        psutil.cpu_percent()
        time.sleep(1)
        self.keep_running = True
        t = threading.Thread(target=self.finalize_tasks)
        t.daemon = True
        t.start()
        logger.info('Starting Task Server')
        logger.info("using code at: " + __file__)
        logger.info("Path to do_ Scripts : %s" % self.path_to_do_scripts)
        logger.info("Data_dir : %s" % self.data_dir)
        logger.info("Port : %s" % self.port)

        if self.sg.cluster_scheduler == 1:
            logger.info("Initilizing DRMAA interface to cluster scheduler")
            import drmaa
            self.drmaa_session = drmaa.Session(
            )  # Start the interface session to DRMAA to control GridEngine
            self.drmaa_session.initialize()
        try:
            # Setup a thread that just updates the last checkin time for this still every 5min
            timer_thread = threading.Thread(target=self.checkin_timer)
            timer_thread.daemon = True  # Make it a daemon so that when ctrl-c happens this thread goes away
            timer_thread.start()  # Start heartbeat
            self.serve_forever()  # Start the lisetenser server
        finally:
            self.shutdown()
        return

    def shutdown(self):
        if self.shutting_down is False:  # check to see if we're already shutting down so we don't step over multiple threads attempting this.
            self.shutting_down = True
            logger.debug("Shutting down task_server")
            hostname = socket.gethostname()
            ip_addr = socket.gethostbyname(hostname)
            cpu_usage = psutil.cpu_percent()
            self.dbi.still_checkin(hostname,
                                   ip_addr,
                                   self.port,
                                   int(cpu_usage),
                                   self.data_dir,
                                   status="OFFLINE")
            self.keep_running = False
            parentproc = psutil.Process()
            myprocs = parentproc.children(recursive=True)
            for proc in myprocs:
                logger.debug("Killing nicely -> Pid: %s - Proc: %s" %
                             (proc.pid, proc.name))
                proc.terminate()
            gone, alive = psutil.wait_procs(myprocs, timeout=3)
            for proc in alive:
                logger.debug("Killing with gusto -> Pid: %s - Proc: %s" %
                             (proc.pid, proc.name))
                proc.kill()
            HTTPServer.shutdown(self)
            if self.sg.cluster_scheduler == 1:
                self.drmaa_session.exit()  # Terminate DRMAA sessionmaker

            sys.exit(0)