def sm_sync(self):
        '''Resynchronize with the script manager'''
        # get this cache first -- it's no problem if this data is old, but bad things
        # happen when this data is newer than the list of running processes in scriptm
        self.lock.acquire()
        try:
            process_groups_cache = self.process_groups.values()
        except:
            self.logger.error("error copying process_groups.values()", exc_info=True)
        self.lock.release()

        try:
            pgroups = ComponentProxy("script-manager").get_jobs([{'id':'*', 'state':'running'}])
        except (ComponentLookupError, xmlrpclib.Fault):
            self.logger.error("Failed to communicate with script manager")
            return
        live = [item['id'] for item in pgroups]
        
        for each in process_groups_cache:
            if each.mode == 'script' and each.script_id not in live:
                self.logger.info("Found dead pg for script job %s" % (each.script_id))
                result = ComponentProxy("script-manager").wait_jobs([{'id':each.script_id, 'exit_status':'*'}])
                self.logger.info("wait returned %r" % result)
                for r in result:
                    which_one = None
                    if r['id'] == each.script_id:
                        each.exit_status = r['exit_status']
                        self.reserve_resources_until(each.location, None, each.jobid)
Exemple #2
0
    def event_driver(self):
        """core part that drives the clock"""

        if self.go_next:
            #only if the go_next tag is true will the clock be incremented. enable scheduler schedule multiple job at the same time stamp
            self.clock_increment()

        machine = self.get_current_event_machine()
        #        print "[%s]: %s, machine=%s, event=%s, job=%s" % (
        #                                            self.implementation,
        #                                            self.get_current_date_time(),
        #                                            self.get_current_event_machine(),
        #                                            self.get_current_event_type(),
        #                                            self.get_current_event_job(),
        #                                            )

        if machine == INTREPID:
            self.bgsched.schedule_jobs()
            util = ComponentProxy("queue-manager").get_util()
            self.log_info(util, "mira_util_mesh")

        if machine == EUREKA:
            self.csched.schedule_jobs()

        if self.go_next:
            ComponentProxy("queue-manager").calc_loss_of_capacity()
Exemple #3
0
    def signal_process_groups(self, specs, signame="SIGINT"):
        my_process_groups = self.process_groups.q_get(specs)
        for pg in my_process_groups:
            if pg.exit_status is None:
                if pg.mode == "script":
                    try:
                        ComponentProxy("script-manager").signal_jobs(
                            [{
                                'id': pg.script_id
                            }], signame)
                    except (ComponentLookupError, xmlrpclib.Fault):
                        self.logger.error(
                            "Failed to communicate with script manager when killing job"
                        )
                else:
                    try:
                        ComponentProxy("forker").signal(pg.head_pid, signame)
                    except:
                        self.logger.error(
                            "Failed to communicate with forker when signalling job"
                        )

                if signame == "SIGKILL" and not pg.true_mpi_args:
                    self._mark_partition_for_cleaning(pg.location[0], pg.jobid)

        return my_process_groups
    def _get_exit_status (self):
        try:
            running = ComponentProxy("forker").active_list()
        except:
            self.logger.error("failed to contact forker component for list of running jobs")
            return

        for each in self.process_groups.itervalues():
            if each.head_pid not in running and each.exit_status is None:
                # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just
                # assume the process is dead?  or maybe just say there's no exit code the first time it happens?
                # maybe the second choice is better
                try:
                    dead_dict = ComponentProxy("forker").get_status(each.head_pid)
                except Queue.Empty:
                    self.logger.error("failed call for get_status from forker component for pg %s", each.head_pid)
                    return
                
                if dead_dict is None:
                    self.logger.info("process group %i: job %s/%s exited with unknown status", each.id, each.jobid, each.user)
                    each.exit_status = 1234567
                else:
                    each.exit_status = dead_dict["exit_status"]
                    if dead_dict["signum"] == 0:
                        self.logger.info("process group %i: job %s/%s exited with status %i", 
                            each.id, each.jobid, each.user, each.exit_status)
                    else:
                        if dead_dict["core_dump"]:
                            core_dump_str = ", core dumped"
                        else:
                            core_dump_str = ""
                        self.logger.info("process group %i: job %s/%s terminated with signal %s%s", 
                            each.id, each.jobid, each.user, dead_dict["signum"], core_dump_str)
Exemple #5
0
    def _get_exit_status(self):

        #common to bgsystem

        running = []
        active_forker_components = []
        for forker_component in ['bg_mpirun_forker', 'user_script_forker']:
            try:
                running.extend(
                    ComponentProxy(forker_component).active_list(
                        "process group"))
                active_forker_components.append(forker_component)
            except:
                self.logger.error(
                    "failed to contact %s component for list of running jobs",
                    forker_component)

        for each in self.process_groups.itervalues():
            if each.head_pid not in running and each.exit_status is None and each.forker in active_forker_components:
                # FIXME: i bet we should consider a retry thing here -- if we fail enough times, just
                # assume the process is dead?  or maybe just say there's no exit code the first time it happens?
                # maybe the second choice is better
                try:
                    if each.head_pid != None:
                        dead_dict = ComponentProxy(each.forker).get_status(
                            each.head_pid)
                    else:
                        dead_dict = None
                except:
                    self.logger.error(
                        "%s: RPC to get_status method in %s component failed",
                        each.label, each.forker)
                    return

                if dead_dict is None:
                    self.logger.info("%s: job exited with unknown status",
                                     each.label)
                    # FIXME: should we use a negative number instead to indicate internal errors? --brt
                    each.exit_status = 1234567
                else:
                    each.exit_status = dead_dict["exit_status"]
                    if dead_dict["signum"] == 0:
                        self.logger.info("%s: job exited with status %i",
                                         each.label, each.exit_status)
                    else:
                        if dead_dict["core_dump"]:
                            core_dump_str = ", core dumped"
                        else:
                            core_dump_str = ""
                        self.logger.info("%s: terminated with signal %s%s",
                                         each.label, dead_dict["signum"],
                                         core_dump_str)
                    self.reserve_resources_until(each.location, None,
                                                 each.jobid)
Exemple #6
0
def check_dependencies(dependency_string):

    if dependency_string.lower() == 'none':
        #we are removing all job dependencies.
        print "Removing job dependencies"
        return

    deps = set(dependency_string.split(":"))
    
    query = []
    for dep in deps:
        try:
            query.append({"jobid": int(dep)})
        except:
            pass
    
    jobs = ComponentProxy("queue-manager").get_jobs(query)
    
    job_ids = set( [str(j["jobid"]) for j in jobs] )
    
    missing = deps.difference(job_ids)
    
    if missing:
        print "WARNING: dependencies %s do not match jobs currently in the "\
                "queue" % ":".join(missing)
Exemple #7
0
    def check_reservations(self):
        ret = ""
        reservations = self.reservations.values()
        for i in range(len(reservations)):
            for j in range(i + 1, len(reservations)):
                # if at least one reservation is cyclic, we want *that* reservation to be the one getting its overlaps method
                # called
                if reservations[i].cycle is not None:
                    res1 = reservations[i]
                    res2 = reservations[j]
                else:
                    res1 = reservations[j]
                    res2 = reservations[i]

                # we subtract a little bit because the overlaps method isn't really meant to do this
                # it will report warnings when one reservation starts at the same time another ends
                if res1.overlaps(res2.start, res2.duration - 0.00001):
                    # now we need to check for overlap in space
                    results = ComponentProxy(self.COMP_SYSTEM).get_partitions(
                        [{
                            'name': p,
                            'children': '*',
                            'parents': '*'
                        } for p in res2.partitions.split(":")])
                    for p in res1.partitions.split(":"):
                        for r in results:
                            if p == r['name'] or p in r['children'] or p in r[
                                    'parents']:
                                ret += "Warning: reservation '%s' overlaps reservation '%s'\n" % (
                                    res1.name, res2.name)

        return ret
Exemple #8
0
    def q_add(self, *args, **kwargs):
        '''Add a reservation to tracking.
        Side Efffects:
            -Add a queue to be tracked
            -If no cqm associated queue, create a reservation queue
            -set policies for new queue
            -emit numerous creation messages

        '''

        qm = ComponentProxy("queue-manager")
        try:
            queues = [spec['name'] for spec in qm.get_queues([{'name': "*"}])]
        except ComponentLookupError:
            logger.error(
                "unable to contact queue manager when adding reservation")
            raise

        try:
            specs = args[0]
            for spec in specs:
                if "res_id" not in spec or spec['res_id'] == '*':
                    spec['res_id'] = bgsched_id_gen.get()
            reservations = Cobalt.Data.DataDict.q_add(self, *args, **kwargs)

        except KeyError, err:
            raise ReservationError(
                "Error: a reservation named %s already exists" % err)
Exemple #9
0
    def add_process_groups(self, specs):
        """Create a process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode', False) == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        script_pgroups = []
        if script_specs:
            for spec in script_specs:
                try:
                    self._set_kernel(
                        spec.get('location')[0], spec.get('kernel', "default"))
                except Exception, e:
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    pgroup.exit_status = 1
                    self.logger.info(
                        "process group %s: job %s/%s failed to set the kernel; %s",
                        pgroup.id, pgroup.jobid, pgroup.user, e)
                else:
                    try:
                        script_pgroup = ComponentProxy(
                            "script-manager").add_jobs([spec])
                    except (ComponentLookupError, xmlrpclib.Fault):
                        self._clear_kernel(spec.get('location')[0])
                        # FIXME: jobs that were already started are not reported
                        raise ProcessGroupCreationError(
                            "system::add_process_groups failed to communicate with script-manager"
                        )
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.script_id = script_pgroup[0]['id']
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    self.logger.info(
                        "job %s/%s: process group %s created to track script",
                        pgroup.jobid, pgroup.user, pgroup.id)
                    self.reserve_resources_until(
                        spec['location'],
                        time.time() + 60 * float(spec['walltime']),
                        pgroup.jobid)
                    if pgroup.kernel != "default":
                        self.logger.info(
                            "process group %s: job %s/%s using kernel %s",
                            pgroup.id, pgroup.jobid, pgroup.user,
                            pgroup.kernel)
                    script_pgroups.append(pgroup)
Exemple #10
0
 def start(self):
     """Start the process group by forking to _mpirun()"""
     try:
         data = self.prefork()
         self.head_pid = ComponentProxy("forker").fork(data)
     except:
         self.logger.error(
             "problem forking: pg %s did not find a child pid", self.id)
 def __init__(self, *args, **kwargs):
     BGSched.__init__(self, *args, **kwargs)
     self.get_current_time = ComponentProxy("event-manager").get_current_time
     self.COMP_QUEUE_MANAGER = "cluster-queue-manager"
     self.COMP_SYSTEM = "cluster-system"
     self.queues = Cobalt.Components.bgsched.QueueDict(self.COMP_QUEUE_MANAGER)
     self.jobs = Cobalt.Components.bgsched.JobDict(self.COMP_QUEUE_MANAGER)
     self.running_job_walltime_prediction = False
Exemple #12
0
 def unregister_with_slp(self):
     try:
         name = self.instance.name
     except AttributeError:
         return
     try:
         ComponentProxy("service-location").unregister(name)
     except Exception, e:
         self.logger.error("unregister_with_slp() [%s]" % (e))
Exemple #13
0
class QueueDict(ForeignDataDict):
    """Dictionary for the queue metadata cache.

    """
    item_cls = Queue
    key = 'name'
    __oserror__ = Cobalt.Util.FailureMode("QM Connection (queue)")
    __function__ = ComponentProxy("queue-manager").get_queues
    __fields__ = ['name', 'state', 'policy', 'priority']
Exemple #14
0
    def _run_reservation_jobs(self, reservations_cache):
        # handle each reservation separately, as they shouldn't be competing for resources
        for cur_res in reservations_cache.itervalues():
            #print "trying to run res jobs in", cur_res.name, self.started_jobs
            queue = cur_res.queue
            if not (self.queues.has_key(queue)
                    and self.queues[queue].state == 'running'):
                continue

            temp_jobs = self.jobs.q_get([{
                'is_runnable': True,
                'queue': queue
            }])
            active_jobs = []
            for j in temp_jobs:
                if not self.started_jobs.has_key(
                        j.jobid) and cur_res.job_within_reservation(j):
                    active_jobs.append(j)

            if not active_jobs:
                continue
            active_jobs.sort(self.utilitycmp)

            job_location_args = []
            for job in active_jobs:
                job_location_args.append({
                    'jobid':
                    str(job.jobid),
                    'nodes':
                    job.nodes,
                    'queue':
                    job.queue,
                    'required':
                    cur_res.partitions.split(":"),
                    'utility_score':
                    job.score,
                    'walltime':
                    job.walltime,
                    'attrs':
                    job.attrs,
                    'user':
                    job.user,
                })

            # there's no backfilling in reservations
            try:
                best_partition_dict = ComponentProxy(
                    self.COMP_SYSTEM).find_job_location(job_location_args, [])
            except:
                self.logger.error("failed to connect to system component")
                best_partition_dict = {}

            for jobid in best_partition_dict:
                job = self.jobs[int(jobid)]
                self._start_job(job, best_partition_dict[jobid],
                                {str(job.jobid): cur_res.res_id})
Exemple #15
0
 def add_jobs (self, specs):
     """Add a job to the process manager."""
     self.logger.info("add_jobs(%r)" % (specs))
     jobs = self.jobs.q_add(specs)
     system_specs = \
         ComponentProxy("system").add_jobs([job.to_rx() for job in jobs])
     for system_spec in system_specs:
         job = self.jobs[system_spec['id']]
         job.state = "running"
     return jobs
Exemple #16
0
 def register_with_slp(self):
     try:
         name = self.instance.name
     except AttributeError:
         self.logger.error("register_with_slp() [unknown component]")
         return
     try:
         ComponentProxy("service-location").register(name, self.url)
     except Exception, e:
         self.logger.error("register_with_slp() [%s]" % (e))
Exemple #17
0
    def signal_process_groups (self, specs, signame="SIGINT"):
        my_process_groups = self.process_groups.q_get(specs)
        for pg in my_process_groups:
            if pg.exit_status is None:
                try:
                    ComponentProxy("forker").signal(pg.head_pid, signame)
                except:
                    self.logger.error("Failed to communicate with forker when signalling job")

        return my_process_groups
    def __init__(self, *args, **kwargs):
        BGSched.__init__(self, *args, **kwargs)

        self.get_current_time = ComponentProxy("event-manager").get_current_time

        predict_scheme = kwargs.get("predict", False)
        if predict_scheme:
            self.running_job_walltime_prediction = bool(int(predict_scheme[2]))
        else:
            self.running_job_walltime_prediction = False
Exemple #19
0
 def start(self):
     """Start the process group by contact the appropriate forker component"""
     try:
         data = self.prefork()
         self.head_pid = ComponentProxy(self.forker, retry=False).fork([self.executable] + self.args, self.tag,
             "Job %s/%s/%s" %(self.jobid, self.user, self.id), self.env, data, self.runid)
     except:
         _logger.error("Job %s/%s/%s: problem forking; %s did not return a child id", self.jobid, self.user, self.id,
             self.forker)
         raise
Exemple #20
0
 def get_mate_jobs_status_local(self, remote_jobid):
     '''return mate job status, invoked by local functions'''
     status_dict = {}
     try:
         status_dict = ComponentProxy(REMOTE_QUEUE_MANAGER).get_mate_job_status(remote_jobid)
     except:
         self.logger.error("failed to connect to remote queue-manager component!")
         status_dict = {'status':'notconnected'}
         self.dbglog.LogMessage("failed to connect to remote queue-manager component!")
     return status_dict
Exemple #21
0
 def q_del(self, *args, **kwargs):
     reservations = Cobalt.Data.DataDict.q_del(self, *args, **kwargs)
     qm = ComponentProxy('queue-manager')
     queues = [spec['name'] for spec in qm.get_queues([{'name': "*"}])]
     spec = [{'name': reservation.queue} for reservation in reservations \
             if reservation.createdQueue and reservation.queue in queues and \
             not self.q_get([{'queue':reservation.queue}])]
     try:
         qm.set_queues(spec, {'state': "dead"}, "bgsched")
     except Exception, e:
         logger.error("problem disabling reservation queue (%s)" % e)
 def signal(self, signame="SIGINT"):
     """
     Do something with this process group depending on the signal
     """
     logstr = "ProcessGroup:signal:"
     LOGGER.debug(logstr + "%s:%s" % (self.jobid, signame))
     try:
         if self.local_id:
             ComponentProxy("forker").signal(self.local_id, signame)
     except OSError as ose:
         LOGGER.exception( logstr + "failure for PG %s: %s" \
                                   % (self.id, err))
Exemple #23
0
    def _start_job(self, job, partition_list):
        cqm = ComponentProxy(self.COMP_QUEUE_MANAGER)

        try:
            self.logger.info("trying to start job %d on partition %r" %
                             (job.jobid, partition_list))
            cqm.run_jobs([{'tag': "job", 'jobid': job.jobid}], partition_list)
        except ComponentLookupError:
            self.logger.error("failed to connect to queue manager")
            return

        self.started_jobs[job.jobid] = self.get_current_time()
Exemple #24
0
 def check_jobs (self):
     """Finish jobs that are no longer running on the system."""
     self.logger.info("check_jobs()")
     local_job_specs = [job.to_rx(["id"]) for job in self.jobs.values() if job.state != 'finished']
     try:
         system_job_specs = ComponentProxy("system").get_jobs(local_job_specs)
     except ComponentLookupError:
         self.logger.error("check_jobs() [unable to contact system]")
         return
     system_job_ids = [spec['id'] for spec in system_job_specs]
     for job in self.jobs.values():
         if job.id not in system_job_ids and job.state != "finished":
             job.state = "finished"
Exemple #25
0
    def q_add(self, *args, **kwargs):
        qm = ComponentProxy(self.COMP_QUEUE_MANAGER)
        try:
            queues = [spec['name'] for spec in qm.get_queues([{'name': "*"}])]
        except ComponentLookupError:
            logger.error(
                "unable to contact queue manager when adding reservation")
            raise

        try:
            reservations = Cobalt.Data.DataDict.q_add(self, *args, **kwargs)
        except KeyError, e:
            raise ReservationError(
                "Error: a reservation named %s already exists" % e)
Exemple #26
0
class JobDict(ForeignDataDict):
    """Dictionary of job metadata from cqm for job location purposes.

    """
    item_cls = Job
    key = 'jobid'
    __oserror__ = Cobalt.Util.FailureMode("QM Connection (job)")
    __function__ = ComponentProxy("queue-manager").get_jobs
    __fields__ = [
        'nodes', 'location', 'jobid', 'state', 'index', 'walltime', 'queue',
        'user', 'submittime', 'starttime', 'project', 'is_runnable',
        'is_active', 'has_resources', 'score', 'attrs', 'walltime_p',
        'geometry'
    ]
Exemple #27
0
 def update(self, spec):
     if spec.has_key("users"):
         qm = ComponentProxy(self.COMP_QUEUE_MANAGER)
         try:
             qm.set_queues([{
                 'name': self.queue,
             }], {'users': spec['users']}, "bgsched")
         except ComponentLookupError:
             logger.error(
                 "unable to contact queue manager when updating reservation users"
             )
             raise
     # try the above first -- if we can't contact the queue-manager, don't update the users
     Data.update(self, spec)
Exemple #28
0
    def invoke_mpi_from_script(self, spec):
        '''Run an mpirun job that was invoked by a script.'''
        self.state = 'running'

        stdin = spec.get("stdin", self.stdin)
        stdout = spec.get("stdout", self.stdout)
        stderr = spec.get("stderr", self.stderr)

        try:
            pgroup = ComponentProxy("system").add_process_groups([{
                'jobid':
                self.jobid,
                'tag':
                'process-group',
                'user':
                self.user,
                'stdout':
                stdout,
                'stderr':
                stderr,
                'cobalt_log_file':
                self.cobalt_log_file,
                'cwd':
                self.cwd,
                'location':
                self.location,
                'stdin':
                stdin,
                'true_mpi_args':
                spec['true_mpi_args'],
                'env': {
                    'path': self.path
                },
                'size':
                0,
                'args': [],
                'executable':
                "this will be ignored"
            }])
        except (ComponentLookupError, xmlrpclib.Fault):
            self.log.error("Job %s: Failed to start up user script job" %
                           (self.jobid))
            return

        if not pgroup[0].has_key('id'):
            self.log.error("Process Group creation failed for Job %s" %
                           self.jobid)
            self.set('state', 'sm-failure')
        else:
            self.mpi_system_id = pgroup[0]['id']
Exemple #29
0
    def launch_script(self, config_option, host, jobid, user, group_name):
        '''Start our script processes used for node prep and cleanup.

        '''
        script = get_cluster_system_config(config_option, None)
        if script == None:
            self.logger.error("Job %s/%s: %s not defined in the "\
                    "cluster_system section of the cobalt config file!",
                    user, jobid, config_option)
            return None
        else:
            cmd = ["/usr/bin/ssh", host, script, 
                    str(jobid), user, group_name]
            return ComponentProxy("system_script_forker").fork(cmd, "system epilogue", 
                    "Job %s/%s" % (jobid, user))
 def start(self):
     """
     Starts the process group by:
     1.  Precompiling the data set for the job
     2.  Calling the forker with the job data
     3.  Saving the local_id from the forker
     ###  Still not sure about this, future work here...
     """
     #try:
     data = self.prefork()
     local_id = ComponentProxy("forker").fork(data)
     print "****************************************************"
     print "                  Local ID is %s" % local_id
     print "****************************************************"
     self.local_id = local_id