Beispiel #1
0
    def __setstate__(self, state):
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.diag_pids = dict()
        self.pending_script_waits = sets.Set()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []

        self.configure()
        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())
        self.lock = threading.Lock()
        self.statistics = Statistics()
Beispiel #2
0
    def __setstate__(self, state):
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = sets.Set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()
Beispiel #3
0
    def __setstate__(self, state):
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.diag_pids = dict()
        self.pending_script_waits = sets.Set()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []

        self.configure()
        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)
        
        self.update_relatives()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())
        self.lock = threading.Lock()
        self.statistics = Statistics()
Beispiel #4
0
    def __setstate__(self, state):
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = sets.Set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()
Beispiel #5
0
class BGSystem (BGBaseSystem):
    
    """Blue Gene system component.
    
    Methods:
    configure -- load partitions from the bridge API
    add_process_groups -- add (start) an mpirun process on the system (exposed, ~query)
    get_process_groups -- retrieve mpirun processes (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- update partition state from the bridge API (runs as a thread)
    """
    
    name = "system"
    implementation = "bgsystem"
    
    logger = logger

    
    _configfields = ['diag_script_location', 'diag_log_file', 'kernel']
    _config = ConfigParser.ConfigParser()
    _config.read(Cobalt.CONFIG_FILES)
    if not _config._sections.has_key('bgsystem'):
        print '''"bgsystem" section missing from cobalt config file'''
        sys.exit(1)
    config = _config._sections['bgsystem']
    mfields = [field for field in _configfields if not config.has_key(field)]
    if mfields:
        print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (" ".join(mfields))
        sys.exit(1)
    if config.get('kernel') == "true":
        _kernel_configfields = ['bootprofiles', 'partitionboot']
        mfields = [field for field in _kernel_configfields if not config.has_key(field)]
        if mfields:
            print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (" ".join(mfields))
            sys.exit(1)

    def __init__ (self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000)
        self.process_groups.item_cls = BGProcessGroup
        self.diag_pids = dict()
        self.configure()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())
    
    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] =  (sched, func, queue)
        return {'managed_partitions':self._managed_partitions, 'version':1,
                'partition_flags': flags}
    
    def __setstate__(self, state):
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.diag_pids = dict()
        self.pending_script_waits = sets.Set()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []

        self.configure()
        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)
        
        self.update_relatives()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)
    save_me = automatic(save_me)

    def _get_node_card(self, name, state):
        if not self.node_card_cache.has_key(name):
            self.node_card_cache[name] = NodeCard(name, state)
            
        return self.node_card_cache[name]

    def _new_partition_dict(self, partition_def, bp_cache={}):
        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        node_list = []

        if partition_def.small:
            bp_name = partition_def.base_partitions[0].id
            for nc in partition_def._node_cards:
                node_list.append(self._get_node_card(bp_name + "-" + nc.id, nc.state))
        else:
            try:
                for bp in partition_def.base_partitions:
                    if bp.id not in bp_cache:
                        bp_cache[bp.id] = []
                        for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp):
                            bp_cache[bp.id].append(self._get_node_card(bp.id + "-" + nc.id, nc.state))
                    node_list += bp_cache[bp.id]
            except BridgeException:
                print "Error communicating with the bridge during initial config.  Terminating."
                sys.exit(1)

        d = dict(
            name = partition_def.id,
            queue = "default",
            size = NODES_PER_NODECARD * len(node_list),
            bridge_partition = partition_def,
            node_cards = node_list,
            switches = [ s.id for s in partition_def.switches ],
            state = _get_state(partition_def),
        )
        return d


    def _detect_wiring_deps(self, partition, wiring_cache={}):
        def _kernel():
            s2 = sets.Set(p.switches)

            if s1.intersection(s2):
                p._wiring_conflicts.add(partition.name)
                partition._wiring_conflicts.add(p.name)
                self.logger.debug("%s and %s havening problems" % (partition.name, p.name))

        s1 = sets.Set(partition.switches)

        if wiring_cache.has_key(partition.size):
            for p in wiring_cache[partition.size]:
                if partition.name!=p.name:
                    _kernel()
        else:
            wiring_cache[partition.size] = [partition]
            for p in self._partitions.values():
                if p.size==partition.size and partition.name!=p.name:
                    wiring_cache[partition.size].append(p)
                    _kernel()

 
    def configure (self):
        
        """Read partition data from the bridge."""
        self.logger.info("configure()")
        try:
            system_def = Cobalt.bridge.PartitionList.by_filter()
        except BridgeException:
            print "Error communicating with the bridge during initial config.  Terminating."
            sys.exit(1)

                
        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()
        
        tmp_list = []

        wiring_cache = {}
        bp_cache = {}
        
        for partition_def in system_def:
            tmp_list.append(self._new_partition_dict(partition_def, bp_cache))
        
        partitions.q_add(tmp_list)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)

        # find the wiring deps
        start = time.time()
        for p in self._partitions.values():
            self._detect_wiring_deps(p, wiring_cache)

        end = time.time()
        self.logger.info("took %f seconds to find wiring deps" % (end - start))
 
        # update state information
        for p in self._partitions.values():
            if p.state != "busy":
                for nc in p.node_cards:
                    if nc.used_by:
                        p.state = "blocked (%s)" % nc.used_by
                        break
                for dep_name in p._wiring_conflicts:
                    if self._partitions[dep_name].state == "busy":
                        p.state = "blocked-wiring (%s)" % dep_name
                        break
        
   
    def update_partition_state(self):
        """Use the quicker bridge method that doesn't return nodecard information to update the states of the partitions"""
        
        def _start_partition_cleanup(p):
            self.logger.info("partition %s: marking partition for cleaning", p.name)
            p.cleanup_pending = True
            partitions_cleanup.append(p)
            _set_partition_cleanup_state(p)
            p.reserved_until = False
            p.reserved_by = None
            p.used_by = None

        def _set_partition_cleanup_state(p):
            p.state = "cleanup"
            for part in p._children:
                if part.bridge_partition.state == "RM_PARTITION_FREE":
                    part.state = "blocked (%s)" % (p.name,)
                else:
                    part.state = "cleanup"
            for part in p._parents:
                if part.state == "idle":
                    part.state = "blocked (%s)" % (p.name,)

        while True:
            try:
                system_def = Cobalt.bridge.PartitionList.info_by_filter()
            except BridgeException:
                self.logger.error("Error communicating with the bridge to update partition state information.")
                self.bridge_in_error = True
                time.sleep(5) # wait a little bit...
                continue # then try again
    
            try:
                bg_object = Cobalt.bridge.BlueGene.by_serial()
                for bp in bg_object.base_partitions:
                    for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp):
                        self.node_card_cache[bp.id + "-" + nc.id].state = nc.state
            except:
                self.logger.error("Error communicating with the bridge to update nodecard state information.")
                self.bridge_in_error = True
                time.sleep(5) # wait a little bit...
                continue # then try again

            self.bridge_in_error = False
            busted_switches = []
            for s in bg_object.switches:
                if s.state != "RM_SWITCH_UP":
                    busted_switches.append(s.id)

            # set all of the nodecards to not busy
            for nc in self.node_card_cache.values():
                nc.used_by = ''

            # update the state of each partition
            self._partitions_lock.acquire()
            now = time.time()
            partitions_cleanup = []
            self.offline_partitions = []
            missing_partitions = set(self._partitions.keys())
            new_partitions = []
            try:
                for partition in system_def:
                    missing_partitions.discard(partition.id)
                    if self._partitions.has_key(partition.id):
                        p = self._partitions[partition.id]
                        p.state = _get_state(partition)
                        p.bridge_partition = partition
                        p._update_node_cards()
                        if p.reserved_until and now > p.reserved_until:
                            p.reserved_until = False
                            p.reserved_by = None
                    else:
                        new_partitions.append(partition)


                # remove the missing partitions and their wiring relations
                for pname in missing_partitions:
                    self.logger.info("missing partition removed: %s", pname)
                    p = self._partitions[pname]
                    for dep_name in p._wiring_conflicts:
                        self.logger.debug("removing wiring dependency from: %s", dep_name)
                        self._partitions[dep_name]._wiring_conflicts.discard(p.name)
                    if p.name in self._managed_partitions:
                        self._managed_partitions.discard(p.name)
                    del self._partitions[p.name]

                bp_cache = {}
                wiring_cache = {}
                # throttle the adding of new partitions so updating of
                # machine state doesn't get bogged down
                for partition in new_partitions[:8]:
                    self.logger.info("new partition found: %s", partition.id)
                    bridge_p = Cobalt.bridge.Partition.by_id(partition.id)
                    self._partitions.q_add([self._new_partition_dict(bridge_p, bp_cache)])
                    p = self._partitions[bridge_p.id]
                    p.bridge_partition = partition
                    self._detect_wiring_deps(p, wiring_cache)

                # if partitions were added or removed, then update the relationships between partitions
                if len(missing_partitions) > 0 or len(new_partitions) > 0:
                    self.update_relatives()

                for p in self._partitions.values():
                    if p.cleanup_pending:
                        if p.used_by:
                            # if the partition has a pending cleanup request, then set the state so that cleanup will be
                            # performed
                            _start_partition_cleanup(p)
                        else:
                            # if the cleanup has already been initiated, then see how it's going
                            busy = []
                            parts = list(p._all_children)
                            parts.append(p)
                            for part in parts:
                                if part.bridge_partition.state != "RM_PARTITION_FREE":
                                    busy.append(part.name)
                            if len(busy) > 0:
                                _set_partition_cleanup_state(p)
                                self.logger.info("partition %s: still cleaning; busy partition(s): %s", p.name, ", ".join(busy))
                            else:
                                p.cleanup_pending = False
                                self.logger.info("partition %s: cleaning complete", p.name)
                    if p.state == "busy":
                        # when the partition becomes busy, if a script job isn't reserving it, then release the reservation
                        if not p.reserved_by:
                            p.reserved_until = False
                    elif p.state != "cleanup":
                        if p.reserved_until:
                            p.state = "allocated"
                            for part in p._parents:
                                if part.state == "idle":
                                    part.state = "blocked (%s)" % (p.name,)
                            for part in p._children:
                                if part.state == "idle":
                                    part.state = "blocked (%s)" % (p.name,)
                        elif p.bridge_partition.state == "RM_PARTITION_FREE" and p.used_by:
                            # if the job assigned to the partition has completed, then set the state so that cleanup will be
                            # performed
                            _start_partition_cleanup(p)
                            continue
                        for diag_part in self.pending_diags:
                            if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children:
                                p.state = "blocked by pending diags"
                        for nc in p.node_cards:
                            if nc.used_by:
                                p.state = "blocked (%s)" % nc.used_by
                            if nc.state != "RM_NODECARD_UP":
                                p.state = "hardware offline: nodecard %s" % nc.id
                                self.offline_partitions.append(p.name)
                        for s in p.switches:
                            if s in busted_switches:
                                p.state = "hardware offline: switch %s" % s 
                                self.offline_partitions.append(p.name)
                        for dep_name in p._wiring_conflicts:
                            if self._partitions[dep_name].state in ["busy", "allocated", "cleanup"]:
                                p.state = "blocked-wiring (%s)" % dep_name
                                break
                        for part_name in self.failed_diags:
                            part = self._partitions[part_name]
                            if p.name == part.name:
                                p.state = "failed diags"
                            elif p.name in part.parents or p.name in part.children:
                                p.state = "blocked by failed diags"
            except:
                self.logger.error("error in update_partition_state", exc_info=True)

            self._partitions_lock.release()

            # cleanup partitions and set their kernels back to the default (while _not_ holding the lock)
            pnames_cleaned = []
            for p in partitions_cleanup:
                self.logger.info("partition %s: starting partition destruction", p.name)
                pnames_destroyed = []
                parts = list(p._all_children)
                parts.append(p)
                for part in parts:
                    pnames_cleaned.append(part.name)
                    try:
                        bpart = part.bridge_partition
                        if bpart.state != "RM_PARTITION_FREE":
                            bpart.destroy()
                            pnames_destroyed.append(part.name)
                    except Cobalt.bridge.IncompatibleState:
                        pass
                    except:
                        self.logger.info("partition %s: an exception occurred while attempting to destroy partition %s",
                            p.name, part.name)
                if len(pnames_destroyed) > 0:
                    self.logger.info("partition %s: partition destruction initiated for %s", p.name, ", ".join(pnames_destroyed))
                else:
                    self.logger.info("partition %s: no partition destruction was required", p.name)
                try:
                    self._clear_kernel(p.name)
                    self.logger.info("partition %s: kernel settings cleared", p.name)
                except:
                    self.logger.error("partition %s: failed to clear kernel settings", p.name)
            job_filter = Cobalt.bridge.JobFilter()
            job_filter.job_type = Cobalt.bridge.JOB_TYPE_ALL_FLAG
            jobs = Cobalt.bridge.JobList.by_filter(job_filter)
            for job in jobs:
                if job.partition_id in pnames_cleaned:
                    try:
                        job.cancel()
                        self.logger.info("partition %s: task %d canceled", job.partition_id, job.db_id)
                    except (Cobalt.bridge.IncompatibleState, Cobalt.bridge.JobNotFound):
                        pass

            time.sleep(10)

    def _mark_partition_for_cleaning(self, pname, jobid):
        self._partitions_lock.acquire()
        try:
            p = self._partitions[pname]
            if p.used_by == jobid:
                p.cleanup_pending = True
                self.logger.info("partition %s: partition marked for cleanup", pname)
            elif p.used_by != None:
                self.logger.info("partition %s: job %s was not the current partition user (%s); partition not marked " + \
                    "for cleanup", pname, jobid, p.used_by)
        except:
            self.logger.exception("partition %s: unexpected exception while marking the partition for cleanup", pname)
        self._partitions_lock.release()

    def _validate_kernel(self, kernel):
        if self.config.get('kernel') != 'true':
            return True
        kernel_dir = "%s/%s" % (os.path.expandvars(self.config.get('bootprofiles')), kernel)
        return os.path.exists(kernel_dir)

    def _set_kernel(self, partition, kernel):
        '''Set the kernel to be used by jobs run on the specified partition'''
        if self.config.get('kernel') != 'true':
            if kernel != "default":
                raise Exception("custom kernel capabilities disabled")
            return
        partition_link = "%s/%s" % (os.path.expandvars(self.config.get('partitionboot')), partition)
        kernel_dir = "%s/%s" % (os.path.expandvars(self.config.get('bootprofiles')), kernel)
        try:
            current = os.readlink(partition_link)
        except OSError:
            self.logger.error("partition %s: failed to read partitionboot location %s" % (partition, partition_link))
            raise Exception("failed to read partitionboot location %s" % (partition_link,))
        if current != kernel_dir:
            if not self._validate_kernel(kernel):
                self.logger.error("partition %s: kernel directory \"%s\" does not exist" % (partition, kernel_dir))
                raise Exception("kernel directory \"%s\" does not exist" % (kernel_dir,))
            self.logger.info("partition %s: updating boot image; currently set to \"%s\"" % (partition, current.split('/')[-1]))
            try:
                os.unlink(partition_link)
                os.symlink(kernel_dir, partition_link)
            except OSError:
                self.logger.error("partition %s: failed to reset boot location" % (partition,))
                raise Exception("failed to reset boot location for partition" % (partition,))
            self.logger.info("partition %s: boot image updated; now set to \"%s\"" % (partition, kernel))

    def _clear_kernel(self, partition):
        '''Set the kernel to be used by a partition to the default value'''
        if self.config.get('kernel') == 'true':
            try:
                self._set_kernel(partition, "default")
            except:
                logger.error("partition %s: failed to reset boot location" % (partition,))

    def generate_xml(self):
        """This method produces an XML file describing the managed partitions, suitable for use with the simulator."""
        ret = "<BG>\n"
        ret += "<PartitionList>\n"
        for p_name in self._managed_partitions:
            p = self._partitions[p_name]

            ret += "   <Partition name='%s'>\n" % p.name
            for nc in p.node_cards:
                ret += "      <NodeCard id='%s' />\n" % nc.id
            for s in p.switches:
                ret += "      <Switch id='%s' />\n" % s
            ret += "   </Partition>\n"
        
        ret += "</PartitionList>\n"

        ret += "</BG>\n"
            
        return ret
    generate_xml = exposed(generate_xml)
    
    def add_process_groups (self, specs):
        
        """Create a process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """
        
        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode', False) == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        script_pgroups = []
        if script_specs:
            for spec in script_specs:
                try:
                    self._set_kernel(spec.get('location')[0], spec.get('kernel', "default"))
                except Exception, e:
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    pgroup.exit_status = 1
                    self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, 
                        pgroup.user, e)
                else:
                    try:
                        script_pgroup = ComponentProxy("script-manager").add_jobs([spec])
                    except (ComponentLookupError, xmlrpclib.Fault):
                        self._clear_kernel(spec.get('location')[0])
                        # FIXME: jobs that were already started are not reported
                        raise ProcessGroupCreationError("system::add_process_groups failed to communicate with script-manager")
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.script_id = script_pgroup[0]['id']
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    self.logger.info("job %s/%s: process group %s created to track script", pgroup.jobid, pgroup.user, pgroup.id)
                    self.reserve_resources_until(spec['location'], time.time() + 60*float(spec['walltime']), pgroup.jobid)
                    if pgroup.kernel != "default":
                        self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user, 
                            pgroup.kernel)
                    script_pgroups.append(pgroup)

        # start up non-script mode jobs
        process_groups = self.process_groups.q_add(other_specs)
        for pgroup in process_groups:
            pgroup.nodect = self._partitions[pgroup.location[0]].size
            self.logger.info("job %s/%s: process group %s created to track mpirun status", pgroup.jobid, pgroup.user, pgroup.id)
            try:
                if not pgroup.true_mpi_args:
                    self._set_kernel(pgroup.location[0], pgroup.kernel)
            except Exception, e:
                # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do.  another flag
                # should be added to the process group that wait_process_group uses to determine when a process group is no
                # longer active.  an error message should also be attached to the process group so that cqm can report the
                # problem to the user.
                pgroup.exit_status = 1
                self.logger.info("process group %s: job %s/%s failed to set the kernel; %s", pgroup.id, pgroup.jobid, 
                    pgroup.user, e)
            else:
                if pgroup.kernel != "default" and not pgroup.true_mpi_args:
                    self.logger.info("process group %s: job %s/%s using kernel %s", pgroup.id, pgroup.jobid, pgroup.user,
                        pgroup.kernel)
                pgroup.start()
Beispiel #6
0
class Simulator (BGBaseSystem):
    
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """
    
    name = "system"
    implementation = "simulator"
    
    logger = logger

    MIN_RUN_TIME = 60
    MAX_RUN_TIME = 180

    def __init__ (self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = BGSimProcessGroup
        self.config_file = kwargs.get("config_file", None)
        self.failed_components = sets.Set()
        if self.config_file is not None:
            self.configure(self.config_file)
    
    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] =  (sched, func, queue)
        return {'managed_partitions':self._managed_partitions, 'version':2, 'config_file':self.config_file, 'partition_flags': flags}
    
    def __setstate__(self, state):
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = sets.Set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()
        
    def save_me(self):
        Component.save(self)
    save_me = automatic(save_me)

# check whether this is a mesh partition
    def check_mesh(self, location_name):
        location = location_name.strip()
        specs = location.split("-")
        mesh_mask = specs[len(specs)-2]
        if len(mesh_mask) == 1:
            return True
        #print location, " Torus"
        else:
            return False
            #print location, " Mesh"


    def configure (self, config_file):
        
        """Configure simulated partitions.
        
        Arguments:
        config_file -- xml configuration file
        """
        
        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)
                
            return self.node_card_cache[name]
            
            
        self.logger.info("configure()")
        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" % config_file)
            self.logger.error("exiting...")
            traceback.print_exc(file=sys.stdout)
            sys.exit(1)
            
        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" % (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)
        
        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32
                
        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()
        
        tmp_list = []
        part_key = {}

        # this is going to hold partition objects from the bridge (not our own Partition)
        wiring_cache = {}
        bp_cache = {}
        
        for partition_def in system_def.getiterator("Block"): 
            # skip duplicated partition
            if part_key.has_key(partition_def.get("name")):
                continue
            else:
                part_key[partition_def.get("name")]=""
            
            node_list = []
            switch_list = []
            
            for nc in partition_def.getiterator("NodeBoard"): 
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)
            
            # skip 1K mesh partition if its a default partition file
            if config_file == "partition_xml/2013-10-10-mira_pure.xml":
                if NODES_PER_NODECARD * nc_count == 1024 and self.check_mesh(partition_def.get("name")):
                    continue
                
            if not wiring_cache.has_key(nc_count):
                wiring_cache[nc_count] = []
            wiring_cache[nc_count].append(partition_def.get("name"))

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))                    
            
            tmp_list.append( dict(
                name = partition_def.get("name"),
                queue = partition_def.get("queue", "default"),
                size = NODES_PER_NODECARD * nc_count,
                node_cards = node_list,
                switches = switch_list,
                state = "idle",
                ))
        
        partitions.q_add(tmp_list)
        
        # find the wiring deps
        for size in wiring_cache:
            for p in wiring_cache[size]:
                p = partitions[p]
                s1 = sets.Set( p.switches )
                for other in wiring_cache[size]:
                    other = partitions[other]
                    if (p.name == other.name):
                        continue

                    s2 = sets.Set( other.switches )
                    
                    if s1.intersection(s2):
                        self.logger.info("found a wiring dep between %s and %s", p.name, other.name)
                        partitions[p.name]._wiring_conflicts.add(other.name)
        
            
        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)
        print "Total partitions: ", len(self._partitions)

    
    def reserve_partition (self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """
        
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("reserve_partition(%r, %r) [does not exist]" % (name, size))
            return False
        if partition.state != "allocated":
            self.logger.error("reserve_partition(%r, %r) [%s]" % (name, size, partition.state))
            return False
        if not partition.functional:
            self.logger.error("reserve_partition(%r, %r) [not functional]" % (name, size))
        if size is not None and size > partition.size:
            self.logger.error("reserve_partition(%r, %r) [size mismatch]" % (name, size))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "busy"
            partition.reserved_until = False
        except:
            self.logger.error("error in reserve_partition", exc_info=True)
        self._partitions_lock.release()
        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()
        
        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True
    reserve_partition = exposed(reserve_partition)
    
    def release_partition (self, name):
        """Release a reserved partition.
        
        Arguments:
        name -- name of the partition to release
        """
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("release_partition(%r) [already free]" % (name))
            return False
        if not partition.state == "busy":
            self.logger.info("release_partition(%r) [not busy]" % (name))
            return False
                
        self._partitions_lock.acquire()
        try:
            partition.state = "idle"
        except:
            self.logger.error("error in release_partition", exc_info=True)
        self._partitions_lock.release()
        
        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True
    release_partition = exposed(release_partition)
    
    def add_process_groups (self, specs):
        
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """
        
        self.logger.info("add_process_groups(%r)" % (specs))
        
        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode') == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)
        
        # start up script jobs
        new_pgroups = []
        if script_specs:
            try:
                for spec in script_specs:
                    script_pgroup = ComponentProxy("script-manager").add_jobs([spec])
                    new_pgroup = self.process_groups.q_add([spec])
                    new_pgroup[0].script_id = script_pgroup[0]['id']
                    self.reserve_resources_until(spec['location'], time.time() + 60*float(spec['walltime']),
                        new_pgroup[0].jobid)
                    new_pgroups.append(new_pgroup[0])
            except (ComponentLookupError, xmlrpclib.Fault):
                raise ProcessGroupCreationError("system::add_process_groups failed to communicate with script-manager")

        process_groups = self.process_groups.q_add(other_specs)
        for process_group in process_groups:
            self.start(process_group)
            
        return new_pgroups + process_groups
    add_process_groups = exposed(query(all_fields=True)(add_process_groups))
    
    def get_process_groups (self, specs):
        """Query process_groups from the simulator."""
        return self.process_groups.q_get(specs)
    get_process_groups = exposed(query(get_process_groups))
    
    def wait_process_groups (self, specs):
        """get process groups that have finished running."""
        self.logger.info("wait_process_groups(%r)" % (specs))
        process_groups = [pg for pg in self.process_groups.q_get(specs) if pg.exit_status is not None]
        for process_group in process_groups:
            # jobs that were launched on behalf of the script manager shouldn't release the partition
            if not process_group.true_mpi_args:
                self.reserve_resources_until(process_group.location, None, process_group.jobid)
            del self.process_groups[process_group.id]
        return process_groups
    wait_process_groups = exposed(query(wait_process_groups))
    
    def signal_process_groups (self, specs, signame="SIGINT"):
        """Simulate the signaling of a process_group."""
        self.logger.info("signal_process_groups(%r, %r)" % (specs, signame))
        process_groups = self.process_groups.q_get(specs)
        for process_group in process_groups:
            if process_group.mode == "script":
                try:
                    pgroup = ComponentProxy("script-manager").signal_jobs([{'id':process_group.script_id}], "SIGTERM")
                except (ComponentLookupError, xmlrpclib.Fault):
                    logger.error("Failed to communicate with script manager when killing job")
            else:
                process_group.signals.append(signame)
        return process_groups
    signal_process_groups = exposed(query(signal_process_groups))
    
    def start (self, process_group):
        thread.start_new_thread(self._mpirun, (process_group, ))
    
    def _mpirun (self, process_group):
        argv = process_group._get_argv()
        try:
            stdout = open(process_group.stdout or "/dev/null", "a")
        except:
            stdout = open("/dev/null", "a")
        try:
            stderr = open(process_group.stderr or "/dev/null", "a")
        except:
            stderr = open("/dev/null", "a")
        
        try:
            clfn = process_group.cobalt_log_file or "/dev/null"
            cobalt_log_file = open(clfn, "a")
            print >> cobalt_log_file, "%s\n" % " ".join(argv[1:])
            cobalt_log_file.close()
        except:
            logger.error("Job %s/%s: unable to open cobaltlog file %s", process_group.id, process_group.user, clfn, exc_info = True)
        
        try:
            partition = argv[argv.index("-partition") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-partition' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-partition' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        
        try:
            mode = argv[argv.index("-mode") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-mode' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-mode' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        
        try:
            size = argv[argv.index("-np") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-np' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-np' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        try:
            size = int(size)
        except ValueError:
            print >> stderr, "ERROR: '-np' got invalid value %r" % (size)
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
        
        print >> stdout, "ENVIRONMENT"
        print >> stdout, "-----------"
        for key, value in process_group.env.iteritems():
            print >> stdout, "%s=%s" % (key, value)
        print >> stdout
        
        print >> stderr, "FE_MPI (Info) : Initializing MPIRUN"
        reserved = self.reserve_partition(partition, size)
        if not reserved:
            print >> stderr, "BE_MPI (ERROR): Failed to run process on partition"
            print >> stderr, "BE_MPI (Info) : BE completed"
            print >> stderr, "FE_MPI (ERROR): Failure list:"
            print >> stderr, "FE_MPI (ERROR): - 1. ProcessGroup execution failed - unable to reserve partition", partition
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        
        
        hardware_failure = False
        for nc in self.partitions[partition].node_cards:
            if nc.id in self.failed_components:
                hardware_failure = True
                break
        for switch in self.partitions[partition].switches:
            if switch in self.failed_components:
                hardware_failure = True
                break

        if hardware_failure:
            excuses = ["incorrectly polarized packet accelerator", "the Internet is full", "side fumbling detected", "unilateral phase detractors offline", ]
            print >> stderr, "BE_MPI (ERROR): Booting aborted - partition is in DEALLOCATING ('D') state"
            print >> stderr, "BE_MPI (ERROR): Partition has not reached the READY ('I') state"
            print >> stderr, "BE_MPI (Info) : Checking for block error text:"
            print >> stderr, "BE_MPI (ERROR): block error text '%s.'" % random.choice(excuses)
            print >> stderr, "BE_MPI (Info) : Starting cleanup sequence"
            time.sleep(20)
            self.release_partition(partition)
            print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')"
            print >> stderr, "FE_MPI (ERROR): Failure list:"
            print >> stderr, "FE_MPI (ERROR): - 1.", partition, "couldn't boot."
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return


        
        print >> stderr, "FE_MPI (Info) : process group with id", process_group.id
        print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate"
        
        print >> stdout, "Running process_group: %s" % " ".join(argv)
        
        start_time = time.time()
        run_time = random.randint(self.MIN_RUN_TIME, self.MAX_RUN_TIME)
        my_exit_status = 0
         
        self.logger.info("process group %d running for about %f seconds", process_group.id, run_time)
        while time.time() < (start_time + run_time):
            if "SIGKILL" in process_group.signals:
                process_group.exit_status = 1
                return
            elif "SIGTERM" in process_group.signals:
                print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM"
                my_exit_status = 1
                break
            else:
                time.sleep(1) # tumblers better than pumpers
        
        print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')"
        print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated"
        print >> stderr, "BE_MPI (Info) : Releasing partition", partition
        released = self.release_partition(partition)
        if not released:
            print >> stderr, "BE_MPI (ERROR): Partition", partition, "could not switch to state FREE ('F')"
            print >> stderr, "BE_MPI (Info) : BE completed"
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')"
        print >> stderr, "BE_MPI (Info) : BE completed"
        print >> stderr, "FE_MPI (Info) : FE completed"
        print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status
        
        process_group.exit_status = my_exit_status
    
    
    def update_partition_state(self):
        # first, set all of the nodecards to not busy
        for nc in self.node_card_cache.values():
            nc.used_by = ''

        self._partitions_lock.acquire()
        try:
            for p in self._partitions.values():
                p._update_node_cards()
                
            now = time.time()
            
            # since we don't have the bridge, a partition which isn't busy
            # should be set to idle and then blocked states can be derived
            for p in self._partitions.values():
                if p.state != "busy":
                    p.state = "idle"
                if p.reserved_until and now > p.reserved_until:
                    p.reserved_until = None
                    p.reserved_by = None
                    
            for p in self._partitions.values():
                if p.state == "busy":
                    # when the partition becomes busy, if a script job isn't reserving it, then release the reservation
                    if not p.reserved_by:
                        p.reserved_until = False
                else:
                    if p.reserved_until:
                        p.state = "allocated"
                        for part in p._parents:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name,)
                        for part in p._children:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name,)
                    for diag_part in self.pending_diags:
                        if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children:
                            p.state = "blocked by pending diags"
                    for nc in p.node_cards:
                        if nc.used_by:
                            p.state = "blocked (%s)" % nc.used_by
                            break
                    for dep_name in p._wiring_conflicts:
                        if self._partitions[dep_name].state in ["allocated", "busy"]:
                            p.state = "blocked-wiring (%s)" % dep_name
                            break
                    for part_name in self.failed_diags:
                        part = self._partitions[part_name]
                        if p.name == part.name:
                            p.state = "failed diags"
                        elif p.name in part.parents or p.name in part.children:
                            p.state = "blocked by failed diags"
        except:
            self.logger.error("error in update_partition_state", exc_info=True)
        
        self._partitions_lock.release()
    update_partition_state = automatic(update_partition_state)

    def add_failed_components(self, component_names):
        success = []
        for name in component_names:
            if self.node_card_cache.has_key(name):
                self.failed_components.add(name)
                success.append(name)
            else:
                for p in self._partitions.values():
                    if name in p.switches:
                        self.failed_components.add(name)
                        success.append(name)
                        break
        return success
    add_failed_component = exposed(add_failed_components)
    
    def del_failed_components(self, component_names):
        success = []
        for name in component_names:
            try:
                self.failed_components.remove(name)
                success.append(name)
            except KeyError:
                pass
            
        return success
    del_failed_components = exposed(del_failed_components)
    
    def list_failed_components(self, component_names):
        return list(self.failed_components)
    list_failed_components = exposed(list_failed_components)
    
    def launch_diags(self, partition, test_name):
        exit_value = 0
        for nc in partition.node_cards:
            if nc.id in self.failed_components:
                exit_value = 1
        for switch in partition.switches:
            if switch in self.failed_components:
                exit_value = 2

        self.finish_diags(partition, test_name, exit_value)
Beispiel #7
0
class BGSystem(BGBaseSystem):
    """Blue Gene system component.
    
    Methods:
    configure -- load partitions from the bridge API
    add_process_groups -- add (start) an mpirun process on the system (exposed, ~query)
    get_process_groups -- retrieve mpirun processes (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- update partition state from the bridge API (runs as a thread)
    """

    name = "system"
    implementation = "bgsystem"

    logger = logger

    _configfields = ['diag_script_location', 'diag_log_file', 'kernel']
    _config = ConfigParser.ConfigParser()
    _config.read(Cobalt.CONFIG_FILES)
    if not _config._sections.has_key('bgsystem'):
        print '''"bgsystem" section missing from cobalt config file'''
        sys.exit(1)
    config = _config._sections['bgsystem']
    mfields = [field for field in _configfields if not config.has_key(field)]
    if mfields:
        print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (
            " ".join(mfields))
        sys.exit(1)
    if config.get('kernel') == "true":
        _kernel_configfields = ['bootprofiles', 'partitionboot']
        mfields = [
            field for field in _kernel_configfields
            if not config.has_key(field)
        ]
        if mfields:
            print "Missing option(s) in cobalt config file [bgsystem] section: %s" % (
                " ".join(mfields))
            sys.exit(1)

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000)
        self.process_groups.item_cls = BGProcessGroup
        self.diag_pids = dict()
        self.configure()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())

    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] = (sched, func, queue)
        return {
            'managed_partitions': self._managed_partitions,
            'version': 1,
            'partition_flags': flags
        }

    def __setstate__(self, state):
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.diag_pids = dict()
        self.pending_script_waits = sets.Set()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []

        self.configure()
        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        # initiate the process before starting any threads
        thread.start_new_thread(self.update_partition_state, tuple())
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def _get_node_card(self, name, state):
        if not self.node_card_cache.has_key(name):
            self.node_card_cache[name] = NodeCard(name, state)

        return self.node_card_cache[name]

    def _new_partition_dict(self, partition_def, bp_cache={}):
        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        node_list = []

        if partition_def.small:
            bp_name = partition_def.base_partitions[0].id
            for nc in partition_def._node_cards:
                node_list.append(
                    self._get_node_card(bp_name + "-" + nc.id, nc.state))
        else:
            try:
                for bp in partition_def.base_partitions:
                    if bp.id not in bp_cache:
                        bp_cache[bp.id] = []
                        for nc in Cobalt.bridge.NodeCardList.by_base_partition(
                                bp):
                            bp_cache[bp.id].append(
                                self._get_node_card(bp.id + "-" + nc.id,
                                                    nc.state))
                    node_list += bp_cache[bp.id]
            except BridgeException:
                print "Error communicating with the bridge during initial config.  Terminating."
                sys.exit(1)

        d = dict(
            name=partition_def.id,
            queue="default",
            size=NODES_PER_NODECARD * len(node_list),
            bridge_partition=partition_def,
            node_cards=node_list,
            switches=[s.id for s in partition_def.switches],
            state=_get_state(partition_def),
        )
        return d

    def _detect_wiring_deps(self, partition, wiring_cache={}):
        def _kernel():
            s2 = sets.Set(p.switches)

            if s1.intersection(s2):
                p._wiring_conflicts.add(partition.name)
                partition._wiring_conflicts.add(p.name)
                self.logger.debug("%s and %s havening problems" %
                                  (partition.name, p.name))

        s1 = sets.Set(partition.switches)

        if wiring_cache.has_key(partition.size):
            for p in wiring_cache[partition.size]:
                if partition.name != p.name:
                    _kernel()
        else:
            wiring_cache[partition.size] = [partition]
            for p in self._partitions.values():
                if p.size == partition.size and partition.name != p.name:
                    wiring_cache[partition.size].append(p)
                    _kernel()

    def configure(self):
        """Read partition data from the bridge."""

        self.logger.info("configure()")
        try:
            system_def = Cobalt.bridge.PartitionList.by_filter()
        except BridgeException:
            print "Error communicating with the bridge during initial config.  Terminating."
            sys.exit(1)

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        wiring_cache = {}
        bp_cache = {}

        for partition_def in system_def:
            tmp_list.append(self._new_partition_dict(partition_def, bp_cache))

        partitions.q_add(tmp_list)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)

        # find the wiring deps
        start = time.time()
        for p in self._partitions.values():
            self._detect_wiring_deps(p, wiring_cache)

        end = time.time()
        self.logger.info("took %f seconds to find wiring deps" % (end - start))

        # update state information
        for p in self._partitions.values():
            if p.state != "busy":
                for nc in p.node_cards:
                    if nc.used_by:
                        p.state = "blocked (%s)" % nc.used_by
                        break
                for dep_name in p._wiring_conflicts:
                    if self._partitions[dep_name].state == "busy":
                        p.state = "blocked-wiring (%s)" % dep_name
                        break

    def update_partition_state(self):
        """Use the quicker bridge method that doesn't return nodecard information to update the states of the partitions"""
        def _start_partition_cleanup(p):
            self.logger.info("partition %s: marking partition for cleaning",
                             p.name)
            p.cleanup_pending = True
            partitions_cleanup.append(p)
            _set_partition_cleanup_state(p)
            p.reserved_until = False
            p.reserved_by = None
            p.used_by = None

        def _set_partition_cleanup_state(p):
            p.state = "cleanup"
            for part in p._children:
                if part.bridge_partition.state == "RM_PARTITION_FREE":
                    part.state = "blocked (%s)" % (p.name, )
                else:
                    part.state = "cleanup"
            for part in p._parents:
                if part.state == "idle":
                    part.state = "blocked (%s)" % (p.name, )

        while True:
            try:
                system_def = Cobalt.bridge.PartitionList.info_by_filter()
            except BridgeException:
                self.logger.error(
                    "Error communicating with the bridge to update partition state information."
                )
                self.bridge_in_error = True
                time.sleep(5)  # wait a little bit...
                continue  # then try again

            try:
                bg_object = Cobalt.bridge.BlueGene.by_serial()
                for bp in bg_object.base_partitions:
                    for nc in Cobalt.bridge.NodeCardList.by_base_partition(bp):
                        self.node_card_cache[bp.id + "-" +
                                             nc.id].state = nc.state
            except:
                self.logger.error(
                    "Error communicating with the bridge to update nodecard state information."
                )
                self.bridge_in_error = True
                time.sleep(5)  # wait a little bit...
                continue  # then try again

            self.bridge_in_error = False
            busted_switches = []
            for s in bg_object.switches:
                if s.state != "RM_SWITCH_UP":
                    busted_switches.append(s.id)

            # set all of the nodecards to not busy
            for nc in self.node_card_cache.values():
                nc.used_by = ''

            # update the state of each partition
            self._partitions_lock.acquire()
            now = time.time()
            partitions_cleanup = []
            self.offline_partitions = []
            missing_partitions = set(self._partitions.keys())
            new_partitions = []
            try:
                for partition in system_def:
                    missing_partitions.discard(partition.id)
                    if self._partitions.has_key(partition.id):
                        p = self._partitions[partition.id]
                        p.state = _get_state(partition)
                        p.bridge_partition = partition
                        p._update_node_cards()
                        if p.reserved_until and now > p.reserved_until:
                            p.reserved_until = False
                            p.reserved_by = None
                    else:
                        new_partitions.append(partition)

                # remove the missing partitions and their wiring relations
                for pname in missing_partitions:
                    self.logger.info("missing partition removed: %s", pname)
                    p = self._partitions[pname]
                    for dep_name in p._wiring_conflicts:
                        self.logger.debug(
                            "removing wiring dependency from: %s", dep_name)
                        self._partitions[dep_name]._wiring_conflicts.discard(
                            p.name)
                    if p.name in self._managed_partitions:
                        self._managed_partitions.discard(p.name)
                    del self._partitions[p.name]

                bp_cache = {}
                wiring_cache = {}
                # throttle the adding of new partitions so updating of
                # machine state doesn't get bogged down
                for partition in new_partitions[:8]:
                    self.logger.info("new partition found: %s", partition.id)
                    bridge_p = Cobalt.bridge.Partition.by_id(partition.id)
                    self._partitions.q_add(
                        [self._new_partition_dict(bridge_p, bp_cache)])
                    p = self._partitions[bridge_p.id]
                    p.bridge_partition = partition
                    self._detect_wiring_deps(p, wiring_cache)

                # if partitions were added or removed, then update the relationships between partitions
                if len(missing_partitions) > 0 or len(new_partitions) > 0:
                    self.update_relatives()

                for p in self._partitions.values():
                    if p.cleanup_pending:
                        if p.used_by:
                            # if the partition has a pending cleanup request, then set the state so that cleanup will be
                            # performed
                            _start_partition_cleanup(p)
                        else:
                            # if the cleanup has already been initiated, then see how it's going
                            busy = []
                            parts = list(p._all_children)
                            parts.append(p)
                            for part in parts:
                                if part.bridge_partition.state != "RM_PARTITION_FREE":
                                    busy.append(part.name)
                            if len(busy) > 0:
                                _set_partition_cleanup_state(p)
                                self.logger.info(
                                    "partition %s: still cleaning; busy partition(s): %s",
                                    p.name, ", ".join(busy))
                            else:
                                p.cleanup_pending = False
                                self.logger.info(
                                    "partition %s: cleaning complete", p.name)
                    if p.state == "busy":
                        # when the partition becomes busy, if a script job isn't reserving it, then release the reservation
                        if not p.reserved_by:
                            p.reserved_until = False
                    elif p.state != "cleanup":
                        if p.reserved_until:
                            p.state = "allocated"
                            for part in p._parents:
                                if part.state == "idle":
                                    part.state = "blocked (%s)" % (p.name, )
                            for part in p._children:
                                if part.state == "idle":
                                    part.state = "blocked (%s)" % (p.name, )
                        elif p.bridge_partition.state == "RM_PARTITION_FREE" and p.used_by:
                            # if the job assigned to the partition has completed, then set the state so that cleanup will be
                            # performed
                            _start_partition_cleanup(p)
                            continue
                        for diag_part in self.pending_diags:
                            if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children:
                                p.state = "blocked by pending diags"
                        for nc in p.node_cards:
                            if nc.used_by:
                                p.state = "blocked (%s)" % nc.used_by
                            if nc.state != "RM_NODECARD_UP":
                                p.state = "hardware offline: nodecard %s" % nc.id
                                self.offline_partitions.append(p.name)
                        for s in p.switches:
                            if s in busted_switches:
                                p.state = "hardware offline: switch %s" % s
                                self.offline_partitions.append(p.name)
                        for dep_name in p._wiring_conflicts:
                            if self._partitions[dep_name].state in [
                                    "busy", "allocated", "cleanup"
                            ]:
                                p.state = "blocked-wiring (%s)" % dep_name
                                break
                        for part_name in self.failed_diags:
                            part = self._partitions[part_name]
                            if p.name == part.name:
                                p.state = "failed diags"
                            elif p.name in part.parents or p.name in part.children:
                                p.state = "blocked by failed diags"
            except:
                self.logger.error("error in update_partition_state",
                                  exc_info=True)

            self._partitions_lock.release()

            # cleanup partitions and set their kernels back to the default (while _not_ holding the lock)
            pnames_cleaned = []
            for p in partitions_cleanup:
                self.logger.info(
                    "partition %s: starting partition destruction", p.name)
                pnames_destroyed = []
                parts = list(p._all_children)
                parts.append(p)
                for part in parts:
                    pnames_cleaned.append(part.name)
                    try:
                        bpart = part.bridge_partition
                        if bpart.state != "RM_PARTITION_FREE":
                            bpart.destroy()
                            pnames_destroyed.append(part.name)
                    except Cobalt.bridge.IncompatibleState:
                        pass
                    except:
                        self.logger.info(
                            "partition %s: an exception occurred while attempting to destroy partition %s",
                            p.name, part.name)
                if len(pnames_destroyed) > 0:
                    self.logger.info(
                        "partition %s: partition destruction initiated for %s",
                        p.name, ", ".join(pnames_destroyed))
                else:
                    self.logger.info(
                        "partition %s: no partition destruction was required",
                        p.name)
                try:
                    self._clear_kernel(p.name)
                    self.logger.info("partition %s: kernel settings cleared",
                                     p.name)
                except:
                    self.logger.error(
                        "partition %s: failed to clear kernel settings",
                        p.name)
            job_filter = Cobalt.bridge.JobFilter()
            job_filter.job_type = Cobalt.bridge.JOB_TYPE_ALL_FLAG
            jobs = Cobalt.bridge.JobList.by_filter(job_filter)
            for job in jobs:
                if job.partition_id in pnames_cleaned:
                    try:
                        job.cancel()
                        self.logger.info("partition %s: task %d canceled",
                                         job.partition_id, job.db_id)
                    except (Cobalt.bridge.IncompatibleState,
                            Cobalt.bridge.JobNotFound):
                        pass

            time.sleep(10)

    def _mark_partition_for_cleaning(self, pname, jobid):
        self._partitions_lock.acquire()
        try:
            p = self._partitions[pname]
            if p.used_by == jobid:
                p.cleanup_pending = True
                self.logger.info("partition %s: partition marked for cleanup",
                                 pname)
            elif p.used_by != None:
                self.logger.info("partition %s: job %s was not the current partition user (%s); partition not marked " + \
                    "for cleanup", pname, jobid, p.used_by)
        except:
            self.logger.exception(
                "partition %s: unexpected exception while marking the partition for cleanup",
                pname)
        self._partitions_lock.release()

    def _validate_kernel(self, kernel):
        if self.config.get('kernel') != 'true':
            return True
        kernel_dir = "%s/%s" % (os.path.expandvars(
            self.config.get('bootprofiles')), kernel)
        return os.path.exists(kernel_dir)

    def _set_kernel(self, partition, kernel):
        '''Set the kernel to be used by jobs run on the specified partition'''
        if self.config.get('kernel') != 'true':
            if kernel != "default":
                raise Exception("custom kernel capabilities disabled")
            return
        partition_link = "%s/%s" % (os.path.expandvars(
            self.config.get('partitionboot')), partition)
        kernel_dir = "%s/%s" % (os.path.expandvars(
            self.config.get('bootprofiles')), kernel)
        try:
            current = os.readlink(partition_link)
        except OSError:
            self.logger.error(
                "partition %s: failed to read partitionboot location %s" %
                (partition, partition_link))
            raise Exception("failed to read partitionboot location %s" %
                            (partition_link, ))
        if current != kernel_dir:
            if not self._validate_kernel(kernel):
                self.logger.error(
                    "partition %s: kernel directory \"%s\" does not exist" %
                    (partition, kernel_dir))
                raise Exception("kernel directory \"%s\" does not exist" %
                                (kernel_dir, ))
            self.logger.info(
                "partition %s: updating boot image; currently set to \"%s\"" %
                (partition, current.split('/')[-1]))
            try:
                os.unlink(partition_link)
                os.symlink(kernel_dir, partition_link)
            except OSError:
                self.logger.error(
                    "partition %s: failed to reset boot location" %
                    (partition, ))
                raise Exception("failed to reset boot location for partition" %
                                (partition, ))
            self.logger.info(
                "partition %s: boot image updated; now set to \"%s\"" %
                (partition, kernel))

    def _clear_kernel(self, partition):
        '''Set the kernel to be used by a partition to the default value'''
        if self.config.get('kernel') == 'true':
            try:
                self._set_kernel(partition, "default")
            except:
                logger.error("partition %s: failed to reset boot location" %
                             (partition, ))

    def generate_xml(self):
        """This method produces an XML file describing the managed partitions, suitable for use with the simulator."""
        ret = "<BG>\n"
        ret += "<PartitionList>\n"
        for p_name in self._managed_partitions:
            p = self._partitions[p_name]

            ret += "   <Partition name='%s'>\n" % p.name
            for nc in p.node_cards:
                ret += "      <NodeCard id='%s' />\n" % nc.id
            for s in p.switches:
                ret += "      <Switch id='%s' />\n" % s
            ret += "   </Partition>\n"

        ret += "</PartitionList>\n"

        ret += "</BG>\n"

        return ret

    generate_xml = exposed(generate_xml)

    def add_process_groups(self, specs):
        """Create a process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode', False) == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        script_pgroups = []
        if script_specs:
            for spec in script_specs:
                try:
                    self._set_kernel(
                        spec.get('location')[0], spec.get('kernel', "default"))
                except Exception, e:
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    pgroup.exit_status = 1
                    self.logger.info(
                        "process group %s: job %s/%s failed to set the kernel; %s",
                        pgroup.id, pgroup.jobid, pgroup.user, e)
                else:
                    try:
                        script_pgroup = ComponentProxy(
                            "script-manager").add_jobs([spec])
                    except (ComponentLookupError, xmlrpclib.Fault):
                        self._clear_kernel(spec.get('location')[0])
                        # FIXME: jobs that were already started are not reported
                        raise ProcessGroupCreationError(
                            "system::add_process_groups failed to communicate with script-manager"
                        )
                    new_pgroup = self.process_groups.q_add([spec])
                    pgroup = new_pgroup[0]
                    pgroup.script_id = script_pgroup[0]['id']
                    pgroup.nodect = self._partitions[pgroup.location[0]].size
                    self.logger.info(
                        "job %s/%s: process group %s created to track script",
                        pgroup.jobid, pgroup.user, pgroup.id)
                    self.reserve_resources_until(
                        spec['location'],
                        time.time() + 60 * float(spec['walltime']),
                        pgroup.jobid)
                    if pgroup.kernel != "default":
                        self.logger.info(
                            "process group %s: job %s/%s using kernel %s",
                            pgroup.id, pgroup.jobid, pgroup.user,
                            pgroup.kernel)
                    script_pgroups.append(pgroup)

        # start up non-script mode jobs
        process_groups = self.process_groups.q_add(other_specs)
        for pgroup in process_groups:
            pgroup.nodect = self._partitions[pgroup.location[0]].size
            self.logger.info(
                "job %s/%s: process group %s created to track mpirun status",
                pgroup.jobid, pgroup.user, pgroup.id)
            try:
                if not pgroup.true_mpi_args:
                    self._set_kernel(pgroup.location[0], pgroup.kernel)
            except Exception, e:
                # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do.  another flag
                # should be added to the process group that wait_process_group uses to determine when a process group is no
                # longer active.  an error message should also be attached to the process group so that cqm can report the
                # problem to the user.
                pgroup.exit_status = 1
                self.logger.info(
                    "process group %s: job %s/%s failed to set the kernel; %s",
                    pgroup.id, pgroup.jobid, pgroup.user, e)
            else:
                if pgroup.kernel != "default" and not pgroup.true_mpi_args:
                    self.logger.info(
                        "process group %s: job %s/%s using kernel %s",
                        pgroup.id, pgroup.jobid, pgroup.user, pgroup.kernel)
                pgroup.start()
Beispiel #8
0
class Simulator(BGBaseSystem):
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """

    name = "system"
    implementation = "simulator"

    logger = logger

    MIN_RUN_TIME = 60
    MAX_RUN_TIME = 180

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        self.process_groups.item_cls = BGSimProcessGroup
        self.config_file = kwargs.get("config_file", None)
        self.failed_components = sets.Set()
        if self.config_file is not None:
            self.configure(self.config_file)

    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] = (sched, func, queue)
        return {
            'managed_partitions': self._managed_partitions,
            'version': 2,
            'config_file': self.config_file,
            'partition_flags': flags
        }

    def __setstate__(self, state):
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = sets.Set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def configure(self, config_file):
        """Configure simulated partitions.
        
        Arguments:
        config_file -- xml configuration file
        """
        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)

            return self.node_card_cache[name]

        self.logger.info("configure()")
        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" %
                              config_file)
            self.logger.error("exiting...")
            sys.exit(1)

        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" %
                              (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)

        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        # this is going to hold partition objects from the bridge (not our own Partition)
        wiring_cache = {}
        bp_cache = {}

        for partition_def in system_def.getiterator("Partition"):
            if not partition_def.get("name").startswith("ANL"):
                continue

            node_list = []
            switch_list = []

            for nc in partition_def.getiterator("NodeCard"):
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)

            # remove partitions which have less than 512 nodes
            if (NODES_PER_NODECARD * nc_count) < 512:
                continue
            if not wiring_cache.has_key(nc_count):
                wiring_cache[nc_count] = []
            wiring_cache[nc_count].append(partition_def.get("name"))

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))

            tmp_list.append(
                dict(
                    name=partition_def.get("name"),
                    queue=partition_def.get("queue", "default"),
                    size=NODES_PER_NODECARD * nc_count,
                    node_cards=node_list,
                    switches=switch_list,
                    state="idle",
                ))

        partitions.q_add(tmp_list)

        # find the wiring deps
        for size in wiring_cache:
            for p in wiring_cache[size]:
                p = partitions[p]
                s1 = sets.Set(p.switches)
                for other in wiring_cache[size]:
                    other = partitions[other]
                    if (p.name == other.name):
                        continue

                    s2 = sets.Set(other.switches)

                    if s1.intersection(s2):
                        self.logger.info(
                            "found a wiring dep between %s and %s", p.name,
                            other.name)
                        partitions[p.name]._wiring_conflicts.add(other.name)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)
        print "Total partitions: ", len(self._partitions)

    def reserve_partition(self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """

        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("reserve_partition(%r, %r) [does not exist]" %
                              (name, size))
            return False
        if partition.state != "allocated":
            self.logger.error("reserve_partition(%r, %r) [%s]" %
                              (name, size, partition.state))
            return False
        if not partition.functional:
            self.logger.error("reserve_partition(%r, %r) [not functional]" %
                              (name, size))
        if size is not None and size > partition.size:
            self.logger.error("reserve_partition(%r, %r) [size mismatch]" %
                              (name, size))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "busy"
            partition.reserved_until = False
        except:
            self.logger.error("error in reserve_partition", exc_info=True)
        self._partitions_lock.release()
        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()

        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True

    reserve_partition = exposed(reserve_partition)

    def release_partition(self, name):
        """Release a reserved partition.
        
        Arguments:
        name -- name of the partition to release
        """
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("release_partition(%r) [already free]" % (name))
            return False
        if not partition.state == "busy":
            self.logger.info("release_partition(%r) [not busy]" % (name))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "idle"
        except:
            self.logger.error("error in release_partition", exc_info=True)
        self._partitions_lock.release()

        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True

    release_partition = exposed(release_partition)

    def add_process_groups(self, specs):
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        script_specs = []
        other_specs = []
        for spec in specs:
            if spec.get('mode') == "script":
                script_specs.append(spec)
            else:
                other_specs.append(spec)

        # start up script jobs
        new_pgroups = []
        if script_specs:
            try:
                for spec in script_specs:
                    script_pgroup = ComponentProxy("script-manager").add_jobs(
                        [spec])
                    new_pgroup = self.process_groups.q_add([spec])
                    new_pgroup[0].script_id = script_pgroup[0]['id']
                    self.reserve_resources_until(
                        spec['location'],
                        time.time() + 60 * float(spec['walltime']),
                        new_pgroup[0].jobid)
                    new_pgroups.append(new_pgroup[0])
            except (ComponentLookupError, xmlrpclib.Fault):
                raise ProcessGroupCreationError(
                    "system::add_process_groups failed to communicate with script-manager"
                )

        process_groups = self.process_groups.q_add(other_specs)
        for process_group in process_groups:
            self.start(process_group)

        return new_pgroups + process_groups

    add_process_groups = exposed(query(all_fields=True)(add_process_groups))

    def get_process_groups(self, specs):
        """Query process_groups from the simulator."""
        return self.process_groups.q_get(specs)

    get_process_groups = exposed(query(get_process_groups))

    def wait_process_groups(self, specs):
        """get process groups that have finished running."""
        self.logger.info("wait_process_groups(%r)" % (specs))
        process_groups = [
            pg for pg in self.process_groups.q_get(specs)
            if pg.exit_status is not None
        ]
        for process_group in process_groups:
            # jobs that were launched on behalf of the script manager shouldn't release the partition
            if not process_group.true_mpi_args:
                self.reserve_resources_until(process_group.location, None,
                                             process_group.jobid)
            del self.process_groups[process_group.id]
        return process_groups

    wait_process_groups = exposed(query(wait_process_groups))

    def signal_process_groups(self, specs, signame="SIGINT"):
        """Simulate the signaling of a process_group."""
        self.logger.info("signal_process_groups(%r, %r)" % (specs, signame))
        process_groups = self.process_groups.q_get(specs)
        for process_group in process_groups:
            if process_group.mode == "script":
                try:
                    pgroup = ComponentProxy("script-manager").signal_jobs(
                        [{
                            'id': process_group.script_id
                        }], "SIGTERM")
                except (ComponentLookupError, xmlrpclib.Fault):
                    logger.error(
                        "Failed to communicate with script manager when killing job"
                    )
            else:
                process_group.signals.append(signame)
        return process_groups

    signal_process_groups = exposed(query(signal_process_groups))

    def start(self, process_group):
        thread.start_new_thread(self._mpirun, (process_group, ))

    def _mpirun(self, process_group):
        argv = process_group._get_argv()
        try:
            stdout = open(process_group.stdout or "/dev/null", "a")
        except:
            stdout = open("/dev/null", "a")
        try:
            stderr = open(process_group.stderr or "/dev/null", "a")
        except:
            stderr = open("/dev/null", "a")

        try:
            clfn = process_group.cobalt_log_file or "/dev/null"
            cobalt_log_file = open(clfn, "a")
            print >> cobalt_log_file, "%s\n" % " ".join(argv[1:])
            cobalt_log_file.close()
        except:
            logger.error("Job %s/%s: unable to open cobaltlog file %s",
                         process_group.id,
                         process_group.user,
                         clfn,
                         exc_info=True)

        try:
            partition = argv[argv.index("-partition") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-partition' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-partition' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        try:
            mode = argv[argv.index("-mode") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-mode' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-mode' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        try:
            size = argv[argv.index("-np") + 1]
        except ValueError:
            print >> stderr, "ERROR: '-np' is a required flag"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        except IndexError:
            print >> stderr, "ERROR: '-np' requires a value"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        try:
            size = int(size)
        except ValueError:
            print >> stderr, "ERROR: '-np' got invalid value %r" % (size)
            print >> stderr, "FE_MPI (Info) : Exit status: 1"

        print >> stdout, "ENVIRONMENT"
        print >> stdout, "-----------"
        for key, value in process_group.env.iteritems():
            print >> stdout, "%s=%s" % (key, value)
        print >> stdout

        print >> stderr, "FE_MPI (Info) : Initializing MPIRUN"
        reserved = self.reserve_partition(partition, size)
        if not reserved:
            print >> stderr, "BE_MPI (ERROR): Failed to run process on partition"
            print >> stderr, "BE_MPI (Info) : BE completed"
            print >> stderr, "FE_MPI (ERROR): Failure list:"
            print >> stderr, "FE_MPI (ERROR): - 1. ProcessGroup execution failed - unable to reserve partition", partition
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        hardware_failure = False
        for nc in self.partitions[partition].node_cards:
            if nc.id in self.failed_components:
                hardware_failure = True
                break
        for switch in self.partitions[partition].switches:
            if switch in self.failed_components:
                hardware_failure = True
                break

        if hardware_failure:
            excuses = [
                "incorrectly polarized packet accelerator",
                "the Internet is full",
                "side fumbling detected",
                "unilateral phase detractors offline",
            ]
            print >> stderr, "BE_MPI (ERROR): Booting aborted - partition is in DEALLOCATING ('D') state"
            print >> stderr, "BE_MPI (ERROR): Partition has not reached the READY ('I') state"
            print >> stderr, "BE_MPI (Info) : Checking for block error text:"
            print >> stderr, "BE_MPI (ERROR): block error text '%s.'" % random.choice(
                excuses)
            print >> stderr, "BE_MPI (Info) : Starting cleanup sequence"
            time.sleep(20)
            self.release_partition(partition)
            print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')"
            print >> stderr, "FE_MPI (ERROR): Failure list:"
            print >> stderr, "FE_MPI (ERROR): - 1.", partition, "couldn't boot."
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return

        print >> stderr, "FE_MPI (Info) : process group with id", process_group.id
        print >> stderr, "FE_MPI (Info) : Waiting for process_group to terminate"

        print >> stdout, "Running process_group: %s" % " ".join(argv)

        start_time = time.time()
        run_time = random.randint(self.MIN_RUN_TIME, self.MAX_RUN_TIME)
        my_exit_status = 0

        self.logger.info("process group %d running for about %f seconds",
                         process_group.id, run_time)
        while time.time() < (start_time + run_time):
            if "SIGKILL" in process_group.signals:
                process_group.exit_status = 1
                return
            elif "SIGTERM" in process_group.signals:
                print >> stderr, "FE_MPI (Info) : ProcessGroup got signal SIGTERM"
                my_exit_status = 1
                break
            else:
                time.sleep(1)  # tumblers better than pumpers

        print >> stderr, "FE_MPI (Info) : ProcessGroup", process_group.id, "switched to state TERMINATED ('T')"
        print >> stderr, "FE_MPI (Info) : ProcessGroup sucessfully terminated"
        print >> stderr, "BE_MPI (Info) : Releasing partition", partition
        released = self.release_partition(partition)
        if not released:
            print >> stderr, "BE_MPI (ERROR): Partition", partition, "could not switch to state FREE ('F')"
            print >> stderr, "BE_MPI (Info) : BE completed"
            print >> stderr, "FE_MPI (Info) : FE completed"
            print >> stderr, "FE_MPI (Info) : Exit status: 1"
            process_group.exit_status = 1
            return
        print >> stderr, "BE_MPI (Info) : Partition", partition, "switched to state FREE ('F')"
        print >> stderr, "BE_MPI (Info) : BE completed"
        print >> stderr, "FE_MPI (Info) : FE completed"
        print >> stderr, "FE_MPI (Info) : Exit status:", my_exit_status

        process_group.exit_status = my_exit_status

    def update_partition_state(self):
        # first, set all of the nodecards to not busy
        for nc in self.node_card_cache.values():
            nc.used_by = ''

        self._partitions_lock.acquire()
        try:
            for p in self._partitions.values():
                p._update_node_cards()

            now = time.time()

            # since we don't have the bridge, a partition which isn't busy
            # should be set to idle and then blocked states can be derived
            for p in self._partitions.values():
                if p.state != "busy":
                    p.state = "idle"
                if p.reserved_until and now > p.reserved_until:
                    p.reserved_until = None
                    p.reserved_by = None

            for p in self._partitions.values():
                if p.state == "busy":
                    # when the partition becomes busy, if a script job isn't reserving it, then release the reservation
                    if not p.reserved_by:
                        p.reserved_until = False
                else:
                    if p.reserved_until:
                        p.state = "allocated"
                        for part in p._parents:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name, )
                        for part in p._children:
                            if part.state == "idle":
                                part.state = "blocked (%s)" % (p.name, )
                    for diag_part in self.pending_diags:
                        if p.name == diag_part.name or p.name in diag_part.parents or p.name in diag_part.children:
                            p.state = "blocked by pending diags"
                    for nc in p.node_cards:
                        if nc.used_by:
                            p.state = "blocked (%s)" % nc.used_by
                            break
                    for dep_name in p._wiring_conflicts:
                        if self._partitions[dep_name].state in [
                                "allocated", "busy"
                        ]:
                            p.state = "blocked-wiring (%s)" % dep_name
                            break
                    for part_name in self.failed_diags:
                        part = self._partitions[part_name]
                        if p.name == part.name:
                            p.state = "failed diags"
                        elif p.name in part.parents or p.name in part.children:
                            p.state = "blocked by failed diags"
        except:
            self.logger.error("error in update_partition_state", exc_info=True)

        self._partitions_lock.release()

    update_partition_state = automatic(update_partition_state)

    def add_failed_components(self, component_names):
        success = []
        for name in component_names:
            if self.node_card_cache.has_key(name):
                self.failed_components.add(name)
                success.append(name)
            else:
                for p in self._partitions.values():
                    if name in p.switches:
                        self.failed_components.add(name)
                        success.append(name)
                        break
        return success

    add_failed_component = exposed(add_failed_components)

    def del_failed_components(self, component_names):
        success = []
        for name in component_names:
            try:
                self.failed_components.remove(name)
                success.append(name)
            except KeyError:
                pass

        return success

    del_failed_components = exposed(del_failed_components)

    def list_failed_components(self, component_names):
        return list(self.failed_components)

    list_failed_components = exposed(list_failed_components)

    def launch_diags(self, partition, test_name):
        exit_value = 0
        for nc in partition.node_cards:
            if nc.id in self.failed_components:
                exit_value = 1
        for switch in partition.switches:
            if switch in self.failed_components:
                exit_value = 2

        self.finish_diags(partition, test_name, exit_value)
Beispiel #9
0
class Simulator(BGBaseSystem):
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """

    name = "system"
    implementation = "simulator"

    logger = logger

    MIN_RUN_TIME = 60
    MAX_RUN_TIME = 180

    def __init__(self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000)  #why this magic number?
        self.process_groups.item_cls = BGSimProcessGroup
        self.config_file = kwargs.get("config_file", None)
        self.failed_components = set()
        if self.config_file is not None:
            self.configure(self.config_file)

    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] = (sched, func, queue)
        return {
            'managed_partitions': self._managed_partitions,
            'version': 2,
            'config_file': self.config_file,
            'partition_flags': flags
        }

    def __setstate__(self, state):
        Cobalt.Util.fix_set(state)
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()

    def save_me(self):
        Component.save(self)

    save_me = automatic(save_me)

    def configure(self, config_file):
        """Configure simulated partitions.
        
        Arguments:
        config_file -- xml configuration file
        """
        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)

            return self.node_card_cache[name]

        self.logger.info("configure()")
        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" %
                              config_file)
            self.logger.error("exiting...")
            sys.exit(1)

        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" %
                              (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)

        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32

        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()

        tmp_list = []

        # this is going to hold partition objects from the bridge (not our own Partition)
        wiring_cache = {}
        bp_cache = {}

        for partition_def in system_def.getiterator("Partition"):
            node_list = []
            switch_list = []

            for nc in partition_def.getiterator("NodeCard"):
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)

            if not wiring_cache.has_key(nc_count):
                wiring_cache[nc_count] = []
            wiring_cache[nc_count].append(partition_def.get("name"))

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))

            tmp_list.append(
                dict(
                    name=partition_def.get("name"),
                    queue=partition_def.get("queue", "default"),
                    size=NODES_PER_NODECARD * nc_count,
                    node_cards=node_list,
                    switches=switch_list,
                    state="idle",
                ))

        partitions.q_add(tmp_list)

        # find the wiring deps
        for size in wiring_cache:
            for p in wiring_cache[size]:
                p = partitions[p]
                s1 = set(p.switches)
                for other in wiring_cache[size]:
                    other = partitions[other]
                    if (p.name == other.name):
                        continue

                    s2 = set(other.switches)

                    if s1.intersection(s2):
                        self.logger.info(
                            "found a wiring dep between %s and %s", p.name,
                            other.name)
                        partitions[p.name]._wiring_conflicts.add(other.name)

        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)

    def reserve_partition(self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """

        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("reserve_partition(%r, %r) [does not exist]" %
                              (name, size))
            return False
        if partition.state != "allocated":
            self.logger.error("reserve_partition(%r, %r) [%s]" %
                              (name, size, partition.state))
            return False
        if not partition.functional:
            self.logger.error("reserve_partition(%r, %r) [not functional]" %
                              (name, size))
        if size is not None and size > partition.size:
            self.logger.error("reserve_partition(%r, %r) [size mismatch]" %
                              (name, size))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "busy"
            partition.reserved_until = False
        except:
            self.logger.error("error in reserve_partition", exc_info=True)
        self._partitions_lock.release()
        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()

        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True

    reserve_partition = exposed(reserve_partition)

    def release_partition(self, name):
        """Release a reserved partition.
        
        Arguments:
        name -- name of the partition to release
        """
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("release_partition(%r) [already free]" % (name))
            return False
        if not partition.state == "busy":
            self.logger.info("release_partition(%r) [not busy]" % (name))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "idle"
        except:
            self.logger.error("error in release_partition", exc_info=True)
        self._partitions_lock.release()

        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True

    release_partition = exposed(release_partition)

    def add_process_groups(self, specs):
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """

        self.logger.info("add_process_groups(%r)" % (specs))

        # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do.  another flag should be
        # added to the process group that wait_process_group uses to determine when a process group is no longer active.  an
        # error message should also be attached to the process group so that cqm can report the problem to the user.
        process_groups = self.process_groups.q_add(specs)
        for pgroup in process_groups:
            pgroup.label = "Job %s/%s/%s" % (pgroup.jobid, pgroup.user,
                                             pgroup.id)
            pgroup.nodect = self._partitions[pgroup.location[0]].size
            self.logger.info(
                "%s: process group %s created to track job status",
                pgroup.label, pgroup.id)
            try:
                #TODO: allow the kernel set step to work in the simulator.  For now this doesn't fly.
                pass
                #self._set_kernel(pgroup.location[0], pgroup.kernel)
            except Exception, e:
                self.logger.error("%s: failed to set the kernel; %s",
                                  pgroup.label, e)
                pgroup.exit_status = 255
            else:
                if pgroup.kernel != "default":
                    self.logger.info("%s: now using kernel %s", pgroup.label,
                                     pgroup.kernel)
                if pgroup.mode == "script":
                    pgroup.forker = 'user_script_forker'
                else:
                    pgroup.forker = 'bg_mpirun_forker'
                if self.reserve_resources_until(
                        pgroup.location,
                        float(pgroup.starttime) + 60 * float(pgroup.walltime),
                        pgroup.jobid):
                    try:
                        pgroup.start()
                        if pgroup.head_pid == None:
                            self.logger.error(
                                "%s: process group failed to start using the %s component; releasing resources",
                                pgroup.label, pgroup.forker)
                            self.reserve_resources_until(
                                pgroup.location, None, pgroup.jobid)
                            pgroup.exit_status = 255
                    except (ComponentLookupError, xmlrpclib.Fault), e:
                        self.logger.error(
                            "%s: failed to contact the %s component",
                            pgroup.label, pgroup.forker)
                        # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry
                        # until the job has exhausted its maximum alloted time
                        del self.process_groups[pgroup.id]
                        raise
                    except (ComponentLookupError, xmlrpclib.Fault), e:
                        self.logger.error(
                            "%s: a fault occurred while attempting to start the process group using the %s "
                            "component", pgroup.label, pgroup.forker)
                        # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry
                        # until the job has exhausted its maximum alloted time
                        del self.process_groups[process_group.id]
                        raise
                    except:
Beispiel #10
0
class Simulator (BGBaseSystem):
    
    """Generic system simulator.
    
    Methods:
    configure -- load partitions from an xml file
    reserve_partition -- lock a partition for use by a process_group (exposed)
    release_partition -- release a locked (busy) partition (exposed)
    add_process_groups -- add (start) a process group on the system (exposed, query)
    get_process_groups -- retrieve process groups (exposed, query)
    wait_process_groups -- get process groups that have exited, and remove them from the system (exposed, query)
    signal_process_groups -- send a signal to the head process of the specified process groups (exposed, query)
    update_partition_state -- simulates updating partition state from the bridge API (automatic)
    """
    
    name = "system"
    implementation = "simulator"
    
    logger = logger

    MIN_RUN_TIME = 60
    MAX_RUN_TIME = 180

    def __init__ (self, *args, **kwargs):
        BGBaseSystem.__init__(self, *args, **kwargs)
        sys.setrecursionlimit(5000) #why this magic number?
        self.process_groups.item_cls = BGSimProcessGroup
        self.config_file = kwargs.get("config_file", None)
        self.failed_components = set()
        if self.config_file is not None:
            self.configure(self.config_file)
    
    def __getstate__(self):
        flags = {}
        for part in self._partitions.values():
            sched = None
            func = None
            queue = None
            if hasattr(part, 'scheduled'):
                sched = part.scheduled
            if hasattr(part, 'functional'):
                func = part.functional
            if hasattr(part, 'queue'):
                queue = part.queue
            flags[part.name] =  (sched, func, queue)
        return {'managed_partitions':self._managed_partitions, 'version':2, 'config_file':self.config_file, 'partition_flags': flags}
    
    def __setstate__(self, state):
        Cobalt.Util.fix_set(state)
        sys.setrecursionlimit(5000)
        self._managed_partitions = state['managed_partitions']
        self.config_file = state['config_file']
        self._partitions = PartitionDict()
        self.process_groups = BGProcessGroupDict()
        self.process_groups.item_cls = BGSimProcessGroup
        self.node_card_cache = dict()
        self._partitions_lock = thread.allocate_lock()
        self.failed_components = set()
        self.pending_diags = dict()
        self.failed_diags = list()
        self.bridge_in_error = False
        self.cached_partitions = None
        self.offline_partitions = []
        if self.config_file is not None:
            self.configure(self.config_file)

        if 'partition_flags' in state:
            for pname, flags in state['partition_flags'].items():
                if pname in self._partitions:
                    self._partitions[pname].scheduled = flags[0]
                    self._partitions[pname].functional = flags[1]
                    self._partitions[pname].queue = flags[2]
                else:
                    logger.info("Partition %s is no longer defined" % pname)

        self.update_relatives()
        self.lock = threading.Lock()
        self.statistics = Statistics()
        
    def save_me(self):
        Component.save(self)
    save_me = automatic(save_me)


    def configure (self, config_file):
        
        """Configure simulated partitions.
        
        Arguments:
        config_file -- xml configuration file
        """
        
        def _get_node_card(name):
            if not self.node_card_cache.has_key(name):
                self.node_card_cache[name] = NodeCard(name)
                
            return self.node_card_cache[name]
            
            
        self.logger.info("configure()")
        try:
            system_doc = ElementTree.parse(config_file)
        except IOError:
            self.logger.error("unable to open file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
        except:
            self.logger.error("problem loading data from file: %r" % config_file)
            self.logger.error("exiting...")
            sys.exit(1)
            
        system_def = system_doc.getroot()
        if system_def.tag != "BG":
            self.logger.error("unexpected root element in %r: %r" % (config_file, system_def.tag))
            self.logger.error("exiting...")
            sys.exit(1)
        
        # that 32 is not really constant -- it needs to either be read from cobalt.conf or from the bridge API
        NODES_PER_NODECARD = 32
                
        # initialize a new partition dict with all partitions
        #
        partitions = PartitionDict()
        
        tmp_list = []

        # this is going to hold partition objects from the bridge (not our own Partition)
        wiring_cache = {}
        bp_cache = {}
        
        for partition_def in system_def.getiterator("Partition"):
            node_list = []
            switch_list = []
            
            for nc in partition_def.getiterator("NodeCard"): 
                node_list.append(_get_node_card(nc.get("id")))

            nc_count = len(node_list)
            
            if not wiring_cache.has_key(nc_count):
                wiring_cache[nc_count] = []
            wiring_cache[nc_count].append(partition_def.get("name"))

            for s in partition_def.getiterator("Switch"):
                switch_list.append(s.get("id"))

            tmp_list.append( dict(
                name = partition_def.get("name"),
                queue = partition_def.get("queue", "default"),
                size = NODES_PER_NODECARD * nc_count,
                node_cards = node_list,
                switches = switch_list,
                state = "idle",
            ))
        
        partitions.q_add(tmp_list)
        
        # find the wiring deps
        for size in wiring_cache:
            for p in wiring_cache[size]:
                p = partitions[p]
                s1 = set( p.switches )
                for other in wiring_cache[size]:
                    other = partitions[other]
                    if (p.name == other.name):
                        continue

                    s2 = set( other.switches )
                    
                    if s1.intersection(s2):
                        self.logger.info("found a wiring dep between %s and %s", p.name, other.name)
                        partitions[p.name]._wiring_conflicts.add(other.name)
        
            
        # update object state
        self._partitions.clear()
        self._partitions.update(partitions)

    
    def reserve_partition (self, name, size=None):
        """Reserve a partition and block all related partitions.
        
        Arguments:
        name -- name of the partition to reserve
        size -- size of the process group reserving the partition (optional)
        """
        
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("reserve_partition(%r, %r) [does not exist]" % (name, size))
            return False
        if partition.state != "allocated":
            self.logger.error("reserve_partition(%r, %r) [%s]" % (name, size, partition.state))
            return False
        if not partition.functional:
            self.logger.error("reserve_partition(%r, %r) [not functional]" % (name, size))
        if size is not None and size > partition.size:
            self.logger.error("reserve_partition(%r, %r) [size mismatch]" % (name, size))
            return False

        self._partitions_lock.acquire()
        try:
            partition.state = "busy"
            partition.reserved_until = False
        except:
            self.logger.error("error in reserve_partition", exc_info=True)
        self._partitions_lock.release()
        # explicitly call this, since the above "busy" is instantaneously available
        self.update_partition_state()
        
        self.logger.info("reserve_partition(%r, %r)" % (name, size))
        return True
    reserve_partition = exposed(reserve_partition)
    
    def release_partition (self, name):
        """Release a reserved partition.
        
        Arguments:
        name -- name of the partition to release
        """
        try:
            partition = self.partitions[name]
        except KeyError:
            self.logger.error("release_partition(%r) [already free]" % (name))
            return False
        if not partition.state == "busy":
            self.logger.info("release_partition(%r) [not busy]" % (name))
            return False
                
        self._partitions_lock.acquire()
        try:
            partition.state = "idle"
        except:
            self.logger.error("error in release_partition", exc_info=True)
        self._partitions_lock.release()
        
        # explicitly unblock the blocked partitions
        self.update_partition_state()

        self.logger.info("release_partition(%r)" % (name))
        return True
    release_partition = exposed(release_partition)
    
    def add_process_groups (self, specs):
        
        """Create a simulated process group.
        
        Arguments:
        spec -- dictionary hash specifying a process group to start
        """
        
        self.logger.info("add_process_groups(%r)" % (specs))

        # FIXME: setting exit_status to signal the job has failed isn't really the right thing to do.  another flag should be
        # added to the process group that wait_process_group uses to determine when a process group is no longer active.  an
        # error message should also be attached to the process group so that cqm can report the problem to the user.
        process_groups = self.process_groups.q_add(specs)
        for pgroup in process_groups:
            pgroup.label = "Job %s/%s/%s" % (pgroup.jobid, pgroup.user, pgroup.id)
            pgroup.nodect = self._partitions[pgroup.location[0]].size
            self.logger.info("%s: process group %s created to track job status", pgroup.label, pgroup.id)
            try:
                #TODO: allow the kernel set step to work in the simulator.  For now this doesn't fly.
                pass
                #self._set_kernel(pgroup.location[0], pgroup.kernel)
            except Exception, e:
                self.logger.error("%s: failed to set the kernel; %s", pgroup.label, e)
                pgroup.exit_status = 255
            else:
                if pgroup.kernel != "default":
                    self.logger.info("%s: now using kernel %s", pgroup.label, pgroup.kernel)
                if pgroup.mode == "script":
                    pgroup.forker = 'user_script_forker'
                else:
                    pgroup.forker = 'bg_mpirun_forker'
                if self.reserve_resources_until(pgroup.location, float(pgroup.starttime) + 60*float(pgroup.walltime), pgroup.jobid):
                    try:
                        pgroup.start()
                        if pgroup.head_pid == None:
                            self.logger.error("%s: process group failed to start using the %s component; releasing resources",
                                pgroup.label, pgroup.forker)
                            self.reserve_resources_until(pgroup.location, None, pgroup.jobid)
                            pgroup.exit_status = 255
                    except (ComponentLookupError, xmlrpclib.Fault), e:
                        self.logger.error("%s: failed to contact the %s component", pgroup.label, pgroup.forker)
                        # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry
                        # until the job has exhausted its maximum alloted time
                        del self.process_groups[pgroup.id]
                        raise
                    except (ComponentLookupError, xmlrpclib.Fault), e:
                        self.logger.error("%s: a fault occurred while attempting to start the process group using the %s "
                            "component", pgroup.label, pgroup.forker)
                        # do not release the resources; instead re-raise the exception and allow cqm to the opportunity to retry
                        # until the job has exhausted its maximum alloted time
                        del self.process_groups[process_group.id]
                        raise
                    except: