def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.resources = ResourceDict() self.process_groups = ProcessGroupDict() self.process_groups.item_cls = BBProcessGroup self.queue_assignments = {} self.queue_assignments["default"] = sets.Set(self.resources)
def __init__ (self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.process_groups = ProcessGroupDict() self.all_nodes = set() self.running_nodes = set() self.down_nodes = set() self.queue_assignments = {} self.node_order = {} try: self.configure(cluster_hostfile) except: self.logger.error("unable to load hostfile") self.queue_assignments["default"] = set(self.all_nodes) self.alloc_only_nodes = {} # nodename:starttime self.cleaning_processes = [] #keep track of which jobs still have hosts being cleaned self.cleaning_host_count = {} # jobid:count self.locations_by_jobid = {} #jobid:[locations] self.jobid_to_user = {} #jobid:username self.alloc_timeout = int(get_cluster_system_config("allocation_timeout", 300)) self.logger.info("allocation timeout set to %d seconds." % self.alloc_timeout)
def __setstate__(self, state): Component.__setstate__(self, state) self.all_nodes = set() self.node_order = {} self.configure() self.queue_assignments = state.get('queue_assignments', {}) nonexistent_queues = [] #make sure we can't try and schedule nodes that don't exist if self.queue_assignments == {}: self.queue_assignments["default"] = set(self.all_nodes) else: #remove nodes that have disappeared for queue, nodes in self.queue_assignments.iteritems(): corrected_nodes = self.all_nodes & set(nodes) if corrected_nodes == set(): nonexistent_queues.append(queue) self.queue_assignments[queue] = corrected_nodes for queue in nonexistent_queues: del self.queue_assignments[queue] self.down_nodes = self.all_nodes & set(state.get('down_nodes', set())) self.process_groups = ProcessGroupDict() self.running_nodes = set() self.alloc_only_nodes = {} # nodename:starttime if not state.has_key("cleaning_processes"): self.cleaning_processes = [] self.cleaning_host_count = {} # jobid:count self.locations_by_jobid = {} #jobid:[locations] self.jobid_to_user = {} #jobid:username self.alloc_timeout = int( get_orcm_system_config("allocation_timeout", 300)) self.logger.info("allocation timeout set to %d seconds." % self.alloc_timeout)
def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.process_groups = ProcessGroupDict() self.pending_diags = dict() self.failed_diags = list() self.all_nodes = sets.Set() self.running_nodes = sets.Set() self.down_nodes = sets.Set() self.queue_assignments = {} self.node_order = {} try: self.configure(CP.get("cluster_system", "hostfile")) except: self.logger.error("unable to load hostfile") self.queue_assignments["default"] = sets.Set(self.all_nodes)
def __setstate__(self, state): self.queue_assignments = state["queue_assignments"] self.down_nodes = state["down_nodes"] self.process_groups = ProcessGroupDict() self.pending_diags = dict() self.failed_diags = list() self.all_nodes = sets.Set() self.running_nodes = sets.Set() self.node_order = {} try: self.configure(CP.get("cluster_system", "hostfile")) except: self.logger.error("unable to load hostfile") self.lock = threading.Lock() self.statistics = Statistics()
def _common_init_restart(self, state=None): '''common intitialization code for both cold initilaization and reinitialization. ''' if state is None: self.process_groups = ProcessGroupDict() self.process_groups.item_cls = self.pgroup_type else: self.process_groups = state.get('process_groups', ProcessGroupDict()) for pgroup in self.process_groups.values(): _logger.info('recovering pgroup %s, jobid %s', pgroup.id, pgroup.jobid) self.process_groups.id_gen.set(int(state['next_pg_id'])) self.process_group_actions = {} self.forkers = [] #list of forker identifiers to use with ComponentProxy self.forker_taskcounts = {} # dict of forkers and counts of pgs attached self.forker_locations = {} # dict of forkers a tuple (host, port) self.remote_qsub_hosts = [] # list of hosts that qsub -I requires # ssh-ing to a forker host self.process_groups_lock = RLock() self.update_launchers()
def __setstate__(self, state): Component.__setstate__(self, state) self.queue_assignments = state["queue_assignments"] self.down_nodes = state["down_nodes"] self.process_groups = ProcessGroupDict() self.all_nodes = set() self.running_nodes = set() self.node_order = {} try: self.configure(cluster_hostfile) except: self.logger.error("unable to load hostfile") self.alloc_only_nodes = {} # nodename:starttime if not state.has_key("cleaning_processes"): self.cleaning_processes = [] self.cleaning_host_count = {} # jobid:count self.locations_by_jobid = {} #jobid:[locations] self.jobid_to_user = {} #jobid:username self.alloc_timeout = int(get_cluster_system_config("allocation_timeout", 300)) self.logger.info("allocation timeout set to %d seconds." % self.alloc_timeout)
def __init__(self): ProcessGroupDict.__init__(self)
class BBSystem(Component): """Breadboard system component. Methods: add_process_groups -- allocates nodes get_process_groups -- get process groups based on specs signal_process_groups -- signal a process group wait_process_groups -- removed process groups based on specs """ name = "system" implementation = "Breadboard" def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.resources = ResourceDict() self.process_groups = ProcessGroupDict() self.process_groups.item_cls = BBProcessGroup self.queue_assignments = {} self.queue_assignments["default"] = sets.Set(self.resources) ##################### # Main set of methods ##################### def add_process_groups(self, specs): """Allocate nodes and add the list of those allocated to the PGDict""" return self.process_groups.q_add(specs, lambda x, _:self._start_pg(x)) add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): """Get a list of existing allocations""" self._wait() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def signal_process_groups(self, specs, sig): """Free the specified process group (set of allocated nodes)""" return self.process_groups.q_get(specs, lambda x, y:x.signal(y), sig) signal_process_groups = exposed(query(signal_process_groups)) def wait_process_groups(self, specs): """Remove terminated process groups""" return self.process_groups.q_del(specs, lambda x, _:self._release_resources(x)) wait_process_groups = exposed(query(wait_process_groups)) ######################################### # Methods for dealing with Process Groups ######################################### def _start_pg(self, pgp): """Starts a process group by initiating building/rebooting nodes""" ########################################### ### The following is for back-compatibility ### with bballoc (bbtools) until breadboard ### is switched entirely to run on cobalt ########################################### bbdata = bblib.BBConfig("/etc/bb.xml") bbdata.SetNodeAttr(pgp.location, {"user":pgp.user, "state":"Cobalt", "comment":"Managed by Cobalt"}) bbdata.WriteAndClose() ########################################### ### End of back-compatibility ########################################### specs = [{"name":name, "attributes":"*"} for name in pgp.location] resources = self.get_resources(specs) action = "build-%s" % pgp.kernel for res in resources: # Set build action for each resource specs = [{"name":res.name}] new_attrs = {"attributes":{"action":action}} self.set_attributes(specs, new_attrs) mac = res.attributes["mac"] linkname = "/tftpboot/pxelinux.cfg/01-%s" \ % mac.replace(":", "-").lower() if os.readlink(linkname) == action: continue os.unlink(linkname) os.symlink(action, linkname) for res in resources: # Cycle power os.system("/usr/sbin/pm -c %s" % res.name) # Add resource to list of building nodes pgp.building_nodes.append(res.name) def _check_builds_done(self): """Checks if nodes are done building for each process group and scripts can begin running""" for pgp in [x for x in self.process_groups.itervalues() if (len(x.building_nodes) > 0 or len(x.pinging_nodes) > 0)]: specs = [{"name":name, "attributes":"*"} for name in pgp.building_nodes] building = self.get_resources(specs) build_action = "build-%s" % pgp.kernel for node in building: if node.attributes["action"] != build_action: pgp.building_nodes.remove(node.name) pgp.pinging_nodes.append(node.name) for nodename in pgp.pinging_nodes: if os.system("/bin/ping -c 1 -W 1 %s > /dev/null" % nodename): continue pgp.pinging_nodes.remove(nodename) if len(pgp.building_nodes) == 0 and len(pgp.pinging_nodes) == 0: pgp.start() _check_builds_done = automatic(_check_builds_done) def node_done_building(self, node): """Sets a node as done building Arguments: node -- string name of node that is done building Returns: nothing """ specs = [{"name":node, "attributes":"*"}] nodedata = self.get_resources(specs) if len(nodedata) > 0: buildimage = nodedata[0].attributes["action"] nodedata[0].attributes["action"] = buildimage.replace("build-", "boot-") node_done_building = exposed(node_done_building) def _wait(self): """Calls the process group container's wait() method""" for pgp in self.process_groups.itervalues(): pgp.wait() _wait = automatic(_wait) def _release_resources(self, pgp): """Releases the resources held by a process group""" os.system("/usr/sbin/pm -0 %s" % " ".join(pgp.location)) specs = [{"name":name} for name in pgp.location] new_attrs = {"state":"idle"} self.set_attributes(specs, new_attrs) ########################################### ### The following is for back-compatibility ### with bballoc (bbtools) until breadboard ### is switched entirely to run on cobalt ########################################### bbdata = bblib.BBConfig("/etc/bb.xml") bbdata.SetNodeAttr(pgp.location, {"user":"******"}) bbdata.WriteAndClose() ########################################### ### End of back-compatibility ########################################### #################################### # Methods for dealing with resources #################################### def add_resources(self, specs): """Add a resource to this system Arguments: specs -- A list of dictionaries with the attributes for the resources Returns: list of values added """ try: ret = self.resources.q_add(specs) for res in ret: self.queue_assignments["default"].add(res) except KeyError: ret = "KeyError" return ret add_resources = exposed(query(add_resources)) def remove_resources(self, specs): """Remove a resource from this system Arguments: specs -- A list of dictionaries with the attributes to pick which resources to remove Returns: list of resources removed """ ret = self.resources.q_del(specs) for res in ret: self.queue_assignments["default"].discard(res) return ret remove_resources = exposed(remove_resources) def get_resources(self, specs): """Returns a list of all the resources for this system matching the given specs (list of dictionaries)""" return self.resources.q_get(specs) get_resources = exposed(query(get_resources)) def set_attributes(self, specs, newattrs): """Sets an attribute in specified resources Arguments: specs -- list of dictionaries with resource attributes to match newattrs -- a dictionary with key:val pairs of attributes to set Returns: a list of the changed resources """ return self.resources.q_get(specs, lambda x, y:[set_attr(x, key, val) for key, val in y.iteritems()], newattrs) set_attributes = exposed(query(set_attributes)) def remove_attributes(self, specs, attrs): """Removes other attributes in specified resources Arguments: specs -- list of dictionaries with resource attributes to match attrs -- list of names of attributes to remove from resource.attributes Returns: a list of the changed resources """ return self.resources.q_get(specs, lambda x, y:[rem_attr(x, key) for key in y], attrs) remove_attributes = exposed(query(remove_attributes)) ########################################################## # Methods for interacting with scheduler and queue-manager ########################################################## def validate_job(self, spec): """Validate a job for submission Arguments: spec -- job specification dictionary """ max_nodes = len(self.get_resources([{"name":"*", "functional":True, "scheduled":True}])) try: spec["nodecount"] = int(spec["nodecount"]) except ValueError: raise JobValidationError("Non-integer node count") if not 0 < spec["nodecount"] <= max_nodes: raise JobValidationError("Node count out of realistic range") if float(spec["time"]) < 15: raise JobValidationError("Walltime less than minimum 15 minutes") if "kernel" in spec: if not (os.path.exists("/tftpboot/pxelinux.cfg/build-%s" % spec["kernel"]) and os.path.exists("/tftpboot/pxelinux.cfg/boot-%s" % spec["kernel"])): raise JobValidationError(("Specified image %s (from -k " + "'kernel' flag does not exist") % spec["kernel"]) if "attrs" in spec: matched_res = self.resources.get_attr_matched_resources( [{"name":"*", "functional":True, "scheduled":True, "attributes":"*"}], spec["attrs"]) if spec["nodecount"] > len(matched_res): raise JobValidationError("Not enough nodes exist with the " + "attributes to match") return spec validate_job = exposed(validate_job) def verify_locations(self, location_list): """Makes sure a 'location string' is valid""" resources = self.get_resources([{"name":r} for r in location_list]) return [r.name for r in resources] verify_locations = exposed(verify_locations) def find_job_location(self, job_location_args, end_times): """Finds and reserves a list of nodes in which the job can run Arguments: job_location_args -- A list of dictionaries with info about the job jobid -- string identifier nodes -- int number of nodes queue -- string queue name required -- ?? utility_score -- ?? threshold -- ?? walltime -- ?? attrs -- dictionary of attributes to match against end_times -- supposed time the job will end Returns: Dictionary with list of nodes a job can run on, keyed by jobid """ locations = {} def jobsort(job): """Used to sort job list by utility score""" return job["utility_score"] job_location_args.sort(key=jobsort) for job in job_location_args: specs = [{"name":"*", "functional":True, "scheduled":True, "state":"idle", "attributes":"*"}] if "attrs" not in job or job["attrs"] is None: job["attrs"] = {} resources = self.resources.get_attr_matched_resources(specs, job["attrs"]) if len(resources) < job["nodes"]: #Can't schedule job - not enough resources continue def namesort(res): """Used to sort resources by name""" return res.name resources.sort(key=namesort) used_resources = resources[:job["nodes"]] for res in used_resources: res.state = "busy" locations[job["jobid"]] = [r.name for r in used_resources] return locations find_job_location = exposed(find_job_location) def find_queue_equivalence_classes(self, reservation_dict, active_queue_names): """Finds equivalent queues""" equiv = [] for queue in self.queue_assignments: # skip queues that aren't running if not queue in active_queue_names: continue found_a_match = False for equ in equiv: if equ['data'].intersection(self.queue_assignments[queue]): equ['queues'].add(queue) equ['data'].update(self.queue_assignments[queue]) found_a_match = True break if not found_a_match: equiv.append({'queues': set([queue]), 'data': set(self.queue_assignments[queue]), 'reservations': set()}) real_equiv = [] for eq_class in equiv: found_a_match = False for equ in real_equiv: if equ['queues'].intersection(eq_class['queues']): equ['queues'].update(eq_class['queues']) equ['data'].update(eq_class['data']) found_a_match = True break if not found_a_match: real_equiv.append(eq_class) equiv = real_equiv for eq_class in equiv: for res_name in reservation_dict: for host_name in reservation_dict[res_name].split(":"): if host_name in eq_class['data']: eq_class['reservations'].add(res_name) for key in eq_class: eq_class[key] = list(eq_class[key]) del eq_class['data'] return equiv find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
class BBSystem(Component): """Breadboard system component. Methods: add_process_groups -- allocates nodes get_process_groups -- get process groups based on specs signal_process_groups -- signal a process group wait_process_groups -- removed process groups based on specs """ name = "system" implementation = "Breadboard" def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) self.resources = ResourceDict() self.process_groups = ProcessGroupDict() self.process_groups.item_cls = BBProcessGroup self.queue_assignments = {} self.queue_assignments["default"] = sets.Set(self.resources) ##################### # Main set of methods ##################### def add_process_groups(self, specs): """Allocate nodes and add the list of those allocated to the PGDict""" return self.process_groups.q_add(specs, lambda x, _: self._start_pg(x)) add_process_groups = exposed(query(add_process_groups)) def get_process_groups(self, specs): """Get a list of existing allocations""" self._wait() return self.process_groups.q_get(specs) get_process_groups = exposed(query(get_process_groups)) def signal_process_groups(self, specs, sig): """Free the specified process group (set of allocated nodes)""" return self.process_groups.q_get(specs, lambda x, y: x.signal(y), sig) signal_process_groups = exposed(query(signal_process_groups)) def wait_process_groups(self, specs): """Remove terminated process groups""" return self.process_groups.q_del( specs, lambda x, _: self._release_resources(x)) wait_process_groups = exposed(query(wait_process_groups)) ######################################### # Methods for dealing with Process Groups ######################################### def _start_pg(self, pgp): """Starts a process group by initiating building/rebooting nodes""" ########################################### ### The following is for back-compatibility ### with bballoc (bbtools) until breadboard ### is switched entirely to run on cobalt ########################################### bbdata = bblib.BBConfig("/etc/bb.xml") bbdata.SetNodeAttr(pgp.location, { "user": pgp.user, "state": "Cobalt", "comment": "Managed by Cobalt" }) bbdata.WriteAndClose() ########################################### ### End of back-compatibility ########################################### specs = [{"name": name, "attributes": "*"} for name in pgp.location] resources = self.get_resources(specs) action = "build-%s" % pgp.kernel for res in resources: # Set build action for each resource specs = [{"name": res.name}] new_attrs = {"attributes": {"action": action}} self.set_attributes(specs, new_attrs) mac = res.attributes["mac"] linkname = "/tftpboot/pxelinux.cfg/01-%s" \ % mac.replace(":", "-").lower() if os.readlink(linkname) == action: continue os.unlink(linkname) os.symlink(action, linkname) for res in resources: # Cycle power os.system("/usr/sbin/pm -c %s" % res.name) # Add resource to list of building nodes pgp.building_nodes.append(res.name) def _check_builds_done(self): """Checks if nodes are done building for each process group and scripts can begin running""" for pgp in [ x for x in self.process_groups.itervalues() if (len(x.building_nodes) > 0 or len(x.pinging_nodes) > 0) ]: specs = [{ "name": name, "attributes": "*" } for name in pgp.building_nodes] building = self.get_resources(specs) build_action = "build-%s" % pgp.kernel for node in building: if node.attributes["action"] != build_action: pgp.building_nodes.remove(node.name) pgp.pinging_nodes.append(node.name) for nodename in pgp.pinging_nodes: if os.system("/bin/ping -c 1 -W 1 %s > /dev/null" % nodename): continue pgp.pinging_nodes.remove(nodename) if len(pgp.building_nodes) == 0 and len(pgp.pinging_nodes) == 0: pgp.start() _check_builds_done = automatic(_check_builds_done) def node_done_building(self, node): """Sets a node as done building Arguments: node -- string name of node that is done building Returns: nothing """ specs = [{"name": node, "attributes": "*"}] nodedata = self.get_resources(specs) if len(nodedata) > 0: buildimage = nodedata[0].attributes["action"] nodedata[0].attributes["action"] = buildimage.replace( "build-", "boot-") node_done_building = exposed(node_done_building) def _wait(self): """Calls the process group container's wait() method""" for pgp in self.process_groups.itervalues(): pgp.wait() _wait = automatic(_wait) def _release_resources(self, pgp): """Releases the resources held by a process group""" os.system("/usr/sbin/pm -0 %s" % " ".join(pgp.location)) specs = [{"name": name} for name in pgp.location] new_attrs = {"state": "idle"} self.set_attributes(specs, new_attrs) ########################################### ### The following is for back-compatibility ### with bballoc (bbtools) until breadboard ### is switched entirely to run on cobalt ########################################### bbdata = bblib.BBConfig("/etc/bb.xml") bbdata.SetNodeAttr(pgp.location, {"user": "******"}) bbdata.WriteAndClose() ########################################### ### End of back-compatibility ########################################### #################################### # Methods for dealing with resources #################################### def add_resources(self, specs): """Add a resource to this system Arguments: specs -- A list of dictionaries with the attributes for the resources Returns: list of values added """ try: ret = self.resources.q_add(specs) for res in ret: self.queue_assignments["default"].add(res) except KeyError: ret = "KeyError" return ret add_resources = exposed(query(add_resources)) def remove_resources(self, specs): """Remove a resource from this system Arguments: specs -- A list of dictionaries with the attributes to pick which resources to remove Returns: list of resources removed """ ret = self.resources.q_del(specs) for res in ret: self.queue_assignments["default"].discard(res) return ret remove_resources = exposed(remove_resources) def get_resources(self, specs): """Returns a list of all the resources for this system matching the given specs (list of dictionaries)""" return self.resources.q_get(specs) get_resources = exposed(query(get_resources)) def set_attributes(self, specs, newattrs): """Sets an attribute in specified resources Arguments: specs -- list of dictionaries with resource attributes to match newattrs -- a dictionary with key:val pairs of attributes to set Returns: a list of the changed resources """ return self.resources.q_get( specs, lambda x, y: [set_attr(x, key, val) for key, val in y.iteritems()], newattrs) set_attributes = exposed(query(set_attributes)) def remove_attributes(self, specs, attrs): """Removes other attributes in specified resources Arguments: specs -- list of dictionaries with resource attributes to match attrs -- list of names of attributes to remove from resource.attributes Returns: a list of the changed resources """ return self.resources.q_get( specs, lambda x, y: [rem_attr(x, key) for key in y], attrs) remove_attributes = exposed(query(remove_attributes)) ########################################################## # Methods for interacting with scheduler and queue-manager ########################################################## def validate_job(self, spec): """Validate a job for submission Arguments: spec -- job specification dictionary """ max_nodes = len( self.get_resources([{ "name": "*", "functional": True, "scheduled": True }])) try: spec["nodecount"] = int(spec["nodecount"]) except ValueError: raise JobValidationError("Non-integer node count") if not 0 < spec["nodecount"] <= max_nodes: raise JobValidationError("Node count out of realistic range") if float(spec["time"]) < 15: raise JobValidationError("Walltime less than minimum 15 minutes") if "kernel" in spec: if not (os.path.exists( "/tftpboot/pxelinux.cfg/build-%s" % spec["kernel"]) and os.path.exists( "/tftpboot/pxelinux.cfg/boot-%s" % spec["kernel"])): raise JobValidationError( ("Specified image %s (from -k " + "'kernel' flag does not exist") % spec["kernel"]) if "attrs" in spec: matched_res = self.resources.get_attr_matched_resources( [{ "name": "*", "functional": True, "scheduled": True, "attributes": "*" }], spec["attrs"]) if spec["nodecount"] > len(matched_res): raise JobValidationError("Not enough nodes exist with the " + "attributes to match") return spec validate_job = exposed(validate_job) def verify_locations(self, location_list): """Makes sure a 'location string' is valid""" resources = self.get_resources([{"name": r} for r in location_list]) return [r.name for r in resources] verify_locations = exposed(verify_locations) def find_job_location(self, job_location_args, end_times): """Finds and reserves a list of nodes in which the job can run Arguments: job_location_args -- A list of dictionaries with info about the job jobid -- string identifier nodes -- int number of nodes queue -- string queue name required -- ?? utility_score -- ?? threshold -- ?? walltime -- ?? attrs -- dictionary of attributes to match against end_times -- supposed time the job will end Returns: Dictionary with list of nodes a job can run on, keyed by jobid """ locations = {} def jobsort(job): """Used to sort job list by utility score""" return job["utility_score"] job_location_args.sort(key=jobsort) for job in job_location_args: specs = [{ "name": "*", "functional": True, "scheduled": True, "state": "idle", "attributes": "*" }] if "attrs" not in job or job["attrs"] is None: job["attrs"] = {} resources = self.resources.get_attr_matched_resources( specs, job["attrs"]) if len(resources) < job["nodes"]: #Can't schedule job - not enough resources continue def namesort(res): """Used to sort resources by name""" return res.name resources.sort(key=namesort) used_resources = resources[:job["nodes"]] for res in used_resources: res.state = "busy" locations[job["jobid"]] = [r.name for r in used_resources] return locations find_job_location = exposed(find_job_location) def find_queue_equivalence_classes(self, reservation_dict, active_queue_names): """Finds equivalent queues""" equiv = [] for queue in self.queue_assignments: # skip queues that aren't running if not queue in active_queue_names: continue found_a_match = False for equ in equiv: if equ['data'].intersection(self.queue_assignments[queue]): equ['queues'].add(queue) equ['data'].update(self.queue_assignments[queue]) found_a_match = True break if not found_a_match: equiv.append({ 'queues': set([queue]), 'data': set(self.queue_assignments[queue]), 'reservations': set() }) real_equiv = [] for eq_class in equiv: found_a_match = False for equ in real_equiv: if equ['queues'].intersection(eq_class['queues']): equ['queues'].update(eq_class['queues']) equ['data'].update(eq_class['data']) found_a_match = True break if not found_a_match: real_equiv.append(eq_class) equiv = real_equiv for eq_class in equiv: for res_name in reservation_dict: for host_name in reservation_dict[res_name].split(":"): if host_name in eq_class['data']: eq_class['reservations'].add(res_name) for key in eq_class: eq_class[key] = list(eq_class[key]) del eq_class['data'] return equiv find_queue_equivalence_classes = exposed(find_queue_equivalence_classes)
class ProcessGroupManager(object): #degenerate with ProcessMonitor. '''Manager for process groups. These are tasks that Cobalt run on behalf of the user. Typically these are scripts submitted via qsub.''' def __init__(self, pgroup_type=ProcessGroup): '''Initialize process group manager. Input: pgroup_type: [optional] type of process group class to use. Must be compatible with the ProcessGroupDict class. ''' self._init_config_vars() self.pgroup_type = pgroup_type self._common_init_restart() def _common_init_restart(self, state=None): '''common intitialization code for both cold initilaization and reinitialization. ''' if state is None: self.process_groups = ProcessGroupDict() self.process_groups.item_cls = self.pgroup_type else: self.process_groups = state.get('process_groups', ProcessGroupDict()) for pgroup in self.process_groups.values(): _logger.info('recovering pgroup %s, jobid %s', pgroup.id, pgroup.jobid) self.process_groups.id_gen.set(int(state['next_pg_id'])) self.process_group_actions = {} self.forkers = [] #list of forker identifiers to use with ComponentProxy self.forker_taskcounts = {} # dict of forkers and counts of pgs attached self.forker_locations = {} # dict of forkers a tuple (host, port) self.remote_qsub_hosts = [] # list of hosts that qsub -I requires # ssh-ing to a forker host self.process_groups_lock = RLock() self.update_launchers() def _init_config_vars(self): '''Initialize variables from Cobalt's configuration files.''' init_cobalt_config() self.forker_re = re.compile('forker') self.sigkill_timeout = int(get_config_option('system', 'sigkill_timeout', 300)) self.remote_qsub_hosts = get_config_option('system', 'elogin_hosts', '').split(":") _logger.info('REMOTE QSUB HOSTS: %s', ", ".join(self.remote_qsub_hosts)) def __getstate__(self): state = {} state['process_groups'] = self.process_groups state['next_pg_id'] = self.process_groups.id_gen.idnum + 1 return state def __setstate__(self, state): self._init_config_vars() self._common_init_restart(state) return self def init_groups(self, specs): '''Add a set of process groups from specs. Generate a unique id. Input: specs - a list of dictionaries that specify process groups for a given system Returns: list of process groups that were just added. ''' # modify the forker in specs to force the job to round-robbin forkers for spec in specs: ordered_forkers = [f[0] for f in sorted(self.forker_taskcounts.items(), key=lambda x:x[1])] if len(ordered_forkers) < 0: raise RuntimeError("No forkers registered!") else: spec['forker'] = ordered_forkers[0] #this is now a tuple self.forker_taskcounts[spec['forker']] += 1 _logger.info("Job %s using forker %s", spec['jobid'], spec['forker']) return self.process_groups.q_add(specs) def signal_groups(self, pgids, signame="SIGINT"): '''Send signal with signame to a list of process groups. Returns: List of signaled process groups ''' signaled_pgs = [] for pgid in pgids: if self.process_groups[pgid].mode == 'interactive': self.process_groups[pgid].interactive_complete = True signaled_pgs.append(self.process_groups[pgid]) elif self.process_groups[pgid].signal(signame): signaled_pgs.append(self.process_groups[pgid]) return signaled_pgs def terminate_groups(self, pgids): '''Send SIGINTs to process groups to allow them to terminate gracefully. Set the time at which a SIGKILL will be send if the process group has not completed. ''' now = int(time.time()) self.signal_groups(pgids) for pg_id in pgids: self.process_groups[pg_id].sigkill_timeout = int(now + self.sigkill_timeout) def start_groups(self, pgids): '''Start process groups. Return groups that succeeded startup. ''' started = [] for pg_id in pgids: try: self.process_groups[pg_id].start() except ProcessGroupStartupError: _logger.error("%s: Unable to start process group.", self.process_groups[pg_id].label) else: started.append(pg_id) self.process_groups[pg_id].startup_timeout = 0 return started #make automatic get final status of process group def update_groups(self): '''update process groups with information from forkers. This will also trigger information cleanup for terminated processes. If the child data isn't found for a ProcessGroup, and no exit status has been set, then the process group must be terminated and marked as having a lost child. ''' children = {} completed = {} orphaned = [] completed_pgs = [] now = int(time.time()) for forker in self.forkers: completed[forker] = [] try: child_data = ComponentProxy(forker).get_children("process group", None) except ComponentLookupError, e: _logger.error("failed to contact the %s component to obtain a list of children", forker) except: