def get_resources(self): """Retrieve the ressources for the vm5k_deployement and define the list of hosts and ip_mac. """ self.resources = get_oar_job_vm5k_resources([(self.oar_job_id, self.frontend)]) self.hosts = self.resources[get_cluster_site(self.cluster)]['hosts'] self.ip_mac = self.resources[get_cluster_site(self.cluster)]['ip_mac']
def run_xp(self): """Iterate over the parameters and execute the bench""" while len(self.sweeper.get_remaining()) > 0: comb = self.sweeper.get_next() if comb['n_core'] > get_host_attributes(comb['cluster']+'-1')['architecture']['smt_size'] * self.n_nodes: self.sweeper.skip(comb) continue logger.info('Processing new combination %s' % (comb,)) site = get_cluster_site(comb['cluster']) jobs = oarsub([(OarSubmission(resources = "{cluster='" + comb['cluster']+"'}/nodes=" + str(self.n_nodes), job_type = 'allow_classic_ssh', walltime ='0:10:00'), site)]) if jobs[0][0]: try: wait_oar_job_start(*jobs[0]) nodes = get_oar_job_nodes(*jobs[0]) bench_cmd = 'mpirun -H %s -n %i %s ~/NPB3.3-MPI/bin/lu.%s.%i' % ( ",".join([node.address for node in nodes]), comb['n_core'], get_mpi_opts(comb['cluster']), comb['size'], comb['n_core']) lu_bench = SshProcess(bench_cmd, nodes[0]) lu_bench.stdout_handlers.append(self.result_dir + '/' + slugify(comb) + '.out') lu_bench.run() if lu_bench.ok: logger.info("comb ok: %s" % (comb,)) self.sweeper.done(comb) continue finally: oardel(jobs) logger.info("comb NOT ok: %s" % (comb,)) self.sweeper.cancel(comb)
def get_clusters_interfaces(clusters, extra_cond=lambda nic: True): """ Returns for each cluster the available cluster interfaces Args: clusters (str): list of the clusters extra_cond (lambda): extra predicate to filter network card retrieved from the API. E.g lambda nic: not nic['mounted'] will retrieve all the usable network cards that are not mounted by default. Returns: dict of cluster with their associated nic names Examples: .. code-block:: python # pseudo code actual = get_clusters_interfaces(["paravance"]) expected = {"paravance": ["eth0", "eth1"]} assertDictEquals(expected, actual) """ utils.get_clusters_interfaces(clusters, extra_cond=extra_cond) interfaces = {} for cluster in clusters: site = EX5.get_cluster_site(cluster) nics = EX5.get_resource_attributes("/sites/%s/clusters/%s/nodes" % (site, cluster)) nics = nics['items'][0]['network_adapters'] nics = [ nic['device'] for nic in nics if nic['mountable'] and nic['interface'] == 'Ethernet' and not nic['management'] and extra_cond(nic) ] nics = sorted(nics) interfaces.setdefault(cluster, nics) return interfaces
def concretize_resources(resources, gridjob, reservation_type): if reservation_type == "oar": nodes = ex5.get_oar_job_nodes(gridjob) else: nodes = ex5.get_oargrid_job_nodes(gridjob) concretize_nodes(resources, nodes) if reservation_type == "oar": # This block is in charge of detecting the site of the oar reservation site_candidates = [] for network_description in resources.get("machines", []): cluster = network_description.get("cluster") site_candidates += [ex5.get_cluster_site(cluster)] for network_description in resources.get("networks", []): site_candidates += [network_description.get("site", "unknown")] if len(set(site_candidates)) == 1: site = site_candidates[0] else: raise "Could not detect the g5k site of the oarjob %s" % gridjob job_sites = [(gridjob, site)] else: job_sites = ex5.get_oargrid_job_oar_jobs(gridjob) vlans = [] for (job_id, site) in job_sites: vlan_ids = ex5.get_oar_job_kavlan(job_id, site) vlans.extend([{ "site": site, "vlan_id": vlan_id } for vlan_id in vlan_ids]) concretize_networks(resources, vlans)
def get_cluster_interfaces(cluster, extra_cond=lambda nic: True): site = ex5.get_cluster_site(cluster) nics = ex5.get_resource_attributes( "/sites/%s/clusters/%s/nodes" % (site, cluster)) nics = nics['items'][0]['network_adapters'] nics = [nic['device'] for nic in nics if nic['mountable'] and nic['interface'] == 'Ethernet' and not nic['management'] and extra_cond(nic)] nics = sorted(nics) return nics
def _mount_cluster_nics(self, conf, cluster, nodes, kavlan_nodes, vlans): """Get the NIC devices of the reserved cluster. :param nodes: List of hostnames unmodified by the vlan """ provider_conf = conf['provider'] # XXX: this only works if all nodes are on the same cluster, # or if nodes from different clusters have the same devices site = EX5.get_cluster_site(cluster) nics = EX5.get_resource_attributes( "/sites/%s/clusters/%s/nodes" % (site, cluster) )['items'][0]['network_adapters'] interfaces = [nic['device'] for nic in nics if nic['mountable'] and nic['interface'] == 'Ethernet'] network_interface = str(interfaces[0]) external_interface = None if len(interfaces) > 1 and not provider_conf['single_interface']: external_interface = str(interfaces[1]) _, vlan = self._get_primary_vlan(vlans) api.set_nodes_vlan(site, map(lambda d: EX.Host(d), nodes), external_interface, vlan) self._exec_command_on_nodes( kavlan_nodes, "ifconfig %s up && dhclient -nw %s" % ( external_interface, external_interface), 'mounting secondary interface') else: # TODO(msimonin) fix the network in this case as well. external_interface = 'veth0' if provider_conf['single_interface']: logging.warning("Forcing the use of a one network interface") else: logging.warning("%s has only one NIC. The same interface " "will be used for network_interface and " "neutron_external_interface." % conf['resources'].keys()[0]) self._exec_command_on_nodes( kavlan_nodes, 'ip link show veth0 || ip link add type veth peer', 'Creating a veth') return (network_interface, external_interface)
def get_cluster_interfaces(cluster, extra_cond=lambda nic: True): site = ex5.get_cluster_site(cluster) nics = ex5.get_resource_attributes("/sites/%s/clusters/%s/nodes" % (site, cluster)) nics = nics['items'][0]['network_adapters'] # NOTE(msimonin): Since 05/18 nics on g5k nodes have predictable names but # the api description keep the legacy name (device key) and the new # predictable name (key name). The legacy names is still used for api # request to the vlan endpoint This should be fixed in # https://intranet.grid5000.fr/bugzilla/show_bug.cgi?id=9272 # When its fixed we should be able to only use the new predictable name. nics = [(nic['device'], nic['name']) for nic in nics if nic['mountable'] and nic['interface'] == 'Ethernet' and not nic['management'] and extra_cond(nic)] nics = sorted(nics) return nics
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission(resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={ 'user': default_frontend_connection_params['user'] }).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission(resources="nodes=%d" % (max(1, comb['cores']/n_core), ), sql_properties="cluster='%s'"%comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([(submission, get_cluster_site(comb['cluster']))])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)]
def get_cpu_topology(cluster, xpdir=None): """ """ logger.info('Determining the architecture of cluster ' + \ style.emph(cluster)) root = None # Trying to reed topology from a directory if xpdir: fname = xpdir + '/topo_' + cluster + '.xml' try: tree = parse(fname) root = tree.getroot() except: logger.info('No cache file found, will reserve a node and ' + \ 'determine topology from virsh capabilities') pass if root is None: frontend = get_cluster_site(cluster) submission = OarSubmission( resources="{cluster='" + cluster + "'}/nodes=1", walltime="0:02:00", job_type="allow_classic_ssh") ((job_id, _), ) = oarsub([(submission, frontend)]) wait_oar_job_start(job_id, frontend) host = get_oar_job_nodes(job_id, frontend)[0] capa = SshProcess('virsh capabilities', host, connection_params={'user': default_frontend_connection_params['user']} ).run() oardel([(job_id, frontend)]) root = fromstring(capa.stdout) if xpdir is not None: tree = ElementTree(root) tree.write(fname) cpu_topology = [] i_cell = 0 for cell in root.findall('.//cell'): cpu_topology.append([]) for cpu in cell.findall('.//cpu'): cpu_topology[i_cell].append(int(cpu.attrib['id'])) i_cell += 1 logger.info(pformat(cpu_topology)) return cpu_topology
def get_nodes(self, comb): """ Perform a submission for a given comb and retrieve the submission node list """ logger.info('Performing submission') n_core = get_host_attributes(comb['cluster'] + '-1')['architecture']['smt_size'] submission = OarSubmission( resources="nodes=%d" % (max(1, comb['cores'] / n_core), ), sql_properties="cluster='%s'" % comb['cluster'], job_type="besteffort", name="l2c_fft_eval") self.oar_job_id, self.frontend = oarsub([ (submission, get_cluster_site(comb['cluster'])) ])[0] logger.info("Waiting for job start") wait_oar_job_start(self.oar_job_id, self.frontend) logger.info("Retrieving hosts list") nodes = get_oar_job_nodes(self.oar_job_id, self.frontend) self.hosts = [host for host in nodes for i in range(n_core)]
def get_cluster_nics(self, cluster): site = EX5.get_cluster_site(cluster) nics = EX5.get_resource_attributes('/sites/%s/clusters/%s/nodes' % (site, cluster))['items'][0]['network_adapters'] return [nic['device'] for nic in nics if nic['mountable']]
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend( tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([ s for (c, s) in clusters_to_submit ]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update(sites_clusters_threads[site].keys()) all_involved_clusters.update([ c for (c, s) in clusters_to_submit if s == site ]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key(site) and sites_clusters_threads[site].has_key(cluster): num_workers = len(sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min(self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % ( num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target = self.worker_start, args = (cluster, site, oarsubmission, data, num_total_workers,)) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key(cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append(th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail("no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail("cleaning: delete job %i of worker #%i on %s" % ( th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None
def get_cluster_site(cluster): return ex5.get_cluster_site(cluster)
def _get_cluster_nics(self, cluster): site = EX5.get_cluster_site(cluster) nics = EX5.get_resource_attributes( '/sites/%s/clusters/%s/nodes' % (site, cluster))['items'][0]['network_adapters'] return [nic['device'] for nic in nics if nic['mountable']]
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy( Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter( lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def run(self): """The main experimental workflow, as described in ``Using the Execo toolkit to perform ...`` """ self.force_options() # The argument is a cluster self.cluster = self.args[0] self.frontend = get_cluster_site(self.cluster) # Analyzing options if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.create_paramsweeper() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.get_resources() # Hosts deployment and configuration if not self.options.no_hosts_setup: self.setup_hosts() if len(self.hosts) == 0: break # Initializing the resources and threads available_hosts = list(self.hosts) available_ip_mac = list(self.ip_mac) threads = {} # Checking that the job is running and not in Error while self.is_job_alive()['state'] != 'Error' \ or len(threads.keys()) > 0: # while get_oar_job_info(self.oar_job_id, self.frontend)['state'] != 'Error' \ # or len(threads.keys()) > 0: job_is_dead = False while self.options.n_nodes > len(available_hosts): tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): available_hosts.extend(tmp_threads[t]['hosts']) available_ip_mac.extend(tmp_threads[t]['ip_mac']) del threads[t] sleep(5) if self.is_job_alive()['state'] == 'Error': job_is_dead = True break if job_is_dead: break # Getting the next combination comb = self.sweeper.get_next() if not comb: while len(threads.keys()) > 0: tmp_threads = dict(threads) for t in tmp_threads: if not t.is_alive(): del threads[t] logger.info('Waiting for threads to complete') sleep(20) break used_hosts = available_hosts[0:self.options.n_nodes] available_hosts = available_hosts[self.options.n_nodes:] n_vm = self.comb_nvm(comb) used_ip_mac = available_ip_mac[0:n_vm] available_ip_mac = available_ip_mac[n_vm:] t = Thread(target=self.workflow, args=(comb, used_hosts, used_ip_mac)) threads[t] = {'hosts': used_hosts, 'ip_mac': used_ip_mac} logger.debug('Threads: %s', len(threads)) t.daemon = True t.start() # if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': if self.is_job_alive()['state'] == 'Error': job_is_dead = True if job_is_dead: self.oar_job_id = None finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def run(self): """Inherited method, put here the code for running the engine""" self.define_parameters() self.cluster = self.args[0] self.site = get_cluster_site(self.cluster) if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None try: # Creation of the main iterator which is used for the first control loop. # You need have a method called define_parameters, that returns a list of parameter dicts self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # If no job, we make a reservation and prepare the hosts for the experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() # Retrieving the hosts and subnets parameters self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) # Hosts deployment deployed, undeployed = deploy(Deployment(self.hosts, env_file="/home/mliroz/deploys/hadoop6.env")) logger.info("%i deployed, %i undeployed" % (len(deployed), len(undeployed))) if len(deployed) == 0: break # Configuration du systeme => look at the execo_g5k.topology module attr = get_host_attributes(self.cluster + '-1') ## SETUP FINISHED # Getting the next combination comb = self.sweeper.get_next() self.prepare_dataset(comb) self.xp(comb) # subloop over the combinations that have the same sizes while True: newcomb = self.sweeper.get_next(lambda r: filter(lambda subcomb: subcomb['sizes'] == comb['sizes'], r)) if newcomb: try: self.xp(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging')
def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([s for (c, s) in clusters_to_submit]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update( sites_clusters_threads[site].keys()) all_involved_clusters.update( [c for (c, s) in clusters_to_submit if s == site]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key( site) and sites_clusters_threads[site].has_key( cluster): num_workers = len( sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min( self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % (num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target=self.worker_start, args=( cluster, site, oarsubmission, data, num_total_workers, )) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key( cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append( th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail( "no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail( "cleaning: delete job %i of worker #%i on %s" % (th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None