def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([s for (c, s) in clusters_to_submit]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update( sites_clusters_threads[site].keys()) all_involved_clusters.update( [c for (c, s) in clusters_to_submit if s == site]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key( site) and sites_clusters_threads[site].has_key( cluster): num_workers = len( sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min( self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % (num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target=self.worker_start, args=( cluster, site, oarsubmission, data, num_total_workers, )) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key( cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append( th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail( "no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail( "cleaning: delete job %i of worker #%i on %s" % (th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None
def run(self): num_total_workers = 0 sites_clusters_threads = {} # dict: keys = sites, values = # dict: keys = clusters, values = # list: threads try: while True: t = Timer() clusters_to_submit = set() for clusterspec in self.get_clusters(): cluster, _, site = clusterspec.partition(".") if site == "": site = get_cluster_site(cluster) clusters_to_submit.add((cluster, site)) for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): sites_clusters_threads[site][cluster] = [ th for th in sites_clusters_threads[site][cluster] if th.is_alive() ] if len(sites_clusters_threads[site][cluster]) == 0: del sites_clusters_threads[site][cluster] if len(sites_clusters_threads[site]) == 0: del sites_clusters_threads[site] all_involved_sites = set(sites_clusters_threads.keys()) all_involved_sites.update([ s for (c, s) in clusters_to_submit ]) no_submissions = True for site in all_involved_sites: all_involved_clusters = set() if sites_clusters_threads.has_key(site): all_involved_clusters.update(sites_clusters_threads[site].keys()) all_involved_clusters.update([ c for (c, s) in clusters_to_submit if s == site ]) for cluster in all_involved_clusters: num_workers = 0 num_waiting = 0 if sites_clusters_threads.has_key(site) and sites_clusters_threads[site].has_key(cluster): num_workers = len(sites_clusters_threads[site][cluster]) num_waiting = len([ th for th in sites_clusters_threads[site][cluster] if th.waiting ]) num_max_new_workers = min(self.options.max_workers - num_workers, self.options.max_waiting - num_waiting) logger.trace( "rescheduling on cluster %s@%s: num_workers = %s / num_waiting = %s / num_max_new_workers = %s" % (cluster, site, num_workers, num_waiting, num_max_new_workers)) if num_max_new_workers > 0: for worker_index in range(0, num_max_new_workers): jobdata = self.get_job(cluster) if not jobdata: break no_submissions = False logger.detail( "spawning worker %i on %s@%s" % ( num_total_workers, cluster, site)) (oarsubmission, data) = jobdata th = Thread(target = self.worker_start, args = (cluster, site, oarsubmission, data, num_total_workers,)) th.waiting = True th.daemon = True th.oarsublock = Lock() th.willterminate = False th.start() num_total_workers += 1 if not sites_clusters_threads.has_key(site): sites_clusters_threads[site] = {} if not sites_clusters_threads[site].has_key(cluster): sites_clusters_threads[site][cluster] = [] sites_clusters_threads[site][cluster].append(th) if no_submissions and len(sites_clusters_threads) == 0: break sleep(self.options.schedule_delay) logger.detail("no more combinations to explore. exit schedule loop") finally: for site in sites_clusters_threads.keys(): for cluster in sites_clusters_threads[site].keys(): for th in sites_clusters_threads[site][cluster]: with th.oarsublock: th.willterminate = True if th.jobid: logger.detail("cleaning: delete job %i of worker #%i on %s" % ( th.jobid, th.worker_index, site)) oardel([(th.jobid, site)]) th.jobid = None