def __init__(self,job_master,appmgr): ITaskScheduler.__init__(self,job_master,appmgr) self.tid = 0 self.worker_init = LQCDWorkerInit() self.snapshots_tasks = {} self.output_area_lock = threading.Lock() self.errors_found = False
def run(self): logger.info('calling run()') # must call base class implementation first ITaskScheduler.run(self) once = True while self.has_more_work() and not self.should_stop(): try: self.output_area_lock.acquire() alive_workers = [w for w in self.job_master.worker_registry.alive_workers.values() if hasattr(w,'snapshot_task') and not w.snapshot_task is None] self.snapshot_tasks = dict([(task.snapshot_name,task) for task in [LQCDTaskIn(snapshot_name,ntraj=self.run_data.ntraj) for snapshot_name in glob.glob(os.path.join(self.run_data.basedir,'dat',SNAPSHOT_PREFIX+'*'))]]) unassigned_snapshot_tasks= [task for task in self.snapshot_tasks.values() if (task.beta,task.seed) not in [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers]] # give priority to tasks which have least iterations done so far (ntraj_count) # tasks with most number of iterations will appear first on the list def equalize(t1,t2): return -cmp(t1.ntraj_count,t2.ntraj_count) # give priority to the tasks closer to the left edge of [m,M] window # outside of this window just equalize #m = 5.1815 #M = 5.18525 def left_priority_window(t1,t2): b1,b2 = float(t1.beta) ,float(t2.beta) m,M = self.run_data.compare_params def in_range(x): return m <= x and x <= M if in_range(b1): if in_range(b2): if b1<b2: return 1 elif b1>b2: return -1 else: return equalize(t1,t2) else: return 1 else: if in_range(b2): return -1 else: return equalize(t1,t2) unassigned_snapshot_tasks.sort(locals()[self.run_data.compare_by]) # some security checks to make sure that the same snapshot is never done by two workers at the same time _check_active_snapshots = [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers] #if len(set(_check_active_snapshots)) != len(_check_active_snapshots): # logger.error("same snapshot assigned to worker more than once (beta,seed,wid):") # logger.error(str([zip(_check_active_snapshots,[w.wid for w in alive_workers])])) # self.errors_found = True # return if once: logger.info('') logger.info('unassigned snapshot tasks (%d)',len(self.snapshot_tasks)) for t in unassigned_snapshot_tasks: logger.info(str(t.snapshot_name)) once = False waiting_workers = self.job_master.worker_registry.waiting_workers.values() #logger.info('waiting workers: %s',[w.wid for w in waiting_workers]) #logger.info('unassigned snapshot tasks: %s',[(t.beta,t.seed) for t in unassigned_snapshot_tasks]) for w in waiting_workers: self.tid += 1 t = TaskInfo(self.tid) if w.snapshot_task is None: try: w.snapshot_task = unassigned_snapshot_tasks.pop() except IndexError: # more workers than available snapshots break else: #FIXME: this is ugly and should be implemented idealy in tasks_completed() but there is currently no way # of assotiating the worker to the completed task in that method... w.snapshot_task.ntraj_count += w.snapshot_task.ntraj pass t.task_input = w.snapshot_task logger.info('scheduling (%s,%s,%d) -> %s',t.task_input.beta,t.task_input.seed,t.task_input.ntraj_count,w.wid) self.job_master.schedule(w,[t]) finally: self.output_area_lock.release() time.sleep(1)