Example #1
0
 def __init__(self,job_master,appmgr):
     ITaskScheduler.__init__(self,job_master,appmgr)
     self.tid = 0
     self.worker_init = LQCDWorkerInit()
     self.snapshots_tasks = {}
     self.output_area_lock = threading.Lock()
     self.errors_found = False
Example #2
0
    def run(self):
        logger.info('calling run()')
        # must call base class implementation first
        ITaskScheduler.run(self)

        once = True
        
        while self.has_more_work() and not self.should_stop():
            try:
                self.output_area_lock.acquire()

                alive_workers = [w for w in self.job_master.worker_registry.alive_workers.values() if hasattr(w,'snapshot_task') and not w.snapshot_task is None]

                self.snapshot_tasks = dict([(task.snapshot_name,task) for task in
                                           [LQCDTaskIn(snapshot_name,ntraj=self.run_data.ntraj) for snapshot_name in
                                            glob.glob(os.path.join(self.run_data.basedir,'dat',SNAPSHOT_PREFIX+'*'))]])

                unassigned_snapshot_tasks= [task for task in self.snapshot_tasks.values() if (task.beta,task.seed) not in
                                            [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers]]


                # give priority to tasks which have least iterations done so far (ntraj_count)
                # tasks with most number of iterations will appear first on the list
                def equalize(t1,t2):
                    return -cmp(t1.ntraj_count,t2.ntraj_count)


                # give priority to the tasks closer to the left edge of [m,M] window
                # outside of this window just equalize

                #m = 5.1815
                #M = 5.18525

                def left_priority_window(t1,t2):
                    b1,b2 = float(t1.beta) ,float(t2.beta)

                    m,M = self.run_data.compare_params
                    
                    def in_range(x):
                        return m <= x and x <= M

                    if in_range(b1):
                        if in_range(b2):
                            if b1<b2:
                                return 1
                            elif b1>b2:
                                return -1
                            else:
                                return equalize(t1,t2)
                        else:
                            return 1
                    else:
                        if in_range(b2):
                            return -1
                        else:
                            return equalize(t1,t2)

                unassigned_snapshot_tasks.sort(locals()[self.run_data.compare_by])

                # some security checks to make sure that the same snapshot is never done by two workers at the same time
                _check_active_snapshots = [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers]
                #if len(set(_check_active_snapshots)) != len(_check_active_snapshots):
                #       logger.error("same snapshot assigned to worker more than once (beta,seed,wid):")
                #       logger.error(str([zip(_check_active_snapshots,[w.wid for w in alive_workers])]))
                #       self.errors_found = True
                #       return
                

                if once:
                    logger.info('')
                    logger.info('unassigned snapshot tasks (%d)',len(self.snapshot_tasks))
                    for t in unassigned_snapshot_tasks:
                        logger.info(str(t.snapshot_name))

                once = False
                
                waiting_workers = self.job_master.worker_registry.waiting_workers.values()

                #logger.info('waiting workers: %s',[w.wid for w in waiting_workers])
                #logger.info('unassigned snapshot tasks: %s',[(t.beta,t.seed) for t in unassigned_snapshot_tasks])
                
                for w in waiting_workers:
                    self.tid += 1
                    t = TaskInfo(self.tid)

                    if w.snapshot_task is None:
                        try:
                            w.snapshot_task = unassigned_snapshot_tasks.pop()
                        except IndexError:
                            # more workers than available snapshots
                            break
                    else:
                        #FIXME: this is ugly and should be implemented idealy in tasks_completed() but there is currently no way
                        # of assotiating the worker to the completed task in that method...
                        w.snapshot_task.ntraj_count += w.snapshot_task.ntraj
                        pass
                        
                    t.task_input = w.snapshot_task
                    
                    logger.info('scheduling (%s,%s,%d) -> %s',t.task_input.beta,t.task_input.seed,t.task_input.ntraj_count,w.wid)
                    self.job_master.schedule(w,[t])
            finally:
                self.output_area_lock.release()
            time.sleep(1)