def initialize(self,init): """ This method is called only once per worker agent. The init parameters are sent by the master (and they result of calling make_init() method of the application manager). """ logger.info('*** initialize: worker id=%d',self._agent.wid) self.ftc = createFileTransferClient(init.file_server_url,program=self._agent.program) self.parameters_template_file = init.parameters_template_file # #KUBA: AMD FIX BEGIN # global EXE_NAME # if file('/proc/cpuinfo').read().find('Opteron') != -1: # EXE_NAME += '.amd_opteron' # #KUBA: AMD FIX END self.ftc.download_file(TAR_NAME) if os.system("tar xfzv %s"%TAR_NAME) != 0: raise Exception('cannot extract tarfile %s'%TAR_NAME) #chmod_executable(EXE_NAME) self.ftc.download_file(init.parameters_template_file) return EXE_NAME
def tasks_completed(self, tasks): try: self.output_area_lock.acquire() bdir = self.run_data.basedir for t in tasks: elapsed_wallclock_time = t.task_output.info.elapsed_wallclock_time try: ru1,ru2 = t.task_output.info.rusage1[:],t.task_output.info.rusage2[:] except TypeError: ru1,ru2 = None,None #save resoruce usage information for further processing self.job_master.journal.addEntry('lqcd_task_resources',tid=t.tid,wid=t.details.assigned_wid, elapsed_wallclock_time=elapsed_wallclock_time, usage_before=ru1, usage_after=ru2) tin0 = t.task_input # ntraj at task start tin1 = t.task_output.task_in # ntraj at task end logger.info('starting moving file set (%s,%s)',tin0.beta,tin0.seed) for f in t.task_output.outputs: f.dest = os.path.join(bdir,f.dest) # put the files into the tmp area f.write() # make backup of current snapshot and results rename(tin0.dat_fn(SNAPSHOT_PREFIX,bdir), tin0.bak_fn(SNAPSHOT_PREFIX,bdir)) for fn in ['fort.15']: if os.path.exists(tin0.dat_fn(fn,bdir)): copyfile(tin0.dat_fn(fn,bdir), tin0.bak_fn(fn,bdir)) # move the new snapshot from tmp to dat area rename(tin1.tmp_fn(SNAPSHOT_PREFIX,bdir), tin1.dat_fn(SNAPSHOT_PREFIX,bdir)) for fn in ['fort.15']: append_to_file(tin0.dat_fn(fn,bdir), tin1.tmp_fn(fn,bdir)) remove(tin1.tmp_fn(fn,bdir)) rename(tin0.dat_fn(fn,bdir), tin1.dat_fn(fn,bdir)) # new logger.info('ended moving file set (%s,%s)',tin0.beta,tin0.seed) finally: self.output_area_lock.release()
def worker_initialized(self,w): logger.info('worker wid=%d initialized OK, executable: %s',w.wid,w.init_output)
def run(self): logger.info('calling run()') # must call base class implementation first ITaskScheduler.run(self) once = True while self.has_more_work() and not self.should_stop(): try: self.output_area_lock.acquire() alive_workers = [w for w in self.job_master.worker_registry.alive_workers.values() if hasattr(w,'snapshot_task') and not w.snapshot_task is None] self.snapshot_tasks = dict([(task.snapshot_name,task) for task in [LQCDTaskIn(snapshot_name,ntraj=self.run_data.ntraj) for snapshot_name in glob.glob(os.path.join(self.run_data.basedir,'dat',SNAPSHOT_PREFIX+'*'))]]) unassigned_snapshot_tasks= [task for task in self.snapshot_tasks.values() if (task.beta,task.seed) not in [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers]] # give priority to tasks which have least iterations done so far (ntraj_count) # tasks with most number of iterations will appear first on the list def equalize(t1,t2): return -cmp(t1.ntraj_count,t2.ntraj_count) # give priority to the tasks closer to the left edge of [m,M] window # outside of this window just equalize #m = 5.1815 #M = 5.18525 def left_priority_window(t1,t2): b1,b2 = float(t1.beta) ,float(t2.beta) m,M = self.run_data.compare_params def in_range(x): return m <= x and x <= M if in_range(b1): if in_range(b2): if b1<b2: return 1 elif b1>b2: return -1 else: return equalize(t1,t2) else: return 1 else: if in_range(b2): return -1 else: return equalize(t1,t2) unassigned_snapshot_tasks.sort(locals()[self.run_data.compare_by]) # some security checks to make sure that the same snapshot is never done by two workers at the same time _check_active_snapshots = [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers] #if len(set(_check_active_snapshots)) != len(_check_active_snapshots): # logger.error("same snapshot assigned to worker more than once (beta,seed,wid):") # logger.error(str([zip(_check_active_snapshots,[w.wid for w in alive_workers])])) # self.errors_found = True # return if once: logger.info('') logger.info('unassigned snapshot tasks (%d)',len(self.snapshot_tasks)) for t in unassigned_snapshot_tasks: logger.info(str(t.snapshot_name)) once = False waiting_workers = self.job_master.worker_registry.waiting_workers.values() #logger.info('waiting workers: %s',[w.wid for w in waiting_workers]) #logger.info('unassigned snapshot tasks: %s',[(t.beta,t.seed) for t in unassigned_snapshot_tasks]) for w in waiting_workers: self.tid += 1 t = TaskInfo(self.tid) if w.snapshot_task is None: try: w.snapshot_task = unassigned_snapshot_tasks.pop() except IndexError: # more workers than available snapshots break else: #FIXME: this is ugly and should be implemented idealy in tasks_completed() but there is currently no way # of assotiating the worker to the completed task in that method... w.snapshot_task.ntraj_count += w.snapshot_task.ntraj pass t.task_input = w.snapshot_task logger.info('scheduling (%s,%s,%d) -> %s',t.task_input.beta,t.task_input.seed,t.task_input.ntraj_count,w.wid) self.job_master.schedule(w,[t]) finally: self.output_area_lock.release() time.sleep(1)
def initialize(self,run_data): self.run_data = run_data self.run_data.basedir = os.path.abspath(run_data.basedir) self.worker_init.file_server_url = self.run_data.file_server_url self.worker_init.parameters_template_file = self.run_data.parameters_template_file logger.info('basedir %s',run_data.basedir)
def finalize(self,x): """ This method is called only once per worker agent. """ logger.info('*** finalize: worker id=%d',self._agent.wid)