Esempio n. 1
0
    def initialize(self,init):
        """
        This method is called only once per worker agent.
        The init parameters are sent by the master (and they result of calling make_init() method of the application manager).
        """
        logger.info('*** initialize: worker id=%d',self._agent.wid)

        self.ftc = createFileTransferClient(init.file_server_url,program=self._agent.program)

        self.parameters_template_file = init.parameters_template_file

#         #KUBA: AMD FIX BEGIN
#         global EXE_NAME
        
#         if file('/proc/cpuinfo').read().find('Opteron') != -1:
#             EXE_NAME += '.amd_opteron'
#         #KUBA: AMD FIX END
        
        self.ftc.download_file(TAR_NAME)

        if os.system("tar xfzv %s"%TAR_NAME) != 0:
            raise Exception('cannot extract tarfile %s'%TAR_NAME)

        #chmod_executable(EXE_NAME)
        
        self.ftc.download_file(init.parameters_template_file)
        return EXE_NAME
Esempio n. 2
0
    def tasks_completed(self, tasks):
        try:
            self.output_area_lock.acquire()

            bdir = self.run_data.basedir
            for t in tasks:
                elapsed_wallclock_time = t.task_output.info.elapsed_wallclock_time
                try:
                    ru1,ru2 = t.task_output.info.rusage1[:],t.task_output.info.rusage2[:]
                except TypeError:
                    ru1,ru2 = None,None

                #save resoruce usage information for further processing
                self.job_master.journal.addEntry('lqcd_task_resources',tid=t.tid,wid=t.details.assigned_wid, elapsed_wallclock_time=elapsed_wallclock_time, usage_before=ru1, usage_after=ru2)

                tin0 = t.task_input # ntraj at task start
                tin1 = t.task_output.task_in # ntraj at task end
                
                logger.info('starting moving file set (%s,%s)',tin0.beta,tin0.seed)
                for f in t.task_output.outputs:
                    f.dest = os.path.join(bdir,f.dest) # put the files into the tmp area
                    f.write()

                # make backup of current snapshot and results
                rename(tin0.dat_fn(SNAPSHOT_PREFIX,bdir), tin0.bak_fn(SNAPSHOT_PREFIX,bdir))
                
                for fn in ['fort.15']:
                    if os.path.exists(tin0.dat_fn(fn,bdir)):
                        copyfile(tin0.dat_fn(fn,bdir), tin0.bak_fn(fn,bdir))

                # move the new snapshot from tmp to dat area
                rename(tin1.tmp_fn(SNAPSHOT_PREFIX,bdir), tin1.dat_fn(SNAPSHOT_PREFIX,bdir))

                for fn in ['fort.15']:
                    append_to_file(tin0.dat_fn(fn,bdir), tin1.tmp_fn(fn,bdir))
                    remove(tin1.tmp_fn(fn,bdir))
                    rename(tin0.dat_fn(fn,bdir), tin1.dat_fn(fn,bdir)) # new
                logger.info('ended moving file set (%s,%s)',tin0.beta,tin0.seed)                
        finally:
            self.output_area_lock.release()
Esempio n. 3
0
 def worker_initialized(self,w):
     logger.info('worker wid=%d initialized OK, executable: %s',w.wid,w.init_output)
Esempio n. 4
0
    def run(self):
        logger.info('calling run()')
        # must call base class implementation first
        ITaskScheduler.run(self)

        once = True
        
        while self.has_more_work() and not self.should_stop():
            try:
                self.output_area_lock.acquire()

                alive_workers = [w for w in self.job_master.worker_registry.alive_workers.values() if hasattr(w,'snapshot_task') and not w.snapshot_task is None]

                self.snapshot_tasks = dict([(task.snapshot_name,task) for task in
                                           [LQCDTaskIn(snapshot_name,ntraj=self.run_data.ntraj) for snapshot_name in
                                            glob.glob(os.path.join(self.run_data.basedir,'dat',SNAPSHOT_PREFIX+'*'))]])

                unassigned_snapshot_tasks= [task for task in self.snapshot_tasks.values() if (task.beta,task.seed) not in
                                            [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers]]


                # give priority to tasks which have least iterations done so far (ntraj_count)
                # tasks with most number of iterations will appear first on the list
                def equalize(t1,t2):
                    return -cmp(t1.ntraj_count,t2.ntraj_count)


                # give priority to the tasks closer to the left edge of [m,M] window
                # outside of this window just equalize

                #m = 5.1815
                #M = 5.18525

                def left_priority_window(t1,t2):
                    b1,b2 = float(t1.beta) ,float(t2.beta)

                    m,M = self.run_data.compare_params
                    
                    def in_range(x):
                        return m <= x and x <= M

                    if in_range(b1):
                        if in_range(b2):
                            if b1<b2:
                                return 1
                            elif b1>b2:
                                return -1
                            else:
                                return equalize(t1,t2)
                        else:
                            return 1
                    else:
                        if in_range(b2):
                            return -1
                        else:
                            return equalize(t1,t2)

                unassigned_snapshot_tasks.sort(locals()[self.run_data.compare_by])

                # some security checks to make sure that the same snapshot is never done by two workers at the same time
                _check_active_snapshots = [(w.snapshot_task.beta,w.snapshot_task.seed) for w in alive_workers]
                #if len(set(_check_active_snapshots)) != len(_check_active_snapshots):
                #       logger.error("same snapshot assigned to worker more than once (beta,seed,wid):")
                #       logger.error(str([zip(_check_active_snapshots,[w.wid for w in alive_workers])]))
                #       self.errors_found = True
                #       return
                

                if once:
                    logger.info('')
                    logger.info('unassigned snapshot tasks (%d)',len(self.snapshot_tasks))
                    for t in unassigned_snapshot_tasks:
                        logger.info(str(t.snapshot_name))

                once = False
                
                waiting_workers = self.job_master.worker_registry.waiting_workers.values()

                #logger.info('waiting workers: %s',[w.wid for w in waiting_workers])
                #logger.info('unassigned snapshot tasks: %s',[(t.beta,t.seed) for t in unassigned_snapshot_tasks])
                
                for w in waiting_workers:
                    self.tid += 1
                    t = TaskInfo(self.tid)

                    if w.snapshot_task is None:
                        try:
                            w.snapshot_task = unassigned_snapshot_tasks.pop()
                        except IndexError:
                            # more workers than available snapshots
                            break
                    else:
                        #FIXME: this is ugly and should be implemented idealy in tasks_completed() but there is currently no way
                        # of assotiating the worker to the completed task in that method...
                        w.snapshot_task.ntraj_count += w.snapshot_task.ntraj
                        pass
                        
                    t.task_input = w.snapshot_task
                    
                    logger.info('scheduling (%s,%s,%d) -> %s',t.task_input.beta,t.task_input.seed,t.task_input.ntraj_count,w.wid)
                    self.job_master.schedule(w,[t])
            finally:
                self.output_area_lock.release()
            time.sleep(1)
Esempio n. 5
0
 def initialize(self,run_data):
     self.run_data = run_data
     self.run_data.basedir = os.path.abspath(run_data.basedir)
     self.worker_init.file_server_url = self.run_data.file_server_url
     self.worker_init.parameters_template_file = self.run_data.parameters_template_file
     logger.info('basedir %s',run_data.basedir)
Esempio n. 6
0
 def finalize(self,x):
     """ This method is called only once per worker agent.
     """
     logger.info('*** finalize: worker id=%d',self._agent.wid)