def get_task_data(self,worker_uuid): wid = self._resolve_wid(worker_uuid) logger.debug('get_task_data %d',wid) self.journal.addEntry('get_task_request',wid=wid) try: worker_entry = self.worker_registry.get(wid) worker_entry.alive_lock.acquire() self.update_contact(wid) require_worker_initialized(worker_entry) try: #TEST: time.sleep(1) import Queue task_info = worker_entry.scheduled_tasks.get(block=False) logger.debug('removed from scheduled_tasks queue:%d,%s',task_info.tid,repr(task_info.task_input)) except Queue.Empty: self.journal.addEntry('get_task_data_error',wid=wid,msg=1) raise DIANE_CORBA.XRepeatCall(0) else: worker_entry.processing_tasks[task_info.tid] = task_info self.worker_registry.update_cache(worker_entry) task_info.details.time_start = time.time() self.journal.addEntry('get_task_data',wid=wid,tid=task_info.tid) return (task_info.tid,streamer.dumps(task_info.task_input)) finally: worker_entry.alive_lock.release()
def do_work(self, task_data): try: logger.debug("InprocessApplicationProxy.do_work()") task_data = streamer.loads(task_data) task_result = self.app.do_work(task_data) return streamer.dumps(task_result) except Exception, x: handleApplicationFailure(x)
def run(self): import MSGWrap from diane.config import log_configuration log_configuration(title='initial configuration') msg_data = { '_worker_uuid' : self.uuid } try: self.registerToMaster() master = StandingCall(self.master, config.HEARTBEAT_DELAY, should_stop = self.should_stop) app_boot,app_init = master.get_init_data(self.uuid) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'get_init_data',self.uuid) _boot = streamer.loads(app_boot) msg_data['_master_uuid'] = _boot.master_uuid msg_data['_runid'] = _boot.runid import os msg_data['ganga_job_uuid'] = self.ganga_job_uuid # FIXME: if worker restart enabled, save diane.config.__all_configs and restore it after run has finished MSGWrap.sendStatus('_worker_create_application_proxy_start', msg_data) self.application = create_application_proxy(app_boot,app_init,agent=self) MSGWrap.sendStatus('_worker_create_application_proxy_finish', msg_data) self.program.registerAtExitHandler(self.finalize_application) MSGWrap.sendStatus('_worker_initialize_start', msg_data) app_init_output = self.application.initialize(app_init) MSGWrap.sendStatus('_worker_initialize_finish', msg_data) # config may have been updated and the value of config.HEARTBEAT_DELAY may have changed -> need to create the object again # FIXME: use a REFERENCE to config.HEARTBEAT_DELAY master = StandingCall(self.master, config.HEARTBEAT_DELAY, should_stop = self.should_stop) master.put_init_result(self.uuid,app_init_output,0) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'put_init_result',self.uuid,app_init_output,0) while not self.should_stop(): time.sleep(config.PULL_REQUEST_DELAY) # PENDING: this parameter should be dynamically controlled by the master tid,task_data = master.get_task_data(self.uuid) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'get_task_data',self.uuid) try: msg_data['tid'] = tid MSGWrap.sendStatus('_worker_do_work_start', msg_data) task_result = self.application.do_work(task_data) MSGWrap.sendStatus('_worker_do_work_finish', msg_data) error = 0 except diane.application.ApplicationFailure,x: # recoverable problem task_result = streamer.dumps(x) error = 1 #FIXME: reporting failure is not yet well-defined master.put_task_result(self.uuid,tid,task_result,error) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'put_task_result',self.uuid,tid,task_result,error) except diane.application.ApplicationFailure,x: # recoverable problem but raised by the application init pass
def initialize(self, app_init): try: logger.debug("InprocessApplicationProxy.initialize()") app_init = streamer.loads(app_init) app_init_output = self.app.initialize(app_init) app_init_output = streamer.dumps(app_init_output) return app_init_output except Exception, x: handleApplicationFailure(x)
def get_init_data(self,worker_uuid): wid = self._resolve_wid(worker_uuid) # during the execution of this method the tasks may not be scheduled # to this worker because it is not in the cache waiting list logger.debug('get_init_data %d',wid) self.update_contact(wid) w = self.worker_registry.get(wid) require_worker_initialized(w,False) self.CNT = 0 self.journal.addEntry('get_init_data',wid=wid) return (self.app_boot_data,streamer.dumps(w.init_input))
def startProcessing(self,input): import diane.util from diane.config import log_configuration log_configuration() #start the default file server import diane.FileTransfer self.file_server = diane.FileTransfer.Server.main('FileTransferOID',self.server) # FIXME: segmentation fault if object reference is passed directly, workaround via stringified IOR self.file_server_ior = self.server.orb.object_to_string(self.file_server) import os self.journal.addEntry('master_start',runid=self.runid(), application_name=input.application.__name__, name=os.path.basename(input._runfile)) # prepare application boot import application boot_data = application.make_boot_data(input) boot_data.runid = self.runid() boot_data.master_uuid = self.uuid self.app_boot_data = streamer.dumps(boot_data) # TEST: trigger race condition with registerWorker(): a fast worker registers before this method is completed # this problem should be fixed now #import time #time.sleep(5) def thread_crash_handler(t): try: logger.debug('Crash handler started: %s',t.__class__.__name__) return t._run() except Exception,x: logger.exception('Information from crash handler (%s): unhandled exception: %s',t.__class__.__name__,x) logger.info('Stopping the RunMaster and dumping the state into the "crash-dump.pickle" file') import pickle pickle.dump(self.worker_registry,file("crash-dump.pickle",'w')) self.shutdown()