def finalize(self, cookie): try: logger.debug("InprocessApplicationProxy.finalize()") cookie = streamer.loads(cookie) self.app.finalize(cookie) except Exception, x: handleApplicationFailure(x)
def do_work(self, task_data): try: logger.debug("InprocessApplicationProxy.do_work()") task_data = streamer.loads(task_data) task_result = self.app.do_work(task_data) return streamer.dumps(task_result) except Exception, x: handleApplicationFailure(x)
def run(self): import MSGWrap from diane.config import log_configuration log_configuration(title='initial configuration') msg_data = { '_worker_uuid' : self.uuid } try: self.registerToMaster() master = StandingCall(self.master, config.HEARTBEAT_DELAY, should_stop = self.should_stop) app_boot,app_init = master.get_init_data(self.uuid) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'get_init_data',self.uuid) _boot = streamer.loads(app_boot) msg_data['_master_uuid'] = _boot.master_uuid msg_data['_runid'] = _boot.runid import os msg_data['ganga_job_uuid'] = self.ganga_job_uuid # FIXME: if worker restart enabled, save diane.config.__all_configs and restore it after run has finished MSGWrap.sendStatus('_worker_create_application_proxy_start', msg_data) self.application = create_application_proxy(app_boot,app_init,agent=self) MSGWrap.sendStatus('_worker_create_application_proxy_finish', msg_data) self.program.registerAtExitHandler(self.finalize_application) MSGWrap.sendStatus('_worker_initialize_start', msg_data) app_init_output = self.application.initialize(app_init) MSGWrap.sendStatus('_worker_initialize_finish', msg_data) # config may have been updated and the value of config.HEARTBEAT_DELAY may have changed -> need to create the object again # FIXME: use a REFERENCE to config.HEARTBEAT_DELAY master = StandingCall(self.master, config.HEARTBEAT_DELAY, should_stop = self.should_stop) master.put_init_result(self.uuid,app_init_output,0) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'put_init_result',self.uuid,app_init_output,0) while not self.should_stop(): time.sleep(config.PULL_REQUEST_DELAY) # PENDING: this parameter should be dynamically controlled by the master tid,task_data = master.get_task_data(self.uuid) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'get_task_data',self.uuid) try: msg_data['tid'] = tid MSGWrap.sendStatus('_worker_do_work_start', msg_data) task_result = self.application.do_work(task_data) MSGWrap.sendStatus('_worker_do_work_finish', msg_data) error = 0 except diane.application.ApplicationFailure,x: # recoverable problem task_result = streamer.dumps(x) error = 1 #FIXME: reporting failure is not yet well-defined master.put_task_result(self.uuid,tid,task_result,error) #(config.HEARTBEAT_DELAY,-1,self.should_stop,self.master,'put_task_result',self.uuid,tid,task_result,error) except diane.application.ApplicationFailure,x: # recoverable problem but raised by the application init pass
def initialize(self, app_init): try: logger.debug("InprocessApplicationProxy.initialize()") app_init = streamer.loads(app_init) app_init_output = self.app.initialize(app_init) app_init_output = streamer.dumps(app_init_output) return app_init_output except Exception, x: handleApplicationFailure(x)
def create_application_proxy(boot_msg, app_init, agent, **kwds): boot = streamer.loads(boot_msg) import os if boot.darname: agent.ftc.download(boot.darname) dar = tarfile.open(boot.darname, "r:gz") try: dar.extractall("_python") except AttributeError: # python < 2.5 os.system("mkdir -p _python") os.system("cd _python; tar xfzv ../%s" % boot.darname) import sys app_python_path = os.path.abspath("_python") sys.path.insert(0, app_python_path) diane.config.restore_config(boot.config) logger.info("application boot and run data received") boot.log() diane.config.log_configuration(title="updated configuration") boot.agent = agent c = diane.config.getConfig("WorkerAgent") boot.application_shell_command = c.APPLICATION_SHELL boot.application_shell_pre_process = "" boot.application_shell_post_process = "" # perform a setup action of the application setup_application = importName(boot.name, "setup_application") if setup_application: try: r = setup_application(streamer.loads(app_init), agent) if not r is None: boot.application_shell_pre_process, boot.application_shell_post_process = r except Exception, x: handleApplicationFailure(x)
def put_init_result(self,worker_uuid,init_result,error): wid = self._resolve_wid(worker_uuid) # during the execution of this method the tasks may not be scheduled # to this worker because it is not in the cache waiting list init_result = streamer.loads(init_result) logger.debug('put_init_result %d %s',wid,repr(init_result)) self.update_contact(wid) w = self.worker_registry.get(wid) require_worker_initialized(w,False) w.init_output = init_result self.journal.addEntry('put_init_result',wid=wid) try: logger.debug('task_scheduler.worker_initialized(w) w.wid=%d w.worker_uuid=%s'%(w.wid,w.worker_uuid)) self.task_scheduler.worker_initialized(w) except Exception,x: logger.exception('Error in TaskScheduler.worker_initialized() callback')
def put_task_result(self,worker_uuid,tid,task_result,error): task_result = streamer.loads(task_result) wid = self._resolve_wid(worker_uuid) logger.debug('put_task_data %d %d %s',wid,tid,repr(task_result)) self.journal.addEntry('put_task_result_request',wid=wid,tid=tid,error=error) try: worker_entry = self.worker_registry.get(wid) worker_entry.alive_lock.acquire() self.update_contact(wid) require_worker_initialized(worker_entry) # protect against multiple calls from the same worker with the same task # this may happen not only because of the login error in the Worker Agent # running lattice qcd application I observed TRANSIENT exception on the worker # but the call apparently made it to the master try: task_info=worker_entry.processing_tasks[tid] except KeyError: logger.debug('ignored multiple call to put_task_data() %d %d',wid,tid) return task_info.details.time_finish = time.time() del worker_entry.processing_tasks[tid] self.journal.addEntry('put_task_result',wid=wid,tid=tid,error=error) if error: task_info.update(TaskStatus.FAILED,task_result) try: logger.debug('task_scheduler.tasks_failed(%s)'%str([task_info.tid])) logger.warning('task %s (%s) failed: %s',task_info.tid,repr(task_info.application_label),task_result) self.task_scheduler.tasks_failed([task_info]) except Exception,x: logger.exception('Error in TaskScheduler.tasks_failed() callback') else: