def __init__(self, config, config_section_name): logger.debug('Initializing AppExecutor') self.config_main_section_name = config_section_name self.config_files_section_name = 'files' self.queues_section_name = 'queues' self.config = config self.remote_config_fname = AppFNames.CONFIG self.remote_system_output_fname = AppFNames.SYSTEM_OUTPUT self.remote_duration_output_fname = AppFNames.DURATION_OUTPUT #TODO: retrieve app_output from the remote resource if necessary and send it to the job creator self.remote_app_output_fname = AppFNames.APP_OUTPUT self.remote_s3cfg_fname = AppFNames.S3CFG self.remote_pid_fname = AppFNames.PID try: self.res_manager = ResourceManager(self.config.get(self.config_main_section_name, 'resources_fpath')) self.job_manager = JobManager(self.config.get(self.config_main_section_name, 'jobs_fpath')) self.consumer = Consumer(self.config.get(self.queues_section_name, 'app_executor_queue')) self.producer_app_monitor = Producer(self.config.get(self.queues_section_name, 'app_monitor_queue')) except (ConfigParser.NoSectionError, ConfigParser.NoOptionError): raise CLAUDEConfigError self.state = AppExecutorSM() self.selector = Selector(self.res_manager, self.job_manager, self.state, self._submit_job) self.ssh_manager = SSHManager() running_resources = self.res_manager.get_resources_in_state(resource_states.RUNNING) for resource in running_resources: self.ssh_manager.add_resource(resource.host, TransportCredentials(resource.credentials.username, resource.credentials.password, proxy_host=resource.credentials.proxy_host, proxy_username=resource.credentials.proxy_username))
class AppExecutor(object): ''' classdocs ''' def __init__(self, config, config_section_name): logger.debug('Initializing AppExecutor') self.config_main_section_name = config_section_name self.config_files_section_name = 'files' self.queues_section_name = 'queues' self.config = config self.remote_config_fname = AppFNames.CONFIG self.remote_system_output_fname = AppFNames.SYSTEM_OUTPUT self.remote_duration_output_fname = AppFNames.DURATION_OUTPUT #TODO: retrieve app_output from the remote resource if necessary and send it to the job creator self.remote_app_output_fname = AppFNames.APP_OUTPUT self.remote_s3cfg_fname = AppFNames.S3CFG self.remote_pid_fname = AppFNames.PID try: self.res_manager = ResourceManager(self.config.get(self.config_main_section_name, 'resources_fpath')) self.job_manager = JobManager(self.config.get(self.config_main_section_name, 'jobs_fpath')) self.consumer = Consumer(self.config.get(self.queues_section_name, 'app_executor_queue')) self.producer_app_monitor = Producer(self.config.get(self.queues_section_name, 'app_monitor_queue')) except (ConfigParser.NoSectionError, ConfigParser.NoOptionError): raise CLAUDEConfigError self.state = AppExecutorSM() self.selector = Selector(self.res_manager, self.job_manager, self.state, self._submit_job) self.ssh_manager = SSHManager() running_resources = self.res_manager.get_resources_in_state(resource_states.RUNNING) for resource in running_resources: self.ssh_manager.add_resource(resource.host, TransportCredentials(resource.credentials.username, resource.credentials.password, proxy_host=resource.credentials.proxy_host, proxy_username=resource.credentials.proxy_username)) def start(self): if len(self.job_manager.get_jobs_in_state(job_states.RUNNING)) > 0: self._msg_create_ask_me() if len(self.job_manager.get_jobs_in_state(job_states.NEW)) > 0: self.state.set(AppExecutorSM.ASSIGNING) try: while True: while self._check_queue(): pass if self.state.get() == AppExecutorSM.ASSIGNING: self.selector.update_hardware_resources() self.selector.assign_jobs() if self.state.get() == AppExecutorSM.ASSIGNED: self._msg_create_ask_me() self.state.set(AppExecutorSM.WAITING) self._process_finished_jobs() except KeyboardInterrupt: logger.info('Caught control-C') def _process_finished_jobs(self): terminated_jobs = self.job_manager.get_jobs_in_state(job_states.TERMINATED) for job in terminated_jobs: self._msg_create_job_finished(job) add_timestamp(job, timestamps.SENT) job.state = job_states.SENT self.job_manager.save() failed_jobs = self.job_manager.get_jobs_in_state(job_states.FAILED) for job in failed_jobs: self._msg_create_job_finished(job) add_timestamp(job, timestamps.SENT) job.state = job_states.SENT self.job_manager.save() to_remove = {} sent_jobs = self.job_manager.get_jobs_in_state(job_states.SENT) for job in sent_jobs: to_remove[job.jid] = job for job in to_remove.values(): self.job_manager.delete(job) def _msg_create_job_finished(self, job): a_msg = claude_msgs_pb2.Msg() a_msg.type = 10 (msg, msg_type_name) = form_message(a_msg) msg.jid = job.jid if job.state == job_states.FAILED: msg.errorcode = 1 if job.systemoutput: msg.systemoutput = job.systemoutput if job.durationoutput: msg.durationoutput = job.durationoutput msg_bytes = a_msg.SerializeToString() producer = Producer(job.returnqueue, host=job.returnip) producer.put(msg_bytes) def _msg_create_job_exists(self, jid, returnqueue, returnip): a_msg = claude_msgs_pb2.Msg() a_msg.type = 11 (msg, msg_type_name) = form_message(a_msg) msg.jid = jid msg_bytes = a_msg.SerializeToString() producer = Producer(returnqueue, host=returnip) producer.put(msg_bytes) def _check_queue(self): ret = False msg_bytes = self.consumer.get() if msg_bytes: ret = True a_msg = claude_msgs_pb2.Msg() a_msg.ParseFromString(msg_bytes) (msg, msg_type_name) = form_message(a_msg) if a_msg.type == 3: self._msg_handler_add_resource(msg) elif a_msg.type == 4: self._msg_handler_add_job(msg) elif a_msg.type == 6: self._msg_handler_req_apps_to_monitor(msg) elif a_msg.type == 8: self._msg_handler_finished_apps(msg) elif a_msg.type == 9: self._msg_handler_kill_job(msg) return ret def _msg_handler_kill_job(self, msg): jid = msg.jid try: job = self.job_manager.get_job_by_jid(jid) if (job.state == job_states.NEW) or (job.state == job_states.RUNNING): if job.state == job_states.RUNNING: resource = self.res_manager.get_resource_by_rid(job.rid) self._kill_job(job, resource) add_timestamp(job, timestamps.KILLED) self.job_manager.save() logger.info('Job "%s" was killed' % jid) else: logger.warning('Cannot kill job "%s" in state %s' % (jid, job.state)) except CLAUDEJobDoesNotExist: pass except CLAUDEResourceDoesNotExist: logger.error('Something is terribly wrong with the system...', exc_info=True) def _msg_handler_finished_apps(self, msg): for jid in msg.jids: try: job = self.job_manager.get_job_by_jid(jid) resource = self.res_manager.get_resource_by_rid(job.rid) if job.state == job_states.RUNNING: try: self._retrieve_output(job, resource) self._free_job(job, resource) self.state.set(AppExecutorSM.ASSIGNING) except CLAUDEResourceError: logger.warning('Problem with resource %s' % resource.rid) add_timestamp(job, timestamps.TERMINATED) job.state = job_states.TERMINATED self.job_manager.save() logger.info('Job "%s" has terminated on resource "%s"' % (jid, job.rid)) except (CLAUDEResourceDoesNotExist, CLAUDEJobDoesNotExist): logger.error('Something is terribly wrong with the system...', exc_info=True) def _msg_handler_req_apps_to_monitor(self, msg): running_jobs = self.job_manager.get_jobs_in_state(job_states.RUNNING) if len(running_jobs) > 0: a_msg = claude_msgs_pb2.Msg() a_msg.type = 7 (msg, msg_type_name) = form_message(a_msg) for running_job in running_jobs: try: resource = self.res_manager.get_resource_by_rid(running_job.rid) job = msg.jobs.add() job.jid = running_job.jid job.host = resource.host job.credentials.username = resource.credentials.username job.credentials.password = resource.credentials.password job.credentials.proxy_host = resource.credentials.proxy_host job.credentials.proxy_username = resource.credentials.proxy_username job.pid = running_job.pid except CLAUDEResourceDoesNotExist: logger.error('Something is terribly wrong with the system...', exc_info=True) msg_bytes = a_msg.SerializeToString() self.producer_app_monitor.put(msg_bytes) def _msg_handler_add_resource(self, msg): #TODO: state should be set somewhere else state = resource_states.RUNNING tresources = HardwareResources(msg.tresources.ram, msg.tresources.cpu, msg.tresources.disk) credentials = TransportCredentials(msg.credentials.username, msg.credentials.password, msg.credentials.proxy_host, msg.credentials.proxy_username) params = {} for param in msg.params: params[param.key] = param.value resource = self.res_manager.add_resource(msg.rid, msg.rtype, msg.host, credentials, state, tresources, msg.rootdir, params) if resource: self.selector.update_hardware_resources() self.ssh_manager.add_resource(msg.host, TransportCredentials(resource.credentials.username, resource.credentials.password, proxy_host=resource.credentials.proxy_host, proxy_username=resource.credentials.proxy_username)) self.state.set(AppExecutorSM.ASSIGNING) def _msg_handler_add_job(self, msg): working_dir = str(uuid.uuid4()) state = job_states.NEW rresources = HardwareResources(msg.rresources.ram, msg.rresources.cpu, msg.rresources.disk) try: s3cfg = msg.s3cfg except AttributeError: s3cfg = None try: keepworkingdir = msg.keepworkingdir except AttributeError: keepworkingdir = None try: chainedjob = msg.chainedjob except AttributeError: chainedjob = None try: appoutput = msg.appoutput except AttributeError: appoutput = None params = {} for param in msg.params: params[param.key] = param.value chainedjobparams = {} for param in msg.chainedjobparams: chainedjobparams[param.key] = param.value if self.job_manager.add_job(msg.jid, working_dir, state, rresources, msg.script, msg.returnip, msg.returnqueue, s3cfg, keepworkingdir, chainedjob, appoutput, params, chainedjobparams): self.state.set(AppExecutorSM.ASSIGNING) else: self._msg_create_job_exists(msg.jid, msg.returnqueue, msg.returnip) def _msg_create_ask_me(self): a_msg = claude_msgs_pb2.Msg() a_msg.type = 2 (msg, msg_type_name) = form_message(a_msg) msg_bytes = a_msg.SerializeToString() self.producer_app_monitor.put(msg_bytes) def _kill_job(self, job, resource): host = resource.host try: connection = self.ssh_manager.get_connection(host) connection.kill_by_pid(job.pid) logger.debug('Process "%s" was killed on host "%s"' % (job.pid, host)) except CLAUDEConnectingError: logger.error('Error connecting to host %s' % host, exc_info=True) raise CLAUDEResourceError except CLAUDENotConnectedError: logger.error('No connection to host %s' % host, exc_info=True) raise CLAUDEResourceError def _retrieve_output(self, job, resource): try: host = resource.host connection = self.ssh_manager.get_connection(host) with TemporaryDirectory() as tmp_dir: local_output_fpath = os.path.join(tmp_dir, 'local_system.tmp') remote_output_fpath = os.path.join(os.path.join(resource.rootdir, job.workingdir), self.remote_system_output_fname) connection.get(remote_output_fpath, local_output_fpath) with open(local_output_fpath) as f: job.systemoutput = f.read() local_output_fpath = os.path.join(tmp_dir, 'local_duration.tmp') remote_output_fpath = os.path.join(os.path.join(resource.rootdir, job.workingdir), self.remote_duration_output_fname) connection.get(remote_output_fpath, local_output_fpath) with open(local_output_fpath) as f: job.durationoutput = f.read() self.job_manager.save() except CLAUDEConnectingError: logger.error('Error connecting to host %s' % host, exc_info=True) raise CLAUDEResourceError except CLAUDENotConnectedError: logger.error('No connection to host %s' % host, exc_info=True) raise CLAUDEResourceError except CLAUDEFileContentRetrievingTimeout: logger.error('Error retrieving output of job %s from host %s' % (job.jid, host), exc_info=True) def _free_job(self, job, resource): try: host = resource.host connection = self.ssh_manager.get_connection(host) if not job.keepworkingdir: working_dir = os.path.join(resource.rootdir, job.workingdir) connection.rmdir(working_dir) logger.debug('Working dir "%s" was removed from host "%s"' % (working_dir, host)) except CLAUDEConnectingError: logger.error('Error connecting to host %s' % host, exc_info=True) raise CLAUDEResourceError except CLAUDENotConnectedError: logger.error('No connection to host %s' % host, exc_info=True) raise CLAUDEResourceError def _submit_job(self, job, resource): host = resource.host try: connection = self.ssh_manager.get_connection(host) working_dir = os.path.join(resource.rootdir, job.workingdir) if connection.exists(working_dir): logger.warning('Working dir "%s" exists on host "%s", removing it first' % (working_dir, host)) connection.rmdir(working_dir) connection.mkdir(working_dir) if job.s3cfg: local_s3cfg_fpath = self.config.get(self.config_files_section_name, 's3cfg') remote_s3cfg_fpath = os.path.join(working_dir, self.remote_s3cfg_fname) connection.put(local_s3cfg_fpath, remote_s3cfg_fpath) else: logger.warning('s3cfg is not in use') job_script_name = job.script local_script_fpath = self.config.get(self.config_files_section_name, job_script_name) script_fname = os.path.basename(local_script_fpath) remote_script_fpath = os.path.join(working_dir, script_fname) connection.put(local_script_fpath, remote_script_fpath) local_common_script_fname = self.config.get(self.config_files_section_name, 'common') common_script_fname = os.path.basename(local_common_script_fname) remote_common_script_fname = os.path.join(working_dir, common_script_fname) connection.put(local_common_script_fname, remote_common_script_fname) if job.params or job.chainedjob: job_config = self._create_job_config(job) remote_job_config_fpath = os.path.join(working_dir, self.remote_config_fname) with TemporaryDirectory() as tmp_dir: local_job_config_fpath = os.path.join(tmp_dir, self.remote_config_fname) with open(local_job_config_fpath, 'w') as job_config_file: job_config.write(job_config_file) connection.put(local_job_config_fpath, remote_job_config_fpath) connection.launch_app('python %s %s' % (remote_script_fpath, working_dir)) pid = connection.retrieve_pid(os.path.join(working_dir, self.remote_pid_fname)) add_timestamp(job, timestamps.SUBMITTED) job.state = job_states.RUNNING job.rid = resource.rid job.pid = pid self.job_manager.save() logger.info('Job "%s" was submitted to resource "%s"' % (job.jid, resource.rid)) except (CLAUDENotConnectedError, CLAUDEConnectingError): raise CLAUDEResourceError except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, IOError, CLAUDEFileContentRetrievingTimeout, ValueError, CLAUDEIOError, OSError): self._free_job(job, resource) raise CLAUDEJobFailed def _create_job_config(self, job): job_config = ConfigParser.RawConfigParser() if job.params: job_section = 'job' job_config.add_section(job_section) for param in job.params: job_config.set(job_section, param.key, param.value) if job.chainedjob: chained_job_section = 'chainedjob' job_config.add_section(chained_job_section) job_config.set(chained_job_section, 'claude_service_ip', get_interface_ip()) job_config.set(chained_job_section, 'claude_service_queue', self.config.get(self.queues_section_name, 'app_executor_queue')) if job.chainedjobparams: for param in job.chainedjobparams: job_config.set(chained_job_section, param.key, param.value) return job_config