def _submit_job(self, job, resource):
     host = resource.host
     
     try:
         connection = self.ssh_manager.get_connection(host)
         
         working_dir = os.path.join(resource.rootdir, job.workingdir)
         if connection.exists(working_dir):
             logger.warning('Working dir "%s" exists on host "%s", removing it first' % (working_dir, host))
             connection.rmdir(working_dir)
             
         connection.mkdir(working_dir)
         if job.s3cfg:
             local_s3cfg_fpath = self.config.get(self.config_files_section_name, 's3cfg')
             remote_s3cfg_fpath = os.path.join(working_dir, self.remote_s3cfg_fname)
             connection.put(local_s3cfg_fpath, remote_s3cfg_fpath)
         else:
             logger.warning('s3cfg is not in use')
             
         job_script_name = job.script
         local_script_fpath = self.config.get(self.config_files_section_name, job_script_name)
         script_fname = os.path.basename(local_script_fpath)
         remote_script_fpath = os.path.join(working_dir, script_fname)
         connection.put(local_script_fpath, remote_script_fpath)
         
         local_common_script_fname = self.config.get(self.config_files_section_name, 'common')
         common_script_fname = os.path.basename(local_common_script_fname)
         remote_common_script_fname = os.path.join(working_dir, common_script_fname)
         connection.put(local_common_script_fname, remote_common_script_fname)
         
         if job.params or job.chainedjob:
             job_config = self._create_job_config(job)
             remote_job_config_fpath = os.path.join(working_dir, self.remote_config_fname)
             
             with TemporaryDirectory() as tmp_dir:
                 local_job_config_fpath = os.path.join(tmp_dir, self.remote_config_fname)
                 
                 with open(local_job_config_fpath, 'w') as job_config_file:
                     job_config.write(job_config_file)
                     
                 connection.put(local_job_config_fpath, remote_job_config_fpath)
         
         connection.launch_app('python %s %s' % (remote_script_fpath, working_dir))
         pid = connection.retrieve_pid(os.path.join(working_dir, self.remote_pid_fname))
         
         add_timestamp(job, timestamps.SUBMITTED)
         job.state = job_states.RUNNING
         job.rid = resource.rid
         job.pid = pid
         self.job_manager.save()
         
         logger.info('Job "%s" was submitted to resource "%s"' % (job.jid, resource.rid))
     except (CLAUDENotConnectedError, CLAUDEConnectingError):
         raise CLAUDEResourceError
     except (ConfigParser.NoSectionError, ConfigParser.NoOptionError, IOError, CLAUDEFileContentRetrievingTimeout, ValueError, CLAUDEIOError, OSError):
         self._free_job(job, resource)
         raise CLAUDEJobFailed
 def _msg_handler_kill_job(self, msg):
     jid = msg.jid
     
     try:
         job = self.job_manager.get_job_by_jid(jid)
         
         if (job.state == job_states.NEW) or (job.state == job_states.RUNNING):
             if job.state == job_states.RUNNING:
                 resource = self.res_manager.get_resource_by_rid(job.rid)
                 self._kill_job(job, resource)
                 
             add_timestamp(job, timestamps.KILLED)
             self.job_manager.save()
             logger.info('Job "%s" was killed' % jid)
         else:
             logger.warning('Cannot kill job "%s" in state %s' % (jid, job.state))
     except CLAUDEJobDoesNotExist:
         pass
     except CLAUDEResourceDoesNotExist:
         logger.error('Something is terribly wrong with the system...', exc_info=True)
 def _msg_handler_finished_apps(self, msg):
     for jid in msg.jids:
         try:
             job = self.job_manager.get_job_by_jid(jid)
             resource = self.res_manager.get_resource_by_rid(job.rid)
             
             if job.state == job_states.RUNNING:
                 try:
                     self._retrieve_output(job, resource)
                     self._free_job(job, resource)
                     self.state.set(AppExecutorSM.ASSIGNING)
                 except CLAUDEResourceError:
                     logger.warning('Problem with resource %s' % resource.rid)
                     
                 add_timestamp(job, timestamps.TERMINATED)
                 job.state = job_states.TERMINATED
                 self.job_manager.save()
                 
                 logger.info('Job "%s" has terminated on resource "%s"' % (jid, job.rid))
         except (CLAUDEResourceDoesNotExist, CLAUDEJobDoesNotExist):
             logger.error('Something is terribly wrong with the system...', exc_info=True)
Beispiel #4
0
 def assign_jobs(self):
     self.ae_state.set(AppExecutorSM.WAITING)
     
     new_jobs = self.job_manager.get_jobs_in_state(job_states.NEW)
     if new_jobs:
         self._print_resources_usage()
     
     for job in new_jobs:
         for rid, h_res in self.hardware_resources.iteritems():
             if job.rresources.ram <= h_res.ram and job.rresources.cpu <= h_res.cpu and job.rresources.disk <= h_res.disk:
                 try:
                     resource = self.res_manager.get_resource_by_rid(rid)
                     
                     if resource.rtype == 0:
                         try:
                             self.submit_job_callback(job, resource)
                             
                             h_res.ram -= job.rresources.ram
                             h_res.cpu -= job.rresources.cpu
                             h_res.disk -= job.rresources.disk
                             
                             self.ae_state.set(AppExecutorSM.ASSIGNED)
                             break
                         except CLAUDEResourceError:
                             logger.warning('Selecting  a different resource')
                         except CLAUDEJobFailed:
                             add_timestamp(job, timestamps.FAILED)
                             job.state = job_states.FAILED
                             self.job_manager.save()
                             
                             logger.warning('Failed to submit job "%s"' % job.jid)
                             break
                 except CLAUDEResourceDoesNotExist:
                     logger.error('Something is terribly wrong with the system...', exc_info=True)
                     
     if new_jobs:
         self._print_resources_usage()
 def _process_finished_jobs(self):
     terminated_jobs = self.job_manager.get_jobs_in_state(job_states.TERMINATED)
     for job in terminated_jobs:
         self._msg_create_job_finished(job)
         
         add_timestamp(job, timestamps.SENT)
         job.state = job_states.SENT
         self.job_manager.save()
         
     failed_jobs = self.job_manager.get_jobs_in_state(job_states.FAILED)
     for job in failed_jobs:
         self._msg_create_job_finished(job)
         
         add_timestamp(job, timestamps.SENT)
         job.state = job_states.SENT
         self.job_manager.save()
         
     to_remove = {}
     sent_jobs = self.job_manager.get_jobs_in_state(job_states.SENT)
     for job in sent_jobs:
         to_remove[job.jid] = job
         
     for job in to_remove.values():
         self.job_manager.delete(job)