def __mt_bulk_submit__(self, node_jdls): '''submitting jobs in multiple threads''' job = self.getJobObject() logger.warning( 'submitting %d subjobs ... it may take a while' % len(node_jdls)) # the algorithm for submitting a single bulk job class MyAlgorithm(Algorithm): def __init__(self, cred_req, masterInputWorkspace, ce, delid): Algorithm.__init__(self) self.inpw = masterInputWorkspace self.cred_req = cred_req self.ce = ce self.delid = delid def process(self, jdl_info): my_sj_id = jdl_info[0] my_sj_jdl = jdl_info[1] my_sj_jid = Grid.cream_submit(my_sj_jdl, self.ce, self.delid, self.cred_req) if not my_sj_jid: return False else: self.__appendResult__(my_sj_id, my_sj_jid) return True mt_data = [] for id, jdl in node_jdls.items(): mt_data.append((id, jdl)) myAlg = MyAlgorithm(cred_req=self.credential_requirements, masterInputWorkspace=job.getInputWorkspace(), ce=self.CE, delid=self.delegation_id) myData = Data(collection=mt_data) runner = MTRunner(name='cream_jsubmit', algorithm=myAlg, data=myData, numThread=config['SubmissionThread']) runner.start() runner.join(timeout=-1) if len(runner.getDoneList()) < len(mt_data): # not all bulk jobs are successfully submitted. canceling the # submitted jobs on WMS immediately logger.error( 'some bulk jobs not successfully (re)submitted, canceling submitted jobs on WMS') Grid.cancel_multiple(runner.getResults().values()) return None else: return runner.getResults()
def master_bulk_kill(self): '''GLITE bulk resubmission''' job = self.getJobObject() # killing the individually re-submitted subjobs logger.debug('cancelling running/submitted subjobs.') # 1. collect job ids ids = [] for sj in job.subjobs: if sj.status in ['submitted', 'running'] and sj.backend.id: ids.append(sj.backend.id) # 2. cancel the collected jobs ck = Grid.arc_cancel_multiple(ids, self.credential_requirements) if not ck: logger.warning('Job cancellation failed') return False else: for sj in job.subjobs: if sj.backend.id in ids: sj.updateStatus('killed') return True
def __setup_sandboxcache__(self, job): '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend''' re_token = re.compile('^token:(.*):(.*)$') self.sandboxcache.timeout = config['SandboxTransferTimeout'] if self.sandboxcache._name == 'LCGSandboxCache': if not self.sandboxcache.lfc_host: self.sandboxcache.lfc_host = Grid.__get_lfc_host__() if not self.sandboxcache.se: token = '' se_host = config['DefaultSE'] m = re_token.match(se_host) if m: token = m.group(1) se_host = m.group(2) self.sandboxcache.se = se_host if token: self.sandboxcache.srm_token = token if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token): self.sandboxcache.srm_token = config['DefaultSRMToken'] return True
def process(self, jdl_info): my_sj_id = jdl_info[0] my_sj_jdl = jdl_info[1] my_sj_jid = Grid.arc_submit(my_sj_jdl, self.ce, self.arcverbose, self.cred_req) if not my_sj_jid: return False else: self.__appendResult__(my_sj_id, my_sj_jid) return True
def kill(self): '''Kill the job''' job = self.getJobObject() logger.info('Killing job %s' % job.getFQID('.')) if not self.id: logger.warning('Job %s is not running.' % job.getFQID('.')) return False return Grid.cream_cancelMultiple([self.id])
def kill(self): '''Kill the job''' job = self.getJobObject() logger.info('Killing job %s' % job.getFQID('.')) if not self.id: logger.warning('Job %s is not running.' % job.getFQID('.')) return False return Grid.arc_cancel([self.id], self.credential_requirements)
def process(self, jdl_info): my_sj_id = jdl_info[0] my_sj_jdl = jdl_info[1] my_sj_jid = Grid.cream_submit(my_sj_jdl, self.ce, self.delid) if not my_sj_jid: return False else: self.__appendResult__(my_sj_id, my_sj_jid) return True
def __setup_sandboxcache__(self, job): '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend''' re_token = re.compile('^token:(.*):(.*)$') self.sandboxcache.vo = config['VirtualOrganisation'] self.sandboxcache.timeout = config['SandboxTransferTimeout'] if self.sandboxcache._name == 'LCGSandboxCache': if not self.sandboxcache.lfc_host: self.sandboxcache.lfc_host = Grid.__get_lfc_host__() if not self.sandboxcache.se: token = '' se_host = config['DefaultSE'] m = re_token.match(se_host) if m: token = m.group(1) se_host = m.group(2) self.sandboxcache.se = se_host if token: self.sandboxcache.srm_token = token if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token): self.sandboxcache.srm_token = config['DefaultSRMToken'] elif self.sandboxcache._name == 'DQ2SandboxCache': # generate a new dataset name if not given if not self.sandboxcache.dataset_name: from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2outputdatasetname self.sandboxcache.dataset_name, unused = dq2outputdatasetname( "%s.input" % get_uuid(), 0, False, '') # subjobs inherits the dataset name from the master job for sj in job.subjobs: sj.backend.sandboxcache.dataset_name = self.sandboxcache.dataset_name elif self.sandboxcache._name == 'GridftpSandboxCache': if config['CreamInputSandboxBaseURI']: self.sandboxcache.baseURI = config['CreamInputSandboxBaseURI'] elif self.CE: ce_host = re.sub(r'\:[0-9]+', '', self.CE.split('/cream')[0]) self.sandboxcache.baseURI = 'gsiftp://%s/opt/glite/var/cream_sandbox/%s' % ( ce_host, self.sandboxcache.vo) else: logger.error('baseURI not available for GridftpSandboxCache') return False return True
def test_submit_no_proxy(mocker): """ Test that the lack of a proxy object causes the submit to fail """ check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=False) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path') assert check_proxy.call_count == 1 assert job_url is None
def submit(self, subjobconfig, master_job_sandbox): '''Submit the job to the grid''' ick = False jdlpath = self.preparejob(subjobconfig, master_job_sandbox) if jdlpath: self.id = Grid.cream_submit(jdlpath, self.CE, self.delegation_id) if self.id: self.actualCE = self.CE ick = True return ick
def test_submit_bad_output(mocker): """ Test that the external command returning bad data causes the job to fail """ __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value=' ') mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell) cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'some bad output', False)) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path', cred_req=FakeCred()) assert __set_submit_option__.call_count == 1 assert cmd1.call_count == 1 assert job_url is None
def submit(self, subjobconfig, master_job_sandbox): '''Submit the job to the grid''' ick = False xrslpath = self.preparejob(subjobconfig, master_job_sandbox) if xrslpath: self.id = Grid.arc_submit(xrslpath, self.CE, self.verbose, self.credential_requirements) if self.id: self.actualCE = self.CE ick = True return ick
def test_submit_expired_proxy(mocker): """ Test that an invalid proxy object causes the submit to fail """ check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=True) credential = mocker.patch('Ganga.Lib.LCG.Grid.credential', return_value=mocker.MagicMock()) credential.return_value.isValid.return_value = False from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path') assert check_proxy.call_count == 1 assert credential.call_count == 1 assert job_url is None
def test_submit(mocker): """ Test that a job submit succeeds with valid input """ __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value=' ') mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell) cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'https://example.com:9000/some_url', False)) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path', cred_req=FakeCred()) assert __set_submit_option__.call_count == 1 assert cmd1.call_count == 1 assert '/some/path' in cmd1.call_args[0][0], 'JDL path was not passed correctly' assert job_url == 'https://example.com:9000/some_url'
def process(self, item): """ downloads output of one LCG job """ pps_check = (True, None) job = item.jobObj # it is very likely that the job's downloading task has been # created and assigned in a previous monitoring loop # ignore such kind of cases if job.status in ['completing', 'completed', 'failed']: return True # it can also happen that the job was killed/removed by user between # the downloading task was created in queue and being taken by one of # the downloading thread. Ignore suck kind of cases if job.status in ['removed', 'killed']: return True job.updateStatus('completing') outw = job.getOutputWorkspace() pps_check = Grid.get_output(job.backend.id, outw.getPath(), job.backend.credential_requirements) if pps_check[0]: job.updateStatus('completed') job.backend.exitcode = 0 else: job.updateStatus('failed') # update the backend's reason if the failure detected in the # Ganga's pps if pps_check[1] != 0: job.backend.reason = 'non-zero app. exit code: %s' % pps_check[ 1] job.backend.exitcode = pps_check[1] # needs to update the master job's status to give an up-to-date status # of the whole job if job.master: job.master.updateMasterJobStatus() self.__appendResult__(job.getFQID('.'), True) return True
def test_submit_bad_output(mocker): """ Test that the external command returning bad data causes the job to fail """ __set_submit_option__ = mocker.patch( 'Ganga.Lib.LCG.Grid.__set_submit_option__', return_value=' ') mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell) cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'some bad output', False)) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path', cred_req=FakeCred()) assert __set_submit_option__.call_count == 1 assert cmd1.call_count == 1 assert job_url is None
def master_submit(self, rjobs, subjobconfigs, masterjobconfig): '''Submit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() # finding ARC CE endpoint for job submission #allowed_celist = [] # try: # allowed_celist = self.requirements.getce() # if not self.CE and allowed_celist: # self.CE = allowed_celist[0] # except: # logger.warning('ARC CE assigment from ARCRequirements failed.') # if self.CE and allowed_celist: # if self.CE not in allowed_celist: # logger.warning('submission to CE not allowed: %s, use %s instead' % ( self.CE, allowed_celist[0] ) ) # self.CE = allowed_celist[0] # use arc info to check for any endpoints recorded in the config file rc, output = Grid.arc_info() if not self.CE and rc != 0: raise GangaException( "ARC CE endpoint not set and no default settings in '%s'. " % config['ArcConfigFile']) elif self.CE: logger.info('ARC CE endpoint set to: ' + str(self.CE)) else: logger.info("Using ARC CE endpoints defined in '%s'" % config['ArcConfigFile']) # doing massive job preparation if len(job.subjobs) == 0: ick = IBackend.master_submit(self, rjobs, subjobconfigs, masterjobconfig) else: ick = self.master_bulk_submit(rjobs, subjobconfigs, masterjobconfig) profiler.check('==> master_submit() elapsed time') return ick
def master_submit(self, rjobs, subjobconfigs, masterjobconfig): '''Submit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() # finding CREAM CE endpoint for job submission allowed_celist = [] try: allowed_celist = self.requirements.getce() if not self.CE and allowed_celist: self.CE = allowed_celist[0] except: logger.warning( 'CREAM CE assigment from AtlasCREAMRequirements failed.') if self.CE and allowed_celist: if self.CE not in allowed_celist: logger.warning( 'submission to CE not allowed: %s, use %s instead' % (self.CE, allowed_celist[0])) self.CE = allowed_celist[0] if not self.CE: raise GangaException('CREAM CE endpoint not set') # delegate proxy to CREAM CE self.delegation_id = Grid.cream_proxy_delegation( self.CE, self.delegation_id, self.credential_requirements) if not self.delegation_id: logger.warning('proxy delegation to %s failed' % self.CE) # doing massive job preparation if len(job.subjobs) == 0: ick = IBackend.master_submit(self, rjobs, subjobconfigs, masterjobconfig) else: ick = self.master_bulk_submit(rjobs, subjobconfigs, masterjobconfig) profiler.check('==> master_submit() elapsed time') return ick
def master_submit(self, rjobs, subjobconfigs, masterjobconfig): '''Submit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() # finding ARC CE endpoint for job submission #allowed_celist = [] # try: # allowed_celist = self.requirements.getce() # if not self.CE and allowed_celist: # self.CE = allowed_celist[0] # except: # logger.warning('ARC CE assigment from ARCRequirements failed.') # if self.CE and allowed_celist: # if self.CE not in allowed_celist: # logger.warning('submission to CE not allowed: %s, use %s instead' % ( self.CE, allowed_celist[0] ) ) # self.CE = allowed_celist[0] # use arc info to check for any endpoints recorded in the config file rc, output = Grid.arc_info(self.credential_requirements) if not self.CE and rc != 0: raise GangaException( "ARC CE endpoint not set and no default settings in '%s'. " % config['ArcConfigFile']) elif self.CE: logger.info('ARC CE endpoint set to: ' + str(self.CE)) else: logger.info("Using ARC CE endpoints defined in '%s'" % config['ArcConfigFile']) # doing massive job preparation if len(job.subjobs) == 0: ick = IBackend.master_submit( self, rjobs, subjobconfigs, masterjobconfig) else: ick = self.master_bulk_submit( rjobs, subjobconfigs, masterjobconfig) profiler.check('==> master_submit() elapsed time') return ick
def resubmit(self): '''Resubmit the job''' ick = False job = self.getJobObject() jdlpath = job.getInputWorkspace().getPath("__jdlfile__") if jdlpath: self.id = Grid.arc_submit(jdlpath, self.CE, self.verbose, self.credential_requirements) if self.id: # refresh the lcg job information self.__refresh_jobinfo__(job) self.actualCE = self.CE ick = True return ick
def master_submit(self, rjobs, subjobconfigs, masterjobconfig): '''Submit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() # finding CREAM CE endpoint for job submission allowed_celist = [] try: allowed_celist = self.requirements.getce() if not self.CE and allowed_celist: self.CE = allowed_celist[0] except: logger.warning( 'CREAM CE assigment from AtlasCREAMRequirements failed.') if self.CE and allowed_celist: if self.CE not in allowed_celist: logger.warning('submission to CE not allowed: %s, use %s instead' % ( self.CE, allowed_celist[0])) self.CE = allowed_celist[0] if not self.CE: raise GangaException('CREAM CE endpoint not set') # delegate proxy to CREAM CE self.delegation_id = Grid.cream_proxy_delegation(self.CE, self.delegation_id) if not self.delegation_id: logger.warning('proxy delegation to %s failed' % self.CE) # doing massive job preparation if len(job.subjobs) == 0: ick = IBackend.master_submit( self, rjobs, subjobconfigs, masterjobconfig) else: ick = self.master_bulk_submit( rjobs, subjobconfigs, masterjobconfig) profiler.check('==> master_submit() elapsed time') return ick
def resubmit(self): '''Resubmit the job''' ick = False job = self.getJobObject() jdlpath = job.getInputWorkspace().getPath("__jdlfile__") if jdlpath: self.id = Grid.arc_submit(jdlpath, self.CE, self.verbose) if self.id: # refresh the lcg job information self.__refresh_jobinfo__(job) self.actualCE = self.CE ick = True return ick
def resubmit(self): '''Resubmit the job''' ick = False job = self.getJobObject() jdlpath = job.getInputWorkspace().getPath("__jdlfile__") if jdlpath: self.id = Grid.cream_submit(jdlpath, self.CE, self.delegation_id) if self.id: # refresh the lcg job information self.__refresh_jobinfo__(job) self.actualCE = self.CE ick = True return ick
def __setup_sandboxcache__(self, job): '''Sets up the sandbox cache object to adopt the runtime configuration of the LCG backend''' re_token = re.compile('^token:(.*):(.*)$') self.sandboxcache.vo = config['VirtualOrganisation'] self.sandboxcache.timeout = config['SandboxTransferTimeout'] if self.sandboxcache._name == 'LCGSandboxCache': if not self.sandboxcache.lfc_host: self.sandboxcache.lfc_host = Grid.__get_lfc_host__() if not self.sandboxcache.se: token = '' se_host = config['DefaultSE'] m = re_token.match(se_host) if m: token = m.group(1) se_host = m.group(2) self.sandboxcache.se = se_host if token: self.sandboxcache.srm_token = token if (self.sandboxcache.se_type in ['srmv2']) and (not self.sandboxcache.srm_token): self.sandboxcache.srm_token = config['DefaultSRMToken'] elif self.sandboxcache._name == 'GridftpSandboxCache': if config['CreamInputSandboxBaseURI']: self.sandboxcache.baseURI = config['CreamInputSandboxBaseURI'] elif self.CE: ce_host = re.sub(r'\:[0-9]+', '', self.CE.split('/cream')[0]) self.sandboxcache.baseURI = 'gsiftp://%s/opt/glite/var/cream_sandbox/%s' % ( ce_host, self.sandboxcache.vo) else: logger.error('baseURI not available for GridftpSandboxCache') return False return True
def resubmit(self): '''Resubmit the job''' ick = False job = self.getJobObject() jdlpath = job.getInputWorkspace().getPath("__jdlfile__") if jdlpath: self.id = Grid.cream_submit(jdlpath, self.CE, self.delegation_id, self.credential_requirements) if self.id: # refresh the lcg job information self.__refresh_jobinfo__(job) self.actualCE = self.CE ick = True return ick
def test_submit(mocker): """ Test that a job submit succeeds with valid input """ __set_submit_option__ = mocker.patch( 'Ganga.Lib.LCG.Grid.__set_submit_option__', return_value=' ') mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell) cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'https://example.com:9000/some_url', False)) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path', cred_req=FakeCred()) assert __set_submit_option__.call_count == 1 assert cmd1.call_count == 1 assert '/some/path' in cmd1.call_args[0][ 0], 'JDL path was not passed correctly' assert job_url == 'https://example.com:9000/some_url'
def test_submit_bad_output(mocker): """ Test that the external command returning bad data causes the job to fail """ check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=True) credential = mocker.patch('Ganga.Lib.LCG.Grid.credential', return_value=mocker.MagicMock()) credential.return_value.isValid.return_value = True __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value=' ') mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell) cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'some bad output', False)) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path') assert check_proxy.call_count == 1 assert credential.call_count == 1 assert __set_submit_option__.call_count == 1 assert cmd1.call_count == 1 assert job_url is None
def test_submit(mocker): """ Test that a job submit succeeds with valid input """ check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=True) credential = mocker.patch('Ganga.Lib.LCG.Grid.credential', return_value=mocker.MagicMock()) credential.return_value.isValid.return_value = True __set_submit_option__ = mocker.patch('Ganga.Lib.LCG.Grid.__set_submit_option__', return_value=' ') mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell) cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'https://example.com:9000/some_url', False)) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path') assert check_proxy.call_count == 1 assert credential.call_count == 1 assert __set_submit_option__.call_count == 1 assert cmd1.call_count == 1 assert '/some/path' in cmd1.call_args[0][0], 'JDL path was not passed correctly' assert job_url == 'https://example.com:9000/some_url'
def master_resubmit(self, rjobs): '''Resubmit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() ick = False # delegate proxy to CREAM CE self.delegation_id = Grid.cream_proxy_delegation( self.CE, self.delegation_id, self.credential_requirements) if not self.delegation_id: logger.warning('proxy delegation to %s failed' % self.CE) if not job.master and len(job.subjobs) == 0: # case 1: master job normal resubmission logger.debug('rjobs: %s' % str(rjobs)) logger.debug('mode: master job normal resubmission') ick = IBackend.master_resubmit(self, rjobs) elif job.master: # case 2: individual subjob resubmission logger.debug('mode: individual subjob resubmission') ick = IBackend.master_resubmit(self, rjobs) else: # case 3: master job bulk resubmission logger.debug('mode: master job resubmission') ick = self.master_bulk_resubmit(rjobs) if not ick: raise GangaException('CREAM bulk submission failure') profiler.check('job re-submission elapsed time') return ick
def master_resubmit(self, rjobs): '''Resubmit the master job to the grid''' profiler = ElapsedTimeProfiler(getLogger(name='Profile.LCG')) profiler.start() job = self.getJobObject() ick = False # delegate proxy to CREAM CE self.delegation_id = Grid.cream_proxy_delegation(self.CE, self.delegation_id) if not self.delegation_id: logger.warning('proxy delegation to %s failed' % self.CE) if not job.master and len(job.subjobs) == 0: # case 1: master job normal resubmission logger.debug('rjobs: %s' % str(rjobs)) logger.debug('mode: master job normal resubmission') ick = IBackend.master_resubmit(self, rjobs) elif job.master: # case 2: individual subjob resubmission logger.debug('mode: individual subjob resubmission') ick = IBackend.master_resubmit(self, rjobs) else: # case 3: master job bulk resubmission logger.debug('mode: master job resubmission') ick = self.master_bulk_resubmit(rjobs) if not ick: raise GangaException('CREAM bulk submission failure') profiler.check('job re-submission elapsed time') return ick
def test_submit_bad_output(mocker): """ Test that the external command returning bad data causes the job to fail """ check_proxy = mocker.patch('Ganga.Lib.LCG.Grid.check_proxy', return_value=True) credential = mocker.patch('Ganga.Lib.LCG.Grid.credential', return_value=mocker.MagicMock()) credential.return_value.isValid.return_value = True __set_submit_option__ = mocker.patch( 'Ganga.Lib.LCG.Grid.__set_submit_option__', return_value=' ') mocker.patch('Ganga.Lib.LCG.Grid.getShell', return_value=Shell) cmd1 = mocker.patch('Ganga.Utility.GridShell.Shell.cmd1', return_value=(0, 'some bad output', False)) from Ganga.Lib.LCG import Grid job_url = Grid.submit('/some/path') assert check_proxy.call_count == 1 assert credential.call_count == 1 assert __set_submit_option__.call_count == 1 assert cmd1.call_count == 1 assert job_url is None
def preparejob(self, jobconfig, master_job_sandbox): '''Prepare the JDL''' script = self.__jobWrapperTemplate__() job = self.getJobObject() inpw = job.getInputWorkspace() wrapperlog = '__jobscript__.log' import Ganga.Core.Sandbox as Sandbox # FIXME: check what happens if 'stdout','stderr' are specified here script = script.replace('###OUTPUTSANDBOX###', repr(jobconfig.outputbox)) script = script.replace('###APPLICATION_NAME###', getName(job.application)) script = script.replace('###APPLICATIONEXEC###', repr(jobconfig.getExeString())) script = script.replace('###APPLICATIONARGS###', repr(jobconfig.getArguments())) from Ganga.GPIDev.Lib.File.OutputFileManager import getWNCodeForOutputPostprocessing, getWNCodeForDownloadingInputFiles script = script.replace('###OUTPUTUPLOADSPOSTPROCESSING###', getWNCodeForOutputPostprocessing(job, ' ')) script = script.replace('###DOWNLOADINPUTFILES###', getWNCodeForDownloadingInputFiles(job, ' ')) if jobconfig.env: script = script.replace('###APPLICATIONENVS###', repr(jobconfig.env)) else: script = script.replace('###APPLICATIONENVS###', repr({})) script = script.replace('###WRAPPERLOG###', repr(wrapperlog)) import inspect script = script.replace('###INLINEMODULES###', inspect.getsource(Sandbox.WNSandbox)) mon = job.getMonitoringService() self.monInfo = None # set the monitoring file by default to the stdout if isinstance(self.monInfo, dict): self.monInfo['remotefile'] = 'stdout' # try to print out the monitoring service information in debug mode try: logger.debug('job info of monitoring service: %s' % str(self.monInfo)) except: pass # prepare input/output sandboxes import Ganga.Utility.files from Ganga.GPIDev.Lib.File import File from Ganga.Core.Sandbox.WNSandbox import PYTHON_DIR import inspect fileutils = File(inspect.getsourcefile(Ganga.Utility.files), subdir=PYTHON_DIR) packed_files = jobconfig.getSandboxFiles() + [fileutils] sandbox_files = job.createPackedInputSandbox(packed_files) # sandbox of child jobs should include master's sandbox sandbox_files.extend(master_job_sandbox) # check the input file size and pre-upload larger inputs to the iocache lfc_host = '' input_sandbox_uris = [] input_sandbox_names = [] ick = True max_prestaged_fsize = 0 for f in sandbox_files: idx = self.__check_and_prestage_inputfile__(f) if not idx: logger.error('input sandbox preparation failed: %s' % f) ick = False break else: if idx['lfc_host']: lfc_host = idx['lfc_host'] if idx['remote']: abspath = os.path.abspath(f) fsize = os.path.getsize(abspath) if fsize > max_prestaged_fsize: max_prestaged_fsize = fsize input_sandbox_uris.append( idx['remote'][os.path.basename(f)]) input_sandbox_names.append(os.path.basename( urlparse(f)[2])) if idx['local']: input_sandbox_uris += idx['local'] input_sandbox_names.append(os.path.basename(f)) if not ick: logger.error('stop job submission') return None # determin the lcg-cp timeout according to the max_prestaged_fsize # - using the assumption of 1 MB/sec. max_prestaged_fsize = 0 lfc_host = '' transfer_timeout = config['SandboxTransferTimeout'] predict_timeout = int(math.ceil(max_prestaged_fsize / 1000000.0)) if predict_timeout > transfer_timeout: transfer_timeout = predict_timeout if transfer_timeout < 60: transfer_timeout = 60 script = script.replace('###TRANSFERTIMEOUT###', '%d' % transfer_timeout) # update the job wrapper with the inputsandbox list script = script.replace( '###INPUTSANDBOX###', repr({ 'remote': {}, 'local': input_sandbox_names })) # write out the job wrapper and put job wrapper into job's inputsandbox scriptPath = inpw.writefile(FileBuffer( '__jobscript_%s__' % job.getFQID('.'), script), executable=1) input_sandbox = input_sandbox_uris + [scriptPath] for isb in input_sandbox: logger.debug('ISB URI: %s' % isb) # compose output sandbox to include by default the following files: # - gzipped stdout (transferred only when the JobLogHandler is WMS) # - gzipped stderr (transferred only when the JobLogHandler is WMS) # - __jobscript__.log (job wrapper's log) output_sandbox = [wrapperlog] from Ganga.GPIDev.Lib.File.OutputFileManager import getOutputSandboxPatterns for outputSandboxPattern in getOutputSandboxPatterns(job): output_sandbox.append(outputSandboxPattern) if config['JobLogHandler'] in ['WMS']: output_sandbox += ['stdout.gz', 'stderr.gz'] if len(jobconfig.outputbox): output_sandbox += [Sandbox.OUTPUT_TARBALL_NAME] # compose ARC XRSL xrsl = { #'VirtualOrganisation' : config['VirtualOrganisation'], 'executable': os.path.basename(scriptPath), 'environment': { 'GANGA_LCG_VO': config['VirtualOrganisation'], 'GANGA_LOG_HANDLER': config['JobLogHandler'], 'LFC_HOST': lfc_host }, #'stdout' : 'stdout', #'stderr' : 'stderr', 'inputFiles': input_sandbox, 'outputFiles': output_sandbox, #'OutputSandboxBaseDestURI': 'gsiftp://localhost' } xrsl['environment'].update({'GANGA_LCG_CE': self.CE}) #xrsl['Requirements'] = self.requirements.merge(jobconfig.requirements).convert() # if self.jobtype.upper() in ['NORMAL','MPICH']: #xrsl['JobType'] = self.jobtype.upper() # if self.jobtype.upper() == 'MPICH': #xrsl['Requirements'].append('(other.GlueCEInfoTotalCPUs >= NodeNumber)') # xrsl['Requirements'].append('Member("MPICH",other.GlueHostApplicationSoftwareRunTimeEnvironment)') #xrsl['NodeNumber'] = self.requirements.nodenumber # else: # logger.warning('JobType "%s" not supported' % self.jobtype) # return # additional settings from the job if jobconfig.env: xrsl['environment'].update(jobconfig.env) xrslText = Grid.expandxrsl(xrsl) # append any additional requirements from the requirements object xrslText += '\n'.join(self.requirements.other) logger.debug('subjob XRSL: %s' % xrslText) return inpw.writefile(FileBuffer('__xrslfile__', xrslText))
def __check_and_prestage_inputfile__(self, file): '''Checks the given input file size and if it's size is over "BoundSandboxLimit", prestage it to a grid SE. The argument is a path of the local file. It returns a dictionary containing information to refer to the file: idx = {'lfc_host': lfc_host, 'local': [the local file pathes], 'remote': {'fname1': 'remote index1', 'fname2': 'remote index2', ... } } If prestaging failed, None object is returned. If the file has been previously uploaded (according to md5sum), the prestaging is ignored and index to the previously uploaded file is returned. ''' idx = {'lfc_host': '', 'local': [], 'remote': {}} job = self.getJobObject() # read-in the previously uploaded files uploadedFiles = [] # getting the uploaded file list from the master job if job.master: uploadedFiles += job.master.backend.sandboxcache.get_cached_files() # set and get the $LFC_HOST for uploading oversized sandbox self.__setup_sandboxcache__(job) uploadedFiles += self.sandboxcache.get_cached_files() lfc_host = None # for LCGSandboxCache, take the one specified in the sansboxcache object. # the value is exactly the same as the one from the local grid shell env. if # it is not specified exclusively. if self.sandboxcache._name == 'LCGSandboxCache': lfc_host = self.sandboxcache.lfc_host # or in general, query it from the Grid object if not lfc_host: lfc_host = Grid.__get_lfc_host__() idx['lfc_host'] = lfc_host abspath = os.path.abspath(file) fsize = os.path.getsize(abspath) if fsize > config['BoundSandboxLimit']: md5sum = get_md5sum(abspath, ignoreGzipTimestamp=True) doUpload = True for uf in uploadedFiles: if uf.md5sum == md5sum: # the same file has been uploaded to the iocache idx['remote'][os.path.basename(file)] = uf.id doUpload = False break if doUpload: logger.warning( 'The size of %s is larger than the sandbox limit (%d byte). Please wait while pre-staging ...' % (file, config['BoundSandboxLimit'])) if self.sandboxcache.upload([abspath]): remote_sandbox = self.sandboxcache.get_cached_files()[-1] idx['remote'][remote_sandbox.name] = remote_sandbox.id else: logger.error( 'Oversized sandbox not successfully pre-staged') return None else: idx['local'].append(abspath) return idx
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime backenddict = {} jobdict = {} for j in jobs: if j.backend.id and ( (datetime.datetime.utcnow() - j.time.timestamps["submitted"] ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j backenddict[j.backend.actualCE] = j if len(jobdict.keys()) == 0: return jobInfoDict = Grid.arc_status(jobdict.keys(), backenddict.keys()) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in [ 'Finished', '(FINISHED)', 'Finished (FINISHED)' ]: # grab output sandbox if Grid.arc_get_output( job.backend.id, job.getOutputWorkspace(create=True).getPath()): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error('fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning('fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: if not Grid.arc_purgeMultiple(jidListForPurge): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' jobdict = dict([[job.backend.id, job] for job in jobs if job.backend.id]) jobInfoDict = Grid.cream_status(jobdict.keys()) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())): if 'Worker Node' in info: job.backend.workernode = info['Worker Node'] if 'CREAM ISB URI' in info: job.backend.isbURI = info['CREAM ISB URI'] if 'CREAM OSB URI' in info: job.backend.osbURI = info['CREAM OSB URI'] doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['Current Status'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']: # resolve output sandbox URIs based on the JDL # information osbURIList = __cream_resolveOSBList__(job, info['JDL']) logger.debug('OSB list:') for f in osbURIList: logger.debug(f) if osbURIList: if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath() ): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath() ) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['Current Status'] if 'ExitCode' in info and info['ExitCode'] != "W": try: job.backend.exitcode_cream = int( info['ExitCode']) except: job.backend.exitcode_cream = 1 if 'FailureReason' in info: try: job.backend.reason = info['FailureReason'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: Grid.cream_purgeMultiple(jidListForPurge)
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime backenddict = {} jobdict = {} for j in jobs: if j.backend.id and ((datetime.datetime.utcnow() - j.time.timestamps["submitted"]).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j backenddict[j.backend.actualCE] = j if len(jobdict.keys()) == 0: return jobInfoDict = Grid.arc_status( jobdict.keys(), backenddict.keys()) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in ['Finished', '(FINISHED)', 'Finished (FINISHED)']: # grab output sandbox if Grid.arc_get_output(job.backend.id, job.getOutputWorkspace(create=True).getPath()): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: if not Grid.arc_purgeMultiple(jidListForPurge): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime ce_list = [] # type: List[str] jobdict = {} # type: Mapping[str, Job] for j in jobs: if j.backend.id and ((datetime.datetime.utcnow() - j.time.timestamps["submitted"]).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j ce_list.append(j.backend.actualCE) if len(jobdict.keys()) == 0: return # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict(list) # type: Mapping[ICredentialRequirement, List[str]] for jid, job in jobdict.items(): cred_to_backend_id_list[job.backend.credential_requirements].append(jid) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.arc_status(job_ids, ce_list, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in ['Finished', '(FINISHED)', 'Finished (FINISHED)']: # grab output sandbox if Grid.arc_get_output(job.backend.id, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): if not Grid.arc_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' jobdict = dict([(job.backend.id, job) for job in jobs if job.backend.id]) # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict(list) for job in jobs: cred_to_backend_id_list[ job.backend.credential_requirements].append(job.backend.id) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.cream_status(job_ids, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.status != info['Current Status'] and ( 'ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())): if 'Worker Node' in info: job.backend.workernode = info['Worker Node'] if 'CREAM ISB URI' in info: job.backend.isbURI = info['CREAM ISB URI'] if 'CREAM OSB URI' in info: job.backend.osbURI = info['CREAM OSB URI'] doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['Current Status'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']: # resolve output sandbox URIs based on the JDL # information osbURIList = __cream_resolveOSBList__(job, info['JDL']) logger.debug('OSB list:') for f in osbURIList: logger.debug(f) if osbURIList: if Grid.cream_get_output( osbURIList, job.getOutputWorkspace( create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace( create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['Current Status'] if 'ExitCode' in info and info['ExitCode'] != "W": try: job.backend.exitcode_cream = int( info['ExitCode']) except: job.backend.exitcode_cream = 1 if 'FailureReason' in info: try: job.backend.reason = info['FailureReason'] except: pass job.backend.updateGangaJobStatus() else: logger.warning('fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): Grid.cream_purge_multiple( set(job_ids) & set(jidListForPurge), cred_req)
def preparejob(self, jobconfig, master_job_sandbox): '''Prepare the JDL''' script = self.__jobWrapperTemplate__() job = self.getJobObject() inpw = job.getInputWorkspace() wrapperlog = '__jobscript__.log' import Ganga.Core.Sandbox as Sandbox # FIXME: check what happens if 'stdout','stderr' are specified here script = script.replace( '###OUTPUTSANDBOX###', repr(jobconfig.outputbox)) script = script.replace( '###APPLICATION_NAME###', getName(job.application)) script = script.replace( '###APPLICATIONEXEC###', repr(jobconfig.getExeString())) script = script.replace( '###APPLICATIONARGS###', repr(jobconfig.getArguments())) from Ganga.GPIDev.Lib.File.OutputFileManager import getWNCodeForOutputPostprocessing, getWNCodeForDownloadingInputFiles script = script.replace( '###OUTPUTUPLOADSPOSTPROCESSING###', getWNCodeForOutputPostprocessing(job, ' ')) script = script.replace( '###DOWNLOADINPUTFILES###', getWNCodeForDownloadingInputFiles(job, ' ')) if jobconfig.env: script = script.replace( '###APPLICATIONENVS###', repr(jobconfig.env)) else: script = script.replace('###APPLICATIONENVS###', repr({})) script = script.replace('###WRAPPERLOG###', repr(wrapperlog)) import inspect script = script.replace( '###INLINEMODULES###', inspect.getsource(Sandbox.WNSandbox)) mon = job.getMonitoringService() self.monInfo = None # set the monitoring file by default to the stdout if isinstance(self.monInfo, dict): self.monInfo['remotefile'] = 'stdout' # try to print out the monitoring service information in debug mode try: logger.debug('job info of monitoring service: %s' % str(self.monInfo)) except: pass # prepare input/output sandboxes import Ganga.Utility.files from Ganga.GPIDev.Lib.File import File from Ganga.Core.Sandbox.WNSandbox import PYTHON_DIR import inspect fileutils = File( inspect.getsourcefile(Ganga.Utility.files), subdir=PYTHON_DIR ) packed_files = jobconfig.getSandboxFiles() + [ fileutils ] sandbox_files = job.createPackedInputSandbox(packed_files) # sandbox of child jobs should include master's sandbox sandbox_files.extend(master_job_sandbox) # check the input file size and pre-upload larger inputs to the iocache lfc_host = '' input_sandbox_uris = [] input_sandbox_names = [] ick = True max_prestaged_fsize = 0 for f in sandbox_files: idx = self.__check_and_prestage_inputfile__(f) if not idx: logger.error('input sandbox preparation failed: %s' % f) ick = False break else: if idx['lfc_host']: lfc_host = idx['lfc_host'] if idx['remote']: abspath = os.path.abspath(f) fsize = os.path.getsize(abspath) if fsize > max_prestaged_fsize: max_prestaged_fsize = fsize input_sandbox_uris.append( idx['remote'][os.path.basename(f)]) input_sandbox_names.append( os.path.basename(urlparse(f)[2])) if idx['local']: input_sandbox_uris += idx['local'] input_sandbox_names.append(os.path.basename(f)) if not ick: logger.error('stop job submission') return None # determin the lcg-cp timeout according to the max_prestaged_fsize # - using the assumption of 1 MB/sec. max_prestaged_fsize = 0 lfc_host = '' transfer_timeout = config['SandboxTransferTimeout'] predict_timeout = int(math.ceil(max_prestaged_fsize / 1000000.0)) if predict_timeout > transfer_timeout: transfer_timeout = predict_timeout if transfer_timeout < 60: transfer_timeout = 60 script = script.replace( '###TRANSFERTIMEOUT###', '%d' % transfer_timeout) # update the job wrapper with the inputsandbox list script = script.replace( '###INPUTSANDBOX###', repr({'remote': {}, 'local': input_sandbox_names})) # write out the job wrapper and put job wrapper into job's inputsandbox scriptPath = inpw.writefile( FileBuffer('__jobscript_%s__' % job.getFQID('.'), script), executable=1) input_sandbox = input_sandbox_uris + [scriptPath] for isb in input_sandbox: logger.debug('ISB URI: %s' % isb) # compose output sandbox to include by default the following files: # - gzipped stdout (transferred only when the JobLogHandler is WMS) # - gzipped stderr (transferred only when the JobLogHandler is WMS) # - __jobscript__.log (job wrapper's log) output_sandbox = [wrapperlog] from Ganga.GPIDev.Lib.File.OutputFileManager import getOutputSandboxPatterns for outputSandboxPattern in getOutputSandboxPatterns(job): output_sandbox.append(outputSandboxPattern) if config['JobLogHandler'] in ['WMS']: output_sandbox += ['stdout.gz', 'stderr.gz'] if len(jobconfig.outputbox): output_sandbox += [Sandbox.OUTPUT_TARBALL_NAME] # compose ARC XRSL xrsl = { #'VirtualOrganisation' : config['VirtualOrganisation'], 'executable': os.path.basename(scriptPath), 'environment': {'GANGA_LCG_VO': config['VirtualOrganisation'], 'GANGA_LOG_HANDLER': config['JobLogHandler'], 'LFC_HOST': lfc_host}, #'stdout' : 'stdout', #'stderr' : 'stderr', 'inputFiles': input_sandbox, 'outputFiles': output_sandbox, #'OutputSandboxBaseDestURI': 'gsiftp://localhost' } xrsl['environment'].update({'GANGA_LCG_CE': self.CE}) #xrsl['Requirements'] = self.requirements.merge(jobconfig.requirements).convert() # if self.jobtype.upper() in ['NORMAL','MPICH']: #xrsl['JobType'] = self.jobtype.upper() # if self.jobtype.upper() == 'MPICH': #xrsl['Requirements'].append('(other.GlueCEInfoTotalCPUs >= NodeNumber)') # xrsl['Requirements'].append('Member("MPICH",other.GlueHostApplicationSoftwareRunTimeEnvironment)') #xrsl['NodeNumber'] = self.requirements.nodenumber # else: # logger.warning('JobType "%s" not supported' % self.jobtype) # return # additional settings from the job if jobconfig.env: xrsl['environment'].update(jobconfig.env) xrslText = Grid.expandxrsl(xrsl) # append any additional requirements from the requirements object xrslText += '\n'.join(self.requirements.other) logger.debug('subjob XRSL: %s' % xrslText) return inpw.writefile(FileBuffer('__xrslfile__', xrslText))
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' import datetime ce_list = [] # type: List[str] jobdict = {} # type: Mapping[str, Job] for j in jobs: if j.backend.id and ( (datetime.datetime.utcnow() - j.time.timestamps["submitted"] ).seconds > config["ArcWaitTimeBeforeStartingMonitoring"]): jobdict[j.backend.id] = j ce_list.append(j.backend.actualCE) if len(jobdict.keys()) == 0: return # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict( list) # type: Mapping[ICredentialRequirement, List[str]] for jid, job in jobdict.items(): cred_to_backend_id_list[ job.backend.credential_requirements].append(jid) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.arc_status(job_ids, ce_list, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.actualCE != urlparse(id)[1].split(":")[0]: job.backend.actualCE = urlparse(id)[1].split(":")[0] if job.backend.status != info['State']: doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['State'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['State'] in [ 'Finished', '(FINISHED)', 'Finished (FINISHED)' ]: # grab output sandbox if Grid.arc_get_output( job.backend.id, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error('fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['State'] if 'Exit Code' in info: try: job.backend.exitcode_arc = int( info['Exit Code']) except: job.backend.exitcode_arc = 1 if 'Job Error' in info: try: job.backend.reason = info['Job Error'] except: pass job.backend.updateGangaJobStatus() else: logger.warning('fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): if not Grid.arc_purge_multiple( set(job_ids) & set(jidListForPurge), cred_req): logger.warning("Failed to purge all ARC jobs.")
def updateMonitoringInformation(jobs): '''Monitoring loop for normal jobs''' jobdict = dict([(job.backend.id, job) for job in jobs if job.backend.id]) # Group jobs by the backend's credential requirements cred_to_backend_id_list = defaultdict(list) for job in jobs: cred_to_backend_id_list[job.backend.credential_requirements].append(job.backend.id) # Batch the status requests by credential requirement jobInfoDict = {} for cred_req, job_ids in cred_to_backend_id_list.items(): # If the credential is not valid or doesn't exist then skip it cred = credential_store.get(cred_req) if not cred or not cred.is_valid(): needed_credentials.add(cred_req) continue # Create a ``Grid`` for each credential requirement and request the relevant jobs through it info = Grid.cream_status(job_ids, cred_req) jobInfoDict.update(info) jidListForPurge = [] # update job information for those available in jobInfoDict for id, info in jobInfoDict.items(): if info: job = jobdict[id] if job.backend.status != info['Current Status'] and ('ExitCode' not in info or ('ExitCode' in info and info['ExitCode'].isdigit())): if 'Worker Node' in info: job.backend.workernode = info['Worker Node'] if 'CREAM ISB URI' in info: job.backend.isbURI = info['CREAM ISB URI'] if 'CREAM OSB URI' in info: job.backend.osbURI = info['CREAM OSB URI'] doStatusUpdate = True # no need to update Ganga job status if backend status is # not changed if info['Current Status'] == job.backend.status: doStatusUpdate = False # download output sandboxes if final status is reached elif info['Current Status'] in ['DONE-OK', 'DONE-FAILED']: # resolve output sandbox URIs based on the JDL # information osbURIList = __cream_resolveOSBList__(job, info['JDL']) logger.debug('OSB list:') for f in osbURIList: logger.debug(f) if osbURIList: if Grid.cream_get_output(osbURIList, job.getOutputWorkspace(create=True).getPath(), job.backend.credential_requirements): (ick, app_exitcode) = Grid.__get_app_exitcode__( job.getOutputWorkspace(create=True).getPath()) job.backend.exitcode = app_exitcode jidListForPurge.append(job.backend.id) else: logger.error( 'fail to download job output: %s' % jobdict[id].getFQID('.')) if doStatusUpdate: job.backend.status = info['Current Status'] if 'ExitCode' in info and info['ExitCode'] != "W": try: job.backend.exitcode_cream = int( info['ExitCode']) except: job.backend.exitcode_cream = 1 if 'FailureReason' in info: try: job.backend.reason = info['FailureReason'] except: pass job.backend.updateGangaJobStatus() else: logger.warning( 'fail to retrieve job informaton: %s' % jobdict[id].getFQID('.')) # purging the jobs the output has been fetched locally if jidListForPurge: for cred_req, job_ids in cred_to_backend_id_list.items(): Grid.cream_purge_multiple(set(job_ids) & set(jidListForPurge), cred_req)