def cancelJobs(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self.cancelExec, self.getCancelArguments(self._getRawIDs(ids))) if proc.wait() != 0: for line in proc.getError().splitlines(): if not self.unknownID() in line: utils.eprint(line.strip()) del activity activity = utils.ActivityLog('waiting for jobs to finish') time.sleep(5) for wmsId, jobNum in ids: path = self._getSandbox(wmsId) if path is None: utils.eprint('Sandbox for job %d with wmsId "%s" could not be found' % (jobNum, wmsId)) continue try: shutil.rmtree(path) except Exception: raise BackendError('Sandbox for job %d with wmsId "%s" could not be deleted' % (jobNum, wmsId)) yield (jobNum, wmsId) del activity
def checkJobs(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('checking job status') proc = utils.LoggedProcess(self.statusExec, self.getCheckArguments(self._getRawIDs(ids))) tmp = {} for data in self.parseStatus(proc.iter()): wmsId = self._createId(data['id']) tmp[wmsId] = (wmsId, self.parseJobState(data['status']), data) for wmsId, jobNum in ids: if wmsId not in tmp: yield (jobNum, wmsId, Job.DONE, {}) else: yield tuple([jobNum] + list(tmp[wmsId])) retCode = proc.wait() del activity if retCode != 0: for line in proc.getError().splitlines(): if not self.unknownID() in line: utils.eprint(line)
def checkJobs(self, ids): if len(ids) == 0: raise StopIteration jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') activity = utils.ActivityLog('checking job status') proc = utils.LoggedProcess(self._statusExec, '--level 0 --logfile "%s" %s' % (log, jobs)) for jobOutput in proc.getOutput().split('******')[1:]: data = {} for statusRegexLevel0 in self._statusRegexLevel0: match = re.match(statusRegexLevel0, jobOutput.replace('\n', ' ')) if match: data = match.groupdict() break data['id'] = self._createId(data['rawId']) yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data.get('status', 'DONE-FAILED')], data) retCode = proc.wait() del activity if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log, jobs = jobs) utils.removeFiles([log])
def _submitJob(self, jobNum, module): activity = utils.ActivityLog('submitting jobs') try: sandbox = tempfile.mkdtemp('', '%s.%04d.' % (module.taskID, jobNum), self.sandPath) except Exception: raise BackendError('Unable to create sandbox directory "%s"!' % sandbox) sbPrefix = sandbox.replace(self.sandPath, '').lstrip('/') def translateTarget(d, s, t): return (d, s, os.path.join(sbPrefix, t)) self.smSBIn.doTransfer( ismap(translateTarget, self._getSandboxFilesIn(module))) self._writeJobConfig( os.path.join(sandbox, '_jobconfig.sh'), jobNum, module, { 'GC_SANDBOX': sandbox, 'GC_SCRATCH_SEARCH': str.join(' ', self.scratchPath) }) reqs = self.brokerSite.brokerAdd(module.getRequirements(jobNum), WMS.SITES) reqs = dict(self.brokerQueue.brokerAdd(reqs, WMS.QUEUES)) if (self.memory > 0) and (reqs.get(WMS.MEMORY, 0) < self.memory): reqs[ WMS. MEMORY] = self.memory # local jobs need higher (more realistic) memory requirements (stdout, stderr) = (os.path.join(sandbox, 'gc.stdout'), os.path.join(sandbox, 'gc.stderr')) jobName = module.getDescription(jobNum).jobName proc = utils.LoggedProcess( self.submitExec, '%s %s "%s" %s' % (self.submitOpts, self.getSubmitArguments(jobNum, jobName, reqs, sandbox, stdout, stderr), utils.pathShare('gc-local.sh'), self.getJobArguments(jobNum, sandbox))) retCode = proc.wait() wmsIdText = proc.getOutput().strip().strip('\n') try: wmsId = self.parseSubmitOutput(wmsIdText) except Exception: wmsId = None del activity if retCode != 0: self._log.warning('%s failed:', self.submitExec) elif wmsId is None: self._log.warning('%s did not yield job id:\n%s', self.submitExec, wmsIdText) if wmsId: wmsId = self._createId(wmsId) open(os.path.join(sandbox, wmsId), 'w') else: proc.logError(self.errorLog) return (jobNum, utils.QM(wmsId, wmsId, None), {'sandbox': sandbox})
def cancelJobs(self, allIds): if len(allIds) == 0: raise StopIteration waitFlag = False for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)): # Delete jobs in groups of 5 - with 5 seconds between groups if waitFlag and not utils.wait(5): break waitFlag = True jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self._cancelExec, '--noint --logfile "%s" %s' % (log, jobs)) retCode = proc.wait() del activity # select cancelled jobs for rawId in self._getRawIDs(ids): deletedWMSId = self._createId(rawId) yield (jobNumMap.get(deletedWMSId), deletedWMSId) if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log) purgeLog = tempfile.mktemp('.log') purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, jobs)) retCode = purgeProc.wait() if retCode != 0: if self.explainError(purgeProc, retCode): pass else: proc.logError(self.errorLog, log = purgeLog, jobs = jobs) utils.removeFiles([log, purgeLog])
def _getJobsOutput(self, allIds): if len(allIds) == 0: raise StopIteration basePath = os.path.join(self._outputPath, 'tmp') try: if len(allIds) == 1: # For single jobs create single subdir basePath = os.path.join(basePath, md5(allIds[0][0]).hexdigest()) utils.ensureDirExists(basePath) except Exception: raise BackendError('Temporary path "%s" could not be created.' % basePath, BackendError) activity = utils.ActivityLog('retrieving job outputs') for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)): jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') #print self._outputExec, '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs) #import sys #sys.exit(1) proc = utils.LoggedProcess(self._outputExec, '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs)) # yield output dirs todo = jobNumMap.values() done = [] currentJobNum = None for line in imap(str.strip, proc.iter()): match = re.match(self._outputRegex, line) if match: currentJobNum = jobNumMap.get(self._createId(match.groupdict()['rawId'])) todo.remove(currentJobNum) done.append(match.groupdict()['rawId']) outputDir = match.groupdict()['outputDir'] if os.path.exists(outputDir): if 'GC_WC.tar.gz' in os.listdir(outputDir): wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz') try: tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir) os.unlink(wildcardTar) except Exception: utils.eprint("Can't unpack output files contained in %s" % wildcardTar) yield (currentJobNum, outputDir) currentJobNum = None retCode = proc.wait() if retCode != 0: if 'Keyboard interrupt raised by user' in proc.getError(): utils.removeFiles([log, basePath]) raise StopIteration else: proc.logError(self.errorLog, log = log) utils.eprint('Trying to recover from error ...') for dirName in os.listdir(basePath): yield (None, os.path.join(basePath, dirName)) del activity # return unretrievable jobs for jobNum in todo: yield (jobNum, None) purgeLog = tempfile.mktemp('.log') purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, " ".join(done))) retCode = purgeProc.wait() if retCode != 0: if self.explainError(purgeProc, retCode): pass else: proc.logError(self.errorLog, log = purgeLog, jobs = done) utils.removeFiles([log, purgeLog, basePath])