def _saveStateToTar(self, tar, meta, source, sourceLen, message): # Write the splitting info grouped into subtarfiles activity = utils.ActivityLog(message) (jobNum, lastValid, subTar) = (-1, -1, None) for jobNum, entry in enumerate(source): if not entry.get(DataSplitter.Invalid, False): lastValid = jobNum if jobNum % self._keySize == 0: self._closeSubTar(tar, subTar) subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / self._keySize)) activity.finish() activity = utils.ActivityLog('%s [%d / %d]' % (message, jobNum, sourceLen)) # Determine shortest way to store file list tmp = entry.pop(DataSplitter.FileList) savelist = self._getReducedFileList(entry, tmp) # can modify entry # Write files with infos / filelist data = str.join('', self._fmt.format(entry, fkt = self._formatFileEntry) + lmap(lambda fn: '=%s\n' % fn, savelist)) self._addToSubTar(subTar, '%05d' % jobNum, data) # Remove common prefix from info if DataSplitter.CommonPrefix in entry: entry.pop(DataSplitter.CommonPrefix) entry[DataSplitter.FileList] = tmp self._closeSubTar(tar, subTar) activity.finish() # Write metadata to allow reconstruction of data splitter meta['MaxJobs'] = lastValid + 1 for (fn, data) in [('Metadata', self._fmt.format(meta)), ('Version', '2')]: self._addToTar(tar, fn, data)
def getWMS(self): log = utils.ActivityLog('Discovering available WMS services') wms_best_list = [] for wms in self.listWMS_good(): log = utils.ActivityLog('Discovering available WMS services - pinging %s' % wms) if wms is None: continue ping, pingtime = self.pingDict.get(wms, (None, 0)) if time.time() - pingtime > 30 * 60: # check every ~30min ping = utils.ping_host(wms.split('://')[1].split('/')[0].split(':')[0]) self.pingDict[wms] = (ping, time.time() + 10 * 60 * random.random()) # 10 min variation if ping is not None: wms_best_list.append((wms, ping)) log.finish() log.finish() if not wms_best_list: return None sort_inplace(wms_best_list, key = lambda name_ping: name_ping[1]) result = choice_exp(wms_best_list) log = utils.ActivityLog('Discovering available WMS services - using %s' % wms) if result is not None: wms, ping = result # reduce timeout by 5min for chosen wms => re-ping every 6 submits self.pingDict[wms] = (ping, self.pingDict[wms][1] + 5*60) result = wms self.updateState() del log return result
def readJobs(self, jobLimit): try: if not os.path.exists(self._dbPath): os.mkdir(self._dbPath) except Exception: raise JobError("Problem creating work directory '%s'" % self._dbPath) candidates = [] for jobFile in fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt'): try: # 2xsplit is faster than regex jobNum = int(jobFile.split(".")[0].split("_")[1]) except Exception: continue candidates.append((jobNum, jobFile)) (jobMap, maxJobs) = ({}, len(candidates)) activity = utils.ActivityLog('Reading job infos ...') idx = 0 for (jobNum, jobFile) in sorted(candidates): idx += 1 if (jobLimit >= 0) and (jobNum >= jobLimit): self._log.info( 'Stopped reading job infos at job #%d out of %d available job files', jobNum, len(candidates)) break jobObj = Job.load(os.path.join(self._dbPath, jobFile)) jobMap[jobNum] = jobObj if idx % 100 == 0: activity.finish() activity = utils.ActivityLog( 'Reading job infos ... %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) activity.finish() return jobMap
def cancelJobs(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self.cancelExec, self.getCancelArguments(self._getRawIDs(ids))) if proc.wait() != 0: for line in proc.getError().splitlines(): if not self.unknownID() in line: utils.eprint(line.strip()) del activity activity = utils.ActivityLog('waiting for jobs to finish') time.sleep(5) for wmsId, jobNum in ids: path = self._getSandbox(wmsId) if path is None: utils.eprint('Sandbox for job %d with wmsId "%s" could not be found' % (jobNum, wmsId)) continue try: shutil.rmtree(path) except Exception: raise BackendError('Sandbox for job %d with wmsId "%s" could not be deleted' % (jobNum, wmsId)) yield (jobNum, wmsId) del activity
def submitJobs(self, jobNumList, task): requestLen = len(jobNumList) activity = utils.ActivityLog('Submitting jobs... (--%)') while jobNumList: jobSubmitNumList = jobNumList[-self._schedd.getSubmitScale():] del(jobNumList[-self._schedd.getSubmitScale():]) activity = utils.ActivityLog('Submitting jobs... (%2d%%)'%(100*(requestLen-len(jobNumList))/requestLen)) for jobNum in jobSubmitNumList: self._writeJobConfig( self.getJobCfgPath(jobNum)[0], jobNum, task, {} ) rawJobInfoMaps = self._schedd.submitJobs( jobSubmitNumList, task, self._getQueryArgs() ) # Yield (jobNum, wmsId, other data) per jobZ jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps) for htcID in jobInfoMaps: yield ( htcID.gcJobNum, self._createGcId(htcID), jobInfoMaps[htcID] ) del(activity)
def _saveStateToTar(self, tar, meta, source, sourceLen, message): # Write the splitting info grouped into subtarfiles activity = utils.ActivityLog(message) (jobNum, subTar) = (-1, None) for jobNum, entry in enumerate(source): if jobNum % 100 == 0: self._closeSubTar(tar, subTar) subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / 100)) activity.finish() activity = utils.ActivityLog('%s [%d / %d]' % (message, jobNum, sourceLen)) # Determine shortest way to store file list tmp = entry.pop(DataSplitter.FileList) savelist = self._getReducedFileList(entry, tmp) # can modify entry # Write files with infos / filelist for name, data in [('list', str.join('\n', savelist)), ('info', self._fmt.format(entry, fkt = self._formatFileEntry))]: self._addToSubTar(subTar, os.path.join('%05d' % jobNum, name), data) # Remove common prefix from info if DataSplitter.CommonPrefix in entry: entry.pop(DataSplitter.CommonPrefix) entry[DataSplitter.FileList] = tmp self._closeSubTar(tar, subTar) # Write metadata to allow reconstruction of data splitter meta['MaxJobs'] = jobNum + 1 self._addToTar(tar, 'Metadata', self._fmt.format(meta)) activity.finish()
def write(cls, fn, pa): fp = ZipFile(fn, 'w') try: keys = sorted(ifilter(lambda p: not p.untracked, pa.getJobKeys())) fp.write('# %s\n' % json.dumps(keys)) maxN = pa.getMaxJobs() if maxN: activity = utils.ActivityLog('Writing parameter dump') for jobNum in irange(maxN): activity.finish() activity = utils.ActivityLog( 'Writing parameter dump [%d/%d]' % (jobNum + 1, maxN)) meta = pa.getJobInfo(jobNum) if meta.get(ParameterInfo.ACTIVE, True): fp.write( '%d\t%s\n' % (jobNum, str.join( '\t', imap(lambda k: json.dumps(meta.get(k, '')), keys)))) else: fp.write( '%d!\t%s\n' % (jobNum, str.join( '\t', imap(lambda k: json.dumps(meta.get(k, '')), keys)))) activity.finish() finally: fp.close()
def __init__(self, config, source): self._rawSource = source BasicParameterAdapter.__init__(self, config, source) self._mapJob2PID = {} if not os.path.isdir(config.getWorkPath()): os.makedirs(config.getWorkPath()) self._pathJob2PID = config.getWorkPath('params.map.gz') self._pathParams = config.getWorkPath('params.dat.gz') # Find out if init should be performed - overrides userResync! userInit = config.getState('init', detail='parameters') needInit = False if not (os.path.exists(self._pathParams) and os.path.exists(self._pathJob2PID)): needInit = True # Init needed if no parameter log exists if userInit and not needInit and (source.getMaxParameters() is not None): utils.eprint( 'Re-Initialization will overwrite the current mapping between jobs and parameter/dataset content! This can lead to invalid results!' ) if utils.getUserBool( 'Do you want to perform a syncronization between the current mapping and the new one to avoid this?', True): userInit = False doInit = userInit or needInit # Find out if resync should be performed userResync = config.getState('resync', detail='parameters') config.setState(False, 'resync', detail='parameters') needResync = False pHash = self._rawSource.getHash() self.storedHash = config.get('parameter hash', pHash, persistent=True) if self.storedHash != pHash: needResync = True # Resync needed if parameters have changed self._log.info('Parameter hash has changed') self._log.debug('\told hash: %s', self.storedHash) self._log.debug('\tnew hash: %s', pHash) config.setState(True, 'init', detail='config') doResync = (userResync or needResync) and not doInit if not doResync and not doInit: # Reuse old mapping activity = utils.ActivityLog( 'Loading cached parameter information') self.readJob2PID() activity.finish() return elif doResync: # Perform sync activity = utils.ActivityLog('Syncronizing parameter information') self.storedHash = None self._resyncState = self.resync() activity.finish() elif doInit: # Write current state self.writeJob2PID(self._pathJob2PID) ParameterSource.getClass('GCDumpParameterSource').write( self._pathParams, self) config.set('parameter hash', self._rawSource.getHash())
def _tidyUpWorkingDirectory(self, forceCleanup=False): # active remote submission should clean up when no jobs remain if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH: self.debugOut( "Revising remote working directory for cleanup. Forced CleanUp: %s" % forceCleanup) activity = utils.ActivityLog('revising remote work directory') # check whether there are any remote working directories remaining checkProcess = self.Pool.LoggedExecute( 'find %s -maxdepth 1 -type d | wc -l' % self.getWorkdirPath()) try: if forceCleanup or (int(checkProcess.getOutput()) <= 1): cleanupProcess = self.Pool.LoggedExecute( 'rm -rf %s' % self.getWorkdirPath()) if cleanupProcess.wait() != 0: if self.explainError(cleanupProcess, cleanupProcess.wait()): return cleanupProcess.logError(self.errorLog) raise BackendError( 'Cleanup process %s returned: %s' % (cleanupProcess.cmd, cleanupProcess.getMessage())) except Exception: self._log.warning( 'There might be some junk data left in: %s @ %s', self.getWorkdirPath(), self.Pool.getDomain()) raise BackendError( 'Unable to clean up remote working directory') activity.finish()
def checkJobs(self, ids): if len(ids) == 0: raise StopIteration jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') activity = utils.ActivityLog('checking job status') proc = utils.LoggedProcess(self._statusExec, '--level 0 --logfile "%s" %s' % (log, jobs)) for jobOutput in proc.getOutput().split('******')[1:]: data = {} for statusRegexLevel0 in self._statusRegexLevel0: match = re.match(statusRegexLevel0, jobOutput.replace('\n', ' ')) if match: data = match.groupdict() break data['id'] = self._createId(data['rawId']) yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data.get('status', 'DONE-FAILED')], data) retCode = proc.wait() del activity if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log, jobs = jobs) utils.removeFiles([log])
def _submitJob(self, jobNum, module): activity = utils.ActivityLog('submitting jobs') try: sandbox = tempfile.mkdtemp('', '%s.%04d.' % (module.taskID, jobNum), self.sandPath) except Exception: raise BackendError('Unable to create sandbox directory "%s"!' % sandbox) sbPrefix = sandbox.replace(self.sandPath, '').lstrip('/') def translateTarget(d, s, t): return (d, s, os.path.join(sbPrefix, t)) self.smSBIn.doTransfer( ismap(translateTarget, self._getSandboxFilesIn(module))) self._writeJobConfig( os.path.join(sandbox, '_jobconfig.sh'), jobNum, module, { 'GC_SANDBOX': sandbox, 'GC_SCRATCH_SEARCH': str.join(' ', self.scratchPath) }) reqs = self.brokerSite.brokerAdd(module.getRequirements(jobNum), WMS.SITES) reqs = dict(self.brokerQueue.brokerAdd(reqs, WMS.QUEUES)) if (self.memory > 0) and (reqs.get(WMS.MEMORY, 0) < self.memory): reqs[ WMS. MEMORY] = self.memory # local jobs need higher (more realistic) memory requirements (stdout, stderr) = (os.path.join(sandbox, 'gc.stdout'), os.path.join(sandbox, 'gc.stderr')) jobName = module.getDescription(jobNum).jobName proc = utils.LoggedProcess( self.submitExec, '%s %s "%s" %s' % (self.submitOpts, self.getSubmitArguments(jobNum, jobName, reqs, sandbox, stdout, stderr), utils.pathShare('gc-local.sh'), self.getJobArguments(jobNum, sandbox))) retCode = proc.wait() wmsIdText = proc.getOutput().strip().strip('\n') try: wmsId = self.parseSubmitOutput(wmsIdText) except Exception: wmsId = None del activity if retCode != 0: self._log.warning('%s failed:', self.submitExec) elif wmsId is None: self._log.warning('%s did not yield job id:\n%s', self.submitExec, wmsIdText) if wmsId: wmsId = self._createId(wmsId) open(os.path.join(sandbox, wmsId), 'w') else: proc.logError(self.errorLog) return (jobNum, utils.QM(wmsId, wmsId, None), {'sandbox': sandbox})
def cancelJobs(self, allIds): if len(allIds) == 0: raise StopIteration waitFlag = False for ids in imap(lambda x: allIds[x:x + 5], irange(0, len(allIds), 5)): # Delete jobs in groups of 5 - with 5 seconds between groups if waitFlag and not utils.wait(5): break waitFlag = True jobNumMap = dict(ids) jobs = self.writeWMSIds(ids) activity = utils.ActivityLog('cancelling jobs') proc = LocalProcess(self._cancelExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs) retCode = proc.status(timeout=60, terminate=True) del activity # select cancelled jobs for deletedWMSId in ifilter(lambda x: x.startswith('- '), proc.stdout.iter()): deletedWMSId = self._createId(deletedWMSId.strip('- \n')) yield (jobNumMap.get(deletedWMSId), deletedWMSId) if retCode != 0: if self.explainError(proc, retCode): pass else: self._log.log_process(proc, files={'jobs': utils.safeRead(jobs)}) utils.removeFiles([jobs])
def resyncMapping(self, newSplitPath, oldBlocks, newBlocks): log = utils.ActivityLog('Performing resynchronization of dataset') (blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks) for rmBlock in blocksMissing: # Files in matching blocks are already sorted sort_inplace(rmBlock[DataProvider.FileList], key=lambda x: x[DataProvider.URL]) log.finish() # User overview and setup starts here resultRedo = [] resultDisable = [] newSplitPathTMP = newSplitPath + '.tmp' resyncIter = self._resyncIterator(resultRedo, resultDisable, blocksAdded, blocksMissing, blocksMatching) self.savePartitions( newSplitPathTMP, resyncIter, sourceLen=self.getMaxJobs(), message= 'Performing resynchronization of dataset map (progress is estimated)' ) if self._interactive: # TODO: print info and ask if not utils.getUserBool( 'Do you want to use the new dataset partition?', False): return None os.rename(newSplitPathTMP, newSplitPath) return (resultRedo, resultDisable)
def checkJobs(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('checking job status') proc = utils.LoggedProcess(self.statusExec, self.getCheckArguments(self._getRawIDs(ids))) tmp = {} for data in self.parseStatus(proc.iter()): wmsId = self._createId(data['id']) tmp[wmsId] = (wmsId, self.parseJobState(data['status']), data) for wmsId, jobNum in ids: if wmsId not in tmp: yield (jobNum, wmsId, Job.DONE, {}) else: yield tuple([jobNum] + list(tmp[wmsId])) retCode = proc.wait() del activity if retCode != 0: for line in proc.getError().splitlines(): if not self.unknownID() in line: utils.eprint(line)
def readJobs(self, jobLimit): jobMap = {} maxJobs = 0 if os.path.exists(self._dbFile): try: tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) except Exception: # Try to recover job archive utils.eprint('=' * 40 + '\nStarting recovery of broken job database') utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40) os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile)) os.rename(self._dbFile, self._dbFile + '.broken') os.rename(self._dbFile + '.tmp', self._dbFile) tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED) utils.removeFiles([self._dbFile + '.broken']) brokenList = [] for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) try: fp = tar.open(fnTarInfo) try: fp.read() finally: fp.close() except Exception: pass for broken in brokenList: os.system('zip %s -d %s' % (self._dbFile, broken)) utils.eprint('Recover completed!') activity = utils.ActivityLog('Reading job transactions ...') maxJobs = len(tar.namelist()) tMap = {} for idx, fnTarInfo in enumerate(tar.namelist()): (jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1))) if tid < tMap.get(jobNum, 0): continue data = utils.DictFormat(escapeString = True).parse(tar.open(fnTarInfo).read()) jobMap[jobNum] = Job.loadData(fnTarInfo, data) tMap[jobNum] = tid if idx % 100 == 0: activity.finish() activity = utils.ActivityLog('Reading job transactions ... %d [%d%%]' % (idx, (100.0 * idx) / maxJobs)) self._serial = maxJobs return jobMap
def guiWait(timeout): onResize(None, None) oldHandler = signal.signal(signal.SIGWINCH, onResize) result = utils.wait(timeout) signal.signal(signal.SIGWINCH, oldHandler) if (time.time() - guiWait.lastwait > 10) and not timeout: tmp = utils.ActivityLog('') # force display update tmp.finish() guiWait.lastwait = time.time() return result
def getEntries(self, path, metadata, events, seList, objStore): for jobNum in self._selected: log = utils.ActivityLog('Reading job logs - [%d / %d]' % (jobNum, self._selected[-1])) metadata['GC_JOBNUM'] = jobNum objStore.update({ 'GC_TASK': self._extTask, 'GC_WORKDIR': self._extWorkDir }) yield (os.path.join(self._extWorkDir, 'output', 'job_%d' % jobNum), metadata, events, seList, objStore) log.finish()
def cancelJobs(self, wmsJobIdList): if len(wmsJobIdList) == 0: raise StopIteration self.debugOut("Started canceling: %s" % set(lzip(*wmsJobIdList)[0])) self.debugPool() wmsIdList = list(self._getRawIDs(wmsJobIdList)) wmsIdArgument = " ".join(wmsIdList) wmsToJobMap = dict(wmsJobIdList) activity = utils.ActivityLog('cancelling jobs') cancelProcess = self.Pool.LoggedExecute( self.cancelExec, '%(jobIDs)s' % {"jobIDs": wmsIdArgument}) # check if canceling actually worked for cancelReturnLine in cancelProcess.iter(): if (cancelReturnLine != '\n') and ('marked for removal' in cancelReturnLine): try: wmsID = cancelReturnLine.split()[1] wmsIdList.remove(wmsID) wmsID = self._createId(wmsID) jobNum = wmsToJobMap[wmsID] yield (jobNum, wmsID) except KeyError: # mismatch in GC<->Condor mapping self._log.error('Error with canceled condor job %s', wmsID) self._log.error('\tCondor IDs: %s', wmsIdList) self._log.error('\tProcess message: %s', cancelProcess.getMessage()) raise BackendError('Error while cancelling job %s' % wmsID) # clean up remote work dir if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH: cleanupProcess = self.Pool.LoggedExecute( 'rm -rf %s' % self.getWorkdirPath(jobNum)) self.debugOut("Cleaning up remote workdir:\n " + cleanupProcess.cmd) if cleanupProcess.wait() != 0: if self.explainError(cleanupProcess, cleanupProcess.wait()): pass else: cleanupProcess.logError(self.errorLog) retCode = cancelProcess.wait() if retCode != 0: if self.explainError(cancelProcess, retCode): pass else: cancelProcess.logError(self.errorLog) # clean up if necessary activity.finish() self._tidyUpWorkingDirectory() self.debugFlush()
def getEntries(self, path, metadata, events, seList, objStore): metadata['GC_SOURCE_DIR'] = self._path counter = 0 from grid_control.backends.storage import se_ls proc = se_ls(self._path) for fn in proc.stdout.iter(timeout=60): log = utils.ActivityLog('Reading source directory - [%d]' % counter) yield (os.path.join(self._path, fn.strip()), metadata, events, seList, objStore) counter += 1 log.finish() if proc.status(timeout=0) != 0: self._log.log_process(proc)
def cancelJobs(self, wmsJobIdList): if not len(wmsJobIdList): raise StopIteration activity = utils.ActivityLog('Canceling jobs...') assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList)), self._schedd.getURI()) canceledJobs = self._schedd.cancelJobs( self._splitGcRequests(wmsJobIdList) ) # Yield ( jobNum, wmsID) for canceled jobs for htcJobID in canceledJobs: yield ( htcJobID.gcJobNum, self._createGcId(htcJobID) ) del activity
def _getJobsOutput(self, wmsJobIdList): if not len(wmsJobIdList): raise StopIteration activity = utils.ActivityLog('Fetching jobs...') assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList)), self._schedd.getURI()) returnedJobs = self._schedd.getJobsOutput( self._splitGcRequests(wmsJobIdList) ) # Yield (jobNum, outputPath) per retrieved job for htcID in returnedJobs: yield ( htcID.gcJobNum, self.getSandboxPath(htcID.gcJobNum) ) del activity
def _getJobsOutput(self, wmsJobIdList): if not len(wmsJobIdList): raise StopIteration self.debugOut("Started retrieving: %s" % set(lzip(*wmsJobIdList)[0])) activity = utils.ActivityLog('retrieving job outputs') for wmsId, jobNum in wmsJobIdList: sandpath = self.getSandboxPath(jobNum) if sandpath is None: yield (jobNum, None) continue # when working with a remote spool schedd, tell condor to return files if self.remoteType == PoolType.SPOOL: transferProcess = self.Pool.LoggedExecute( self.transferExec, '%(jobID)s' % {"jobID": self._splitId(wmsId)}) if transferProcess.wait() != 0: if self.explainError(transferProcess, transferProcess.wait()): pass else: transferProcess.logError(self.errorLog) # when working with a remote [gsi]ssh schedd, manually return files elif self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH: transferProcess = self.Pool.LoggedCopyFromRemote( self.getWorkdirPath(jobNum), self.getSandboxPath()) if transferProcess.wait() != 0: if self.explainError(transferProcess, transferProcess.wait()): pass else: transferProcess.logError(self.errorLog) # clean up remote working directory cleanupProcess = self.Pool.LoggedExecute( 'rm -rf %s' % self.getWorkdirPath(jobNum)) self.debugOut("Cleaning up remote workdir: JobID %s\n %s" % (jobNum, cleanupProcess.cmd)) if cleanupProcess.wait() != 0: if self.explainError(cleanupProcess, cleanupProcess.wait()): pass else: cleanupProcess.logError(self.errorLog) yield (jobNum, sandpath) # clean up if necessary activity.finish() self._tidyUpWorkingDirectory() self.debugFlush()
def getBlocks(self, silent=True): def prepareBlocks(): # Validation, Filtering & Naming: for block in self.getBlocksInternal(): assert (block[DataProvider.Dataset]) block.setdefault(DataProvider.BlockName, '0') block.setdefault(DataProvider.Provider, self.__class__.__name__) block.setdefault(DataProvider.Locations, None) if self._datasetID: block[DataProvider.DatasetID] = self._datasetID events = sum( imap(lambda x: x[DataProvider.NEntries], block[DataProvider.FileList])) block.setdefault(DataProvider.NEntries, events) if self._datasetNick: block[DataProvider.Nickname] = self._datasetNick elif self._nickProducer: block = self._nickProducer.processBlock(block) if not block: raise DatasetError('Nickname producer failed!') yield block if self._cache_block is None: log = utils.ActivityLog('Retrieving %s' % self._datasetExpr) try: if self._passthrough: self._cache_block = list( self._stats.process(prepareBlocks())) else: self._cache_block = list( self._stats.process( self._datasetProcessor.process(prepareBlocks()))) except Exception: raise DatasetError('Unable to retrieve dataset %s' % repr(self._datasetExpr)) statString = ' * Dataset ' if self._datasetNick: statString += repr(self._datasetNick) elif self._datasetExpr: statString += repr(self._datasetExpr) log.finish() statString += '\tcontains %d block(s) with %s' % self._stats.getStats( ) if not silent: self._log.info(statString) return self._cache_block
def __init__(self, config, jobLimit = -1, jobSelector = None): dbPath = config.getWorkPath('jobs') self._dbFile = config.getWorkPath('jobs.zip') if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile): activity = utils.ActivityLog('Converting job database...') self._serial = 0 try: oldDB = JobDB(config) oldDB.readJobs(-1) for jobNum in oldDB.getJobs(): self.commit(jobNum, oldDB.get(jobNum)) except Exception: utils.removeFiles([self._dbFile]) raise activity.finish() ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
def getEntries(self, path, metadata, events, seList, objStore): allDirs = ifilter(lambda fn: fn.startswith('job_'), os.listdir(self._extOutputDir)) for idx, dirName in enumerate(allDirs): log = utils.ActivityLog('Reading job logs - [%d / %d]' % (idx, len(allDirs))) try: metadata['GC_JOBNUM'] = int(dirName.split('_')[1]) objStore['GC_WORKDIR'] = self._extWorkDir log.finish() if self._selector and self._selector(metadata['GC_JOBNUM'], None): yield (os.path.join(self._extOutputDir, dirName), metadata, events, seList, objStore) except Exception: pass log.finish()
def _getJobsOutput(self, ids): if not len(ids): raise StopIteration activity = utils.ActivityLog('retrieving job outputs') for wmsId, jobNum in ids: path = self._getSandbox(wmsId) if path is None: yield (jobNum, None) continue # Cleanup sandbox outFiles = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles)) utils.removeFiles(ifilter(lambda x: x not in outFiles, imap(lambda fn: os.path.join(path, fn), os.listdir(path)))) yield (jobNum, path) del activity
def checkJobs(self, wmsJobIdList): if not len(wmsJobIdList): raise StopIteration activity = utils.ActivityLog('Checking jobs...') assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList)), self._schedd.getURI()) rawJobInfoMaps = self._schedd.checkJobs( self._splitGcRequests(wmsJobIdList), self._getQueryArgs() ) # Yield (jobNum, wmsId, state, other data) per active jobs jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps) for htcID in jobInfoMaps: yield ( htcID.gcJobNum, self._createGcId(htcID), self._statusMap[jobInfoMaps[htcID]['state']][0], jobInfoMaps[htcID] ) del(activity)
def __init__(self, path): activity = utils.ActivityLog('Reading dataset partition file') self._lock = GCLock() self._fmt = utils.DictFormat() self._tar = tarfile.open(path, 'r:') (self._cacheKey, self._cacheTar) = (None, None) metadata = self._fmt.parse(self._tar.extractfile('Metadata').readlines(), keyParser = {None: str}) self.maxJobs = metadata.pop('MaxJobs') self.classname = metadata.pop('ClassName') self.metadata = {'dataset': dict(ifilter(lambda k_v: not k_v[0].startswith('['), metadata.items()))} for (k, v) in ifilter(lambda k_v: k_v[0].startswith('['), metadata.items()): self.metadata.setdefault('dataset %s' % k.split(']')[0].lstrip('['), {})[k.split(']')[1].strip()] = v activity.finish() self._parserMap = { None: str, DataSplitter.NEntries: int, DataSplitter.Skipped: int, DataSplitter.DatasetID: int, DataSplitter.Invalid: parseBool, DataSplitter.Locations: lambda x: parseList(x, ','), DataSplitter.MetadataHeader: parseJSON, DataSplitter.Metadata: lambda x: parseJSON(x.strip("'")) }
def matchSites(self, endpoint): log = utils.ActivityLog('Discovering available WMS services - testing %s' % endpoint) checkArgs = ['-a'] if endpoint: checkArgs.extend(['-e', endpoint]) checkArgs.append(utils.pathShare('null.jdl')) proc = LocalProcess(self._exeGliteWMSJobListMatch, *checkArgs) result = [] for line in proc.stdout.iter(timeout = 3): if line.startswith(' - '): result.append(line[3:].strip()) if proc.status(timeout = 0) is None: self.wms_timeout[endpoint] = self.wms_timeout.get(endpoint, 0) + 1 if self.wms_timeout.get(endpoint, 0) > 10: # remove endpoints after 10 failures self.wms_all.remove(endpoint) log.finish() return [] log.finish() return result
def cancelJobs(self, allIds): if len(allIds) == 0: raise StopIteration waitFlag = False for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)): # Delete jobs in groups of 5 - with 5 seconds between groups if waitFlag and not utils.wait(5): break waitFlag = True jobNumMap = dict(ids) jobs = ' '.join(self._getRawIDs(ids)) log = tempfile.mktemp('.log') activity = utils.ActivityLog('cancelling jobs') proc = utils.LoggedProcess(self._cancelExec, '--noint --logfile "%s" %s' % (log, jobs)) retCode = proc.wait() del activity # select cancelled jobs for rawId in self._getRawIDs(ids): deletedWMSId = self._createId(rawId) yield (jobNumMap.get(deletedWMSId), deletedWMSId) if retCode != 0: if self.explainError(proc, retCode): pass else: proc.logError(self.errorLog, log = log) purgeLog = tempfile.mktemp('.log') purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, jobs)) retCode = purgeProc.wait() if retCode != 0: if self.explainError(purgeProc, retCode): pass else: proc.logError(self.errorLog, log = purgeLog, jobs = jobs) utils.removeFiles([log, purgeLog])