Beispiel #1
0
	def _saveStateToTar(self, tar, meta, source, sourceLen, message):
		# Write the splitting info grouped into subtarfiles
		activity = utils.ActivityLog(message)
		(jobNum, lastValid, subTar) = (-1, -1, None)
		for jobNum, entry in enumerate(source):
			if not entry.get(DataSplitter.Invalid, False):
				lastValid = jobNum
			if jobNum % self._keySize == 0:
				self._closeSubTar(tar, subTar)
				subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / self._keySize))
				activity.finish()
				activity = utils.ActivityLog('%s [%d / %d]' % (message, jobNum, sourceLen))
			# Determine shortest way to store file list
			tmp = entry.pop(DataSplitter.FileList)
			savelist = self._getReducedFileList(entry, tmp) # can modify entry
			# Write files with infos / filelist
			data = str.join('', self._fmt.format(entry, fkt = self._formatFileEntry) + lmap(lambda fn: '=%s\n' % fn, savelist))
			self._addToSubTar(subTar, '%05d' % jobNum, data)
			# Remove common prefix from info
			if DataSplitter.CommonPrefix in entry:
				entry.pop(DataSplitter.CommonPrefix)
			entry[DataSplitter.FileList] = tmp
		self._closeSubTar(tar, subTar)
		activity.finish()
		# Write metadata to allow reconstruction of data splitter
		meta['MaxJobs'] = lastValid + 1
		for (fn, data) in [('Metadata', self._fmt.format(meta)), ('Version', '2')]:
			self._addToTar(tar, fn, data)
Beispiel #2
0
	def getWMS(self):
		log = utils.ActivityLog('Discovering available WMS services')
		wms_best_list = []
		for wms in self.listWMS_good():
			log = utils.ActivityLog('Discovering available WMS services - pinging %s' % wms)
			if wms is None:
				continue
			ping, pingtime = self.pingDict.get(wms, (None, 0))
			if time.time() - pingtime > 30 * 60: # check every ~30min
				ping = utils.ping_host(wms.split('://')[1].split('/')[0].split(':')[0])
				self.pingDict[wms] = (ping, time.time() + 10 * 60 * random.random()) # 10 min variation
			if ping is not None:
				wms_best_list.append((wms, ping))
			log.finish()
		log.finish()
		if not wms_best_list:
			return None
		sort_inplace(wms_best_list, key = lambda name_ping: name_ping[1])
		result = choice_exp(wms_best_list)
		log = utils.ActivityLog('Discovering available WMS services - using %s' % wms)
		if result is not None:
			wms, ping = result # reduce timeout by 5min for chosen wms => re-ping every 6 submits
			self.pingDict[wms] = (ping, self.pingDict[wms][1] + 5*60)
			result = wms
		self.updateState()
		del log
		return result
Beispiel #3
0
    def readJobs(self, jobLimit):
        try:
            if not os.path.exists(self._dbPath):
                os.mkdir(self._dbPath)
        except Exception:
            raise JobError("Problem creating work directory '%s'" %
                           self._dbPath)

        candidates = []
        for jobFile in fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt'):
            try:  # 2xsplit is faster than regex
                jobNum = int(jobFile.split(".")[0].split("_")[1])
            except Exception:
                continue
            candidates.append((jobNum, jobFile))

        (jobMap, maxJobs) = ({}, len(candidates))
        activity = utils.ActivityLog('Reading job infos ...')
        idx = 0
        for (jobNum, jobFile) in sorted(candidates):
            idx += 1
            if (jobLimit >= 0) and (jobNum >= jobLimit):
                self._log.info(
                    'Stopped reading job infos at job #%d out of %d available job files',
                    jobNum, len(candidates))
                break
            jobObj = Job.load(os.path.join(self._dbPath, jobFile))
            jobMap[jobNum] = jobObj
            if idx % 100 == 0:
                activity.finish()
                activity = utils.ActivityLog(
                    'Reading job infos ... %d [%d%%]' %
                    (idx, (100.0 * idx) / maxJobs))
        activity.finish()
        return jobMap
Beispiel #4
0
	def cancelJobs(self, ids):
		if not len(ids):
			raise StopIteration

		activity = utils.ActivityLog('cancelling jobs')
		proc = utils.LoggedProcess(self.cancelExec, self.getCancelArguments(self._getRawIDs(ids)))
		if proc.wait() != 0:
			for line in proc.getError().splitlines():
				if not self.unknownID() in line:
					utils.eprint(line.strip())
		del activity

		activity = utils.ActivityLog('waiting for jobs to finish')
		time.sleep(5)
		for wmsId, jobNum in ids:
			path = self._getSandbox(wmsId)
			if path is None:
				utils.eprint('Sandbox for job %d with wmsId "%s" could not be found' % (jobNum, wmsId))
				continue
			try:
				shutil.rmtree(path)
			except Exception:
				raise BackendError('Sandbox for job %d with wmsId "%s" could not be deleted' % (jobNum, wmsId))
			yield (jobNum, wmsId)
		del activity
Beispiel #5
0
	def submitJobs(self, jobNumList, task):
		requestLen = len(jobNumList)
		activity = utils.ActivityLog('Submitting jobs... (--%)')
		while jobNumList:
			jobSubmitNumList = jobNumList[-self._schedd.getSubmitScale():]
			del(jobNumList[-self._schedd.getSubmitScale():])
			activity = utils.ActivityLog('Submitting jobs... (%2d%%)'%(100*(requestLen-len(jobNumList))/requestLen))
			for jobNum in jobSubmitNumList:
				self._writeJobConfig(
					self.getJobCfgPath(jobNum)[0],
					jobNum,
					task, {}
					)
			rawJobInfoMaps = self._schedd.submitJobs(
				jobSubmitNumList, 
				task,
				self._getQueryArgs()
				)
			# Yield (jobNum, wmsId, other data) per jobZ
			jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps)
			for htcID in jobInfoMaps:
				yield (
					htcID.gcJobNum,
					self._createGcId(htcID),
					jobInfoMaps[htcID]
					)
		del(activity)
Beispiel #6
0
	def _saveStateToTar(self, tar, meta, source, sourceLen, message):
		# Write the splitting info grouped into subtarfiles
		activity = utils.ActivityLog(message)
		(jobNum, subTar) = (-1, None)
		for jobNum, entry in enumerate(source):
			if jobNum % 100 == 0:
				self._closeSubTar(tar, subTar)
				subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / 100))
				activity.finish()
				activity = utils.ActivityLog('%s [%d / %d]' % (message, jobNum, sourceLen))
			# Determine shortest way to store file list
			tmp = entry.pop(DataSplitter.FileList)
			savelist = self._getReducedFileList(entry, tmp) # can modify entry
			# Write files with infos / filelist
			for name, data in [('list', str.join('\n', savelist)), ('info', self._fmt.format(entry, fkt = self._formatFileEntry))]:
				self._addToSubTar(subTar, os.path.join('%05d' % jobNum, name), data)
			# Remove common prefix from info
			if DataSplitter.CommonPrefix in entry:
				entry.pop(DataSplitter.CommonPrefix)
			entry[DataSplitter.FileList] = tmp
		self._closeSubTar(tar, subTar)
		# Write metadata to allow reconstruction of data splitter
		meta['MaxJobs'] = jobNum + 1
		self._addToTar(tar, 'Metadata', self._fmt.format(meta))
		activity.finish()
Beispiel #7
0
 def write(cls, fn, pa):
     fp = ZipFile(fn, 'w')
     try:
         keys = sorted(ifilter(lambda p: not p.untracked, pa.getJobKeys()))
         fp.write('# %s\n' % json.dumps(keys))
         maxN = pa.getMaxJobs()
         if maxN:
             activity = utils.ActivityLog('Writing parameter dump')
             for jobNum in irange(maxN):
                 activity.finish()
                 activity = utils.ActivityLog(
                     'Writing parameter dump [%d/%d]' % (jobNum + 1, maxN))
                 meta = pa.getJobInfo(jobNum)
                 if meta.get(ParameterInfo.ACTIVE, True):
                     fp.write(
                         '%d\t%s\n' %
                         (jobNum,
                          str.join(
                              '\t',
                              imap(lambda k: json.dumps(meta.get(k, '')),
                                   keys))))
                 else:
                     fp.write(
                         '%d!\t%s\n' %
                         (jobNum,
                          str.join(
                              '\t',
                              imap(lambda k: json.dumps(meta.get(k, '')),
                                   keys))))
             activity.finish()
     finally:
         fp.close()
Beispiel #8
0
    def __init__(self, config, source):
        self._rawSource = source
        BasicParameterAdapter.__init__(self, config, source)
        self._mapJob2PID = {}
        if not os.path.isdir(config.getWorkPath()):
            os.makedirs(config.getWorkPath())
        self._pathJob2PID = config.getWorkPath('params.map.gz')
        self._pathParams = config.getWorkPath('params.dat.gz')

        # Find out if init should be performed - overrides userResync!
        userInit = config.getState('init', detail='parameters')
        needInit = False
        if not (os.path.exists(self._pathParams)
                and os.path.exists(self._pathJob2PID)):
            needInit = True  # Init needed if no parameter log exists
        if userInit and not needInit and (source.getMaxParameters()
                                          is not None):
            utils.eprint(
                'Re-Initialization will overwrite the current mapping between jobs and parameter/dataset content! This can lead to invalid results!'
            )
            if utils.getUserBool(
                    'Do you want to perform a syncronization between the current mapping and the new one to avoid this?',
                    True):
                userInit = False
        doInit = userInit or needInit

        # Find out if resync should be performed
        userResync = config.getState('resync', detail='parameters')
        config.setState(False, 'resync', detail='parameters')
        needResync = False
        pHash = self._rawSource.getHash()
        self.storedHash = config.get('parameter hash', pHash, persistent=True)
        if self.storedHash != pHash:
            needResync = True  # Resync needed if parameters have changed
            self._log.info('Parameter hash has changed')
            self._log.debug('\told hash: %s', self.storedHash)
            self._log.debug('\tnew hash: %s', pHash)
            config.setState(True, 'init', detail='config')
        doResync = (userResync or needResync) and not doInit

        if not doResync and not doInit:  # Reuse old mapping
            activity = utils.ActivityLog(
                'Loading cached parameter information')
            self.readJob2PID()
            activity.finish()
            return
        elif doResync:  # Perform sync
            activity = utils.ActivityLog('Syncronizing parameter information')
            self.storedHash = None
            self._resyncState = self.resync()
            activity.finish()
        elif doInit:  # Write current state
            self.writeJob2PID(self._pathJob2PID)
            ParameterSource.getClass('GCDumpParameterSource').write(
                self._pathParams, self)
        config.set('parameter hash', self._rawSource.getHash())
Beispiel #9
0
 def _tidyUpWorkingDirectory(self, forceCleanup=False):
     # active remote submission should clean up when no jobs remain
     if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH:
         self.debugOut(
             "Revising remote working directory for cleanup. Forced CleanUp: %s"
             % forceCleanup)
         activity = utils.ActivityLog('revising remote work directory')
         # check whether there are any remote working directories remaining
         checkProcess = self.Pool.LoggedExecute(
             'find %s -maxdepth 1 -type d | wc -l' % self.getWorkdirPath())
         try:
             if forceCleanup or (int(checkProcess.getOutput()) <= 1):
                 cleanupProcess = self.Pool.LoggedExecute(
                     'rm -rf %s' % self.getWorkdirPath())
                 if cleanupProcess.wait() != 0:
                     if self.explainError(cleanupProcess,
                                          cleanupProcess.wait()):
                         return
                     cleanupProcess.logError(self.errorLog)
                     raise BackendError(
                         'Cleanup process %s returned: %s' %
                         (cleanupProcess.cmd, cleanupProcess.getMessage()))
         except Exception:
             self._log.warning(
                 'There might be some junk data left in: %s @ %s',
                 self.getWorkdirPath(), self.Pool.getDomain())
             raise BackendError(
                 'Unable to clean up remote working directory')
         activity.finish()
Beispiel #10
0
	def checkJobs(self, ids):
		if len(ids) == 0:
			raise StopIteration

		jobNumMap = dict(ids)
		jobs = ' '.join(self._getRawIDs(ids))
		log = tempfile.mktemp('.log')

		activity = utils.ActivityLog('checking job status')
		proc = utils.LoggedProcess(self._statusExec, '--level 0 --logfile "%s" %s' % (log, jobs))
		for jobOutput in proc.getOutput().split('******')[1:]:
			data = {}
			for statusRegexLevel0 in self._statusRegexLevel0:
				match = re.match(statusRegexLevel0, jobOutput.replace('\n', ' '))
				if match:
					data = match.groupdict()
					break
			data['id'] = self._createId(data['rawId'])
			yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data.get('status', 'DONE-FAILED')], data)
		
		retCode = proc.wait()
		del activity

		if retCode != 0:
			if self.explainError(proc, retCode):
				pass
			else:
				proc.logError(self.errorLog, log = log, jobs = jobs)
		
		utils.removeFiles([log])
Beispiel #11
0
    def _submitJob(self, jobNum, module):
        activity = utils.ActivityLog('submitting jobs')

        try:
            sandbox = tempfile.mkdtemp('',
                                       '%s.%04d.' % (module.taskID, jobNum),
                                       self.sandPath)
        except Exception:
            raise BackendError('Unable to create sandbox directory "%s"!' %
                               sandbox)
        sbPrefix = sandbox.replace(self.sandPath, '').lstrip('/')

        def translateTarget(d, s, t):
            return (d, s, os.path.join(sbPrefix, t))

        self.smSBIn.doTransfer(
            ismap(translateTarget, self._getSandboxFilesIn(module)))

        self._writeJobConfig(
            os.path.join(sandbox, '_jobconfig.sh'), jobNum, module, {
                'GC_SANDBOX': sandbox,
                'GC_SCRATCH_SEARCH': str.join(' ', self.scratchPath)
            })
        reqs = self.brokerSite.brokerAdd(module.getRequirements(jobNum),
                                         WMS.SITES)
        reqs = dict(self.brokerQueue.brokerAdd(reqs, WMS.QUEUES))
        if (self.memory > 0) and (reqs.get(WMS.MEMORY, 0) < self.memory):
            reqs[
                WMS.
                MEMORY] = self.memory  # local jobs need higher (more realistic) memory requirements

        (stdout, stderr) = (os.path.join(sandbox, 'gc.stdout'),
                            os.path.join(sandbox, 'gc.stderr'))
        jobName = module.getDescription(jobNum).jobName
        proc = utils.LoggedProcess(
            self.submitExec, '%s %s "%s" %s' %
            (self.submitOpts,
             self.getSubmitArguments(jobNum, jobName, reqs, sandbox, stdout,
                                     stderr), utils.pathShare('gc-local.sh'),
             self.getJobArguments(jobNum, sandbox)))
        retCode = proc.wait()
        wmsIdText = proc.getOutput().strip().strip('\n')
        try:
            wmsId = self.parseSubmitOutput(wmsIdText)
        except Exception:
            wmsId = None

        del activity

        if retCode != 0:
            self._log.warning('%s failed:', self.submitExec)
        elif wmsId is None:
            self._log.warning('%s did not yield job id:\n%s', self.submitExec,
                              wmsIdText)
        if wmsId:
            wmsId = self._createId(wmsId)
            open(os.path.join(sandbox, wmsId), 'w')
        else:
            proc.logError(self.errorLog)
        return (jobNum, utils.QM(wmsId, wmsId, None), {'sandbox': sandbox})
Beispiel #12
0
    def cancelJobs(self, allIds):
        if len(allIds) == 0:
            raise StopIteration

        waitFlag = False
        for ids in imap(lambda x: allIds[x:x + 5], irange(0, len(allIds), 5)):
            # Delete jobs in groups of 5 - with 5 seconds between groups
            if waitFlag and not utils.wait(5):
                break
            waitFlag = True

            jobNumMap = dict(ids)
            jobs = self.writeWMSIds(ids)

            activity = utils.ActivityLog('cancelling jobs')
            proc = LocalProcess(self._cancelExec, '--noint', '--logfile',
                                '/dev/stderr', '-i', jobs)
            retCode = proc.status(timeout=60, terminate=True)
            del activity

            # select cancelled jobs
            for deletedWMSId in ifilter(lambda x: x.startswith('- '),
                                        proc.stdout.iter()):
                deletedWMSId = self._createId(deletedWMSId.strip('- \n'))
                yield (jobNumMap.get(deletedWMSId), deletedWMSId)

            if retCode != 0:
                if self.explainError(proc, retCode):
                    pass
                else:
                    self._log.log_process(proc,
                                          files={'jobs': utils.safeRead(jobs)})
            utils.removeFiles([jobs])
Beispiel #13
0
    def resyncMapping(self, newSplitPath, oldBlocks, newBlocks):
        log = utils.ActivityLog('Performing resynchronization of dataset')
        (blocksAdded, blocksMissing,
         blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks)
        for rmBlock in blocksMissing:  # Files in matching blocks are already sorted
            sort_inplace(rmBlock[DataProvider.FileList],
                         key=lambda x: x[DataProvider.URL])
        log.finish()

        # User overview and setup starts here
        resultRedo = []
        resultDisable = []
        newSplitPathTMP = newSplitPath + '.tmp'
        resyncIter = self._resyncIterator(resultRedo, resultDisable,
                                          blocksAdded, blocksMissing,
                                          blocksMatching)
        self.savePartitions(
            newSplitPathTMP,
            resyncIter,
            sourceLen=self.getMaxJobs(),
            message=
            'Performing resynchronization of dataset map (progress is estimated)'
        )

        if self._interactive:
            # TODO: print info and ask
            if not utils.getUserBool(
                    'Do you want to use the new dataset partition?', False):
                return None
        os.rename(newSplitPathTMP, newSplitPath)

        return (resultRedo, resultDisable)
Beispiel #14
0
	def checkJobs(self, ids):
		if not len(ids):
			raise StopIteration

		activity = utils.ActivityLog('checking job status')
		proc = utils.LoggedProcess(self.statusExec, self.getCheckArguments(self._getRawIDs(ids)))

		tmp = {}
		for data in self.parseStatus(proc.iter()):
			wmsId = self._createId(data['id'])
			tmp[wmsId] = (wmsId, self.parseJobState(data['status']), data)

		for wmsId, jobNum in ids:
			if wmsId not in tmp:
				yield (jobNum, wmsId, Job.DONE, {})
			else:
				yield tuple([jobNum] + list(tmp[wmsId]))

		retCode = proc.wait()
		del activity

		if retCode != 0:
			for line in proc.getError().splitlines():
				if not self.unknownID() in line:
					utils.eprint(line)
Beispiel #15
0
	def readJobs(self, jobLimit):
		jobMap = {}
		maxJobs = 0
		if os.path.exists(self._dbFile):
			try:
				tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
			except Exception: # Try to recover job archive
				utils.eprint('=' * 40 + '\nStarting recovery of broken job database')
				utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40)
				os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile))
				os.rename(self._dbFile, self._dbFile + '.broken')
				os.rename(self._dbFile + '.tmp', self._dbFile)
				tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
				utils.removeFiles([self._dbFile + '.broken'])
				brokenList = []
				for idx, fnTarInfo in enumerate(tar.namelist()):
					(jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
					try:
						fp = tar.open(fnTarInfo)
						try:
							fp.read()
						finally:
							fp.close()
					except Exception:
						pass
				for broken in brokenList:
					os.system('zip %s -d %s' % (self._dbFile, broken))
				utils.eprint('Recover completed!')
			activity = utils.ActivityLog('Reading job transactions ...')
			maxJobs = len(tar.namelist())
			tMap = {}
			for idx, fnTarInfo in enumerate(tar.namelist()):
				(jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
				if tid < tMap.get(jobNum, 0):
					continue
				data = utils.DictFormat(escapeString = True).parse(tar.open(fnTarInfo).read())
				jobMap[jobNum] = Job.loadData(fnTarInfo, data)
				tMap[jobNum] = tid
				if idx % 100 == 0:
					activity.finish()
					activity = utils.ActivityLog('Reading job transactions ... %d [%d%%]' % (idx, (100.0 * idx) / maxJobs))

		self._serial = maxJobs
		return jobMap
Beispiel #16
0
			def guiWait(timeout):
				onResize(None, None)
				oldHandler = signal.signal(signal.SIGWINCH, onResize)
				result = utils.wait(timeout)
				signal.signal(signal.SIGWINCH, oldHandler)
				if (time.time() - guiWait.lastwait > 10) and not timeout:
					tmp = utils.ActivityLog('') # force display update
					tmp.finish()
				guiWait.lastwait = time.time()
				return result
Beispiel #17
0
 def getEntries(self, path, metadata, events, seList, objStore):
     for jobNum in self._selected:
         log = utils.ActivityLog('Reading job logs - [%d / %d]' %
                                 (jobNum, self._selected[-1]))
         metadata['GC_JOBNUM'] = jobNum
         objStore.update({
             'GC_TASK': self._extTask,
             'GC_WORKDIR': self._extWorkDir
         })
         yield (os.path.join(self._extWorkDir, 'output', 'job_%d' % jobNum),
                metadata, events, seList, objStore)
         log.finish()
Beispiel #18
0
    def cancelJobs(self, wmsJobIdList):
        if len(wmsJobIdList) == 0:
            raise StopIteration
        self.debugOut("Started canceling: %s" % set(lzip(*wmsJobIdList)[0]))
        self.debugPool()

        wmsIdList = list(self._getRawIDs(wmsJobIdList))
        wmsIdArgument = " ".join(wmsIdList)
        wmsToJobMap = dict(wmsJobIdList)

        activity = utils.ActivityLog('cancelling jobs')
        cancelProcess = self.Pool.LoggedExecute(
            self.cancelExec, '%(jobIDs)s' % {"jobIDs": wmsIdArgument})

        # check if canceling actually worked
        for cancelReturnLine in cancelProcess.iter():
            if (cancelReturnLine != '\n') and ('marked for removal'
                                               in cancelReturnLine):
                try:
                    wmsID = cancelReturnLine.split()[1]
                    wmsIdList.remove(wmsID)
                    wmsID = self._createId(wmsID)
                    jobNum = wmsToJobMap[wmsID]
                    yield (jobNum, wmsID)
                except KeyError:  # mismatch in GC<->Condor mapping
                    self._log.error('Error with canceled condor job %s', wmsID)
                    self._log.error('\tCondor IDs: %s', wmsIdList)
                    self._log.error('\tProcess message: %s',
                                    cancelProcess.getMessage())
                    raise BackendError('Error while cancelling job %s' % wmsID)
            # clean up remote work dir
            if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH:
                cleanupProcess = self.Pool.LoggedExecute(
                    'rm -rf %s' % self.getWorkdirPath(jobNum))
                self.debugOut("Cleaning up remote workdir:\n	" +
                              cleanupProcess.cmd)
                if cleanupProcess.wait() != 0:
                    if self.explainError(cleanupProcess,
                                         cleanupProcess.wait()):
                        pass
                    else:
                        cleanupProcess.logError(self.errorLog)

        retCode = cancelProcess.wait()
        if retCode != 0:
            if self.explainError(cancelProcess, retCode):
                pass
            else:
                cancelProcess.logError(self.errorLog)
        # clean up if necessary
        activity.finish()
        self._tidyUpWorkingDirectory()
        self.debugFlush()
Beispiel #19
0
 def getEntries(self, path, metadata, events, seList, objStore):
     metadata['GC_SOURCE_DIR'] = self._path
     counter = 0
     from grid_control.backends.storage import se_ls
     proc = se_ls(self._path)
     for fn in proc.stdout.iter(timeout=60):
         log = utils.ActivityLog('Reading source directory - [%d]' %
                                 counter)
         yield (os.path.join(self._path, fn.strip()), metadata, events,
                seList, objStore)
         counter += 1
         log.finish()
     if proc.status(timeout=0) != 0:
         self._log.log_process(proc)
Beispiel #20
0
	def cancelJobs(self, wmsJobIdList):
		if not len(wmsJobIdList):
			raise StopIteration
		activity   = utils.ActivityLog('Canceling jobs...')
		assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList)), self._schedd.getURI())
		canceledJobs = self._schedd.cancelJobs(
			self._splitGcRequests(wmsJobIdList)
			)
		# Yield ( jobNum, wmsID) for canceled jobs
		for htcJobID in canceledJobs:
			yield (
				htcJobID.gcJobNum,
				self._createGcId(htcJobID)
				)
		del activity
Beispiel #21
0
	def _getJobsOutput(self, wmsJobIdList):
		if not len(wmsJobIdList):
			raise StopIteration
		activity   = utils.ActivityLog('Fetching jobs...')
		assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList)), self._schedd.getURI())
		returnedJobs = self._schedd.getJobsOutput(
			self._splitGcRequests(wmsJobIdList)
			)
		# Yield (jobNum, outputPath) per retrieved job
		for htcID in returnedJobs:
			yield (
				htcID.gcJobNum,
				self.getSandboxPath(htcID.gcJobNum)
				)
		del activity
Beispiel #22
0
    def _getJobsOutput(self, wmsJobIdList):
        if not len(wmsJobIdList):
            raise StopIteration
        self.debugOut("Started retrieving: %s" % set(lzip(*wmsJobIdList)[0]))

        activity = utils.ActivityLog('retrieving job outputs')
        for wmsId, jobNum in wmsJobIdList:
            sandpath = self.getSandboxPath(jobNum)
            if sandpath is None:
                yield (jobNum, None)
                continue
            # when working with a remote spool schedd, tell condor to return files
            if self.remoteType == PoolType.SPOOL:
                transferProcess = self.Pool.LoggedExecute(
                    self.transferExec,
                    '%(jobID)s' % {"jobID": self._splitId(wmsId)})
                if transferProcess.wait() != 0:
                    if self.explainError(transferProcess,
                                         transferProcess.wait()):
                        pass
                    else:
                        transferProcess.logError(self.errorLog)
            # when working with a remote [gsi]ssh schedd, manually return files
            elif self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH:
                transferProcess = self.Pool.LoggedCopyFromRemote(
                    self.getWorkdirPath(jobNum), self.getSandboxPath())
                if transferProcess.wait() != 0:
                    if self.explainError(transferProcess,
                                         transferProcess.wait()):
                        pass
                    else:
                        transferProcess.logError(self.errorLog)
                # clean up remote working directory
                cleanupProcess = self.Pool.LoggedExecute(
                    'rm -rf %s' % self.getWorkdirPath(jobNum))
                self.debugOut("Cleaning up remote workdir: JobID %s\n	%s" %
                              (jobNum, cleanupProcess.cmd))
                if cleanupProcess.wait() != 0:
                    if self.explainError(cleanupProcess,
                                         cleanupProcess.wait()):
                        pass
                    else:
                        cleanupProcess.logError(self.errorLog)
            yield (jobNum, sandpath)
        # clean up if necessary
        activity.finish()
        self._tidyUpWorkingDirectory()
        self.debugFlush()
Beispiel #23
0
    def getBlocks(self, silent=True):
        def prepareBlocks():
            # Validation, Filtering & Naming:
            for block in self.getBlocksInternal():
                assert (block[DataProvider.Dataset])
                block.setdefault(DataProvider.BlockName, '0')
                block.setdefault(DataProvider.Provider,
                                 self.__class__.__name__)
                block.setdefault(DataProvider.Locations, None)
                if self._datasetID:
                    block[DataProvider.DatasetID] = self._datasetID
                events = sum(
                    imap(lambda x: x[DataProvider.NEntries],
                         block[DataProvider.FileList]))
                block.setdefault(DataProvider.NEntries, events)
                if self._datasetNick:
                    block[DataProvider.Nickname] = self._datasetNick
                elif self._nickProducer:
                    block = self._nickProducer.processBlock(block)
                    if not block:
                        raise DatasetError('Nickname producer failed!')
                yield block

        if self._cache_block is None:
            log = utils.ActivityLog('Retrieving %s' % self._datasetExpr)
            try:
                if self._passthrough:
                    self._cache_block = list(
                        self._stats.process(prepareBlocks()))
                else:
                    self._cache_block = list(
                        self._stats.process(
                            self._datasetProcessor.process(prepareBlocks())))
            except Exception:
                raise DatasetError('Unable to retrieve dataset %s' %
                                   repr(self._datasetExpr))
            statString = ' * Dataset '
            if self._datasetNick:
                statString += repr(self._datasetNick)
            elif self._datasetExpr:
                statString += repr(self._datasetExpr)
            log.finish()
            statString += '\tcontains %d block(s) with %s' % self._stats.getStats(
            )
            if not silent:
                self._log.info(statString)
        return self._cache_block
Beispiel #24
0
	def __init__(self, config, jobLimit = -1, jobSelector = None):
		dbPath = config.getWorkPath('jobs')
		self._dbFile = config.getWorkPath('jobs.zip')
		if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile):
			activity = utils.ActivityLog('Converting job database...')
			self._serial = 0
			try:
				oldDB = JobDB(config)
				oldDB.readJobs(-1)
				for jobNum in oldDB.getJobs():
					self.commit(jobNum, oldDB.get(jobNum))
			except Exception:
				utils.removeFiles([self._dbFile])
				raise
			activity.finish()

		ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
 def getEntries(self, path, metadata, events, seList, objStore):
     allDirs = ifilter(lambda fn: fn.startswith('job_'),
                       os.listdir(self._extOutputDir))
     for idx, dirName in enumerate(allDirs):
         log = utils.ActivityLog('Reading job logs - [%d / %d]' %
                                 (idx, len(allDirs)))
         try:
             metadata['GC_JOBNUM'] = int(dirName.split('_')[1])
             objStore['GC_WORKDIR'] = self._extWorkDir
             log.finish()
             if self._selector and self._selector(metadata['GC_JOBNUM'],
                                                  None):
                 yield (os.path.join(self._extOutputDir, dirName), metadata,
                        events, seList, objStore)
         except Exception:
             pass
         log.finish()
Beispiel #26
0
	def _getJobsOutput(self, ids):
		if not len(ids):
			raise StopIteration

		activity = utils.ActivityLog('retrieving job outputs')
		for wmsId, jobNum in ids:
			path = self._getSandbox(wmsId)
			if path is None:
				yield (jobNum, None)
				continue

			# Cleanup sandbox
			outFiles = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles))
			utils.removeFiles(ifilter(lambda x: x not in outFiles, imap(lambda fn: os.path.join(path, fn), os.listdir(path))))

			yield (jobNum, path)
		del activity
Beispiel #27
0
	def checkJobs(self, wmsJobIdList):
		if not len(wmsJobIdList):
			raise StopIteration
		activity   = utils.ActivityLog('Checking jobs...')
		assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(wmsJobIdList)), self._schedd.getURI())
		rawJobInfoMaps = self._schedd.checkJobs(
			self._splitGcRequests(wmsJobIdList),
			self._getQueryArgs()
			)
		# Yield (jobNum, wmsId, state, other data) per active jobs
		jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps)
		for htcID in jobInfoMaps:
			yield (
				htcID.gcJobNum,
				self._createGcId(htcID),
				self._statusMap[jobInfoMaps[htcID]['state']][0],
				jobInfoMaps[htcID]
				)
		del(activity)
Beispiel #28
0
	def __init__(self, path):
		activity = utils.ActivityLog('Reading dataset partition file')
		self._lock = GCLock()
		self._fmt = utils.DictFormat()
		self._tar = tarfile.open(path, 'r:')
		(self._cacheKey, self._cacheTar) = (None, None)

		metadata = self._fmt.parse(self._tar.extractfile('Metadata').readlines(), keyParser = {None: str})
		self.maxJobs = metadata.pop('MaxJobs')
		self.classname = metadata.pop('ClassName')
		self.metadata = {'dataset': dict(ifilter(lambda k_v: not k_v[0].startswith('['), metadata.items()))}
		for (k, v) in ifilter(lambda k_v: k_v[0].startswith('['), metadata.items()):
			self.metadata.setdefault('dataset %s' % k.split(']')[0].lstrip('['), {})[k.split(']')[1].strip()] = v
		activity.finish()

		self._parserMap = { None: str, DataSplitter.NEntries: int, DataSplitter.Skipped: int,
			DataSplitter.DatasetID: int, DataSplitter.Invalid: parseBool,
			DataSplitter.Locations: lambda x: parseList(x, ','),
			DataSplitter.MetadataHeader: parseJSON,
			DataSplitter.Metadata: lambda x: parseJSON(x.strip("'")) }
Beispiel #29
0
	def matchSites(self, endpoint):
		log = utils.ActivityLog('Discovering available WMS services - testing %s' % endpoint)
		checkArgs = ['-a']
		if endpoint:
			checkArgs.extend(['-e', endpoint])
		checkArgs.append(utils.pathShare('null.jdl'))

		proc = LocalProcess(self._exeGliteWMSJobListMatch, *checkArgs)
		result = []
		for line in proc.stdout.iter(timeout = 3):
			if line.startswith(' - '):
				result.append(line[3:].strip())
		if proc.status(timeout = 0) is None:
			self.wms_timeout[endpoint] = self.wms_timeout.get(endpoint, 0) + 1
			if self.wms_timeout.get(endpoint, 0) > 10: # remove endpoints after 10 failures
				self.wms_all.remove(endpoint)
			log.finish()
			return []
		log.finish()
		return result
Beispiel #30
0
	def cancelJobs(self, allIds):
		if len(allIds) == 0:
			raise StopIteration

		waitFlag = False
		for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)):
			# Delete jobs in groups of 5 - with 5 seconds between groups
			if waitFlag and not utils.wait(5):
				break
			waitFlag = True

			jobNumMap = dict(ids)
			jobs = ' '.join(self._getRawIDs(ids))
			log = tempfile.mktemp('.log')

			activity = utils.ActivityLog('cancelling jobs')
			proc = utils.LoggedProcess(self._cancelExec, '--noint --logfile "%s" %s' % (log, jobs))
			retCode = proc.wait()
			del activity

			# select cancelled jobs
			for rawId in self._getRawIDs(ids):
				deletedWMSId = self._createId(rawId)
				yield (jobNumMap.get(deletedWMSId), deletedWMSId)

			if retCode != 0:
				if self.explainError(proc, retCode):
					pass
				else:
					proc.logError(self.errorLog, log = log)
		
			purgeLog = tempfile.mktemp('.log')
			purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, jobs))
			retCode = purgeProc.wait()
			if retCode != 0:
				if self.explainError(purgeProc, retCode):
					pass
				else:
					proc.logError(self.errorLog, log = purgeLog, jobs = jobs)
			
			utils.removeFiles([log, purgeLog])