Example #1
0
	def _submitJob(self, jobNum, module):
		fd, jdl = tempfile.mkstemp('.jdl')
		try:
			data = self.makeJDL(jobNum, module)
			utils.safeWrite(os.fdopen(fd, 'w'), data)
		except Exception:
			utils.removeFiles([jdl])
			raise BackendError('Could not write jdl data to %s.' % jdl)

		try:
			tmp = utils.filterDict(self._submitParams, vF = lambda v: v)
			params = str.join(' ', map(lambda (x, y): '%s %s' % (x, y), tmp.items()))

			log = tempfile.mktemp('.log')
			activity = utils.ActivityLog('submitting jobs')
			proc = utils.LoggedProcess(self._submitExec, '%s --nomsg --noint --logfile "%s" "%s"' % (params, log, jdl))

			wmsId = None
			for line in filter(lambda x: x.startswith('http'), map(str.strip, proc.iter())):
				wmsId = line
			retCode = proc.wait()
			del activity

			if (retCode != 0) or (wmsId == None):
				if self.explainError(proc, retCode):
					pass
				else:
					proc.logError(self.errorLog, log = log, jdl = jdl)
		finally:
			utils.removeFiles([log, jdl])
		return (jobNum, utils.QM(wmsId, self._createId(wmsId), None), {'jdl': str.join('', data)})
Example #2
0
	def _submitJob(self, jobNum, module):
		fd, jdl = tempfile.mkstemp('.jdl')
		try:
			jdlData = self.makeJDL(jobNum, module)
			utils.safeWrite(os.fdopen(fd, 'w'), jdlData)
		except Exception:
			utils.removeFiles([jdl])
			raise BackendError('Could not write jdl data to %s.' % jdl)

		try:
			submitArgs = []
			for key_value in utils.filterDict(self._submitParams, vF = lambda v: v).items():
				submitArgs.extend(key_value)
			submitArgs.append(jdl)

			activity = Activity('submitting job %d' % jobNum)
			proc = LocalProcess(self._submitExec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submitArgs)

			gcID = None
			for line in ifilter(lambda x: x.startswith('http'), imap(str.strip, proc.stdout.iter(timeout = 60))):
				gcID = line
			retCode = proc.status(timeout = 0, terminate = True)

			activity.finish()

			if (retCode != 0) or (gcID is None):
				if self.explainError(proc, retCode):
					pass
				else:
					self._log.log_process(proc, files = {'jdl': SafeFile(jdl).read()})
		finally:
			utils.removeFiles([jdl])
		return (jobNum, utils.QM(gcID, self._createId(gcID), None), {'jdl': str.join('', jdlData)})
Example #3
0
	def cancelJobs(self, allIds):
		if len(allIds) == 0:
			raise StopIteration

		waitFlag = False
		for ids in map(lambda x: allIds[x:x+5], range(0, len(allIds), 5)):
			# Delete jobs in groups of 5 - with 5 seconds between groups
			if waitFlag and utils.wait(5) == False:
				break
			waitFlag = True

			jobNumMap = dict(ids)
			jobs = self.writeWMSIds(ids)
			log = tempfile.mktemp('.log')

			activity = utils.ActivityLog('cancelling jobs')
			proc = utils.LoggedProcess(self._cancelExec, '--noint --logfile "%s" -i "%s"' % (log, jobs))
			retCode = proc.wait()
			del activity

			# select cancelled jobs
			for deletedWMSId in filter(lambda x: x.startswith('- '), proc.iter()):
				deletedWMSId = self._createId(deletedWMSId.strip('- \n'))
				yield (jobNumMap.get(deletedWMSId), deletedWMSId)

			if retCode != 0:
				if self.explainError(proc, retCode):
					pass
				else:
					proc.logError(self.errorLog, log = log)
			utils.removeFiles([log, jobs])
Example #4
0
	def checkJobs(self, ids):
		if len(ids) == 0:
			raise StopIteration

		jobNumMap = dict(ids)
		jobs = ' '.join(self._getRawIDs(ids))
		log = tempfile.mktemp('.log')

		activity = utils.ActivityLog('checking job status')
		proc = utils.LoggedProcess(self._statusExec, '--level 0 --logfile "%s" %s' % (log, jobs))
		for jobOutput in proc.getOutput().split('******')[1:]:
			data = {}
			for statusRegexLevel0 in self._statusRegexLevel0:
				match = re.match(statusRegexLevel0, jobOutput.replace('\n', ' '))
				if match:
					data = match.groupdict()
					break
			data['id'] = self._createId(data['rawId'])
			yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data.get('status', 'DONE-FAILED')], data)
		
		retCode = proc.wait()
		del activity

		if retCode != 0:
			if self.explainError(proc, retCode):
				pass
			else:
				proc.logError(self.errorLog, log = log, jobs = jobs)
		
		utils.removeFiles([log])
Example #5
0
	def cancelJobs(self, allIds):
		if len(allIds) == 0:
			raise StopIteration

		waitFlag = False
		for ids in imap(lambda x: allIds[x:x+5], irange(0, len(allIds), 5)):
			# Delete jobs in groups of 5 - with 5 seconds between groups
			if waitFlag and not utils.wait(5):
				break
			waitFlag = True

			jobNumMap = dict(ids)
			jobs = self.writeWMSIds(ids)

			activity = utils.ActivityLog('cancelling jobs')
			proc = LocalProcess(self._cancelExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs)
			retCode = proc.status(timeout = 60, terminate = True)
			del activity

			# select cancelled jobs
			for deletedWMSId in ifilter(lambda x: x.startswith('- '), proc.stdout.iter()):
				deletedWMSId = self._createId(deletedWMSId.strip('- \n'))
				yield (jobNumMap.get(deletedWMSId), deletedWMSId)

			if retCode != 0:
				if self.explainError(proc, retCode):
					pass
				else:
					self._log.log_process(proc, files = {'jobs': utils.safeRead(jobs)})
			utils.removeFiles([jobs])
Example #6
0
    def cancelJobs(self, allIds):
        if len(allIds) == 0:
            raise StopIteration

        waitFlag = False
        for ids in imap(lambda x: allIds[x:x + 5], irange(0, len(allIds), 5)):
            # Delete jobs in groups of 5 - with 5 seconds between groups
            if waitFlag and not utils.wait(5):
                break
            waitFlag = True

            jobNumMap = dict(ids)
            jobs = self.writeWMSIds(ids)

            activity = utils.ActivityLog('cancelling jobs')
            proc = LocalProcess(self._cancelExec, '--noint', '--logfile',
                                '/dev/stderr', '-i', jobs)
            retCode = proc.status(timeout=60, terminate=True)
            del activity

            # select cancelled jobs
            for deletedWMSId in ifilter(lambda x: x.startswith('- '),
                                        proc.stdout.iter()):
                deletedWMSId = self._createId(deletedWMSId.strip('- \n'))
                yield (jobNumMap.get(deletedWMSId), deletedWMSId)

            if retCode != 0:
                if self.explainError(proc, retCode):
                    pass
                else:
                    self._log.log_process(proc,
                                          files={'jobs': utils.safeRead(jobs)})
            utils.removeFiles([jobs])
Example #7
0
	def _getJobsOutput(self, ids):
		if len(ids) == 0:
			raise StopIteration

		basePath = os.path.join(self._outputPath, 'tmp')
		try:
			if len(ids) == 1:
				# For single jobs create single subdir
				tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest())
			else:
				tmpPath = basePath
			utils.ensureDirExists(tmpPath)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % tmpPath, BackendError)

		jobNumMap = dict(ids)
		jobs = self.writeWMSIds(ids)

		activity = Activity('retrieving %d job outputs' % len(ids))
		proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath)

		# yield output dirs
		todo = jobNumMap.values()
		currentJobNum = None
		for line in imap(str.strip, proc.stdout.iter(timeout = 60)):
			if line.startswith(tmpPath):
				todo.remove(currentJobNum)
				outputDir = line.strip()
				if os.path.exists(outputDir):
					if 'GC_WC.tar.gz' in os.listdir(outputDir):
						wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz')
						try:
							tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir)
							os.unlink(wildcardTar)
						except Exception:
							self._log.error('Can\'t unpack output files contained in %s', wildcardTar)
				yield (currentJobNum, line.strip())
				currentJobNum = None
			else:
				currentJobNum = jobNumMap.get(self._createId(line), currentJobNum)
		retCode = proc.status(timeout = 0, terminate = True)
		activity.finish()

		if retCode != 0:
			if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout = 0):
				utils.removeFiles([jobs, basePath])
				raise StopIteration
			else:
				self._log.log_process(proc, files = {'jobs': SafeFile(jobs).read()})
			self._log.error('Trying to recover from error ...')
			for dirName in os.listdir(basePath):
				yield (None, os.path.join(basePath, dirName))

		# return unretrievable jobs
		for jobNum in todo:
			yield (jobNum, None)

		utils.removeFiles([jobs, basePath])
Example #8
0
 def _readJobs(self, jobLimit):
     jobMap = {}
     maxJobs = 0
     if os.path.exists(self._dbFile):
         try:
             tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
         except Exception:  # Try to recover job archive
             self._log.warning(
                 '=' * 40 +
                 '\nStarting recovery of broken job database => Answer "y" if asked "Is this a single-disk archive?"!\n'
                 + '=' * 40)
             os.system('zip -FF %s --out %s.tmp 2> /dev/null' %
                       (self._dbFile, self._dbFile))
             os.rename(self._dbFile, self._dbFile + '.broken')
             os.rename(self._dbFile + '.tmp', self._dbFile)
             tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
             removeFiles([self._dbFile + '.broken'])
             brokenList = []
             for idx, fnTarInfo in enumerate(tar.namelist()):
                 (jobNum, tid) = tuple(
                     imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
                 try:
                     fp = tar.open(fnTarInfo)
                     try:
                         fp.read()
                     finally:
                         fp.close()
                 except Exception:
                     clear_current_exception()
             for broken in brokenList:
                 os.system('zip %s -d %s' % (self._dbFile, broken))
             self._log.info('Recover completed!')
         activity = Activity('Reading job transactions')
         maxJobs = len(tar.namelist())
         tMap = {}
         for idx, fnTarInfo in enumerate(tar.namelist()):
             (jobNum, tid) = tuple(
                 imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
             if tid < tMap.get(jobNum, 0):
                 continue
             try:
                 data = self._fmt.parse(tar.open(fnTarInfo).read())
             except Exception:
                 continue
             jobMap[jobNum] = self._create_job_obj(fnTarInfo, data)
             tMap[jobNum] = tid
             if idx % 100 == 0:
                 activity.update('Reading job transactions %d [%d%%]' %
                                 (idx, (100.0 * idx) / maxJobs))
         activity.finish()
     self._serial = maxJobs
     return jobMap
Example #9
0
	def image(self):
		cherrypy.response.headers['Content-Type']= 'image/png'
		nodes = ["MetadataSplitter", "RunSplitter"]
		edges = [("MetadataSplitter", "RunSplitter")]
		nodeStr = str.join('', map(lambda x: '%s [label="%s", fillcolor="/set312/1", style="filled"]\n' % (x, x), nodes))
		edgeStr = str.join('', map(lambda x: '%s -> %s' % x, edges))
		inp = "digraph mygraph { overlap=False; ranksep=1.5; %s; %s; }" % (nodeStr, edgeStr)

		fd, fn = tempfile.mkstemp()
		os.fdopen(fd, 'w').write(inp)
		proc = utils.LoggedProcess('neato', '%s -Tpng' % fn)
		result = proc.getOutput()
		utils.removeFiles([fn])
		return result
Example #10
0
	def __init__(self, config, jobLimit = -1, jobSelector = None):
		dbPath = config.getWorkPath('jobs')
		self._dbFile = config.getWorkPath('jobs.zip')
		if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile):
			activity = Activity('Converting job database')
			self._serial = 0
			try:
				oldDB = TextFileJobDB(config)
				for jobNum in oldDB.getJobs():
					self.commit(jobNum, oldDB.get(jobNum))
			except Exception:
				removeFiles([self._dbFile])
				raise
			activity.finish()
		ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
Example #11
0
 def __init__(self, config, jobLimit=-1, jobSelector=None):
     dbPath = config.getWorkPath('jobs')
     self._dbFile = config.getWorkPath('jobs.zip')
     if os.path.exists(dbPath) and os.path.isdir(
             dbPath) and not os.path.exists(self._dbFile):
         activity = Activity('Converting job database')
         self._serial = 0
         try:
             oldDB = TextFileJobDB(config)
             for jobNum in oldDB.getJobs():
                 self.commit(jobNum, oldDB.get(jobNum))
         except Exception:
             removeFiles([self._dbFile])
             raise
         activity.finish()
     ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
Example #12
0
	def _getJobsOutput(self, ids):
		if not len(ids):
			raise StopIteration

		activity = Activity('retrieving %d job outputs' % len(ids))
		for gcID, jobNum in ids:
			path = self._sandbox_helper.get_sandbox(gcID)
			if path is None:
				yield (jobNum, None)
				continue

			# Cleanup sandbox
			outFiles = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles))
			utils.removeFiles(ifilter(lambda x: x not in outFiles, imap(lambda fn: os.path.join(path, fn), os.listdir(path))))

			yield (jobNum, path)
		activity.finish()
Example #13
0
	def _getJobsOutput(self, ids):
		if not len(ids):
			raise StopIteration

		activity = utils.ActivityLog('retrieving job outputs')
		for wmsId, jobNum in ids:
			path = self._getSandbox(wmsId)
			if path == None:
				yield (jobNum, None)
				continue

			# Cleanup sandbox
			outFiles = utils.listMapReduce(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles)
			utils.removeFiles(filter(lambda x: x not in outFiles, map(lambda fn: os.path.join(path, fn), os.listdir(path))))

			yield (jobNum, path)
		del activity
Example #14
0
    def __init__(self, config, jobLimit=-1, jobSelector=None):
        dbPath = config.getWorkPath("jobs")
        self._dbFile = config.getWorkPath("jobs.zip")
        if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile):
            log = utils.ActivityLog("Converting job database...")
            self._serial = 0
            try:
                oldDB = JobDB(config)
                oldDB.readJobs(-1)
                for jobNum in oldDB.getJobs():
                    self.commit(jobNum, oldDB.get(jobNum))
            except Exception:
                utils.removeFiles([dbFile])
                raise
            del log

        ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
Example #15
0
	def _getJobsOutput(self, ids):
		if not len(ids):
			raise StopIteration

		activity = utils.ActivityLog('retrieving job outputs')
		for wmsId, jobNum in ids:
			path = self._getSandbox(wmsId)
			if path is None:
				yield (jobNum, None)
				continue

			# Cleanup sandbox
			outFiles = lchain(imap(lambda pat: glob.glob(os.path.join(path, pat)), self.outputFiles))
			utils.removeFiles(ifilter(lambda x: x not in outFiles, imap(lambda fn: os.path.join(path, fn), os.listdir(path))))

			yield (jobNum, path)
		del activity
Example #16
0
	def _readJobs(self, jobLimit):
		jobMap = {}
		maxJobs = 0
		if os.path.exists(self._dbFile):
			try:
				tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
			except Exception: # Try to recover job archive
				self._log.warning('=' * 40 + '\nStarting recovery of broken job database => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40)
				os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile))
				os.rename(self._dbFile, self._dbFile + '.broken')
				os.rename(self._dbFile + '.tmp', self._dbFile)
				tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
				removeFiles([self._dbFile + '.broken'])
				brokenList = []
				for idx, fnTarInfo in enumerate(tar.namelist()):
					(jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
					try:
						fp = tar.open(fnTarInfo)
						try:
							fp.read()
						finally:
							fp.close()
					except Exception:
						clear_current_exception()
				for broken in brokenList:
					os.system('zip %s -d %s' % (self._dbFile, broken))
				self._log.info('Recover completed!')
			activity = Activity('Reading job transactions')
			maxJobs = len(tar.namelist())
			tMap = {}
			for idx, fnTarInfo in enumerate(tar.namelist()):
				(jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
				if tid < tMap.get(jobNum, 0):
					continue
				try:
					data = self._fmt.parse(tar.open(fnTarInfo).read())
				except Exception:
					continue
				jobMap[jobNum] = self._create_job_obj(fnTarInfo, data)
				tMap[jobNum] = tid
				if idx % 100 == 0:
					activity.update('Reading job transactions %d [%d%%]' % (idx, (100.0 * idx) / maxJobs))
			activity.finish()
		self._serial = maxJobs
		return jobMap
Example #17
0
	def readJobs(self, jobLimit):
		jobMap = {}
		maxJobs = 0
		if os.path.exists(self._dbFile):
			try:
				tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
			except Exception: # Try to recover job archive
				utils.eprint('=' * 40 + '\nStarting recovery of broken job database')
				utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + '=' * 40)
				os.system('zip -FF %s --out %s.tmp 2> /dev/null' % (self._dbFile, self._dbFile))
				os.rename(self._dbFile, self._dbFile + '.broken')
				os.rename(self._dbFile + '.tmp', self._dbFile)
				tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
				utils.removeFiles([self._dbFile + '.broken'])
				brokenList = []
				for idx, fnTarInfo in enumerate(tar.namelist()):
					(jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
					try:
						fp = tar.open(fnTarInfo)
						try:
							fp.read()
						finally:
							fp.close()
					except Exception:
						pass
				for broken in brokenList:
					os.system('zip %s -d %s' % (self._dbFile, broken))
				utils.eprint('Recover completed!')
			activity = utils.ActivityLog('Reading job transactions ...')
			maxJobs = len(tar.namelist())
			tMap = {}
			for idx, fnTarInfo in enumerate(tar.namelist()):
				(jobNum, tid) = tuple(imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
				if tid < tMap.get(jobNum, 0):
					continue
				data = utils.DictFormat(escapeString = True).parse(tar.open(fnTarInfo).read())
				jobMap[jobNum] = Job.loadData(fnTarInfo, data)
				tMap[jobNum] = tid
				if idx % 100 == 0:
					activity.finish()
					activity = utils.ActivityLog('Reading job transactions ... %d [%d%%]' % (idx, (100.0 * idx) / maxJobs))

		self._serial = maxJobs
		return jobMap
Example #18
0
	def cancelJobs(self, allIds):
		if len(allIds) == 0:
			raise StopIteration

		waitFlag = False
		for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)):
			# Delete jobs in groups of 5 - with 5 seconds between groups
			if waitFlag and not utils.wait(5):
				break
			waitFlag = True

			jobNumMap = dict(ids)
			jobs = ' '.join(self._getRawIDs(ids))
			log = tempfile.mktemp('.log')

			activity = utils.ActivityLog('cancelling jobs')
			proc = utils.LoggedProcess(self._cancelExec, '--noint --logfile "%s" %s' % (log, jobs))
			retCode = proc.wait()
			del activity

			# select cancelled jobs
			for rawId in self._getRawIDs(ids):
				deletedWMSId = self._createId(rawId)
				yield (jobNumMap.get(deletedWMSId), deletedWMSId)

			if retCode != 0:
				if self.explainError(proc, retCode):
					pass
				else:
					proc.logError(self.errorLog, log = log)
		
			purgeLog = tempfile.mktemp('.log')
			purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, jobs))
			retCode = purgeProc.wait()
			if retCode != 0:
				if self.explainError(purgeProc, retCode):
					pass
				else:
					proc.logError(self.errorLog, log = purgeLog, jobs = jobs)
			
			utils.removeFiles([log, purgeLog])
Example #19
0
	def checkJobs(self, ids):
		if len(ids) == 0:
			raise StopIteration

		jobNumMap = dict(ids)
		jobs = self.writeWMSIds(ids)

		activity = utils.ActivityLog('checking job status')
		proc = LocalProcess(self._statusExec, '--verbosity', 1, '--noint', '--logfile', '/dev/stderr', '-i', jobs)
		for data in self._parseStatus(proc.stdout.iter(timeout = 60)):
			data['id'] = self._createId(data['id'])
			yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data['status']], data)
		retCode = proc.status(timeout = 0, terminate = True)
		del activity

		if retCode != 0:
			if self.explainError(proc, retCode):
				pass
			else:
				self._log.log_process(proc, files = {'jobs': utils.safeRead(jobs)})
		utils.removeFiles([jobs])
Example #20
0
    def readJobs(self, jobLimit):
        jobMap = {}
        maxJobs = 0
        if os.path.exists(self._dbFile):
            try:
                tar = zipfile.ZipFile(self._dbFile, "r", zipfile.ZIP_DEFLATED)
            except Exception:  # Try to recover job archive
                utils.eprint("=" * 40 + "\nStarting recovery of broken job database")
                utils.eprint(' => Answer "y" if asked "Is this a single-disk archive?"!\n' + "=" * 40)
                os.system("zip -FF %s --out %s.tmp 2> /dev/null" % (self._dbFile, self._dbFile))
                os.rename(self._dbFile, self._dbFile + ".broken")
                os.rename(self._dbFile + ".tmp", self._dbFile)
                tar = zipfile.ZipFile(self._dbFile, "r", zipfile.ZIP_DEFLATED)
                utils.removeFiles([self._dbFile + ".broken"])
                brokenList = []
                for idx, fnTarInfo in enumerate(tar.namelist()):
                    (jobNum, tid) = tuple(map(lambda s: int(s[1:]), fnTarInfo.split("_", 1)))
                    try:
                        rawData = tar.open(fnTarInfo).read()
                    except Exception:
                        pass
                for broken in brokenList:
                    os.system("zip %s -d %s" % (self._dbFile, broken))
                utils.eprint("Recover completed!")
            log = None
            maxJobs = len(tar.namelist())
            tMap = {}
            for idx, fnTarInfo in enumerate(tar.namelist()):
                (jobNum, tid) = tuple(map(lambda s: int(s[1:]), fnTarInfo.split("_", 1)))
                if tid < tMap.get(jobNum, 0):
                    continue
                data = utils.DictFormat(escapeString=True).parse(tar.open(fnTarInfo).read())
                jobMap[jobNum] = Job.loadData(fnTarInfo, data)
                tMap[jobNum] = tid
                if idx % 100 == 0:
                    del log
                    log = utils.ActivityLog("Reading job transactions ... %d [%d%%]" % (idx, (100.0 * idx) / maxJobs))

        self._serial = maxJobs
        return jobMap
Example #21
0
	def checkJobs(self, ids):
		if len(ids) == 0:
			raise StopIteration

		jobNumMap = dict(ids)
		jobs = self.writeWMSIds(ids)
		log = tempfile.mktemp('.log')

		activity = utils.ActivityLog('checking job status')
		proc = utils.LoggedProcess(self._statusExec, '--verbosity 1 --noint --logfile "%s" -i "%s"' % (log, jobs))
		for data in self._parseStatus(proc.iter()):
			data['id'] = self._createId(data['id'])
			yield (jobNumMap.get(data['id']), data['id'], self._statusMap[data['status']], data)
		retCode = proc.wait()
		del activity

		if retCode != 0:
			if self.explainError(proc, retCode):
				pass
			else:
				proc.logError(self.errorLog, log = log, jobs = jobs)
		utils.removeFiles([log, jobs])
Example #22
0
    def _submitJob(self, jobNum, module):
        fd, jdl = tempfile.mkstemp('.jdl')
        try:
            jdlData = self.makeJDL(jobNum, module)
            utils.safeWrite(os.fdopen(fd, 'w'), jdlData)
        except Exception:
            utils.removeFiles([jdl])
            raise BackendError('Could not write jdl data to %s.' % jdl)

        try:
            submitArgs = []
            for key_value in utils.filterDict(self._submitParams,
                                              vF=lambda v: v).items():
                submitArgs.extend(key_value)
            submitArgs.append(jdl)

            activity = Activity('submitting job %d' % jobNum)
            proc = LocalProcess(self._submitExec, '--nomsg', '--noint',
                                '--logfile', '/dev/stderr', *submitArgs)

            gcID = None
            for line in ifilter(lambda x: x.startswith('http'),
                                imap(str.strip, proc.stdout.iter(timeout=60))):
                gcID = line
            retCode = proc.status(timeout=0, terminate=True)

            activity.finish()

            if (retCode != 0) or (gcID is None):
                if self.explainError(proc, retCode):
                    pass
                else:
                    self._log.log_process(proc,
                                          files={'jdl': SafeFile(jdl).read()})
        finally:
            utils.removeFiles([jdl])
        return (jobNum, utils.QM(gcID, self._createId(gcID), None), {
            'jdl': str.join('', jdlData)
        })
Example #23
0
	def bulkSubmissionBegin(self):
		self._submitParams.update({ '-d': None })
		if self._discovery_module:
			self._submitParams.update({ '-e': self._discovery_module.getWMS() })
		if self._useDelegate == False:
			self._submitParams.update({ '-a': ' ' })
			return True
		log = tempfile.mktemp('.log')
		try:
			dID = 'GCD' + md5(str(time.time())).hexdigest()[:10]
			activity = utils.ActivityLog('creating delegate proxy for job submission')
			proc = utils.LoggedProcess(self._delegateExec, '%s -d %s --noint --logfile "%s"' %
				(utils.QM(self._configVO, '--config "%s"' % self._configVO, ''), dID, log))

			output = proc.getOutput(wait = True)
			if ('glite-wms-job-delegate-proxy Success' in output) and (dID in output):
				self._submitParams.update({ '-d': dID })
			del activity

			if proc.wait() != 0:
				proc.logError(self.errorLog, log = log)
			return (self._submitParams.get('-d', None) != None)
		finally:
			utils.removeFiles([log])
Example #24
0
    def checkJobs(self, ids):
        if len(ids) == 0:
            raise StopIteration

        jobNumMap = dict(ids)
        jobs = self.writeWMSIds(ids)

        activity = utils.ActivityLog('checking job status')
        proc = LocalProcess(self._statusExec, '--verbosity', 1, '--noint',
                            '--logfile', '/dev/stderr', '-i', jobs)
        for data in self._parseStatus(proc.stdout.iter(timeout=60)):
            data['id'] = self._createId(data['id'])
            yield (jobNumMap.get(data['id']), data['id'],
                   self._statusMap[data['status']], data)
        retCode = proc.status(timeout=0, terminate=True)
        del activity

        if retCode != 0:
            if self.explainError(proc, retCode):
                pass
            else:
                self._log.log_process(proc,
                                      files={'jobs': utils.safeRead(jobs)})
        utils.removeFiles([jobs])
Example #25
0
	def _getJobsOutput(self, ids):
		if len(ids) == 0:
			raise StopIteration

		basePath = os.path.join(self._outputPath, 'tmp')
		try:
			if len(ids) == 1:
				# For single jobs create single subdir
				tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest())
			else:
				tmpPath = basePath
			utils.ensureDirExists(tmpPath)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % tmpPath, RuntimeError)

		jobNumMap = dict(ids)
		jobs = self.writeWMSIds(ids)
		log = tempfile.mktemp('.log')

		activity = utils.ActivityLog('retrieving job outputs')
		proc = utils.LoggedProcess(self._outputExec,
			'--noint --logfile "%s" -i "%s" --dir "%s"' % (log, jobs, tmpPath))

		# yield output dirs
		todo = jobNumMap.values()
		currentJobNum = None
		for line in map(str.strip, proc.iter()):
			if line.startswith(tmpPath):
				todo.remove(currentJobNum)
				outputDir = line.strip()
				if os.path.exists(outputDir):
					if 'GC_WC.tar.gz' in os.listdir(outputDir):
						wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz')
						try:
							tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir)
							os.unlink(wildcardTar)
						except Exception:
							utils.eprint("Can't unpack output files contained in %s" % wildcardTar)
							pass
				yield (currentJobNum, line.strip())
				currentJobNum = None
			else:
				currentJobNum = jobNumMap.get(self._createId(line), currentJobNum)
		retCode = proc.wait()
		del activity

		if retCode != 0:
			if 'Keyboard interrupt raised by user' in proc.getError():
				utils.removeFiles([log, jobs, basePath])
				raise StopIteration
			else:
				proc.logError(self.errorLog, log = log)
			utils.eprint('Trying to recover from error ...')
			for dirName in os.listdir(basePath):
				yield (None, os.path.join(basePath, dirName))

		# return unretrievable jobs
		for jobNum in todo:
			yield (jobNum, None)

		utils.removeFiles([log, jobs, basePath])
Example #26
0
	def _getJobsOutput(self, allIds):
		if len(allIds) == 0:
			raise StopIteration

		basePath = os.path.join(self._outputPath, 'tmp')
		try:
			if len(allIds) == 1:
				# For single jobs create single subdir
				basePath = os.path.join(basePath, md5(allIds[0][0]).hexdigest())
			utils.ensureDirExists(basePath)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % basePath, BackendError)
		
		activity = utils.ActivityLog('retrieving job outputs')
		for ids in imap(lambda x: allIds[x:x+self._nJobsPerChunk], irange(0, len(allIds), self._nJobsPerChunk)):
			jobNumMap = dict(ids)
			jobs = ' '.join(self._getRawIDs(ids))
			log = tempfile.mktemp('.log')

			#print self._outputExec, '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs)
			#import sys
			#sys.exit(1)
			proc = utils.LoggedProcess(self._outputExec,
				'--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs))

			# yield output dirs
			todo = jobNumMap.values()
			done = []
			currentJobNum = None
			for line in imap(str.strip, proc.iter()):
				match = re.match(self._outputRegex, line)
				if match:
					currentJobNum = jobNumMap.get(self._createId(match.groupdict()['rawId']))
					todo.remove(currentJobNum)
					done.append(match.groupdict()['rawId'])
					outputDir = match.groupdict()['outputDir']
					if os.path.exists(outputDir):
						if 'GC_WC.tar.gz' in os.listdir(outputDir):
							wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz')
							try:
								tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir)
								os.unlink(wildcardTar)
							except Exception:
								utils.eprint("Can't unpack output files contained in %s" % wildcardTar)
					yield (currentJobNum, outputDir)
					currentJobNum = None
			retCode = proc.wait()

			if retCode != 0:
				if 'Keyboard interrupt raised by user' in proc.getError():
					utils.removeFiles([log, basePath])
					raise StopIteration
				else:
					proc.logError(self.errorLog, log = log)
				utils.eprint('Trying to recover from error ...')
				for dirName in os.listdir(basePath):
					yield (None, os.path.join(basePath, dirName))
		del activity

		# return unretrievable jobs
		for jobNum in todo:
			yield (jobNum, None)
		
		purgeLog = tempfile.mktemp('.log')
		purgeProc = utils.LoggedProcess(self._purgeExec, '--noint --logfile "%s" %s' % (purgeLog, " ".join(done)))
		retCode = purgeProc.wait()
		if retCode != 0:
			if self.explainError(purgeProc, retCode):
				pass
			else:
				proc.logError(self.errorLog, log = purgeLog, jobs = done)
		utils.removeFiles([log, purgeLog, basePath])
Example #27
0
    def _getJobsOutput(self, ids):
        if len(ids) == 0:
            raise StopIteration

        basePath = os.path.join(self._outputPath, 'tmp')
        try:
            if len(ids) == 1:
                # For single jobs create single subdir
                tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest())
            else:
                tmpPath = basePath
            utils.ensureDirExists(tmpPath)
        except Exception:
            raise BackendError(
                'Temporary path "%s" could not be created.' % tmpPath,
                BackendError)

        jobNumMap = dict(ids)
        jobs = self.writeWMSIds(ids)

        activity = Activity('retrieving %d job outputs' % len(ids))
        proc = LocalProcess(self._outputExec, '--noint', '--logfile',
                            '/dev/stderr', '-i', jobs, '--dir', tmpPath)

        # yield output dirs
        todo = jobNumMap.values()
        currentJobNum = None
        for line in imap(str.strip, proc.stdout.iter(timeout=60)):
            if line.startswith(tmpPath):
                todo.remove(currentJobNum)
                outputDir = line.strip()
                if os.path.exists(outputDir):
                    if 'GC_WC.tar.gz' in os.listdir(outputDir):
                        wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz')
                        try:
                            tarfile.TarFile.open(wildcardTar,
                                                 'r:gz').extractall(outputDir)
                            os.unlink(wildcardTar)
                        except Exception:
                            self._log.error(
                                'Can\'t unpack output files contained in %s',
                                wildcardTar)
                yield (currentJobNum, line.strip())
                currentJobNum = None
            else:
                currentJobNum = jobNumMap.get(self._createId(line),
                                              currentJobNum)
        retCode = proc.status(timeout=0, terminate=True)
        activity.finish()

        if retCode != 0:
            if 'Keyboard interrupt raised by user' in proc.stderr.read(
                    timeout=0):
                utils.removeFiles([jobs, basePath])
                raise StopIteration
            else:
                self._log.log_process(proc,
                                      files={'jobs': SafeFile(jobs).read()})
            self._log.error('Trying to recover from error ...')
            for dirName in os.listdir(basePath):
                yield (None, os.path.join(basePath, dirName))

        # return unretrievable jobs
        for jobNum in todo:
            yield (jobNum, None)

        utils.removeFiles([jobs, basePath])
Example #28
0
	def submitJobs(self, jobNumListFull, module):
		submitBatch=25
		for index in range(0,len(jobNumListFull),submitBatch):
			jobNumList=jobNumListFull[index:index+submitBatch]
			self.debugOut("\nStarted submitting: %s" % jobNumList)
			self.debugPool()
			# get the full job config path and basename
			def _getJobCFG(jobNum):
				return os.path.join(self.getSandboxPath(jobNum), 'job_%d.var' % jobNum), 'job_%d.var' % jobNum
			activity = utils.ActivityLog('preparing jobs')
			# construct a temporary JDL for this batch of jobs
			jdlDescriptor, jdlFilePath = tempfile.mkstemp(suffix='.jdl')
			jdlSubmitPath = jdlFilePath
			self.debugOut("Writing temporary jdl to: "+jdlSubmitPath)
			try:
				data = self.makeJDLdata(jobNumList, module)
				utils.safeWrite(os.fdopen(jdlDescriptor, 'w'), data)
			except Exception:
				utils.removeFiles([jdlFilePath])
				raise BackendError('Could not write jdl data to %s.' % jdlFilePath)

			# create the _jobconfig.sh file containing the actual data
			for jobNum in jobNumList:
				try:
					self._writeJobConfig(_getJobCFG(jobNum)[0], jobNum, module)
				except Exception:
					raise BackendError('Could not write _jobconfig data for %s.' % jobNum)

			self.debugOut("Copying to remote")
			# copy infiles to ssh/gsissh remote pool if required
			if self.remoteType == poolType.SSH or self.remoteType == poolType.GSISSH:
				activity = utils.ActivityLog('preparing remote scheduler')
				self.debugOut("Copying to sandbox")
				workdirBase = self.getWorkdirPath()
				# TODO: check whether shared remote files already exist and copy otherwise
				for fileDescr, fileSource, fileTarget in self._getSandboxFilesIn(module):
					copyProcess = self.Pool.LoggedCopyToRemote(fileSource, os.path.join(workdirBase, fileTarget))
					if copyProcess.wait() != 0:
						if self.explainError(copyProcess, copyProcess.wait()):
							pass
						else:
							copyProcess.logError(self.errorLog, brief=True)
					self.debugFlush()
				# copy job config files
				self.debugOut("Copying job configs")
				for jobNum in jobNumList:
					fileSource, fileTarget = _getJobCFG(jobNum)
					copyProcess = self.Pool.LoggedCopyToRemote(fileSource, os.path.join(self.getWorkdirPath(jobNum), fileTarget))
					if copyProcess.wait() != 0:
						if self.explainError(copyProcess, copyProcess.wait()):
							pass
						else:
							copyProcess.logError(self.errorLog, brief=True)
					self.debugFlush()
				# copy jdl
				self.debugOut("Copying jdl")
				jdlSubmitPath = os.path.join(workdirBase, os.path.basename(jdlFilePath))
				copyProcess = self.Pool.LoggedCopyToRemote(jdlFilePath, jdlSubmitPath )
				if copyProcess.wait() != 0:
					if self.explainError(copyProcess, copyProcess.wait()):
						pass
					else:
						copyProcess.logError(self.errorLog, brief=True)
				self.debugFlush()
				# copy proxy
				for authFile in self.proxy.getAuthFiles():
					self.debugOut("Copying proxy")
					copyProcess = self.Pool.LoggedCopyToRemote(authFile, os.path.join(self.getWorkdirPath(), os.path.basename(authFile)))
					if copyProcess.wait() != 0:
						if self.explainError(copyProcess, copyProcess.wait()):
							pass
						else:
							copyProcess.logError(self.errorLog, brief=True)
					self.debugFlush()


			self.debugOut("Starting jobs")
			try:
				# submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
				activity = utils.ActivityLog('queuing jobs at scheduler')
				proc = self.Pool.LoggedProcess(self.submitExec, ' -verbose %(JDL)s' % { "JDL": jdlSubmitPath })

				self.debugOut("AAAAA")
				# extract the Condor ID (WMS ID) of the jobs from output ClassAds
				wmsJobIdList = []
				for line in proc.iter():
					if "GridControl_GCIDtoWMSID" in line:
						GCWMSID=line.split('=')[1].strip(' "\n').split('@')
						GCID,WMSID=int(GCWMSID[0]),GCWMSID[1].strip()
						# Condor creates a default job then overwrites settings on any subsequent job - i.e. skip every second, but better be sure
						if ( not wmsJobIdList ) or ( GCID not in zip(*wmsJobIdList)[0] ):
							wmsJobIdList.append((self._createId(WMSID),GCID))
					if "GridControl_GCtoWMSID" in line:
						self.debugOut("o : %s" % line)
						self.debugOut("o : %s" % wmsJobIdList)

				retCode = proc.wait()
				if (retCode != 0) or ( len(wmsJobIdList) < len(jobNumList) ):
					if self.explainError(proc, retCode):
						pass
					else:
						utils.eprint("Submitted %4d jobs of %4d expected" % (len(wmsJobIdList),len(jobNumList)))
						proc.logError(self.errorLog, jdl = jdlFilePath)
			finally:
				utils.removeFiles([jdlFilePath])
			self.debugOut("Done Submitting")

			# yield the (jobNum, WMS ID, other data) of each job successively
			for index in range(len(wmsJobIdList)):
				yield (wmsJobIdList[index][1], wmsJobIdList[index][0], {} )
			self.debugOut("Yielded submitted job")
			self.debugFlush()
Example #29
0
    def submitJobs(self, jobNumListFull, module):
        submitBatch = 25
        for index in irange(0, len(jobNumListFull), submitBatch):
            jobNumList = jobNumListFull[index:index + submitBatch]
            self.debugOut("\nStarted submitting: %s" % jobNumList)
            self.debugPool()

            # get the full job config path and basename
            def _getJobCFG(jobNum):
                return os.path.join(self.getSandboxPath(jobNum), 'job_%d.var' %
                                    jobNum), 'job_%d.var' % jobNum

            activity = utils.ActivityLog('preparing jobs')
            # construct a temporary JDL for this batch of jobs
            jdlDescriptor, jdlFilePath = tempfile.mkstemp(suffix='.jdl')
            jdlSubmitPath = jdlFilePath
            self.debugOut("Writing temporary jdl to: " + jdlSubmitPath)
            try:
                data = self.makeJDLdata(jobNumList, module)
                utils.safeWrite(os.fdopen(jdlDescriptor, 'w'), data)
            except Exception:
                utils.removeFiles([jdlFilePath])
                raise BackendError('Could not write jdl data to %s.' %
                                   jdlFilePath)

            # create the _jobconfig.sh file containing the actual data
            for jobNum in jobNumList:
                try:
                    self._writeJobConfig(
                        _getJobCFG(jobNum)[0], jobNum, module, {})
                except Exception:
                    raise BackendError(
                        'Could not write _jobconfig data for %s.' % jobNum)

            self.debugOut("Copying to remote")
            # copy infiles to ssh/gsissh remote pool if required
            if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH:
                activity = utils.ActivityLog('preparing remote scheduler')
                self.debugOut("Copying to sandbox")
                workdirBase = self.getWorkdirPath()
                # TODO: check whether shared remote files already exist and copy otherwise
                for _, fileSource, fileTarget in self._getSandboxFilesIn(
                        module):
                    copyProcess = self.Pool.LoggedCopyToRemote(
                        fileSource, os.path.join(workdirBase, fileTarget))
                    if copyProcess.wait() != 0:
                        if self.explainError(copyProcess, copyProcess.wait()):
                            pass
                        else:
                            copyProcess.logError(self.errorLog, brief=True)
                    self.debugFlush()
                # copy job config files
                self.debugOut("Copying job configs")
                for jobNum in jobNumList:
                    fileSource, fileTarget = _getJobCFG(jobNum)
                    copyProcess = self.Pool.LoggedCopyToRemote(
                        fileSource,
                        os.path.join(self.getWorkdirPath(jobNum), fileTarget))
                    if copyProcess.wait() != 0:
                        if self.explainError(copyProcess, copyProcess.wait()):
                            pass
                        else:
                            copyProcess.logError(self.errorLog, brief=True)
                    self.debugFlush()
                # copy jdl
                self.debugOut("Copying jdl")
                jdlSubmitPath = os.path.join(workdirBase,
                                             os.path.basename(jdlFilePath))
                copyProcess = self.Pool.LoggedCopyToRemote(
                    jdlFilePath, jdlSubmitPath)
                if copyProcess.wait() != 0:
                    if self.explainError(copyProcess, copyProcess.wait()):
                        pass
                    else:
                        copyProcess.logError(self.errorLog, brief=True)
                self.debugFlush()
                # copy proxy
                for authFile in self._token.getAuthFiles():
                    self.debugOut("Copying proxy")
                    copyProcess = self.Pool.LoggedCopyToRemote(
                        authFile,
                        os.path.join(self.getWorkdirPath(),
                                     os.path.basename(authFile)))
                    if copyProcess.wait() != 0:
                        if self.explainError(copyProcess, copyProcess.wait()):
                            pass
                        else:
                            copyProcess.logError(self.errorLog, brief=True)
                    self.debugFlush()

            self.debugOut("Starting jobs")
            try:
                # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
                activity = utils.ActivityLog('queuing jobs at scheduler')
                proc = self.Pool.LoggedExecute(
                    self.submitExec,
                    ' -verbose %(JDL)s' % {"JDL": jdlSubmitPath})

                self.debugOut("AAAAA")
                # extract the Condor ID (WMS ID) of the jobs from output ClassAds
                wmsJobIdList = []
                for line in proc.iter():
                    if "GridControl_GCIDtoWMSID" in line:
                        GCWMSID = line.split('=')[1].strip(' "\n').split('@')
                        GCID, WMSID = int(GCWMSID[0]), GCWMSID[1].strip()
                        # Condor creates a default job then overwrites settings on any subsequent job - i.e. skip every second, but better be sure
                        if (not wmsJobIdList) or (GCID not in lzip(
                                *wmsJobIdList)[0]):
                            wmsJobIdList.append((self._createId(WMSID), GCID))
                    if "GridControl_GCtoWMSID" in line:
                        self.debugOut("o : %s" % line)
                        self.debugOut("o : %s" % wmsJobIdList)

                retCode = proc.wait()
                activity.finish()
                if (retCode != 0) or (len(wmsJobIdList) < len(jobNumList)):
                    if self.explainError(proc, retCode):
                        pass
                    else:
                        utils.eprint("Submitted %4d jobs of %4d expected" %
                                     (len(wmsJobIdList), len(jobNumList)))
                        proc.logError(self.errorLog, jdl=jdlFilePath)
            finally:
                utils.removeFiles([jdlFilePath])
            self.debugOut("Done Submitting")

            # yield the (jobNum, WMS ID, other data) of each job successively
            for index in irange(len(wmsJobIdList)):
                yield (wmsJobIdList[index][1], wmsJobIdList[index][0], {})
            self.debugOut("Yielded submitted job")
            self.debugFlush()