Ejemplo n.º 1
0
	def _saveStateToTar(self, tar, meta, source, sourceLen, message):
		# Write the splitting info grouped into subtarfiles
		activity = Activity(message)
		(jobNum, lastValid, subTar) = (-1, -1, None)
		for jobNum, entry in enumerate(source):
			if not entry.get(DataSplitter.Invalid, False):
				lastValid = jobNum
			if jobNum % self._keySize == 0:
				self._closeSubTar(tar, subTar)
				subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / self._keySize))
				activity.update('%s [%d / %d]' % (message, jobNum, sourceLen))
			# Determine shortest way to store file list
			tmp = entry.pop(DataSplitter.FileList)
			savelist = self._getReducedFileList(entry, tmp) # can modify entry
			# Write files with infos / filelist
			data = str.join('', self._fmt.format(entry, fkt = self._formatFileEntry) + lmap(lambda fn: '=%s\n' % fn, savelist))
			self._addToSubTar(subTar, '%05d' % jobNum, data)
			# Remove common prefix from info
			if DataSplitter.CommonPrefix in entry:
				entry.pop(DataSplitter.CommonPrefix)
			entry[DataSplitter.FileList] = tmp
		self._closeSubTar(tar, subTar)
		activity.finish()
		# Write metadata to allow reconstruction of data splitter
		meta['MaxJobs'] = lastValid + 1
		for (fn, data) in [('Metadata', self._fmt.format(meta)), ('Version', '2')]:
			self._addToTar(tar, fn, data)
Ejemplo n.º 2
0
	def execute(self, wmsIDs, wmsName): # yields list of (wmsID,)
		marked_wmsIDs = lmap(lambda result: result[0], self._cancel_executor.execute(wmsIDs, wmsName))
		time.sleep(5)
		activity = Activity('Purging jobs')
		for result in self._purge_executor.execute(marked_wmsIDs, wmsName):
			yield result
		activity.finish()
def hash_verify(opts, status_mon, local_se_path, jobnum, fi_idx, fi):
	if not opts.verify_md5:
		return status_mon.register_file_result(jobnum, fi_idx, 'Download successful',
			FileDownloadStatus.FILE_OK)
	# Verify => compute md5hash
	remote_hash = fi[FileInfo.Hash]
	activity = Activity('Verifying checksum')
	try:
		local_hash = ignore_exception(Exception, None, hash_calc, local_se_path.replace('file://', ''))
		if local_hash is None:
			return status_mon.register_file_result(jobnum, fi_idx, 'Unable to calculate checksum',
				FileDownloadStatus.FILE_HASH_FAILED)
	finally:
		activity.finish()
	hash_match = fi[FileInfo.Hash] == local_hash
	match_map = {True: 'MATCH', False: 'FAIL'}
	if ANSI is not None:
		match_map = {True: ANSI.reset + ANSI.color_green + 'MATCH' + ANSI.reset,
			False: ANSI.reset + ANSI.color_red + 'FAIL' + ANSI.reset}
	msg = '\tLocal  hash: %s\n' % local_hash + \
		log_intro(jobnum, fi_idx) + '\tRemote hash: %s\n' % remote_hash + \
		log_intro(jobnum, fi_idx) + 'Checksum comparison: ' + match_map[hash_match]
	if hash_match:
		return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_OK)
	return status_mon.register_file_result(jobnum, fi_idx, msg, FileDownloadStatus.FILE_HASH_FAILED)
Ejemplo n.º 4
0
	def submit_jobs(self, jobnum_list, task):
		requestLen = len(jobnum_list)
		activity = Activity('Submitting jobs (--%)')
		while jobnum_list:
			jobSubmitNumList = jobnum_list[-self._schedd.getSubmitScale():]
			del jobnum_list[-self._schedd.getSubmitScale():]
			activity = Activity('Submitting jobs (%2d%%)'%(100*(requestLen-len(jobnum_list))/requestLen))
			for jobnum in jobSubmitNumList:
				self._write_job_config(
					self.getJobCfgPath(jobnum)[0],
					jobnum,
					task, {}
					)
			rawJobInfoMaps = self._schedd.submit_jobs(
				jobSubmitNumList, 
				task,
				self._getQueryArgs()
				)
			# Yield (jobnum, gc_id, other data) per jobZ
			jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps)
			for htcID in jobInfoMaps:
				yield (
					htcID.gcJobNum,
					self._createGcId(htcID),
					jobInfoMaps[htcID]
					)
		activity.finish()
Ejemplo n.º 5
0
	def _resync_psrc(self):
		activity = Activity('Performing resync of datasource %r' % self.get_datasource_name())
		# Get old and new dataset information
		provider_old = DataProvider.load_from_file(self._get_data_path('cache.dat'))
		block_list_old = provider_old.get_block_list_cached(show_stats=False)
		self._provider.clear_cache()
		block_list_new = self._provider.get_block_list_cached(show_stats=False)
		self._provider.save_to_file(self._get_data_path('cache-new.dat'), block_list_new)

		# Use old splitting information to synchronize with new dataset infos
		partition_len_old = self.get_parameter_len()
		partition_changes = self._resync_partitions(
			self._get_data_path('map-new.tar'), block_list_old, block_list_new)
		activity.finish()
		if partition_changes is not None:
			# Move current splitting to backup and use the new splitting from now on
			def _rename_with_backup(new, cur, old):
				if self._keep_old:
					os.rename(self._get_data_path(cur), self._get_data_path(old))
				os.rename(self._get_data_path(new), self._get_data_path(cur))
			_rename_with_backup('map-new.tar', 'map.tar', 'map-old-%d.tar' % time.time())
			_rename_with_backup('cache-new.dat', 'cache.dat', 'cache-old-%d.dat' % time.time())
			self._set_reader(DataSplitter.load_partitions(self._get_data_path('map.tar')))
			self._log.debug('Dataset resync finished: %d -> %d partitions', partition_len_old, self._len)
			(pnum_list_redo, pnum_list_disable) = partition_changes
			return (set(pnum_list_redo), set(pnum_list_disable), partition_len_old != self._len)
Ejemplo n.º 6
0
	def _resync(self):
		if self._data_provider:
			activity = Activity('Performing resync of datasource %r' % self._name)
			# Get old and new dataset information
			ds_old = DataProvider.loadFromFile(self._getDataPath('cache.dat')).getBlocks(show_stats = False)
			self._data_provider.clearCache()
			ds_new = self._data_provider.getBlocks(show_stats = False)
			self._data_provider.saveToFile(self._getDataPath('cache-new.dat'), ds_new)

			# Use old splitting information to synchronize with new dataset infos
			old_maxN = self._data_splitter.getMaxJobs()
			jobChanges = self._data_splitter.resyncMapping(self._getDataPath('map-new.tar'), ds_old, ds_new)
			activity.finish()
			if jobChanges is not None:
				# Move current splitting to backup and use the new splitting from now on
				def backupRename(old, cur, new):
					if self._keepOld:
						os.rename(self._getDataPath(cur), self._getDataPath(old))
					os.rename(self._getDataPath(new), self._getDataPath(cur))
				backupRename(  'map-old-%d.tar' % time.time(),   'map.tar',   'map-new.tar')
				backupRename('cache-old-%d.dat' % time.time(), 'cache.dat', 'cache-new.dat')
				self._data_splitter.importPartitions(self._getDataPath('map.tar'))
				self._maxN = self._data_splitter.getMaxJobs()
				self._log.debug('Dataset resync finished: %d -> %d partitions', old_maxN, self._maxN)
				return (set(jobChanges[0]), set(jobChanges[1]), old_maxN != self._maxN)
Ejemplo n.º 7
0
	def _submit_jobs(self, jobnum_list, task):
		# submit_jobs: Submit a number of jobs and yield (jobnum, WMS ID, other data) sequentially
		# >>jobnum: internal ID of the Job
		# JobNum is linked to the actual *task* here
		(jdl_fn, submit_jdl_fn) = self._submit_jobs_prepare(jobnum_list, task)
		try:
			# submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
			activity = Activity('queuing jobs at scheduler')
			submit_args = ' -verbose -batch-name ' + task.get_description().task_name + ' ' + submit_jdl_fn
			proc = self._proc_factory.logged_execute(self._submit_exec, submit_args)

			# extract the Condor ID (WMS ID) of the jobs from output ClassAds
			jobnum_gc_id_list = []
			for line in proc.iter():
				if 'GridControl_GCIDtoWMSID' in line:
					jobnum_wms_id = line.split('=')[1].strip(' "\n').split('@')
					jobnum, wms_id = int(jobnum_wms_id[0]), jobnum_wms_id[1].strip()
					# Condor creates a default job then overwrites settings on any subsequent job
					# i.e. skip every second, but better be sure
					if (not jobnum_gc_id_list) or (jobnum not in lzip(*jobnum_gc_id_list)[0]):
						jobnum_gc_id_list.append((jobnum, self._create_gc_id(wms_id)))

			exit_code = proc.wait()
			activity.finish()
			if (exit_code != 0) or (len(jobnum_gc_id_list) < len(jobnum_list)):
				if not self._explain_error(proc, exit_code):
					self._log.error('Submitted %4d jobs of %4d expected',
						len(jobnum_gc_id_list), len(jobnum_list))
					proc.log_error(self._error_log_fn, jdl=jdl_fn)
		finally:
			remove_files([jdl_fn])

		for (jobnum, gc_id) in jobnum_gc_id_list:
			yield (jobnum, gc_id, {})
Ejemplo n.º 8
0
	def _submitJob(self, jobNum, module):
		fd, jdl = tempfile.mkstemp('.jdl')
		try:
			jdlData = self.makeJDL(jobNum, module)
			utils.safeWrite(os.fdopen(fd, 'w'), jdlData)
		except Exception:
			utils.removeFiles([jdl])
			raise BackendError('Could not write jdl data to %s.' % jdl)

		try:
			submitArgs = []
			for key_value in utils.filterDict(self._submitParams, vF = lambda v: v).items():
				submitArgs.extend(key_value)
			submitArgs.append(jdl)

			activity = Activity('submitting job %d' % jobNum)
			proc = LocalProcess(self._submitExec, '--nomsg', '--noint', '--logfile', '/dev/stderr', *submitArgs)

			gcID = None
			for line in ifilter(lambda x: x.startswith('http'), imap(str.strip, proc.stdout.iter(timeout = 60))):
				gcID = line
			retCode = proc.status(timeout = 0, terminate = True)

			activity.finish()

			if (retCode != 0) or (gcID is None):
				if self.explainError(proc, retCode):
					pass
				else:
					self._log.log_process(proc, files = {'jdl': SafeFile(jdl).read()})
		finally:
			utils.removeFiles([jdl])
		return (jobNum, utils.QM(gcID, self._createId(gcID), None), {'jdl': str.join('', jdlData)})
Ejemplo n.º 9
0
	def __init__(self, block_list_old, block_list_new):
		activity = Activity('Performing resynchronization of dataset')
		block_resync_tuple = DataProvider.resync_blocks(block_list_old, block_list_new)
		(self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple
		for block_missing in self._block_list_missing:  # Files in matching blocks are already sorted
			sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL))
		activity.finish()
Ejemplo n.º 10
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		# retrieve task output files from sandbox directory
		if not len(gc_id_jobnum_list):
			raise StopIteration

		activity = Activity('retrieving job outputs')
		for gc_id, jobnum in gc_id_jobnum_list:
			sandpath = self._get_sandbox_dn(jobnum)
			if sandpath is None:
				yield (jobnum, None)
				continue
			# when working with a remote spool schedd, tell condor to return files
			if self._remote_type == PoolType.SPOOL:
				self._check_and_log_proc(self._proc_factory.logged_execute(
					self._transfer_exec, self._split_gc_id(gc_id)[1]))
			# when working with a remote [gsi]ssh schedd, manually return files
			elif self._remote_type in (PoolType.SSH, PoolType.GSISSH):
				self._check_and_log_proc(self._proc_factory.logged_copy_from_remote(
					self._get_remote_output_dn(jobnum), self._get_sandbox_dn()))
				# clean up remote working directory
				self._check_and_log_proc(self._proc_factory.logged_execute(
					'rm -rf %s' % self._get_remote_output_dn(jobnum)))
			# eventually extract wildcarded output files from the tarball
			unpack_wildcard_tar(self._log, sandpath)
			yield (jobnum, sandpath)
		# clean up if necessary
		activity.finish()
		self._cleanup_remote_output_dn()
Ejemplo n.º 11
0
	def _readJobs(self, jobLimit):
		utils.ensureDirExists(self._dbPath, 'job database directory', JobError)

		candidates = []
		for jobFile in fnmatch.filter(os.listdir(self._dbPath), 'job_*.txt'):
			try: # 2xsplit is faster than regex
				jobNum = int(jobFile.split(".")[0].split("_")[1])
			except Exception:
				continue
			candidates.append((jobNum, jobFile))

		(jobMap, maxJobs) = ({}, len(candidates))
		activity = Activity('Reading job infos')
		idx = 0
		for (jobNum, jobFile) in sorted(candidates):
			idx += 1
			if (jobLimit >= 0) and (jobNum >= jobLimit):
				self._log.info('Stopped reading job infos at job #%d out of %d available job files, since the limit of %d jobs is reached',
					jobNum, len(candidates), jobLimit)
				break
			jobObj = self._load_job(os.path.join(self._dbPath, jobFile))
			jobMap[jobNum] = jobObj
			if idx % 100 == 0:
				activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / maxJobs))
		activity.finish()
		return jobMap
Ejemplo n.º 12
0
	def _read_jobs(self, job_limit):
		ensure_dir_exists(self._path_db, 'job database directory', JobError)

		candidates = []
		for job_fn in fnmatch.filter(os.listdir(self._path_db), 'job_*.txt'):
			try:  # 2xsplit is faster than regex
				jobnum = int(job_fn.split(".")[0].split("_")[1])
			except Exception:
				clear_current_exception()
				continue
			candidates.append((jobnum, job_fn))

		(job_map, max_job_len) = ({}, len(candidates))
		activity = Activity('Reading job infos')
		idx = 0
		for (jobnum, job_fn) in sorted(candidates):
			idx += 1
			if jobnum >= job_limit >= 0:
				self._log.info('Stopped reading job infos at job #%d out of %d available job files, ' +
					'since the limit of %d jobs is reached', jobnum, len(candidates), job_limit)
				break
			try:
				job_fn_full = os.path.join(self._path_db, job_fn)
				data = self._fmt.parse(SafeFile(job_fn_full).iter_close())
				job_obj = self._create_job_obj(job_fn_full, data)
			except Exception:
				raise JobError('Unable to process job file %r' % job_fn_full)
			job_map[jobnum] = job_obj
			activity.update('Reading job infos %d [%d%%]' % (idx, (100.0 * idx) / max_job_len))
		activity.finish()
		return job_map
Ejemplo n.º 13
0
def create_tarball(match_info_iter, **kwargs):
	tar = tarfile.open(mode='w:gz', **kwargs)
	activity = Activity('Generating tarball')
	for match_info in match_info_iter:
		if isinstance(match_info, tuple):
			(path_source, path_target) = match_info
		else:
			(path_source, path_target) = (match_info, None)
		if isinstance(path_source, str):
			if not os.path.exists(path_source):
				raise PathError('File %s does not exist!' % path_source)
			tar.add(path_source, path_target or os.path.basename(path_source), recursive=False)
		elif path_source is None:  # Update activity
			activity.update('Generating tarball: %s' % path_target)
		else:  # File handle
			info, handle = path_source.get_tar_info()
			if path_target:
				info.name = path_target
			info.mtime = time.time()
			info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH
			if info.name.endswith('.sh') or info.name.endswith('.py'):
				info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH
			tar.addfile(info, handle)
			handle.close()
	activity.finish()
	tar.close()
Ejemplo n.º 14
0
	def __init__(self, lockfile):
		self._lockfile = lockfile
		activity = Activity('Trying to aquire lock file %s ...' % lockfile)
		while os.path.exists(self._lockfile):
			time.sleep(0.2)
		activity.finish()
		self._fd = open(self._lockfile, 'w')
		fcntl.flock(self._fd, fcntl.LOCK_EX)
Ejemplo n.º 15
0
	def _getJobsOutput(self, ids):
		if len(ids) == 0:
			raise StopIteration

		basePath = os.path.join(self._outputPath, 'tmp')
		try:
			if len(ids) == 1:
				# For single jobs create single subdir
				tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest())
			else:
				tmpPath = basePath
			utils.ensureDirExists(tmpPath)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % tmpPath, BackendError)

		jobNumMap = dict(ids)
		jobs = self.writeWMSIds(ids)

		activity = Activity('retrieving %d job outputs' % len(ids))
		proc = LocalProcess(self._outputExec, '--noint', '--logfile', '/dev/stderr', '-i', jobs, '--dir', tmpPath)

		# yield output dirs
		todo = jobNumMap.values()
		currentJobNum = None
		for line in imap(str.strip, proc.stdout.iter(timeout = 60)):
			if line.startswith(tmpPath):
				todo.remove(currentJobNum)
				outputDir = line.strip()
				if os.path.exists(outputDir):
					if 'GC_WC.tar.gz' in os.listdir(outputDir):
						wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz')
						try:
							tarfile.TarFile.open(wildcardTar, 'r:gz').extractall(outputDir)
							os.unlink(wildcardTar)
						except Exception:
							self._log.error('Can\'t unpack output files contained in %s', wildcardTar)
				yield (currentJobNum, line.strip())
				currentJobNum = None
			else:
				currentJobNum = jobNumMap.get(self._createId(line), currentJobNum)
		retCode = proc.status(timeout = 0, terminate = True)
		activity.finish()

		if retCode != 0:
			if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout = 0):
				utils.removeFiles([jobs, basePath])
				raise StopIteration
			else:
				self._log.log_process(proc, files = {'jobs': SafeFile(jobs).read()})
			self._log.error('Trying to recover from error ...')
			for dirName in os.listdir(basePath):
				yield (None, os.path.join(basePath, dirName))

		# return unretrievable jobs
		for jobNum in todo:
			yield (jobNum, None)

		utils.removeFiles([jobs, basePath])
Ejemplo n.º 16
0
		def _delete(file_se_path, where, what):
			if se_exists(file_se_path).status(timeout=10, terminate=True) == 0:
				activity = Activity('Deleting file %s from %s' % (fi[FileInfo.NameDest], where))
				rm_proc = se_rm(file_se_path)
				if rm_proc.status(timeout=60, terminate=True) == 0:
					log.info(log_intro(jobnum, fi_idx) + 'Deleted file %s', file_se_path)
				else:
					log.log_process(rm_proc, msg=log_intro(jobnum, fi_idx) + 'Unable to remove %s' % what)
				activity.finish()
Ejemplo n.º 17
0
	def _get_phedex_replica_list(self, block_path, replicas_dict):
		activity_fi = Activity('Getting file replica information from PhEDex')
		# Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list)
		replicas_dict[block_path] = []
		for phedex_block in self._pjrc.get(params={'block': block_path})['phedex']['block']:
			for replica in phedex_block['replica']:
				replica_info = (replica['node'], replica.get('se'), replica['complete'] == 'y')
				replicas_dict[block_path].append(replica_info)
		activity_fi.finish()
Ejemplo n.º 18
0
def wait(timeout):
	activity = Activity('Waiting', parent='root')
	for remaining in irange(timeout, 0, -1):
		if abort():
			return False
		if (remaining == timeout) or (remaining < 5) or (remaining % 5 == 0):
			activity.update('Waiting for %d seconds' % remaining)
		time.sleep(1)
	activity.finish()
	return True
Ejemplo n.º 19
0
	def get_dataset_name_list(self):
		if self._cache_dataset is None:
			self._cache_dataset = [self._dataset_path]
			if '*' in self._dataset_path:
				activity = Activity('Getting dataset list for %s' % self._dataset_path)
				self._cache_dataset = list(self._get_cms_dataset_list(self._dataset_path))
				if not self._cache_dataset:
					raise DatasetError('No datasets selected by DBS wildcard %s !' % self._dataset_path)
				activity.finish()
		return self._cache_dataset
Ejemplo n.º 20
0
	def get_endpoint(self):
		activity = Activity('Discovering available WMS services')
		wms_best_list = []
		for wms in self._list_endpoint_good():
			activity_wms = Activity('pinging WMS %s' % wms)
			if wms is None:
				continue
			ping, pingtime = self._ping_dict.get(wms, (None, 0))
			if time.time() - pingtime > 30 * 60:  # check every ~30min
				ping = ping_host(wms.split('://')[1].split('/')[0].split(':')[0])
				self._ping_dict[wms] = (ping, time.time() + 10 * 60 * random.random())  # 10 min variation
			if ping is not None:
				wms_best_list.append((wms, ping))
			activity_wms.finish()
		activity.finish()
		if not wms_best_list:
			return None
		sort_inplace(wms_best_list, key=lambda name_ping: name_ping[1])
		result = _choice_exp(wms_best_list)
		if result is not None:
			activity = Activity('selecting WMS %s' % result)
			wms, ping = result  # reduce timeout by 5min for chosen wms => re-ping every 6 submits
			self._ping_dict[wms] = (ping, self._ping_dict[wms][1] + 5 * 60)
			result = wms
			activity.finish()
		self._update_state()
		return result
Ejemplo n.º 21
0
	def __init__(self, config, source):
		self._psrc_raw = source
		BasicParameterAdapter.__init__(self, config, source)
		self._map_jobnum2pnum = {}
		ensure_dir_exists(config.get_work_path(), 'parameter storage directory', ParameterError)
		self._path_jobnum2pnum = config.get_work_path('params.map.gz')
		self._path_params = config.get_work_path('params.dat.gz')

		# Find out if init should be performed - overrides resync_requested!
		init_requested = config.get_state('init', detail='parameters')
		init_needed = False
		if not (os.path.exists(self._path_params) and os.path.exists(self._path_jobnum2pnum)):
			init_needed = True  # Init needed if no parameter log exists
		if init_requested and not init_needed and (source.get_parameter_len() is not None):
			self._log.warning('Re-Initialization will overwrite the current mapping ' +
				'between jobs and parameter/dataset content! This can lead to invalid results!')
			user_msg = ('Do you want to perform a syncronization between ' +
				'the current mapping and the new one to avoid this?')
			if UserInputInterface().prompt_bool(user_msg, True):
				init_requested = False
		do_init = init_requested or init_needed

		# Find out if resync should be performed
		resync_by_user = config.get_state('resync', detail='parameters')
		config.set_state(False, 'resync', detail='parameters')
		psrc_hash = self._psrc_raw.get_psrc_hash()
		self._psrc_hash_stored = config.get('parameter hash', psrc_hash, persistent=True)
		psrc_hash_changed = self._psrc_hash_stored != psrc_hash  # Resync if parameters have changed
		resync_by_psrc = self._psrc_raw.get_resync_request()

		if do_init:  # Write current state
			self._write_jobnum2pnum(self._path_jobnum2pnum)
			ParameterSource.get_class('GCDumpParameterSource').write(self._path_params,
				self.get_job_len(), self.get_job_metadata(), self.iter_jobs())
		elif resync_by_user or resync_by_psrc or psrc_hash_changed:  # Perform sync
			if psrc_hash_changed:
				self._log.info('Parameter hash has changed')
				self._log.debug('\told hash: %s', self._psrc_hash_stored)
				self._log.debug('\tnew hash: %s', psrc_hash)
				self._log.log(logging.DEBUG1, '\tnew src: %s', self._psrc_raw)
				config.set_state(True, 'init', detail='config')
			elif resync_by_psrc:
				self._log.info('Parameter source requested resync')
				self._log.debug('\t%r', str.join(', ', imap(repr, resync_by_psrc)))
			elif resync_by_user:
				self._log.info('User requested resync')
			self._psrc_hash_stored = None
			self._resync_state = self.resync(force=True)
		else:  # Reuse old mapping
			activity = Activity('Loading cached parameter information')
			self._read_jobnum2pnum()
			activity.finish()
			return  # do not set parameter hash in config
		config.set('parameter hash', self._psrc_raw.get_psrc_hash())
Ejemplo n.º 22
0
	def execute(self, wms_id_list, wms_name):  # yields list of purged (wms_id,)
		activity = Activity('waiting for jobs to finish')
		time.sleep(5)
		for wms_id in wms_id_list:
			path = self._sandbox_helper.get_sandbox('WMSID.%s.%s' % (wms_name, wms_id))
			if path is None:
				self._log.warning('Sandbox for job %r could not be found', wms_id)
				continue
			with_lock(LocalPurgeJobs.purge_lock, _purge_directory, self._log, path, wms_id)
			yield (wms_id,)
		activity.finish()
Ejemplo n.º 23
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		# Get output of jobs and yield output dirs
		if len(gc_id_jobnum_list) == 0:
			raise StopIteration

		root_dn = os.path.join(self._path_output, 'tmp')
		try:
			if len(gc_id_jobnum_list) == 1:
				# For single jobs create single subdir
				tmp_dn = os.path.join(root_dn, md5_hex(gc_id_jobnum_list[0][0]))
			else:
				tmp_dn = root_dn
			ensure_dir_exists(tmp_dn)
		except Exception:
			raise BackendError('Temporary path "%s" could not be created.' % tmp_dn, BackendError)

		map_gc_id2jobnum = dict(gc_id_jobnum_list)
		jobs = self._write_wms_id_list(gc_id_jobnum_list)

		activity = Activity('retrieving %d job outputs' % len(gc_id_jobnum_list))
		proc = LocalProcess(self._output_exec, '--noint',
			'--logfile', '/dev/stderr', '-i', jobs, '--dir', tmp_dn)

		# yield output dirs
		todo = map_gc_id2jobnum.values()
		current_jobnum = None
		for line in imap(str.strip, proc.stdout.iter(timeout=60)):
			if line.startswith(tmp_dn):
				todo.remove(current_jobnum)
				output_dn = line.strip()
				unpack_wildcard_tar(self._log, output_dn)
				yield (current_jobnum, output_dn)
				current_jobnum = None
			else:
				current_jobnum = map_gc_id2jobnum.get(self._create_gc_id(line), current_jobnum)
		exit_code = proc.status(timeout=0, terminate=True)
		activity.finish()

		if exit_code != 0:
			if 'Keyboard interrupt raised by user' in proc.stderr.read(timeout=0):
				remove_files([jobs, root_dn])
				raise StopIteration
			else:
				self._log.log_process(proc, files={'jobs': SafeFile(jobs).read()})
			self._log.error('Trying to recover from error ...')
			for dn in os.listdir(root_dn):
				yield (None, os.path.join(root_dn, dn))

		# return unretrievable jobs
		for jobnum in todo:
			yield (jobnum, None)

		remove_files([jobs, tmp_dn])
Ejemplo n.º 24
0
	def _run_executor(self, desc, executor, fmt, gcIDs, *args):
		# Perform some action with the executor, translate wmsID -> gcID and format the result
		activity = Activity(desc)
		wmsID_gcID_Map = self._get_map_wmsID_gcID(gcIDs)
		wmsIDs = sorted(wmsID_gcID_Map.keys())

		for result in executor.execute(wmsIDs, *args):
			wmsID = result[0] # result[0] is the wmsID by convention
			gcID = wmsID_gcID_Map.pop(wmsID, None)
			if gcID is not None:
				yield fmt((gcID,) + result[1:])
			else:
				self._log.debug('unable to find gcID for wmsID %r', wmsID)
		activity.finish()
Ejemplo n.º 25
0
	def execute(self, wmsIDs, wmsName): # yields list of purged (wmsID,)
		activity = Activity('waiting for jobs to finish')
		time.sleep(5)
		for wmsID in wmsIDs:
			path = self._sandbox_helper.get_sandbox('WMSID.%s.%s' % (wmsName, wmsID))
			if path is None:
				self._log.warning('Sandbox for job %r could not be found', wmsID)
				continue
			try:
				shutil.rmtree(path)
			except Exception:
				raise BackendError('Sandbox for job %r could not be deleted', wmsID)
			yield (wmsID,)
		activity.finish()
Ejemplo n.º 26
0
	def _run_executor(self, desc, executor, fmt, gc_id_list, *args):
		# Perform some action with the executor, translate wms_id -> gc_id and format the result
		activity = Activity(desc)
		map_wms_id2gc_id = self._get_map_wms_id2gc_id(gc_id_list)
		wms_id_list = sorted(map_wms_id2gc_id.keys())

		for result in executor.execute(wms_id_list, *args):
			wms_id = result[0]  # result[0] is the wms_id by convention
			gc_id = map_wms_id2gc_id.pop(wms_id, None)
			if gc_id is not None:
				yield fmt((gc_id,) + result[1:])
			else:
				self._log.debug('unable to find gc_id for wms_id %r', wms_id)
		activity.finish()
Ejemplo n.º 27
0
	def __init__(self, config, jobLimit = -1, jobSelector = None):
		dbPath = config.getWorkPath('jobs')
		self._dbFile = config.getWorkPath('jobs.zip')
		if os.path.exists(dbPath) and os.path.isdir(dbPath) and not os.path.exists(self._dbFile):
			activity = Activity('Converting job database')
			self._serial = 0
			try:
				oldDB = TextFileJobDB(config)
				for jobNum in oldDB.getJobs():
					self.commit(jobNum, oldDB.get(jobNum))
			except Exception:
				removeFiles([self._dbFile])
				raise
			activity.finish()
		ZippedJobDB.__init__(self, config, jobLimit, jobSelector)
Ejemplo n.º 28
0
	def resync(self, force = False): # Do not overwrite resync results - eg. from external or init trigger
		source_hash = self._source.getHash()
		if (self._resync_state == ParameterSource.EmptyResyncResult()) and ((source_hash != self._source_hash) or force):
			activity = Activity('Syncronizing parameter information')
			t_start = time.time()
			try:
				self._resync_state = self._resync()
			except Exception:
				raise ParameterError('Unable to resync parameters!')
			self._source_hash = self._source.getHash()
			activity.finish()
			self._log.log(logging.INFO, 'Finished resync of parameter source (%s)', strTimeShort(time.time() - t_start))
		result = self._resync_state
		self._resync_state = ParameterSource.EmptyResyncResult()
		return result
Ejemplo n.º 29
0
	def cancel_jobs(self, gc_id_jobnum_list):
		if not len(gc_id_jobnum_list):
			raise StopIteration
		activity   = Activity('Canceling jobs')
		assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list)), self._schedd.getURI())
		canceledJobs = self._schedd.cancel_jobs(
			self._splitGcRequests(gc_id_jobnum_list)
			)
		# Yield ( jobnum, wms_id) for canceled jobs
		for htcJobID in canceledJobs:
			yield (
				htcJobID.gcJobNum,
				self._createGcId(htcJobID)
				)
		activity.finish()
Ejemplo n.º 30
0
	def _get_jobs_output(self, gc_id_jobnum_list):
		if not len(gc_id_jobnum_list):
			raise StopIteration
		activity   = Activity('Fetching jobs')
		assert not bool(lfilter( lambda htcid: htcid.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list))), 'Bug! Got jobs at Schedds %s, but servicing only Schedd %s' % (lfilter( lambda itr: itr.scheddURI != self._schedd.getURI(), self._splitGcRequests(gc_id_jobnum_list)), self._schedd.getURI())
		returnedJobs = self._schedd.getJobsOutput(
			self._splitGcRequests(gc_id_jobnum_list)
			)
		# Yield (jobnum, path_output) per retrieved job
		for htcID in returnedJobs:
			yield (
				htcID.gcJobNum,
				self.getSandboxPath(htcID.gcJobNum)
				)
		activity.finish()
Ejemplo n.º 31
0
    def _get_jobs_output(self, gc_id_jobnum_list):
        if not len(gc_id_jobnum_list):
            raise StopIteration

        activity = Activity('retrieving %d job outputs' %
                            len(gc_id_jobnum_list))
        for gc_id, jobnum in gc_id_jobnum_list:
            path = self._sandbox_helper.get_sandbox(gc_id)
            if path is None:
                yield (jobnum, None)
                continue

            # Cleanup sandbox
            output_fn_list = lchain(
                imap(lambda pat: glob.glob(os.path.join(path, pat)),
                     self._output_fn_list))
            remove_files(
                ifilter(
                    lambda x: x not in output_fn_list,
                    imap(lambda fn: os.path.join(path, fn), os.listdir(path))))

            yield (jobnum, path)
        activity.finish()
Ejemplo n.º 32
0
    def _resync_psrc(self):
        activity = Activity('Performing resync of datasource %r' %
                            self.get_datasource_name())
        # Get old and new dataset information
        provider_old = DataProvider.load_from_file(
            self._get_data_path('cache.dat'))
        block_list_old = provider_old.get_block_list_cached(show_stats=False)
        self._provider.clear_cache()
        block_list_new = self._provider.get_block_list_cached(show_stats=False)
        self._provider.save_to_file(self._get_data_path('cache-new.dat'),
                                    block_list_new)

        # Use old splitting information to synchronize with new dataset infos
        partition_len_old = self.get_parameter_len()
        partition_changes = self._resync_partitions(
            self._get_data_path('map-new.tar'), block_list_old, block_list_new)
        activity.finish()
        if partition_changes is not None:
            # Move current splitting to backup and use the new splitting from now on
            def _rename_with_backup(new, cur, old):
                if self._keep_old:
                    os.rename(self._get_data_path(cur),
                              self._get_data_path(old))
                os.rename(self._get_data_path(new), self._get_data_path(cur))

            _rename_with_backup('map-new.tar', 'map.tar',
                                'map-old-%d.tar' % time.time())
            _rename_with_backup('cache-new.dat', 'cache.dat',
                                'cache-old-%d.dat' % time.time())
            self._set_reader(
                DataSplitter.load_partitions(self._get_data_path('map.tar')))
            self._log.debug('Dataset resync finished: %d -> %d partitions',
                            partition_len_old, self._len)
            (pnum_list_redo, pnum_list_disable) = partition_changes
            return (set(pnum_list_redo), set(pnum_list_disable),
                    partition_len_old != self._len)
Ejemplo n.º 33
0
    def _begin_bulk_submission(self):
        self._submit_args_dict.update({'-d': None})
        if self._discovery_plugin:
            self._submit_args_dict.update(
                {'-e': self._discovery_plugin.get_endpoint()})
        if self._use_delegate is False:
            self._submit_args_dict.update({'-a': ' '})
            return True
        delegate_id = 'GCD' + md5_hex(str(time.time()))[:10]
        activity = Activity('creating delegate proxy for job submission')
        delegate_arg_list = []
        if self._config_fn:
            delegate_arg_list.extend(['--config', self._config_fn])
        proc = LocalProcess(self._delegate_exec, '-d', delegate_id, '--noint',
                            '--logfile', '/dev/stderr', *delegate_arg_list)
        output = proc.get_output(timeout=10, raise_errors=False)
        if ('glite-wms-job-delegate-proxy Success' in output) and (delegate_id
                                                                   in output):
            self._submit_args_dict.update({'-d': delegate_id})
        activity.finish()

        if proc.status(timeout=0, terminate=True) != 0:
            self._log.log_process(proc)
        return self._submit_args_dict.get('-d') is not None
Ejemplo n.º 34
0
    def _get_jobs_output(self, gc_id_jobnum_list):
        # Get output of jobs and yield output dirs
        if len(gc_id_jobnum_list) == 0:
            raise StopIteration

        tmp_dn = os.path.join(self._path_output, 'tmp')
        try:
            if len(gc_id_jobnum_list) == 1:
                # For single jobs create single subdir
                tmp_dn = os.path.join(tmp_dn, md5_hex(gc_id_jobnum_list[0][0]))
            ensure_dir_exists(tmp_dn)
        except Exception:
            raise BackendError(
                'Temporary path "%s" could not be created.' % tmp_dn,
                BackendError)

        map_gc_id2jobnum = dict(gc_id_jobnum_list)
        jobnum_list_todo = list(map_gc_id2jobnum.values())
        wms_id_list_done = []
        activity = Activity('retrieving %d job outputs' %
                            len(gc_id_jobnum_list))
        chunk_pos_iter = irange(0, len(gc_id_jobnum_list), self._chunk_size)
        for ids in imap(lambda x: gc_id_jobnum_list[x:x + self._chunk_size],
                        chunk_pos_iter):
            for (current_jobnum, output_dn) in self.get_jobs_output_chunk(
                    tmp_dn, ids, wms_id_list_done):
                unpack_wildcard_tar(self._log, output_dn)
                jobnum_list_todo.remove(current_jobnum)
                yield (current_jobnum, output_dn)
        activity.finish()

        # return unretrievable jobs
        for jobnum in jobnum_list_todo:
            yield (jobnum, None)
        self._purge_done_jobs(wms_id_list_done)
        remove_files([tmp_dn])
Ejemplo n.º 35
0
    def _resync(self):
        if self._data_provider:
            activity = Activity('Performing resync of datasource %r' %
                                self._name)
            # Get old and new dataset information
            ds_old = DataProvider.loadFromFile(
                self._getDataPath('cache.dat')).getBlocks(show_stats=False)
            self._data_provider.clearCache()
            ds_new = self._data_provider.getBlocks(show_stats=False)
            self._data_provider.saveToFile(self._getDataPath('cache-new.dat'),
                                           ds_new)

            # Use old splitting information to synchronize with new dataset infos
            old_maxN = self._data_splitter.getMaxJobs()
            jobChanges = self._data_splitter.resyncMapping(
                self._getDataPath('map-new.tar'), ds_old, ds_new)
            activity.finish()
            if jobChanges is not None:
                # Move current splitting to backup and use the new splitting from now on
                def backupRename(old, cur, new):
                    if self._keepOld:
                        os.rename(self._getDataPath(cur),
                                  self._getDataPath(old))
                    os.rename(self._getDataPath(new), self._getDataPath(cur))

                backupRename('map-old-%d.tar' % time.time(), 'map.tar',
                             'map-new.tar')
                backupRename('cache-old-%d.dat' % time.time(), 'cache.dat',
                             'cache-new.dat')
                self._data_splitter.importPartitions(
                    self._getDataPath('map.tar'))
                self._maxN = self._data_splitter.getMaxJobs()
                self._log.debug('Dataset resync finished: %d -> %d partitions',
                                old_maxN, self._maxN)
                return (set(jobChanges[0]), set(jobChanges[1]),
                        old_maxN != self._maxN)
Ejemplo n.º 36
0
def wait(timeout):
    activity = Activity('Waiting', parent='root')
    for remaining in irange(timeout, 0, -1):
        if abort():
            return False
        if (remaining == timeout) or (remaining < 5) or (remaining % 5 == 0):
            activity.update('Waiting for %d seconds' % remaining)
        time.sleep(1)
    activity.finish()
    return True
Ejemplo n.º 37
0
 def getEntries(self, path, metadata, events, seList, objStore):
     metadata['GC_SOURCE_DIR'] = self._path
     counter = 0
     activity = Activity('Reading source directory')
     for fn in self._iter_path():
         activity.update('Reading source directory - [%d]' % counter)
         yield (os.path.join(self._path, fn.strip()), metadata, events,
                seList, objStore)
         counter += 1
     activity.finish()
Ejemplo n.º 38
0
 def _readJobs(self, jobLimit):
     jobMap = {}
     maxJobs = 0
     if os.path.exists(self._dbFile):
         try:
             tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
         except Exception:  # Try to recover job archive
             self._log.warning(
                 '=' * 40 +
                 '\nStarting recovery of broken job database => Answer "y" if asked "Is this a single-disk archive?"!\n'
                 + '=' * 40)
             os.system('zip -FF %s --out %s.tmp 2> /dev/null' %
                       (self._dbFile, self._dbFile))
             os.rename(self._dbFile, self._dbFile + '.broken')
             os.rename(self._dbFile + '.tmp', self._dbFile)
             tar = zipfile.ZipFile(self._dbFile, 'r', zipfile.ZIP_DEFLATED)
             removeFiles([self._dbFile + '.broken'])
             brokenList = []
             for idx, fnTarInfo in enumerate(tar.namelist()):
                 (jobNum, tid) = tuple(
                     imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
                 try:
                     fp = tar.open(fnTarInfo)
                     try:
                         fp.read()
                     finally:
                         fp.close()
                 except Exception:
                     clear_current_exception()
             for broken in brokenList:
                 os.system('zip %s -d %s' % (self._dbFile, broken))
             self._log.info('Recover completed!')
         activity = Activity('Reading job transactions')
         maxJobs = len(tar.namelist())
         tMap = {}
         for idx, fnTarInfo in enumerate(tar.namelist()):
             (jobNum, tid) = tuple(
                 imap(lambda s: int(s[1:]), fnTarInfo.split('_', 1)))
             if tid < tMap.get(jobNum, 0):
                 continue
             try:
                 data = self._fmt.parse(tar.open(fnTarInfo).read())
             except Exception:
                 continue
             jobMap[jobNum] = self._create_job_obj(fnTarInfo, data)
             tMap[jobNum] = tid
             if idx % 100 == 0:
                 activity.update('Reading job transactions %d [%d%%]' %
                                 (idx, (100.0 * idx) / maxJobs))
         activity.finish()
     self._serial = maxJobs
     return jobMap
Ejemplo n.º 39
0
 def getEntries(self, path, metadata, events, seList, objStore):
     activity = Activity('Reading job logs')
     for jobNum in self._selected:
         activity.update('Reading job logs - [%d / %d]' %
                         (jobNum, self._selected[-1]))
         metadata['GC_JOBNUM'] = jobNum
         objStore.update({
             'GC_TASK': self._extTask,
             'GC_WORKDIR': self._extWorkDir
         })
         yield (os.path.join(self._extWorkDir, 'output', 'job_%d' % jobNum),
                metadata, events, seList, objStore)
     activity.finish()
Ejemplo n.º 40
0
	def _submit_jobs_prepare(self, jobnum_list, task):
		activity = Activity('preparing jobs')
		jdl_fn = self._write_jdl(jobnum_list, task)

		# create the _jobconfig.sh file containing the actual data
		for jobnum in jobnum_list:
			try:
				job_var_fn = os.path.join(self._get_sandbox_dn(jobnum), 'job_%d.var' % jobnum)
				self._write_job_config(job_var_fn, jobnum, task, {})
			except Exception:
				raise BackendError('Could not write _jobconfig data for %s.' % jobnum)

		# copy infiles to ssh/gsissh remote pool if required
		submit_jdl_fn = jdl_fn
		if self._remote_type in (PoolType.SSH, PoolType.GSISSH):
			activity_remote = Activity('preparing remote scheduler')
			remote_output_dn = self._get_remote_output_dn()
			# TODO: check whether shared remote files already exist and copy otherwise
			for _, source_fn, target_fn in self._get_in_transfer_info_list(task):
				self._check_and_log_proc(self._proc_factory.logged_copy_to_remote(source_fn,
					os.path.join(remote_output_dn, target_fn)))
			# copy job config files
			for jobnum in jobnum_list:
				self._check_and_log_proc(self._proc_factory.logged_copy_to_remote(
					os.path.join(self._get_sandbox_dn(jobnum), 'job_%d.var' % jobnum),
					os.path.join(self._get_remote_output_dn(jobnum), 'job_%d.var' % jobnum)))
			# copy jdl
			submit_jdl_fn = os.path.join(remote_output_dn, os.path.basename(jdl_fn))
			self._check_and_log_proc(self._proc_factory.logged_copy_to_remote(jdl_fn, submit_jdl_fn))
			# copy proxy
			for auth_fn in self._token.get_auth_fn_list():
				self._check_and_log_proc(self._proc_factory.logged_copy_to_remote(auth_fn,
					os.path.join(self._get_remote_output_dn(), os.path.basename(auth_fn))))
			activity_remote.finish()
		activity.finish()
		return (jdl_fn, submit_jdl_fn)
Ejemplo n.º 41
0
 def getEntries(self, path, metadata, events, seList, objStore):
     allDirs = lfilter(lambda fn: fn.startswith('job_'),
                       os.listdir(self._extOutputDir))
     activity = Activity('Reading job logs')
     for idx, dirName in enumerate(allDirs):
         activity.update('Reading job logs - [%d / %d]' %
                         (idx, len(allDirs)))
         try:
             metadata['GC_JOBNUM'] = int(dirName.split('_')[1])
         except Exception:
             continue
         objStore['GC_WORKDIR'] = self._extWorkDir
         if self._selector and not self._selector(metadata['GC_JOBNUM'],
                                                  None):
             continue
         yield (os.path.join(self._extOutputDir,
                             dirName), metadata, events, seList, objStore)
     activity.finish()
Ejemplo n.º 42
0
 def submitJobs(self, jobNumList, task):
     requestLen = len(jobNumList)
     activity = Activity('Submitting jobs (--%)')
     while jobNumList:
         jobSubmitNumList = jobNumList[-self._schedd.getSubmitScale():]
         del jobNumList[-self._schedd.getSubmitScale():]
         activity = Activity('Submitting jobs (%2d%%)' %
                             (100 *
                              (requestLen - len(jobNumList)) / requestLen))
         for jobNum in jobSubmitNumList:
             self._writeJobConfig(
                 self.getJobCfgPath(jobNum)[0], jobNum, task, {})
         rawJobInfoMaps = self._schedd.submitJobs(jobSubmitNumList, task,
                                                  self._getQueryArgs())
         # Yield (jobNum, gcID, other data) per jobZ
         jobInfoMaps = self._digestQueueInfoMaps(rawJobInfoMaps)
         for htcID in jobInfoMaps:
             yield (htcID.gcJobNum, self._createGcId(htcID),
                    jobInfoMaps[htcID])
     activity.finish()
Ejemplo n.º 43
0
def genTarball(outFile, fileList):
    tar = tarfile.open(outFile, 'w:gz')
    activity = Activity('Generating tarball')
    for (pathAbs, pathRel, pathStatus) in fileList:
        if pathStatus is True:  # Existing file
            tar.add(pathAbs, pathRel, recursive=False)
        elif pathStatus is False:  # Existing file
            if not os.path.exists(pathAbs):
                raise UserError('File %s does not exist!' % pathRel)
            tar.add(pathAbs, pathRel, recursive=False)
        elif pathStatus is None:  # Directory
            activity.update('Generating tarball: %s' % pathRel)
        else:  # File handle
            info, handle = pathStatus.getTarInfo()
            info.mtime = time.time()
            info.mode = stat.S_IRUSR + stat.S_IWUSR + stat.S_IRGRP + stat.S_IROTH
            if info.name.endswith('.sh') or info.name.endswith('.py'):
                info.mode += stat.S_IXUSR + stat.S_IXGRP + stat.S_IXOTH
            tar.addfile(info, handle)
            handle.close()
    activity.finish()
    tar.close()
Ejemplo n.º 44
0
 def write(cls, fn, pa):
     fp = ZipFile(fn, 'w')
     try:
         keys = sorted(ifilter(lambda p: not p.untracked, pa.getJobKeys()))
         fp.write('# %s\n' % json.dumps(keys))
         maxN = pa.getMaxJobs()
         if maxN:
             activity = Activity('Writing parameter dump')
             for jobNum in irange(maxN):
                 activity.update('Writing parameter dump [%d/%d]' %
                                 (jobNum + 1, maxN))
                 meta = pa.getJobInfo(jobNum)
                 meta_str = str.join(
                     '\t', imap(lambda k: json.dumps(meta.get(k, '')),
                                keys))
                 if meta.get(ParameterInfo.ACTIVE, True):
                     fp.write('%d\t%s\n' % (jobNum, meta_str))
                 else:
                     fp.write('%d!\t%s\n' % (jobNum, meta_str))
             activity.finish()
     finally:
         fp.close()
Ejemplo n.º 45
0
	def _saveStateToTar(self, tar, meta, source, sourceLen, message):
		# Write the splitting info grouped into subtarfiles
		activity = Activity(message)
		(jobNum, subTar) = (-1, None)
		for jobNum, entry in enumerate(source):
			if jobNum % 100 == 0:
				self._closeSubTar(tar, subTar)
				subTar = self._createSubTar('%03dXX.tgz' % int(jobNum / 100))
				activity.update('%s [%d / %d]' % (message, jobNum, sourceLen))
			# Determine shortest way to store file list
			tmp = entry.pop(DataSplitter.FileList)
			savelist = self._getReducedFileList(entry, tmp) # can modify entry
			# Write files with infos / filelist
			for name, data in [('list', str.join('\n', savelist)), ('info', self._fmt.format(entry, fkt = self._formatFileEntry))]:
				self._addToSubTar(subTar, os.path.join('%05d' % jobNum, name), data)
			# Remove common prefix from info
			if DataSplitter.CommonPrefix in entry:
				entry.pop(DataSplitter.CommonPrefix)
			entry[DataSplitter.FileList] = tmp
		self._closeSubTar(tar, subTar)
		# Write metadata to allow reconstruction of data splitter
		meta['MaxJobs'] = jobNum + 1
		self._addToTar(tar, 'Metadata', self._fmt.format(meta))
		activity.finish()
Ejemplo n.º 46
0
    def submit_jobs(self, jobnum_list, task):
        import os

        activity = Activity("Waiting for lock to be released...")
        while os.path.isfile(self._lock_filename):
            time.sleep(2)
        file = open(self._lock_filename, "w+")
        activity.finish()
        activity = Activity("Lock acquired:" + self._lock_filename)
        activity.finish()

        t = self._begin_bulk_submission()
        while not t:
            activity = Activity(
                'waiting before trying to delegate proxy again...')
            time.sleep(900)
            activity.finish()
            activity = Activity('re-attempting to delegate proxy...')
            t = self._begin_bulk_submission()
            activity.finish()
        '''
		if not self._begin_bulk_submission():  # Trying to delegate proxy failed
			self._log.error('Unable to delegate proxy! Continue with automatic delegation...')
			self._submit_args_dict.update({'-a': ' '})
			self._use_delegate = False
		'''

        count_submitted = 0
        for result in GridWMS.submit_jobs(self, jobnum_list, task):
            count_submitted += 1
            yield result
        file.close()
        self._log.info('count_submitted: %d' % count_submitted)
        count_submitted = int(count_submitted * 0.2)

        x = threading.Thread(target=self.delfile,
                             args=(self._lock_filename, count_submitted,
                                   self._log))
        x.start()
Ejemplo n.º 47
0
    def _getJobsOutput(self, allIds):
        if len(allIds) == 0:
            raise StopIteration

        basePath = os.path.join(self._outputPath, 'tmp')
        try:
            if len(allIds) == 1:
                # For single jobs create single subdir
                basePath = os.path.join(basePath,
                                        md5(allIds[0][0]).hexdigest())
            utils.ensureDirExists(basePath)
        except Exception:
            raise BackendError(
                'Temporary path "%s" could not be created.' % basePath,
                BackendError)

        activity = Activity('retrieving %d job outputs' % len(allIds))
        for ids in imap(lambda x: allIds[x:x + self._nJobsPerChunk],
                        irange(0, len(allIds), self._nJobsPerChunk)):
            jobNumMap = dict(ids)
            jobs = ' '.join(self._getRawIDs(ids))
            log = tempfile.mktemp('.log')

            proc = LoggedProcess(
                self._outputExec,
                '--noint --logfile "%s" --dir "%s" %s' % (log, basePath, jobs))

            # yield output dirs
            todo = jobNumMap.values()
            done = []
            currentJobNum = None
            for line in imap(str.strip, proc.iter()):
                match = re.match(self._outputRegex, line)
                if match:
                    currentJobNum = jobNumMap.get(
                        self._createId(match.groupdict()['rawId']))
                    todo.remove(currentJobNum)
                    done.append(match.groupdict()['rawId'])
                    outputDir = match.groupdict()['outputDir']
                    if os.path.exists(outputDir):
                        if 'GC_WC.tar.gz' in os.listdir(outputDir):
                            wildcardTar = os.path.join(outputDir,
                                                       'GC_WC.tar.gz')
                            try:
                                tarfile.TarFile.open(
                                    wildcardTar, 'r:gz').extractall(outputDir)
                                os.unlink(wildcardTar)
                            except Exception:
                                self._log.error(
                                    'Can\'t unpack output files contained in %s',
                                    wildcardTar)
                    yield (currentJobNum, outputDir)
                    currentJobNum = None
            retCode = proc.wait()

            if retCode != 0:
                if 'Keyboard interrupt raised by user' in proc.getError():
                    utils.removeFiles([log, basePath])
                    raise StopIteration
                else:
                    proc.logError(self.errorLog, log=log)
                self._log.error('Trying to recover from error ...')
                for dirName in os.listdir(basePath):
                    yield (None, os.path.join(basePath, dirName))
        activity.finish()

        # return unretrievable jobs
        for jobNum in todo:
            yield (jobNum, None)

        purgeLog = tempfile.mktemp('.log')
        purgeProc = LoggedProcess(
            utils.resolveInstallPath('glite-ce-job-purge'),
            '--noint --logfile "%s" %s' % (purgeLog, str.join(' ', done)))
        retCode = purgeProc.wait()
        if retCode != 0:
            if self.explainError(purgeProc, retCode):
                pass
            else:
                proc.logError(self.errorLog, log=purgeLog, jobs=done)
        utils.removeFiles([log, purgeLog, basePath])
Ejemplo n.º 48
0
def handle_abort_interrupt(signum, frame):
    utils.abort(True)
    handle_abort_interrupt.log = Activity(
        'Quitting grid-control! (This can take a few seconds...)',
        parent='root')
    signal.signal(signum, signal.SIG_DFL)
Ejemplo n.º 49
0
    def _getJobsOutput(self, ids):
        if len(ids) == 0:
            raise StopIteration

        basePath = os.path.join(self._outputPath, 'tmp')
        try:
            if len(ids) == 1:
                # For single jobs create single subdir
                tmpPath = os.path.join(basePath, md5(ids[0][0]).hexdigest())
            else:
                tmpPath = basePath
            utils.ensureDirExists(tmpPath)
        except Exception:
            raise BackendError(
                'Temporary path "%s" could not be created.' % tmpPath,
                BackendError)

        jobNumMap = dict(ids)
        jobs = self.writeWMSIds(ids)

        activity = Activity('retrieving %d job outputs' % len(ids))
        proc = LocalProcess(self._outputExec, '--noint', '--logfile',
                            '/dev/stderr', '-i', jobs, '--dir', tmpPath)

        # yield output dirs
        todo = jobNumMap.values()
        currentJobNum = None
        for line in imap(str.strip, proc.stdout.iter(timeout=60)):
            if line.startswith(tmpPath):
                todo.remove(currentJobNum)
                outputDir = line.strip()
                if os.path.exists(outputDir):
                    if 'GC_WC.tar.gz' in os.listdir(outputDir):
                        wildcardTar = os.path.join(outputDir, 'GC_WC.tar.gz')
                        try:
                            tarfile.TarFile.open(wildcardTar,
                                                 'r:gz').extractall(outputDir)
                            os.unlink(wildcardTar)
                        except Exception:
                            self._log.error(
                                'Can\'t unpack output files contained in %s',
                                wildcardTar)
                yield (currentJobNum, line.strip())
                currentJobNum = None
            else:
                currentJobNum = jobNumMap.get(self._createId(line),
                                              currentJobNum)
        retCode = proc.status(timeout=0, terminate=True)
        activity.finish()

        if retCode != 0:
            if 'Keyboard interrupt raised by user' in proc.stderr.read(
                    timeout=0):
                utils.removeFiles([jobs, basePath])
                raise StopIteration
            else:
                self._log.log_process(proc,
                                      files={'jobs': SafeFile(jobs).read()})
            self._log.error('Trying to recover from error ...')
            for dirName in os.listdir(basePath):
                yield (None, os.path.join(basePath, dirName))

        # return unretrievable jobs
        for jobNum in todo:
            yield (jobNum, None)

        utils.removeFiles([jobs, basePath])
Ejemplo n.º 50
0
    def submitJobs(self, jobNumListFull, module):
        submitBatch = 25
        for index in irange(0, len(jobNumListFull), submitBatch):
            jobNumList = jobNumListFull[index:index + submitBatch]
            self.debugOut("\nStarted submitting: %s" % jobNumList)
            self.debugPool()

            # get the full job config path and basename
            def _getJobCFG(jobNum):
                return os.path.join(self.getSandboxPath(jobNum), 'job_%d.var' %
                                    jobNum), 'job_%d.var' % jobNum

            activity = Activity('preparing jobs')
            # construct a temporary JDL for this batch of jobs
            jdlDescriptor, jdlFilePath = tempfile.mkstemp(suffix='.jdl')
            jdlSubmitPath = jdlFilePath
            self.debugOut("Writing temporary jdl to: " + jdlSubmitPath)
            try:
                data = self.makeJDLdata(jobNumList, module)
                utils.safeWrite(os.fdopen(jdlDescriptor, 'w'), data)
            except Exception:
                utils.removeFiles([jdlFilePath])
                raise BackendError('Could not write jdl data to %s.' %
                                   jdlFilePath)

            # create the _jobconfig.sh file containing the actual data
            for jobNum in jobNumList:
                try:
                    self._writeJobConfig(
                        _getJobCFG(jobNum)[0], jobNum, module, {})
                except Exception:
                    raise BackendError(
                        'Could not write _jobconfig data for %s.' % jobNum)

            self.debugOut("Copying to remote")
            # copy infiles to ssh/gsissh remote pool if required
            if self.remoteType == PoolType.SSH or self.remoteType == PoolType.GSISSH:
                activity = Activity('preparing remote scheduler')
                self.debugOut("Copying to sandbox")
                workdirBase = self.getWorkdirPath()
                # TODO: check whether shared remote files already exist and copy otherwise
                for _, fileSource, fileTarget in self._getSandboxFilesIn(
                        module):
                    copyProcess = self.Pool.LoggedCopyToRemote(
                        fileSource, os.path.join(workdirBase, fileTarget))
                    if copyProcess.wait() != 0:
                        if self.explainError(copyProcess, copyProcess.wait()):
                            pass
                        else:
                            copyProcess.logError(self.errorLog, brief=True)
                    self.debugFlush()
                # copy job config files
                self.debugOut("Copying job configs")
                for jobNum in jobNumList:
                    fileSource, fileTarget = _getJobCFG(jobNum)
                    copyProcess = self.Pool.LoggedCopyToRemote(
                        fileSource,
                        os.path.join(self.getWorkdirPath(jobNum), fileTarget))
                    if copyProcess.wait() != 0:
                        if self.explainError(copyProcess, copyProcess.wait()):
                            pass
                        else:
                            copyProcess.logError(self.errorLog, brief=True)
                    self.debugFlush()
                # copy jdl
                self.debugOut("Copying jdl")
                jdlSubmitPath = os.path.join(workdirBase,
                                             os.path.basename(jdlFilePath))
                copyProcess = self.Pool.LoggedCopyToRemote(
                    jdlFilePath, jdlSubmitPath)
                if copyProcess.wait() != 0:
                    if self.explainError(copyProcess, copyProcess.wait()):
                        pass
                    else:
                        copyProcess.logError(self.errorLog, brief=True)
                self.debugFlush()
                # copy proxy
                for authFile in self._token.getAuthFiles():
                    self.debugOut("Copying proxy")
                    copyProcess = self.Pool.LoggedCopyToRemote(
                        authFile,
                        os.path.join(self.getWorkdirPath(),
                                     os.path.basename(authFile)))
                    if copyProcess.wait() != 0:
                        if self.explainError(copyProcess, copyProcess.wait()):
                            pass
                        else:
                            copyProcess.logError(self.errorLog, brief=True)
                    self.debugFlush()

            self.debugOut("Starting jobs")
            try:
                # submit all jobs simultaneously and temporarily store verbose (ClassAdd) output
                activity = Activity('queuing jobs at scheduler')
                proc = self.Pool.LoggedExecute(
                    self.submitExec,
                    ' -verbose %(JDL)s' % {"JDL": jdlSubmitPath})

                self.debugOut("AAAAA")
                # extract the Condor ID (WMS ID) of the jobs from output ClassAds
                wmsJobIdList = []
                for line in proc.iter():
                    if "GridControl_GCIDtoWMSID" in line:
                        GCWMSID = line.split('=')[1].strip(' "\n').split('@')
                        GCID, WMSID = int(GCWMSID[0]), GCWMSID[1].strip()
                        # Condor creates a default job then overwrites settings on any subsequent job - i.e. skip every second, but better be sure
                        if (not wmsJobIdList) or (GCID not in lzip(
                                *wmsJobIdList)[0]):
                            wmsJobIdList.append((self._createId(WMSID), GCID))
                    if "GridControl_GCtoWMSID" in line:
                        self.debugOut("o : %s" % line)
                        self.debugOut("o : %s" % wmsJobIdList)

                retCode = proc.wait()
                activity.finish()
                if (retCode != 0) or (len(wmsJobIdList) < len(jobNumList)):
                    if self.explainError(proc, retCode):
                        pass
                    else:
                        self._log.error('Submitted %4d jobs of %4d expected',
                                        len(wmsJobIdList), len(jobNumList))
                        proc.logError(self.errorLog, jdl=jdlFilePath)
            finally:
                utils.removeFiles([jdlFilePath])
            self.debugOut("Done Submitting")

            # yield the (jobNum, WMS ID, other data) of each job successively
            for index in irange(len(wmsJobIdList)):
                yield (wmsJobIdList[index][1], wmsJobIdList[index][0], {})
            self.debugOut("Yielded submitted job")
            self.debugFlush()
Ejemplo n.º 51
0
def download_single_file(opts, jobnum, fi_idx, fi, status_mon):
    (source_se_path, target_se_path,
     local_se_path) = get_fi_path_tuple(opts, fi)
    show_file_info(jobnum, fi_idx, fi)

    # Copy files to local folder
    if not accepted_se(opts, fi):
        return status_mon.register_file_result(
            jobnum, fi_idx, 'skipping file on blacklisted SE',
            FileDownloadStatus.FILE_SE_BLACKLIST)
    activity_check = Activity('Checking file existance')
    try:
        if opts.skip_existing and (se_exists(target_se_path).status(
                timeout=10, terminate=True) == 0):
            return status_mon.register_file_result(
                jobnum, fi_idx, 'skipping already existing file',
                FileDownloadStatus.FILE_EXISTS)
    finally:
        activity_check.finish()
    if se_exists(os.path.dirname(target_se_path)).status(timeout=10,
                                                         terminate=True) != 0:
        activity = Activity('Creating target directory')
        try:
            mkdir_proc = se_mkdir(os.path.dirname(target_se_path))
            if mkdir_proc.status(timeout=10, terminate=True) != 0:
                return status_mon.register_file_result(
                    jobnum,
                    fi_idx,
                    'unable to create target dir',
                    FileDownloadStatus.FILE_MKDIR_FAILED,
                    proc=mkdir_proc)
        finally:
            activity.finish()

    if 'file://' in target_se_path:
        local_se_path = target_se_path
    copy_timeout_event = GCEvent()
    copy_ended_event = GCEvent()
    monitor_thread = start_daemon('Download monitor %s' % jobnum,
                                  download_monitor, jobnum, fi_idx, fi,
                                  local_se_path, copy_ended_event,
                                  copy_timeout_event)

    cp_proc = se_copy(source_se_path, target_se_path, tmp=local_se_path)
    while (cp_proc.status(timeout=0) is
           None) and not copy_timeout_event.wait(timeout=0.1):
        pass
    copy_ended_event.set()
    monitor_thread.join()

    if copy_timeout_event.is_set():
        cp_proc.terminate(timeout=1)
        return status_mon.register_file_result(jobnum, fi_idx,
                                               'Transfer timeout',
                                               FileDownloadStatus.FILE_TIMEOUT)
    elif cp_proc.status(timeout=0, terminate=True) != 0:
        return status_mon.register_file_result(jobnum,
                                               fi_idx,
                                               'Transfer error',
                                               FileDownloadStatus.FILE_TIMEOUT,
                                               proc=cp_proc)
    return hash_verify(opts, status_mon, local_se_path, jobnum, fi_idx, fi)
Ejemplo n.º 52
0
	def getWMS(self):
		activity = Activity('Discovering available WMS services')
		wms_best_list = []
		for wms in self.listWMS_good():
			activity_wms = Activity('pinging WMS %s' % wms)
			if wms is None:
				continue
			ping, pingtime = self.pingDict.get(wms, (None, 0))
			if time.time() - pingtime > 30 * 60: # check every ~30min
				ping = utils.ping_host(wms.split('://')[1].split('/')[0].split(':')[0])
				self.pingDict[wms] = (ping, time.time() + 10 * 60 * random.random()) # 10 min variation
			if ping is not None:
				wms_best_list.append((wms, ping))
			activity_wms.finish()
		activity.finish()
		if not wms_best_list:
			return None
		sort_inplace(wms_best_list, key = lambda name_ping: name_ping[1])
		result = choice_exp(wms_best_list)
		if result is not None:
			activity = Activity('selecting WMS %s' % result)
			wms, ping = result # reduce timeout by 5min for chosen wms => re-ping every 6 submits
			self.pingDict[wms] = (ping, self.pingDict[wms][1] + 5*60)
			result = wms
			activity.finish()
		self.updateState()
		return result
Ejemplo n.º 53
0
def merge_batch(args):

    from gcSettings import Settings
    cfg = Settings()
    cfg.workflow.task = 'UserTask'
    cfg.workflow.backend = 'local'
    cfg.workflow.duration = '-1'

    cfg.jobs.wall_time = '3:00:00'
    cfg.jobs.memory = "6000"
    cfg.jobs.max_retry = 1

    cfg.usertask.executable = 'Artus/Utility/scripts/artus_userjob_epilog.sh'
    cmssw_base = os.getenv("CMSSW_BASE") + "/src/"
    executable = 'artusMergeOutputs.py '
    cfg.usertask.input_files = [
        cmssw_base + "Artus/Configuration/scripts/artusMergeOutputs.py"
    ]

    project_dirs = "-i " + " ".join(args.project_dir)
    outputs_per_nick = folders_to_merge(args)
    # extract nicks that should be ran on
    cfg.parameters.parameters = ["NICK"]
    nicks_to_process = outputs_per_nick.keys() if (
        args.project_subdir == None) else [args.project_subdir
                                           ]  # keep only single path
    input_dirs = []
    for project_dir in args.project_dir:
        for nick in nicks_to_process:
            input_dir = os.path.join(project_dir, nick)
            if os.path.exists(input_dir):
                input_dirs.append(input_dir)
            input_dir = os.path.join(project_dir, "output", nick)
            if os.path.exists(input_dir):
                input_dirs.append(input_dir)

    required_scratch_space = max(map(get_folder_size,
                                     input_dirs)) * 2 + 100 * 1024 * 1024
    cfg.backend.submit_options = "-l h_fsize=" + str(
        required_scratch_space / 1024 / 1024 / 1024) + "G"
    cfg.parameters.NICK = nicks_to_process
    cfg.jobs.jobs = len(nicks_to_process)

    arguments = cmssw_base
    arguments = arguments + " " + executable
    arguments = arguments + " ".join(args.project_dir)
    arguments = arguments + " --project-subdir @NICK@ "
    if (args.output_dir != None):
        arguments = arguments + " --output-dir " + args.output_dir

    cfg.usertask.arguments = "%s" % arguments
    merged_directory = os.path.join(
        args.project_dir[0] if (args.output_dir == None) else args.output_dir,
        "merged")
    cfg.storage.se_path = merged_directory
    cfg.storage.scratch_space_used = required_scratch_space / 1024 / 1024
    cfg.storage.se_output_files = "merged.root"
    cfg.storage.se_output_pattern = "@NICK@/@[email protected]"
    cfg.GLOBAL.workdir = os.path.join(
        args.project_dir[0] if (args.output_dir == None) else args.output_dir,
        "workdir_merge")
    from grid_control.utils.activity import Activity
    Activity.root = Activity('Running grid-control', name='root')
    from gcTool import gc_create_workflow, gc_create_config
    config = gc_create_config(configDict=Settings.getConfigDict())

    workflow = gc_create_workflow(config)
    #activate for large verbosity
    #logging.getLogger('process').setLevel(logging.DEBUG1)
    workflow.run()
Ejemplo n.º 54
0
	def splitDataset(self, path, blocks):
		activity = Activity('Splitting dataset into jobs')
		self.savePartitions(path, self.splitDatasetInternal(blocks))
		self.importPartitions(path)
		activity.finish()
Ejemplo n.º 55
0
    def _begin_bulk_submission(self):
        self._set_proxy_lifetime()
        if self._end_of_proxy_lifetime is None:
            raise Exception("_end_of_proxy_lifetime is not set")

        if self._delegated_proxy_filename is None:
            raise Exception("_delegated_proxy_filename is not set")

        if self._end_of_proxy_lifetime <= time.time():
            self._log.info(
                "renew proxy is necessary: %s <= %s" %
                (str(self._end_of_proxy_lifetime), str(time.time())))
            x = threading.Thread(target=CreamWMS.delfile,
                                 args=(self._lock_filename, 0, self._log))
            x.start()
            y = threading.Thread(target=CreamWMS.delfile,
                                 args=(self._delegated_proxy_filename, 0,
                                       self._log))
            y.start()
            raise Exception("renew proxy is necessary")

        elif '-D' in self._submit_args_dict.keys(
        ) and self._submit_args_dict['-D'] is not None:
            try:
                left_time_str = timedelta(seconds=self._end_of_proxy_lifetime -
                                          time.time())
            except:
                left_time_str = str(self._end_of_proxy_lifetime -
                                    time.time()) + ' sec.'
            self._log.info(
                "Proxy delegation IS NOT ISSUED since expected to be OK. left: %s "
                % left_time_str)

        else:
            if os.path.isfile(self._delegated_proxy_filename):
                file = open(self._delegated_proxy_filename, "r")
                delegate_id = file.read()
                file.close()
                # file is empty -> another process edditing it?
                # if delegate_id is None or delegate_id == '': return False
                if delegate_id is not None and delegate_id != "":
                    self._submit_args_dict.update({'-D': delegate_id})
                self._log.info('Proxy delegation read from a file: %s ' %
                               (delegate_id))

            elif not os.path.isfile(
                    self._delegated_proxy_lock
            ):  #not os.path.isfile(self._delegated_proxy_filename):
                file_lock = open(self._delegated_proxy_lock, "w+")
                file = open(self._delegated_proxy_filename, "w+")

                activity = Activity('Delegating proxy for job submission')
                self._submit_args_dict.update({'-D': None})
                #if self._use_delegate is False:
                #	self._submit_args_dict.update({'-a': ' '})
                #	return True
                t = time.time()
                thehex = md5_hex(str(t))
                self._log.info('Proxy delegation full hex: %s at time %s' %
                               (thehex, str(t)))
                delegate_id = 'GCD' + thehex[:15]
                delegate_arg_list = ['-e', self._ce[:self._ce.rfind("/")]]
                if self._config_fn:
                    delegate_arg_list.extend(['--config', self._config_fn])
                proc = LocalProcess(self._delegate_exec, '-d', delegate_id,
                                    '--logfile', '/dev/stderr',
                                    *delegate_arg_list)
                output = proc.get_output(timeout=10, raise_errors=False)
                if ('succesfully delegated to endpoint'
                        in output) and (delegate_id in output):
                    self._submit_args_dict.update({'-D': delegate_id})
                activity.finish()

                if proc.status(timeout=0, terminate=True) != 0:
                    self._log.log_process(proc)

                file.write(delegate_id)
                file.close()
                file_lock.close()
                y = threading.Thread(target=CreamWMS.delfile,
                                     args=(self._delegated_proxy_lock, 0,
                                           self._log))
                y.start()

        return self._submit_args_dict.get('-D') is not None