Ejemplo n.º 1
0
	def setupJobParameters(self, config, pm):
		config = config.addSections(['dataset']).addTags([self])
		self.dataSplitter = None
		self.dataRefresh = None
		self.dataset = config.get('dataset', '').strip()
		if self.dataset == '':
			return
		config.set('se output pattern', '@NICK@_job_@MY_JOBID@_@X@', override = False)
		config.set('default lookup', 'DATASETNICK', override = False)

		defaultProvider = config.get('dataset provider', 'ListProvider')
		dataProvider = DataProvider.create(config, self.dataset, defaultProvider)
		splitterName = config.get('dataset splitter', 'FileBoundarySplitter')
		splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName))
		self.dataSplitter = splitterClass(config)
		self.checkSE = config.getBool('dataset storage check', True, onChange = None)

		# Create and register dataset parameter plugin
		paramSource = DataParameterSource(config.getWorkPath(), 'data',
			dataProvider, self.dataSplitter, self.initDataProcessor())
		DataParameterSource.datasetsAvailable['data'] = paramSource

		# Select dataset refresh rate
		self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None)
		if self.dataRefresh > 0:
			paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit()))
			utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1)
		else:
			paramSource.resyncSetup(interval = 0)
		def externalRefresh(sig, frame):
			paramSource.resyncSetup(force = True)
		signal.signal(signal.SIGUSR2, externalRefresh)

		if self.dataSplitter.getMaxJobs() == 0:
			raise UserError('There are no events to process')
Ejemplo n.º 2
0
	def getEntries(self, path, metadata, events, seList, objStore):
		datacachePath = os.path.join(objStore.get('GC_WORKDIR', ''), 'datacache.dat')
		source = utils.QM((self.source == '') and os.path.exists(datacachePath), datacachePath, self.source)
		if source and (source not in self.lfnMap):
			pSource = DataProvider.create(createConfigFactory().getConfig(), source, 'ListProvider')
			for (n, fl) in map(lambda b: (b[DataProvider.Dataset], b[DataProvider.FileList]), pSource.getBlocks()):
				self.lfnMap.setdefault(source, {}).update(dict(map(lambda fi: (self.lfnTrans(fi[DataProvider.URL]), n), fl)))
		pList = set()
		for key in filter(lambda k: k in metadata, self.parentKeys):
			pList.update(map(lambda pPath: self.lfnMap.get(source, {}).get(self.lfnTrans(pPath)), metadata[key]))
		metadata['PARENT_PATH'] = filter(lambda x: x, pList)
		yield (path, metadata, events, seList, objStore)
Ejemplo n.º 3
0
	def setupJobParameters(self, config, pm):
		config = config.changeView(viewClass = TaggedConfigView, addSections = ['dataset'], addTags = [self])
		self.dataSplitter = None
		self.dataRefresh = None
		self._forceRefresh = config.getState('resync', detail = 'dataset', default = False)
		def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str):
			if ((old_obj == '') and (cur_obj != '')):
				raise UserError('It is currently not possible to attach a dataset to a non-dataset task!')
			self._forceRefresh = True
			return cur_obj
		self.dataset = config.get('dataset', '', onChange = userRefresh).strip()
		if self.dataset == '':
			return
		config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@')
		config.set('default lookup', 'DATASETNICK')

		defaultProvider = config.get('dataset provider', 'ListProvider')
		dataProvider = DataProvider.create(config, self.dataset, defaultProvider)
		splitterName = config.get('dataset splitter', 'FileBoundarySplitter')
		splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName))
		self.dataSplitter = splitterClass(config)

		# Create and register dataset parameter source
		paramSplitProcessor = config.getCompositePlugin('dataset processor',
			'BasicDataSplitProcessor SECheckSplitProcessor', 'MultiDataSplitProcessor',
			cls = DataSplitProcessor).getInstance(config)
		paramSource = DataParameterSource(config.getWorkPath(), 'data',
			dataProvider, self.dataSplitter, paramSplitProcessor)
		DataParameterSource.datasetsAvailable['data'] = paramSource

		# Select dataset refresh rate
		self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None)
		if self.dataRefresh > 0:
			paramSource.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit()))
			utils.vprint('Dataset source will be queried every %s' % utils.strTime(self.dataRefresh), -1)
		else:
			paramSource.resyncSetup(interval = 0)
		if self._forceRefresh:
			paramSource.resyncSetup(force = True)
		def externalRefresh(sig, frame):
			paramSource.resyncSetup(force = True)
		signal.signal(signal.SIGUSR2, externalRefresh)

		if self.dataSplitter.getMaxJobs() == 0:
			raise UserError('There are no events to process')
Ejemplo n.º 4
0
	def __init__(self, config):
		dsPath = config.get('source dataset path')
		self.source = DataProvider.create(config, None, dsPath, 'ListProvider')
Ejemplo n.º 5
0
def main():
	dataset = args[0].strip()
	cfgSettings = {'dbs blacklist T1': 'False', 'remove empty blocks': 'False',
		'remove empty files': 'False', 'location format': opts.locationfmt,
		'nickname check collision': 'False'}
	if opts.metadata or opts.blockmetadata:
		cfgSettings['lumi filter'] = '-'
		cfgSettings['keep lumi metadata'] = 'True'
	section = 'dataset'

	fillerList = [DefaultFilesConfigFiller()]
	if opts.settings:
		fillerList.append(FileConfigFiller([opts.settings]))
		tmpCfg = Config(fillerList, opts.settings)
		section = tmpCfg.get('global', ['task', 'module'])

	dummyConfig = Config(fillerList + [DictConfigFiller({section: cfgSettings})], opts.settings)
	dummyConfig.opts = opts
	dummyConfig = dummyConfig.addSections(['dataset'])

	if os.path.exists(dataset):
		provider = DataProvider.loadState(dataset, dummyConfig)
	else:
		provider = DataProvider.create(dummyConfig, dataset, opts.provider)
	blocks = provider.getBlocks()
	if len(blocks) == 0:
		raise DatasetError('No blocks!')

	datasets = set(map(lambda x: x[DataProvider.Dataset], blocks))
	if len(datasets) > 1 or opts.info:
		headerbase = [(DataProvider.Dataset, 'Dataset')]
	else:
		print 'Dataset: %s' % blocks[0][DataProvider.Dataset]
		headerbase = []

	if opts.configentry:
		print
		print 'dataset ='
		infos = {}
		order = []
		maxnick = 5
		for block in blocks:
			dsName = block[DataProvider.Dataset]
			if not infos.get(dsName, None):
				order.append(dsName)
				infos[dsName] = dict([(DataProvider.Dataset, dsName)])
				if DataProvider.Nickname not in block and opts.confignick:
					try:
						if '/' in dsName: 
							block[DataProvider.Nickname] = dsName.lstrip('/').split('/')[1]
						else:
							block[DataProvider.Nickname] = dsName
					except:
						pass
				if DataProvider.Nickname not in block and opts.confignick:
					block[DataProvider.Nickname] = np.getName(None, dsName, block)
				if DataProvider.Nickname in block:
					nick = block[DataProvider.Nickname]
					infos[dsName][DataProvider.Nickname] = nick
					maxnick = max(maxnick, len(nick))
				if len(block[DataProvider.FileList]):
					infos[dsName][DataProvider.URL] = block[DataProvider.FileList][0][DataProvider.URL]
		for dsID, dsName in enumerate(order):
			info = infos[dsName]
			short = DataProvider.providers.get(provider.__class__.__name__, provider.__class__.__name__)
			print '', info.get(DataProvider.Nickname, 'nick%d' % dsID).rjust(maxnick), ':', short, ':',
			print '%s%s' % (provider._datasetExpr, QM(short == 'list', ' %% %s' % info[DataProvider.Dataset], ''))


	if opts.listdatasets:
		# Add some enums for consistent access to info dicts
		DataProvider.NFiles = -1
		DataProvider.NBlocks = -2

		print
		infos = {}
		order = []
		infosum = {DataProvider.Dataset : 'Sum'}
		for block in blocks:
			dsName = block.get(DataProvider.Dataset, '')
			if not infos.get(dsName, None):
				order.append(dsName)
				infos[dsName] = {DataProvider.Dataset: block[DataProvider.Dataset]}
			def updateInfos(target):
				target[DataProvider.NBlocks]  = target.get(DataProvider.NBlocks, 0) + 1
				target[DataProvider.NFiles]   = target.get(DataProvider.NFiles, 0) + len(block[DataProvider.FileList])
				target[DataProvider.NEntries] = target.get(DataProvider.NEntries, 0) + block[DataProvider.NEntries]
			updateInfos(infos[dsName])
			updateInfos(infosum)
		head = [(DataProvider.Dataset, 'Dataset'), (DataProvider.NEntries, '#Events'),
			(DataProvider.NBlocks, '#Blocks'), (DataProvider.NFiles, '#Files')]
		utils.printTabular(head, map(lambda x: infos[x], order) + ["=", infosum])

	if opts.listblocks:
		print
		utils.printTabular(headerbase + [(DataProvider.BlockName, 'Block'), (DataProvider.NEntries, 'Events')], blocks)

	if opts.listfiles:
		print
		for block in blocks:
			if len(datasets) > 1:
				print 'Dataset: %s' % block[DataProvider.Dataset]
			print 'Blockname: %s' % block[DataProvider.BlockName]
			utils.printTabular([(DataProvider.URL, 'Filename'), (DataProvider.NEntries, 'Events')], block[DataProvider.FileList])
			print

	def printMetadata(src, maxlen):
		for (mk, mv) in src:
			if len(str(mv)) > 200:
				mv = '<metadata entry size: %s> %s...' % (len(str(mv)), repr(mv)[:200])
			print '\t%s: %s' % (mk.rjust(maxlen), mv)
		if src:
			print

	if opts.metadata and not opts.save:
		print
		for block in blocks:
			if len(datasets) > 1:
				print 'Dataset: %s' % block[DataProvider.Dataset]
			print 'Blockname: %s' % block[DataProvider.BlockName]
			mk_len = max(map(len, block.get(DataProvider.Metadata, [''])))
			for f in block[DataProvider.FileList]:
				print '%s [%d events]' % (f[DataProvider.URL], f[DataProvider.NEntries])
				printMetadata(zip(block.get(DataProvider.Metadata, []), f.get(DataProvider.Metadata, [])), mk_len)
			print

	if opts.blockmetadata and not opts.save:
		for block in blocks:
			if len(datasets) > 1:
				print 'Dataset: %s' % block[DataProvider.Dataset]
			print 'Blockname: %s' % block[DataProvider.BlockName]
			mkdict = lambda x: dict(zip(block[DataProvider.Metadata], x[DataProvider.Metadata]))
			metadata = QM(block[DataProvider.FileList], mkdict(block[DataProvider.FileList][0]), {})
			for fileInfo in block[DataProvider.FileList]:
				utils.intersectDict(metadata, mkdict(fileInfo))
			printMetadata(metadata.items(), max(map(len, metadata.keys())))

	if opts.liststorage:
		print
		infos = {}
		print 'Storage elements:'
		for block in blocks:
			dsName = block[DataProvider.Dataset]
			if len(headerbase) > 0:
				print 'Dataset: %s' % dsName
			if block.get(DataProvider.BlockName, None):
				print 'Blockname: %s' % block[DataProvider.BlockName]
			if block[DataProvider.Locations] == None:
				print '\tNo location contraint specified'
			elif block[DataProvider.Locations] == []:
				print '\tNot located at anywhere'
			else:
				for se in block[DataProvider.Locations]:
					print '\t%s' % se
			print

	if opts.info:
		evSum = 0
		for block in blocks:
			print block.get(DataProvider.Dataset, '-'),
			print block.get(DataProvider.BlockName, '-'),
			if block.get(DataProvider.Locations, None):
				print str.join(',', block.get(DataProvider.Locations, '-')),
			else:
				print '-',
			print block.get(DataProvider.NEntries, 0),
			evSum += block.get(DataProvider.NEntries, 0)
			print evSum

	if opts.save:
		print
		blocks = provider.getBlocks()
		if opts.sort:
			blocks.sort(key = lambda b: b[DataProvider.Dataset] + '#' + b[DataProvider.BlockName])
			for b in blocks:
				b[DataProvider.FileList].sort(key = lambda fi: fi[DataProvider.URL])
		provider.saveState(opts.save, blocks)
		print 'Dataset information saved to ./%s' % opts.save
Ejemplo n.º 6
0
def main():
	dataset = args[0].strip()
	cfgSettings = {'dbs blacklist T1 *': 'False', 'remove empty blocks *': 'False',
		'remove empty files *': 'False', 'location format *': opts.locationfmt,
		'nickname check collision *': 'False'}
	if opts.metadata or opts.blockmetadata:
		cfgSettings['lumi filter *'] = '-'
		cfgSettings['keep lumi metadata *'] = 'True'

	config = getConfig(configFile = opts.settings, configDict = {'dataset': cfgSettings})

	if os.path.exists(dataset):
		provider = DataProvider.getInstance('ListProvider', config, dataset, None)
	else:
		provider = DataProvider.create(config, dataset, opts.provider)
	blocks = provider.getBlocks()
	if len(blocks) == 0:
		raise DatasetError('No blocks!')

	datasets = set(map(lambda x: x[DataProvider.Dataset], blocks))
	if len(datasets) > 1 or opts.info:
		headerbase = [(DataProvider.Dataset, 'Dataset')]
	else:
		print('Dataset: %s' % blocks[0][DataProvider.Dataset])
		headerbase = []

	if opts.configentry:
		print('')
		print('dataset =')
		infos = {}
		order = []
		maxnick = 5
		for block in blocks:
			dsName = block[DataProvider.Dataset]
			if not infos.get(dsName, None):
				order.append(dsName)
				infos[dsName] = dict([(DataProvider.Dataset, dsName)])
				if DataProvider.Nickname not in block and opts.confignick:
					try:
						if '/' in dsName: 
							block[DataProvider.Nickname] = dsName.lstrip('/').split('/')[1]
						else:
							block[DataProvider.Nickname] = dsName
					except Exception:
						pass
				if DataProvider.Nickname not in block and opts.confignick:
					block[DataProvider.Nickname] = np.getName(None, dsName, block)
				if DataProvider.Nickname in block:
					nick = block[DataProvider.Nickname]
					infos[dsName][DataProvider.Nickname] = nick
					maxnick = max(maxnick, len(nick))
				if len(block[DataProvider.FileList]):
					infos[dsName][DataProvider.URL] = block[DataProvider.FileList][0][DataProvider.URL]
		for dsID, dsName in enumerate(order):
			info = infos[dsName]
			short = DataProvider.providers.get(provider.__class__.__name__, provider.__class__.__name__)
			nickname = info.get(DataProvider.Nickname, 'nick%d' % dsID).rjust(maxnick)
			filterExpr = utils.QM(short == 'list', ' %% %s' % info[DataProvider.Dataset], '')
			print('\t%s : %s : %s%s' % (nickname, short, provider._datasetExpr, filterExpr))


	if opts.listdatasets:
		# Add some enums for consistent access to info dicts
		DataProvider.NFiles = -1
		DataProvider.NBlocks = -2

		print('')
		infos = {}
		order = []
		infosum = {DataProvider.Dataset : 'Sum'}
		for block in blocks:
			dsName = block.get(DataProvider.Dataset, '')
			if not infos.get(dsName, None):
				order.append(dsName)
				infos[dsName] = {DataProvider.Dataset: block[DataProvider.Dataset]}
			def updateInfos(target):
				target[DataProvider.NBlocks]  = target.get(DataProvider.NBlocks, 0) + 1
				target[DataProvider.NFiles]   = target.get(DataProvider.NFiles, 0) + len(block[DataProvider.FileList])
				target[DataProvider.NEntries] = target.get(DataProvider.NEntries, 0) + block[DataProvider.NEntries]
			updateInfos(infos[dsName])
			updateInfos(infosum)
		head = [(DataProvider.Dataset, 'Dataset'), (DataProvider.NEntries, '#Events'),
			(DataProvider.NBlocks, '#Blocks'), (DataProvider.NFiles, '#Files')]
		utils.printTabular(head, map(lambda x: infos[x], order) + ['=', infosum])

	if opts.listblocks:
		print('')
		utils.printTabular(headerbase + [(DataProvider.BlockName, 'Block'), (DataProvider.NEntries, 'Events')], blocks)

	if opts.listfiles:
		print('')
		for block in blocks:
			if len(datasets) > 1:
				print('Dataset: %s' % block[DataProvider.Dataset])
			print('Blockname: %s' % block[DataProvider.BlockName])
			utils.printTabular([(DataProvider.URL, 'Filename'), (DataProvider.NEntries, 'Events')], block[DataProvider.FileList])
			print('')

	def printMetadata(src, maxlen):
		for (mk, mv) in src:
			if len(str(mv)) > 200:
				mv = '<metadata entry size: %s> %s...' % (len(str(mv)), repr(mv)[:200])
			print('\t%s: %s' % (mk.rjust(maxlen), mv))
		if src:
			print('')

	if opts.metadata and not opts.save:
		print('')
		for block in blocks:
			if len(datasets) > 1:
				print('Dataset: %s' % block[DataProvider.Dataset])
			print('Blockname: %s' % block[DataProvider.BlockName])
			mk_len = max(map(len, block.get(DataProvider.Metadata, [''])))
			for f in block[DataProvider.FileList]:
				print('%s [%d events]' % (f[DataProvider.URL], f[DataProvider.NEntries]))
				printMetadata(zip(block.get(DataProvider.Metadata, []), f.get(DataProvider.Metadata, [])), mk_len)
			print('')

	if opts.blockmetadata and not opts.save:
		for block in blocks:
			if len(datasets) > 1:
				print('Dataset: %s' % block[DataProvider.Dataset])
			print('Blockname: %s' % block[DataProvider.BlockName])
			mkdict = lambda x: dict(zip(block[DataProvider.Metadata], x[DataProvider.Metadata]))
			metadata = utils.QM(block[DataProvider.FileList], mkdict(block[DataProvider.FileList][0]), {})
			for fileInfo in block[DataProvider.FileList]:
				utils.intersectDict(metadata, mkdict(fileInfo))
			printMetadata(metadata.items(), max(map(len, metadata.keys())))

	if opts.liststorage:
		print('')
		infos = {}
		print('Storage elements:')
		for block in blocks:
			dsName = block[DataProvider.Dataset]
			if len(headerbase) > 0:
				print('Dataset: %s' % dsName)
			if block.get(DataProvider.BlockName, None):
				print('Blockname: %s' % block[DataProvider.BlockName])
			if block[DataProvider.Locations] == None:
				print('\tNo location contraint specified')
			elif block[DataProvider.Locations] == []:
				print('\tNot located at anywhere')
			else:
				for se in block[DataProvider.Locations]:
					print('\t%s' % se)
			print('')

	if opts.info:
		evSum = 0
		for block in blocks:
			blockId = '%s %s' % (block.get(DataProvider.Dataset, '-'), block.get(DataProvider.BlockName, '-'))
			blockStorage = '-'
			if block.get(DataProvider.Locations, None):
				blockStorage = str.join(',', block.get(DataProvider.Locations, '-'))
			evSum += block.get(DataProvider.NEntries, 0)
			print('%s %s %d %d' % (blockId, blockStorage, block.get(DataProvider.NEntries, 0), evSum))

	if opts.save:
		print('')
		blocks = provider.getBlocks()
		if opts.sort:
			blocks.sort(key = lambda b: b[DataProvider.Dataset] + '#' + b[DataProvider.BlockName])
			for b in blocks:
				b[DataProvider.FileList].sort(key = lambda fi: fi[DataProvider.URL])
		provider.saveState(opts.save, blocks)
		print('Dataset information saved to ./%s' % opts.save)