def __init__(self, url):
     self._reader_url = '%s/%s' % (url, 'DBSReader')
     self._writer_url = '%s/%s' % (url, 'DBSWriter')
     self._migrate_url = '%s/%s' % (url, 'DBSMigrate')
     self._proxy_path = os.environ.get('X509_USER_PROXY', '')
     if not os.path.exists(self._proxy_path):
         raise UserError(
             'VOMS proxy needed to query DBS3! Environment variable X509_USER_PROXY is "%s"'
             % self._proxy_path)
     self._jrc = JSONRestClient(cert=self._proxy_path)
Example #2
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        dataset_config = config.change_view(
            default_on_change=TriggerResync(['datasets', 'parameters']))
        self._lumi_filter = dataset_config.get_lookup(
            ['lumi filter', '%s lumi filter' % datasource_name],
            default={},
            parser=parse_lumi_filter,
            strfun=str_lumi)
        if not self._lumi_filter.empty():
            config.set('%s processor' % datasource_name, 'LumiDataProcessor',
                       '+=')
        DataProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = dataset_config.get_bool(
            ['lumi metadata',
             '%s lumi metadata' % datasource_name],
            default=not self._lumi_filter.empty())
        config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=')
        # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
        self._phedex_filter = dataset_config.get_filter(
            'phedex sites',
            '-* T1_*_Disk T2_* T3_*',
            default_matcher='BlackWhiteMatcher',
            default_filter='StrictListFilter')
        self._only_complete = dataset_config.get_bool('only complete sites',
                                                      True)
        self._only_valid = dataset_config.get_bool('only valid', True)
        self._allow_phedex = dataset_config.get_bool('allow phedex', True)
        self._location_format = dataset_config.get_enum(
            'location format', CMSLocationFormat, CMSLocationFormat.hostname)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )
        self._sitedb = SiteDB()

        dataset_expr_parts = split_opt(dataset_expr, '@#')
        (self._dataset_path, self._dataset_instance,
         self._dataset_block_selector) = dataset_expr_parts
        instance_default = dataset_config.get('dbs instance', '')
        self._dataset_instance = self._dataset_instance or instance_default
        if not self._dataset_instance:
            self._dataset_instance = 'prod/global'
        elif '/' not in self._dataset_instance:
            self._dataset_instance = 'prod/%s' % self._dataset_instance
        self._dataset_block_selector = self._dataset_block_selector or 'all'
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None):
		dataset_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters']))
		self._lumi_filter = dataset_config.get_lookup(['lumi filter', '%s lumi filter' % datasource_name],
			default={}, parser=parse_lumi_filter, strfun=str_lumi)
		if not self._lumi_filter.empty():
			config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = dataset_config.get_bool(
			['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty())
		config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=')
		# PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
		self._phedex_filter = dataset_config.get_filter('phedex sites', '-* T1_*_Disk T2_* T3_*',
			default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter')
		self._only_complete = dataset_config.get_bool('only complete sites', True)
		self._only_valid = dataset_config.get_bool('only valid', True)
		self._location_format = dataset_config.get_enum('location format',
			CMSLocationFormat, CMSLocationFormat.hostname)
		self._pjrc = JSONRestClient(url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')
		self._sitedb = SiteDB()

		dataset_expr_parts = split_opt(dataset_expr, '@#')
		(self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts
		instance_default = dataset_config.get('dbs instance', '')
		self._dataset_instance = self._dataset_instance or instance_default
		if not self._dataset_instance:
			self._dataset_instance = 'prod/global'
		elif '/' not in self._dataset_instance:
			self._dataset_instance = 'prod/%s' % self._dataset_instance
		self._dataset_block_selector = self._dataset_block_selector or 'all'
Example #4
0
	def __init__(self, config, datasetExpr, datasetNick = None):
		self._changeTrigger = triggerResync(['datasets', 'parameters'])
		self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = self._changeTrigger)
		if not self._lumi_filter.empty():
			config.set('dataset processor', 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasetExpr, datasetNick)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = self._changeTrigger)
		config.set('phedex sites matcher mode', 'shell', '?=')
		# PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
		self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*',
			defaultMatcher = 'blackwhite', defaultFilter = 'strict', onChange = self._changeTrigger)
		self._onlyComplete = config.getBool('only complete sites', True, onChange = self._changeTrigger)
		self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = self._changeTrigger)
		self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')
		self._sitedb = SiteDB()

		(self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#')
		instance_default = config.get('dbs instance', '', onChange = self._changeTrigger)
		self._datasetInstance = self._datasetInstance or instance_default
		if not self._datasetInstance:
			self._datasetInstance = 'prod/global'
		elif '/' not in self._datasetInstance:
			self._datasetInstance = 'prod/%s' % self._datasetInstance
		self._datasetBlock = self._datasetBlock or 'all'
		self.onlyValid = config.getBool('only valid', True, onChange = self._changeTrigger)
Example #5
0
    def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0):
        changeTrigger = triggerResync(['datasets', 'parameters'])
        self._lumi_filter = config.getLookup('lumi filter', {},
                                             parser=parseLumiFilter,
                                             strfun=strLumi,
                                             onChange=changeTrigger)
        if not self._lumi_filter.empty():
            config.set('dataset processor', 'LumiDataProcessor', '+=')
        DataProvider.__init__(self, config, datasetExpr, datasetNick,
                              datasetID)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = config.getBool('lumi metadata',
                                          not self._lumi_filter.empty(),
                                          onChange=changeTrigger)
        # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
        self._phedexFilter = config.getFilter('phedex sites',
                                              '-T3_US_FNALLPC',
                                              defaultMatcher='blackwhite',
                                              defaultFilter='weak',
                                              onChange=changeTrigger)
        self._phedexT1Filter = config.getFilter('phedex t1 accept',
                                                'T1_DE_KIT T1_US_FNAL',
                                                defaultMatcher='blackwhite',
                                                defaultFilter='weak',
                                                onChange=changeTrigger)
        self._phedexT1Mode = config.getEnum('phedex t1 mode',
                                            PhedexT1Mode,
                                            PhedexT1Mode.disk,
                                            onChange=changeTrigger)
        self.onlyComplete = config.getBool('only complete sites',
                                           True,
                                           onChange=changeTrigger)
        self._locationFormat = config.getEnum('location format',
                                              CMSLocationFormat,
                                              CMSLocationFormat.hostname,
                                              onChange=changeTrigger)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )

        (self._datasetPath, self._url,
         self._datasetBlock) = optSplit(datasetExpr, '@#')
        self._url = self._url or config.get('dbs instance', '')
        self._datasetBlock = self._datasetBlock or 'all'
        self.onlyValid = config.getBool('only valid',
                                        True,
                                        onChange=changeTrigger)
Example #6
0
    def __init__(self, config, datasetExpr, datasetNick=None):
        self._changeTrigger = triggerResync(['datasets', 'parameters'])
        self._lumi_filter = config.getLookup('lumi filter', {},
                                             parser=parseLumiFilter,
                                             strfun=strLumi,
                                             onChange=self._changeTrigger)
        if not self._lumi_filter.empty():
            config.set('dataset processor', 'LumiDataProcessor', '+=')
        DataProvider.__init__(self, config, datasetExpr, datasetNick)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = config.getBool('lumi metadata',
                                          not self._lumi_filter.empty(),
                                          onChange=self._changeTrigger)
        config.set('phedex sites matcher mode', 'shell', '?=')
        # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
        self._phedexFilter = config.getFilter('phedex sites',
                                              '-* T1_*_Disk T2_* T3_*',
                                              defaultMatcher='blackwhite',
                                              defaultFilter='strict',
                                              onChange=self._changeTrigger)
        self._onlyComplete = config.getBool('only complete sites',
                                            True,
                                            onChange=self._changeTrigger)
        self._locationFormat = config.getEnum('location format',
                                              CMSLocationFormat,
                                              CMSLocationFormat.hostname,
                                              onChange=self._changeTrigger)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )
        self._sitedb = SiteDB()

        (self._datasetPath, self._datasetInstance,
         self._datasetBlock) = optSplit(datasetExpr, '@#')
        instance_default = config.get('dbs instance',
                                      '',
                                      onChange=self._changeTrigger)
        self._datasetInstance = self._datasetInstance or instance_default
        if not self._datasetInstance:
            self._datasetInstance = 'prod/global'
        elif '/' not in self._datasetInstance:
            self._datasetInstance = 'prod/%s' % self._datasetInstance
        self._datasetBlock = self._datasetBlock or 'all'
        self.onlyValid = config.getBool('only valid',
                                        True,
                                        onChange=self._changeTrigger)
Example #7
0
def _lfn2pfn(node, lfn, prot='srmv2'):
    return JSONRestClient().get(
        url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/lfn2pfn',
        params={
            'node': node,
            'protocol': prot,
            'lfn': lfn
        })['phedex']['mapping']
class DBS3LiteClient(object):
    def __init__(self, url):
        self._reader_url = '%s/%s' % (url, 'DBSReader')
        self._writer_url = '%s/%s' % (url, 'DBSWriter')
        self._migrate_url = '%s/%s' % (url, 'DBSMigrate')
        self._proxy_path = os.environ.get('X509_USER_PROXY', '')
        if not os.path.exists(self._proxy_path):
            raise UserError(
                'VOMS proxy needed to query DBS3! Environment variable X509_USER_PROXY is "%s"'
                % self._proxy_path)
        self._jrc = JSONRestClient(cert=self._proxy_path)

    def listBlocks(self, **kwargs):
        return self._jrc.get(url=self._reader_url, api='blocks', params=kwargs)

    def listFiles(self, **kwargs):
        return self._jrc.get(url=self._reader_url, api='files', params=kwargs)

    def listFileParents(self, **kwargs):
        return self._jrc.get(url=self._reader_url,
                             api='fileparents',
                             params=kwargs)

    def insertBulkBlock(self, data):
        return self._jrc.post(url=self._writer_url,
                              api='bulkblocks',
                              data=data)

    def migrateSubmit(self, data):
        return self._jrc.post(url=self._migrate_url, api='submit', data=data)

    def migrateStatus(self, **kwargs):
        return self._jrc.get(url=self._migrate_url,
                             api='status',
                             params=kwargs)
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		changeTrigger = triggerResync(['datasets', 'parameters'])
		self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = changeTrigger)
		if not self._lumi_filter.empty():
			config.set('dataset processor', 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = changeTrigger)
		# PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
		self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger)
		self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger)
		self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange = changeTrigger)
		self.onlyComplete = config.getBool('only complete sites', True, onChange = changeTrigger)
		self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = changeTrigger)
		self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')

		(self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#')
		self._url = self._url or config.get('dbs instance', '')
		self._datasetBlock = self._datasetBlock or 'all'
		self.onlyValid = config.getBool('only valid', True, onChange = changeTrigger)
Example #10
0
class CMSBaseProvider(DataProvider):
	def __init__(self, config, datasetExpr, datasetNick = None):
		self._changeTrigger = triggerResync(['datasets', 'parameters'])
		self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = self._changeTrigger)
		if not self._lumi_filter.empty():
			config.set('dataset processor', 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasetExpr, datasetNick)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = self._changeTrigger)
		config.set('phedex sites matcher mode', 'shell', '?=')
		# PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
		self._phedexFilter = config.getFilter('phedex sites', '-* T1_*_Disk T2_* T3_*',
			defaultMatcher = 'blackwhite', defaultFilter = 'strict', onChange = self._changeTrigger)
		self._onlyComplete = config.getBool('only complete sites', True, onChange = self._changeTrigger)
		self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = self._changeTrigger)
		self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')
		self._sitedb = SiteDB()

		(self._datasetPath, self._datasetInstance, self._datasetBlock) = optSplit(datasetExpr, '@#')
		instance_default = config.get('dbs instance', '', onChange = self._changeTrigger)
		self._datasetInstance = self._datasetInstance or instance_default
		if not self._datasetInstance:
			self._datasetInstance = 'prod/global'
		elif '/' not in self._datasetInstance:
			self._datasetInstance = 'prod/%s' % self._datasetInstance
		self._datasetBlock = self._datasetBlock or 'all'
		self.onlyValid = config.getBool('only valid', True, onChange = self._changeTrigger)


	# Define how often the dataprovider can be queried automatically
	def queryLimit(self):
		return 2 * 60 * 60 # 2 hour delay minimum


	# Check if splitterClass is valid
	def checkSplitter(self, splitterClass):
		if (DataSplitter.Skipped in splitterClass.neededEnums()) and not self._lumi_filter.empty():
			self._log.debug('Selected splitter %s is not compatible with active lumi filter!', splitterClass.__name__)
			self._log.warning('Active lumi section filter forced selection of HybridSplitter')
			return HybridSplitter
		return splitterClass


	def _replicaLocation(self, replica_info):
		(name_node, name_hostname, _) = replica_info
		if self._locationFormat == CMSLocationFormat.siteDB:
			yield name_node
		else:
			if name_hostname is not None:
				name_hostnames = [name_hostname]
			else:
				name_hostnames = self._sitedb.cms_name_to_se(name_node)
			for name_hostname in name_hostnames:
				if self._locationFormat == CMSLocationFormat.hostname:
					yield name_hostname
				else:
					yield '%s/%s' % (name_node, name_hostname)


	def _fmtLocations(self, replica_infos):
		for replica_info in replica_infos:
			(_, _, completed) = replica_info
			if completed:
				for entry in self._replicaLocation(replica_info):
					yield entry
			else:
				for entry in self._replicaLocation(replica_info):
					yield '(%s)' % entry


	def _processReplicas(self, blockPath, replica_infos):
		def empty_with_warning(*args):
			self._log.warning(*args)
			return []
		def expanded_replica_locations(replica_infos):
			for replica_info in replica_infos:
				for entry in self._replicaLocation(replica_info):
					yield entry

		if not replica_infos:
			return empty_with_warning('Dataset block %r has no replica information!', blockPath)
		replica_infos_selected = self._phedexFilter.filterList(replica_infos, key = itemgetter(0))
		if not replica_infos_selected:
			return empty_with_warning('Dataset block %r is not available at the selected locations!\nAvailable locations: %s', blockPath,
				str.join(', ', self._fmtLocations(replica_infos)))
		if not self._onlyComplete:
			return list(expanded_replica_locations(replica_infos_selected))
		replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected)
		if not replica_infos_complete:
			return empty_with_warning('Dataset block %r is not completely available at the selected locations!\nAvailable locations: %s', blockPath,
				str.join(', ', self._fmtLocations(replica_infos)))
		return list(expanded_replica_locations(replica_infos_complete))


	# Get dataset se list from PhEDex (perhaps concurrent with listFiles)
	def _getPhedexReplicas(self, blockPath, dictReplicas):
		dictReplicas[blockPath] = []
		for phedexBlock in self._pjrc.get(params = {'block': blockPath})['phedex']['block']:
			for replica in phedexBlock['replica']:
				dictReplicas[blockPath].append((replica['node'], replica.get('se'), replica['complete'] == 'y'))


	def getDatasets(self):
		if self._cache_dataset is None:
			self._cache_dataset = [self._datasetPath]
			if '*' in self._datasetPath:
				self._cache_dataset = list(self._getCMSDatasets(self._datasetPath))
				if not self._cache_dataset:
					raise DatasetError('No datasets selected by DBS wildcard %s !' % self._datasetPath)
		return self._cache_dataset


	def _getCMSBlocks(self, datasetPath, getSites):
		iter_blockname_selist = self._getCMSBlocksImpl(datasetPath, getSites)
		n_blocks = 0
		selected_blocks = False
		for (blockname, selist) in iter_blockname_selist:
			n_blocks += 1
			if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1] != self._datasetBlock):
				continue
			selected_blocks = True
			yield (blockname, selist)
		if (n_blocks > 0) and not selected_blocks:
			raise DatasetError('Dataset %r contains %d blocks, but none were selected by %r' % (datasetPath, n_blocks, self._datasetBlock))


	def _fillCMSFiles(self, block, blockPath):
		lumi_used = False
		lumiDict = {}
		if self._lumi_query: # central lumi query
			lumiDict = self._getCMSLumisImpl(blockPath)
		fileList = []
		for (fileInfo, listLumi) in self._getCMSFilesImpl(blockPath, self.onlyValid, self._lumi_query):
			if lumiDict and not listLumi:
				listLumi = lumiDict.get(fileInfo[DataProvider.URL], [])
			if listLumi:
				(listLumiExt_Run, listLumiExt_Lumi) = ([], [])
				for (run, lumi_list) in sorted(listLumi):
					listLumiExt_Run.extend([run] * len(lumi_list))
					listLumiExt_Lumi.extend(lumi_list)
				fileInfo[DataProvider.Metadata] = [listLumiExt_Run, listLumiExt_Lumi]
				lumi_used = True
			fileList.append(fileInfo)
		if lumi_used:
			block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi'])
		block[DataProvider.FileList] = fileList


	def _getCMSLumisImpl(self, blockPath):
		return None


	def _getGCBlocks(self, usePhedex):
		for datasetPath in self.getDatasets():
			counter = 0
			for (blockPath, replica_infos) in self._getCMSBlocks(datasetPath, getSites = not usePhedex):
				result = {}
				result[DataProvider.Dataset] = blockPath.split('#')[0]
				result[DataProvider.BlockName] = blockPath.split('#')[1]

				if usePhedex: # Start parallel phedex query
					dictReplicas = {}
					tPhedex = start_thread('Query phedex site info for %s' % blockPath, self._getPhedexReplicas, blockPath, dictReplicas)
					self._fillCMSFiles(result, blockPath)
					tPhedex.join()
					replica_infos = dictReplicas.get(blockPath)
				else:
					self._fillCMSFiles(result, blockPath)
				result[DataProvider.Locations] = self._processReplicas(blockPath, replica_infos)

				if len(result[DataProvider.FileList]):
					counter += 1
					yield result

			if counter == 0:
				raise DatasetError('Dataset %s does not contain any valid blocks!' % datasetPath)
Example #11
0
tau-cream.hep.tau.ac.il
tech-crm.hep.technion.ac.il
top.ucr.edu
umiss001.hep.olemiss.edu
uosaf0008.sscc.uos.ac.kr
uscms1.fltech-grid3.fit.edu
v6ce00.grid.hep.ph.ic.ac.uk
vserv13.hep.phy.cam.ac.uk
wipp-crm.weizmann.ac.il
"""

    import sys, time
    from grid_control.utils.webservice import JSONRestClient
    from python_compat import set, imap, lmap, lfilter, sorted

    jrc = JSONRestClient(
        url='http://maps.googleapis.com/maps/api/geocode/json')

    def geocode(loc):
        result = jrc.get(params={
            'address': str.join('.',
                                loc.split('.')[2:]),
            'sensor': 'false'
        })
        if 'Placemark' in result:
            result = lmap(
                lambda x:
                (x['address'], tuple(reversed(x['Point']['coordinates'][:2]))),
                result['Placemark'])
        return result

    counter = 0
Example #12
0
class CMSBaseProvider(DataProvider):
    # required format: <dataset path>[@<instance>][#<block>]
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        dataset_config = config.change_view(
            default_on_change=TriggerResync(['datasets', 'parameters']))
        self._lumi_filter = dataset_config.get_lookup(
            ['lumi filter', '%s lumi filter' % datasource_name],
            default={},
            parser=parse_lumi_filter,
            strfun=str_lumi)
        if not self._lumi_filter.empty():
            config.set('%s processor' % datasource_name, 'LumiDataProcessor',
                       '+=')
        DataProvider.__init__(self, config, datasource_name, dataset_expr,
                              dataset_nick, dataset_proc)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = dataset_config.get_bool(
            ['lumi metadata',
             '%s lumi metadata' % datasource_name],
            default=not self._lumi_filter.empty())
        config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=')
        # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
        self._phedex_filter = dataset_config.get_filter(
            'phedex sites',
            '-* T1_*_Disk T2_* T3_*',
            default_matcher='BlackWhiteMatcher',
            default_filter='StrictListFilter')
        self._only_complete = dataset_config.get_bool('only complete sites',
                                                      True)
        self._only_valid = dataset_config.get_bool('only valid', True)
        self._allow_phedex = dataset_config.get_bool('allow phedex', True)
        self._location_format = dataset_config.get_enum(
            'location format', CMSLocationFormat, CMSLocationFormat.hostname)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )
        self._sitedb = SiteDB()

        dataset_expr_parts = split_opt(dataset_expr, '@#')
        (self._dataset_path, self._dataset_instance,
         self._dataset_block_selector) = dataset_expr_parts
        instance_default = dataset_config.get('dbs instance', '')
        self._dataset_instance = self._dataset_instance or instance_default
        if not self._dataset_instance:
            self._dataset_instance = 'prod/global'
        elif '/' not in self._dataset_instance:
            self._dataset_instance = 'prod/%s' % self._dataset_instance
        self._dataset_block_selector = self._dataset_block_selector or 'all'

    def check_splitter(self, splitter):
        # Check if splitter is valid
        if (DataSplitter.Skipped in splitter.get_needed_enums()
            ) and not self._lumi_filter.empty():
            self._log.debug(
                'Selected splitter %s is not compatible with active lumi filter!',
                splitter.__name__)
            self._log.warning(
                'Active lumi section filter forced selection of HybridSplitter'
            )
            return HybridSplitter
        return splitter

    def get_dataset_name_list(self):
        if self._cache_dataset is None:
            self._cache_dataset = [self._dataset_path]
            if '*' in self._dataset_path:
                activity = Activity('Getting dataset list for %s' %
                                    self._dataset_path)
                self._cache_dataset = list(
                    self._get_cms_dataset_list(self._dataset_path))
                if not self._cache_dataset:
                    raise DatasetError(
                        'No datasets selected by DBS wildcard %s !' %
                        self._dataset_path)
                activity.finish()
        return self._cache_dataset

    def get_query_interval(self):
        # Define how often the dataprovider can be queried automatically
        return 2 * 60 * 60  # 2 hour delay minimum

    def _fill_cms_fi_list(self, block, block_path):
        activity_fi = Activity('Getting file information')
        lumi_used = False
        lumi_info_dict = {}
        if self._lumi_query:  # central lumi query
            lumi_info_dict = self._get_cms_lumi_dict(block_path)
        fi_list = []
        for (fi,
             lumi_info_list) in self._iter_cms_files(block_path,
                                                     self._only_valid,
                                                     self._lumi_query):
            self._raise_on_abort()
            if lumi_info_dict and not lumi_info_list:
                lumi_info_list = lumi_info_dict.get(fi[DataProvider.URL], [])
            if lumi_info_list:
                (run_list_result, lumi_list_result) = ([], [])
                for (run, lumi_list) in sorted(lumi_info_list):
                    run_list_result.extend([run] * len(lumi_list))
                    lumi_list_result.extend(lumi_list)
                assert len(run_list_result) == len(lumi_list_result)
                fi[DataProvider.Metadata] = [run_list_result, lumi_list_result]
                lumi_used = True
            fi_list.append(fi)
        if lumi_used:
            block.setdefault(DataProvider.Metadata,
                             []).extend(['Runs', 'Lumi'])
        block[DataProvider.FileList] = fi_list
        activity_fi.finish()

    def _filter_cms_blockinfo_list(self, dataset_path, do_query_sites):
        iter_dataset_block_name_selist = self._iter_cms_blocks(
            dataset_path, do_query_sites)
        n_blocks = 0
        selected_blocks = False
        for (dataset_block_name, selist) in iter_dataset_block_name_selist:
            n_blocks += 1
            block_name = str.split(dataset_block_name, '#')[1]
            if (self._dataset_block_selector !=
                    'all') and (block_name != self._dataset_block_selector):
                continue
            selected_blocks = True
            yield (dataset_block_name, selist)
        if (n_blocks > 0) and not selected_blocks:
            raise DatasetError(
                'Dataset %r contains %d blocks, but none were selected by %r' %
                (dataset_path, n_blocks, self._dataset_block_selector))

    def _get_cms_dataset_list(self, dataset_path):
        raise AbstractError

    def _get_cms_lumi_dict(self, block_path):
        return None

    def _get_gc_block_list(self, use_phedex):
        dataset_name_list = self.get_dataset_name_list()
        progress_ds = ProgressActivity('Getting dataset',
                                       len(dataset_name_list))
        for dataset_idx, dataset_path in enumerate(dataset_name_list):
            progress_ds.update_progress(dataset_idx,
                                        msg='Getting dataset %s' %
                                        dataset_path)
            counter = 0
            blockinfo_list = list(
                self._filter_cms_blockinfo_list(dataset_path, not use_phedex))
            progress_block = ProgressActivity('Getting block information',
                                              len(blockinfo_list))
            for (block_path, replica_infos) in blockinfo_list:
                result = {}
                result[DataProvider.Dataset] = block_path.split('#')[0]
                result[DataProvider.BlockName] = block_path.split('#')[1]
                progress_block.update_progress(
                    counter,
                    msg='Getting block information for ' +
                    result[DataProvider.BlockName])

                if use_phedex and self._allow_phedex:  # Start parallel phedex query
                    replicas_dict = {}
                    phedex_thread = start_thread(
                        'Query phedex site info for %s' % block_path,
                        self._get_phedex_replica_list, block_path,
                        replicas_dict)
                    self._fill_cms_fi_list(result, block_path)
                    phedex_thread.join()
                    replica_infos = replicas_dict.get(block_path)
                else:
                    self._fill_cms_fi_list(result, block_path)
                result[DataProvider.Locations] = self._process_replica_list(
                    block_path, replica_infos)

                if len(result[DataProvider.FileList]):
                    counter += 1
                    yield result
            progress_block.finish()

            if counter == 0:
                raise DatasetError(
                    'Dataset %s does not contain any valid blocks!' %
                    dataset_path)
        progress_ds.finish()

    def _get_phedex_replica_list(self, block_path, replicas_dict):
        activity_fi = Activity('Getting file replica information from PhEDex')
        # Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list)
        replicas_dict[block_path] = []
        for phedex_block in self._pjrc.get(
                params={'block': block_path})['phedex']['block']:
            for replica in phedex_block['replica']:
                replica_info = (replica['node'], replica.get('se'),
                                replica['complete'] == 'y')
                replicas_dict[block_path].append(replica_info)
        activity_fi.finish()

    def _iter_cms_blocks(self, dataset_path, do_query_sites):
        raise AbstractError

    def _iter_cms_files(self, block_path, query_only_valid, query_lumi):
        raise AbstractError

    def _iter_formatted_locations(self, replica_infos):
        for replica_info in replica_infos:
            (_, _, completed) = replica_info
            if completed:
                for entry in self._iter_replica_locations(replica_info):
                    yield entry
            else:
                for entry in self._iter_replica_locations(replica_info):
                    yield '(%s)' % entry

    def _iter_replica_locations(self, replica_info):
        (name_node, name_hostname, _) = replica_info
        if self._location_format == CMSLocationFormat.siteDB:
            yield name_node
        else:
            if name_hostname is not None:
                name_hostnames = [name_hostname]
            else:
                name_hostnames = self._sitedb.cms_name_to_se(name_node)
            for name_hostname in name_hostnames:
                if self._location_format == CMSLocationFormat.hostname:
                    yield name_hostname
                else:
                    yield '%s/%s' % (name_node, name_hostname)

    def _process_replica_list(self, block_path, replica_infos):
        def _empty_with_warning(error_msg, *args):
            self._log.warning('Dataset block %r ' + error_msg, block_path,
                              *args)
            return []

        def _expanded_replica_locations(replica_infos):
            for replica_info in replica_infos:
                for entry in self._iter_replica_locations(replica_info):
                    yield entry

        if not replica_infos:
            return _empty_with_warning('has no replica information!')
        replica_infos_selected = self._phedex_filter.filter_list(
            replica_infos, key=itemgetter(0))
        if not replica_infos_selected:
            return _empty_with_warning(
                'is not available at the selected locations!\n' +
                'Available locations: %s',
                str.join(', ', self._iter_formatted_locations(replica_infos)))
        if not self._only_complete:
            return list(_expanded_replica_locations(replica_infos_selected))
        replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2],
                                         replica_infos_selected)
        if not replica_infos_complete:
            return _empty_with_warning(
                'is not completely available at the selected locations!\n' +
                'Available locations: %s',
                str.join(', ', self._iter_formatted_locations(replica_infos)))
        return list(_expanded_replica_locations(replica_infos_complete))
Example #13
0
 def __init__(self):
     from grid_control.utils.webservice import JSONRestClient
     self._jrc = JSONRestClient(
         url='http://maps.googleapis.com/maps/api/geocode/json')
Example #14
0
class GeoResolver(object):
    def __init__(self):
        from grid_control.utils.webservice import JSONRestClient
        self._jrc = JSONRestClient(
            url='http://maps.googleapis.com/maps/api/geocode/json')

    def run(self):
        # output of lcg-infosites ce | while read X X X X X CE; do echo $CE; done
        #   | cut -d "/" -f 1 | cut -d ":" -f 1 | sort | uniq
        ce_list_str = """alcyone-cms.grid.helsinki.fi
			alice23.spbu.ru
			arc-ce01.gridpp.rl.ac.uk
			arc-ce02.gridpp.rl.ac.uk
			arc-ce03.gridpp.rl.ac.uk
			argoce01.na.infn.it
			atlasce1.lnf.infn.it
			atlasce2.lnf.infn.it
			atlasce3.lnf.infn.it
			atlas-cream01.na.infn.it
			atlas-cream02.na.infn.it
			boce.bo.infn.it
			bonner-grid.rice.edu
			brux3.hep.brown.edu
			cale.uniandes.edu.co
			carter-osg.rcac.purdue.edu
			cccreamceli07.in2p3.fr
			cccreamceli08.in2p3.fr
			cce.ihep.ac.cn
			ce0002.m45.ihep.su
			ce0004.m45.ihep.su
			ce01.cmsaf.mit.edu
			ce01.jinr-t1.ru
			ce01-lcg.cr.cnaf.infn.it
			ce-01.roma3.infn.it
			ce01.tier2.hep.manchester.ac.uk
			ce02.cmsaf.mit.edu
			ce02.jinr-t1.ru
			ce02.ngcc.acad.bg
			ce02.tier2.hep.manchester.ac.uk
			ce04-lcg.cr.cnaf.infn.it
			ce05.esc.qmul.ac.uk
			ce05-lcg.cr.cnaf.infn.it
			ce05.ncg.ingrid.pt
			ce06.esc.qmul.ac.uk
			ce06-lcg.cr.cnaf.infn.it
			ce07.esc.qmul.ac.uk
			ce07-lcg.cr.cnaf.infn.it
			ce08-lcg.cr.cnaf.infn.it
			ce101.grid.ucy.ac.cy
			ce1.accre.vanderbilt.edu
			ce1.dur.scotgrid.ac.uk
			ce1.grid.lebedev.ru
			ce1.ts.infn.it
			ce201.cern.ch
			ce202.cern.ch
			ce203.cern.ch
			ce204.cern.ch
			ce205.cern.ch
			ce206.cern.ch
			ce207.cern.ch
			ce208.cern.ch
			ce2.accre.vanderbilt.edu
			ce2.particles.ipm.ac.ir
			ce301.cern.ch
			ce302.cern.ch
			ce3.ppgrid1.rhul.ac.uk
			ce401.cern.ch
			ce402.cern.ch
			ce403.cern.ch
			ce404.cern.ch
			ce405.cern.ch
			ce406.cern.ch
			ce407.cern.ch
			ce408.cern.ch
			ce64.ipb.ac.rs
			ce6.grid.icm.edu.pl
			ce7.glite.ecdf.ed.ac.uk
			ce9.grid.icm.edu.pl
			cebo-t3-01.cr.cnaf.infn.it
			cebo-t3-02.cr.cnaf.infn.it
			ce.cis.gov.pl
			cecream.ca.infn.it
			ce.fesb.egi.cro-ngi.hr
			ce.grid.unesp.br
			ce.irb.egi.cro-ngi.hr
			ceprod05.grid.hep.ph.ic.ac.uk
			ceprod06.grid.hep.ph.ic.ac.uk
			ceprod07.grid.hep.ph.ic.ac.uk
			ceprod08.grid.hep.ph.ic.ac.uk
			cert-37.pd.infn.it
			ce.scope.unina.it
			ce.srce.egi.cro-ngi.hr
			cetest01.grid.hep.ph.ic.ac.uk
			cetest02.grid.hep.ph.ic.ac.uk
			ce.ulakbim.gov.tr
			cit-gatekeeper2.ultralight.org
			cit-gatekeeper.ultralight.org
			cluster118.knu.ac.kr
			cluster50.knu.ac.kr
			cms-0.mps.ohio-state.edu
			cmsce01.na.infn.it
			cmsgrid01.hep.wisc.edu
			cmsgrid02.hep.wisc.edu
			cms-grid0.hep.uprm.edu
			cmsosgce2.fnal.gov
			cmsosgce4.fnal.gov
			cmsosgce.fnal.gov
			cmsrm-cream01.roma1.infn.it
			cmsrm-cream02.roma1.infn.it
			cmsrm-cream03.roma1.infn.it
			cmstest1.rcac.purdue.edu
			cms.tier3.ucdavis.edu
			conte-osg.rcac.purdue.edu
			cox01.grid.metu.edu.tr
			cr1.ipp.acad.bg
			cream01.grid.auth.gr
			cream01.grid.sinica.edu.tw
			cream01.grid.uoi.gr
			cream01.kallisto.hellasgrid.gr
			cream01.lcg.cscs.ch
			cream02.grid.cyf-kr.edu.pl
			cream02.iihe.ac.be
			cream02.lcg.cscs.ch
			cream03.lcg.cscs.ch
			cream04.grid.sinica.edu.tw
			cream04.lcg.cscs.ch
			cream05.grid.sinica.edu.tw
			cream2.ppgrid1.rhul.ac.uk
			cream3.hep.kbfi.ee
			cream4.hep.kbfi.ee
			cream.afroditi.hellasgrid.gr
			cream-ce01.ariagni.hellasgrid.gr
			cream-ce01.indiacms.res.in
			cream-ce01.marie.hellasgrid.gr
			cream-ce02.cat.cbpf.br
			creamce02.ciemat.es
			cream-ce02.marie.hellasgrid.gr
			creamce03.ciemat.es
			creamce1.itep.ru
			cream-ce-2.ba.infn.it
			cream-ce-4.ba.infn.it
			cream-ce.cat.cbpf.br
			cream-ce.grid.atomki.hu
			creamce.hephy.oeaw.ac.at
			creamce.inula.man.poznan.pl
			cream-ce.kipt.kharkov.ua
			cream-ce.pg.infn.it
			creamce.reef.man.poznan.pl
			cream.grid.cyf-kr.edu.pl
			cream.ipb.ac.rs
			dc2-grid-66.brunel.ac.uk
			dc2-grid-68.brunel.ac.uk
			dc2-grid-70.brunel.ac.uk
			dwarf.wcss.wroc.pl
			earth.crc.nd.edu
			epgr02.ph.bham.ac.uk
			erbium.lsr.nectec.or.th
			f-cream01.grid.sinica.edu.tw
			f-cream04.grid.sinica.edu.tw
			fiupg.hep.fiu.edu
			foam.grid.kiae.ru
			fornax-ce2.itwm.fhg.de
			fornax-ce.itwm.fhg.de
			grcreamce01.inr.troitsk.ru
			grid001.ics.forth.gr
			grid002.jet.efda.org
			grid012.ct.infn.it
			grid01.physics.uoi.gr
			grid0.fe.infn.it
			grid106.kfki.hu
			grid107.kfki.hu
			grid109.kfki.hu
			grid129.sinp.msu.ru
			grid36.lal.in2p3.fr
			grid72.phy.ncu.edu.tw
			gridce01.ifca.es
			gridce03.ifca.es
			gridce0.pi.infn.it
			gridce1.pi.infn.it
			grid-ce2.physik.rwth-aachen.de
			gridce2.pi.infn.it
			gridce3.pi.infn.it
			gridce4.pi.infn.it
			gridce.ilc.cnr.it
			grid-ce.physik.rwth-aachen.de
			grid-cr0.desy.de
			grid-cr1.desy.de
			grid-cr2.desy.de
			grid-cr3.desy.de
			grid-cr4.desy.de
			gridgk01.racf.bnl.gov
			gridgk02.racf.bnl.gov
			gridgk03.racf.bnl.gov
			gridgk04.racf.bnl.gov
			gridgk05.racf.bnl.gov
			gridgk06.racf.bnl.gov
			gridgk08.racf.bnl.gov
			gridtest02.racf.bnl.gov
			gridvm03.roma2.infn.it
			grisuce.scope.unina.it
			gt3.pnpi.nw.ru
			hansen-osg.rcac.purdue.edu
			hepcms-0.umd.edu
			hepgrid10.ph.liv.ac.uk
			hepgrid5.ph.liv.ac.uk
			hepgrid6.ph.liv.ac.uk
			hepgrid97.ph.liv.ac.uk
			hephygr.oeaw.ac.at
			heposg01.colorado.edu
			hurr.tamu.edu
			ingrid.cism.ucl.ac.be
			jade-cms.hip.fi
			juk.nikhef.nl
			kalkan1.ulakbim.gov.tr
			khaldun.biruni.upm.my
			klomp.nikhef.nl
			kodiak-ce.baylor.edu
			lcg18.sinp.msu.ru
			lcg52.sinp.msu.ru
			lcgce01.phy.bris.ac.uk
			lcgce03.phy.bris.ac.uk
			lcgce04.phy.bris.ac.uk
			lcgce12.jinr.ru
			lcgce1.shef.ac.uk
			lcgce21.jinr.ru
			lcgce2.shef.ac.uk
			lcg-cream.ifh.de
			llrcream.in2p3.fr
			lpnhe-cream.in2p3.fr
			lyogrid07.in2p3.fr
			magic.cse.buffalo.edu
			mwt2-gk.campuscluster.illinois.edu
			ndcms.crc.nd.edu
			node01-03.usm.renam.md
			node01-04.grid.renam.md
			node05-02.imi.renam.md
			node74.datagrid.cea.fr
			nodeslab-0002.nlab.tb.hiit.fi
			ntugrid2.phys.ntu.edu.tw
			ntugrid5.phys.ntu.edu.tw
			nys1.cac.cornell.edu
			osgce.hepgrid.uerj.br
			osg-ce.sprace.org.br
			osg-gk.mwt2.org
			osg-gw-6.t2.ucsd.edu
			osg-gw-7.t2.ucsd.edu
			osg.hpc.ufl.edu
			osg-nemo-ce.phys.uwm.edu
			osg.rcac.purdue.edu
			osgserv01.slac.stanford.edu
			osgserv02.slac.stanford.edu
			ouhep0.nhn.ou.edu
			pamelace01.na.infn.it
			pcncp04.ncp.edu.pk
			pcncp05.ncp.edu.pk
			pre7230.datagrid.cea.fr
			prod-ce-01.pd.infn.it
			razi.biruni.upm.my
			recasce01.na.infn.it
			red-gw1.unl.edu
			red-gw2.unl.edu
			red.unl.edu
			rossmann-osg.rcac.purdue.edu
			sbgce2.in2p3.fr
			snf-189278.vm.okeanos.grnet.gr
			snf-458754.vm.okeanos.grnet.gr
			spacina-ce.scope.unina.it
			svr009.gla.scotgrid.ac.uk
			svr010.gla.scotgrid.ac.uk
			svr011.gla.scotgrid.ac.uk
			svr014.gla.scotgrid.ac.uk
			t2arc01.physics.ox.ac.uk
			t2-ce-01.lnl.infn.it
			t2-ce-01.to.infn.it
			t2-ce-02.lnl.infn.it
			t2ce02.physics.ox.ac.uk
			t2-ce-03.lnl.infn.it
			t2-ce-04.lnl.infn.it
			t2-ce-04.mi.infn.it
			t2ce04.physics.ox.ac.uk
			t2-ce-05.mi.infn.it
			t2-ce-06.lnl.infn.it
			t2ce06.physics.ox.ac.uk
			t3serv007.mit.edu
			tau-cream.hep.tau.ac.il
			tech-crm.hep.technion.ac.il
			top.ucr.edu
			umiss001.hep.olemiss.edu
			uosaf0008.sscc.uos.ac.kr
			uscms1.fltech-grid3.fit.edu
			v6ce00.grid.hep.ph.ic.ac.uk
			vserv13.hep.phy.cam.ac.uk
			wipp-crm.weizmann.ac.il
		"""
        import sys, time
        from python_compat import set, imap, lfilter, sorted

        counter = 0
        used = set()
        for line in imap(str.strip, ce_list_str.splitlines()):
            time.sleep(0.2)
            match = get_geo_match(line)
            if not match:
                counter += 1
                sys.stderr.write('\t%r: %r\n' % (line, self._geocode(line)))
            else:
                used.add(match)
        sys.stderr.write('%s unmatched entries\n' % counter)
        sys.stderr.write('unused entries:\n%s\n' %
                         repr(lfilter(lambda x: x not in used, _GEO_DICT)))

        sys.stdout.write('_GEO_DICT = {\n')
        geo_dict_key_list = sorted(
            _GEO_DICT.keys(),
            key=lambda x: str.join('.', reversed(x.split('.'))))
        for entry in geo_dict_key_list:
            sys.stdout.write('\t%r: (%.6f, %.6f),\n' %
                             (entry, _GEO_DICT[entry][0], _GEO_DICT[entry][1]))
        sys.stdout.write('}\n')

    def _geocode(self, loc):
        result = self._jrc.get(params={
            'address': str.join('.',
                                loc.split('.')[2:]),
            'sensor': 'false'
        })
        if 'Placemark' in result:  # unfold placemake entries
            place_list = []
            for entry in result['Placemark']:
                place_list.append(
                    (entry['address'],
                     tuple(reversed(entry['Point']['coordinates'][:2]))))
            return place_list
        return result
Example #15
0
class GeoResolver(object):
	def __init__(self):
		from grid_control.utils.webservice import JSONRestClient
		self._jrc = JSONRestClient(url='http://maps.googleapis.com/maps/api/geocode/json')

	def run(self):
		# output of lcg-infosites ce | while read X X X X X CE; do echo $CE; done
		#   | cut -d "/" -f 1 | cut -d ":" -f 1 | sort | uniq
		ce_list_str = """alcyone-cms.grid.helsinki.fi
			alice23.spbu.ru
			arc-ce01.gridpp.rl.ac.uk
			arc-ce02.gridpp.rl.ac.uk
			arc-ce03.gridpp.rl.ac.uk
			argoce01.na.infn.it
			atlasce1.lnf.infn.it
			atlasce2.lnf.infn.it
			atlasce3.lnf.infn.it
			atlas-cream01.na.infn.it
			atlas-cream02.na.infn.it
			boce.bo.infn.it
			bonner-grid.rice.edu
			brux3.hep.brown.edu
			cale.uniandes.edu.co
			carter-osg.rcac.purdue.edu
			cccreamceli07.in2p3.fr
			cccreamceli08.in2p3.fr
			cce.ihep.ac.cn
			ce0002.m45.ihep.su
			ce0004.m45.ihep.su
			ce01.cmsaf.mit.edu
			ce01.jinr-t1.ru
			ce01-lcg.cr.cnaf.infn.it
			ce-01.roma3.infn.it
			ce01.tier2.hep.manchester.ac.uk
			ce02.cmsaf.mit.edu
			ce02.jinr-t1.ru
			ce02.ngcc.acad.bg
			ce02.tier2.hep.manchester.ac.uk
			ce04-lcg.cr.cnaf.infn.it
			ce05.esc.qmul.ac.uk
			ce05-lcg.cr.cnaf.infn.it
			ce05.ncg.ingrid.pt
			ce06.esc.qmul.ac.uk
			ce06-lcg.cr.cnaf.infn.it
			ce07.esc.qmul.ac.uk
			ce07-lcg.cr.cnaf.infn.it
			ce08-lcg.cr.cnaf.infn.it
			ce101.grid.ucy.ac.cy
			ce1.accre.vanderbilt.edu
			ce1.dur.scotgrid.ac.uk
			ce1.grid.lebedev.ru
			ce1.ts.infn.it
			ce201.cern.ch
			ce202.cern.ch
			ce203.cern.ch
			ce204.cern.ch
			ce205.cern.ch
			ce206.cern.ch
			ce207.cern.ch
			ce208.cern.ch
			ce2.accre.vanderbilt.edu
			ce2.particles.ipm.ac.ir
			ce301.cern.ch
			ce302.cern.ch
			ce3.ppgrid1.rhul.ac.uk
			ce401.cern.ch
			ce402.cern.ch
			ce403.cern.ch
			ce404.cern.ch
			ce405.cern.ch
			ce406.cern.ch
			ce407.cern.ch
			ce408.cern.ch
			ce64.ipb.ac.rs
			ce6.grid.icm.edu.pl
			ce7.glite.ecdf.ed.ac.uk
			ce9.grid.icm.edu.pl
			cebo-t3-01.cr.cnaf.infn.it
			cebo-t3-02.cr.cnaf.infn.it
			ce.cis.gov.pl
			cecream.ca.infn.it
			ce.fesb.egi.cro-ngi.hr
			ce.grid.unesp.br
			ce.irb.egi.cro-ngi.hr
			ceprod05.grid.hep.ph.ic.ac.uk
			ceprod06.grid.hep.ph.ic.ac.uk
			ceprod07.grid.hep.ph.ic.ac.uk
			ceprod08.grid.hep.ph.ic.ac.uk
			cert-37.pd.infn.it
			ce.scope.unina.it
			ce.srce.egi.cro-ngi.hr
			cetest01.grid.hep.ph.ic.ac.uk
			cetest02.grid.hep.ph.ic.ac.uk
			ce.ulakbim.gov.tr
			cit-gatekeeper2.ultralight.org
			cit-gatekeeper.ultralight.org
			cluster118.knu.ac.kr
			cluster50.knu.ac.kr
			cms-0.mps.ohio-state.edu
			cmsce01.na.infn.it
			cmsgrid01.hep.wisc.edu
			cmsgrid02.hep.wisc.edu
			cms-grid0.hep.uprm.edu
			cmsosgce2.fnal.gov
			cmsosgce4.fnal.gov
			cmsosgce.fnal.gov
			cmsrm-cream01.roma1.infn.it
			cmsrm-cream02.roma1.infn.it
			cmsrm-cream03.roma1.infn.it
			cmstest1.rcac.purdue.edu
			cms.tier3.ucdavis.edu
			conte-osg.rcac.purdue.edu
			cox01.grid.metu.edu.tr
			cr1.ipp.acad.bg
			cream01.grid.auth.gr
			cream01.grid.sinica.edu.tw
			cream01.grid.uoi.gr
			cream01.kallisto.hellasgrid.gr
			cream01.lcg.cscs.ch
			cream02.grid.cyf-kr.edu.pl
			cream02.iihe.ac.be
			cream02.lcg.cscs.ch
			cream03.lcg.cscs.ch
			cream04.grid.sinica.edu.tw
			cream04.lcg.cscs.ch
			cream05.grid.sinica.edu.tw
			cream2.ppgrid1.rhul.ac.uk
			cream3.hep.kbfi.ee
			cream4.hep.kbfi.ee
			cream.afroditi.hellasgrid.gr
			cream-ce01.ariagni.hellasgrid.gr
			cream-ce01.indiacms.res.in
			cream-ce01.marie.hellasgrid.gr
			cream-ce02.cat.cbpf.br
			creamce02.ciemat.es
			cream-ce02.marie.hellasgrid.gr
			creamce03.ciemat.es
			creamce1.itep.ru
			cream-ce-2.ba.infn.it
			cream-ce-4.ba.infn.it
			cream-ce.cat.cbpf.br
			cream-ce.grid.atomki.hu
			creamce.hephy.oeaw.ac.at
			creamce.inula.man.poznan.pl
			cream-ce.kipt.kharkov.ua
			cream-ce.pg.infn.it
			creamce.reef.man.poznan.pl
			cream.grid.cyf-kr.edu.pl
			cream.ipb.ac.rs
			dc2-grid-66.brunel.ac.uk
			dc2-grid-68.brunel.ac.uk
			dc2-grid-70.brunel.ac.uk
			dwarf.wcss.wroc.pl
			earth.crc.nd.edu
			epgr02.ph.bham.ac.uk
			erbium.lsr.nectec.or.th
			f-cream01.grid.sinica.edu.tw
			f-cream04.grid.sinica.edu.tw
			fiupg.hep.fiu.edu
			foam.grid.kiae.ru
			fornax-ce2.itwm.fhg.de
			fornax-ce.itwm.fhg.de
			grcreamce01.inr.troitsk.ru
			grid001.ics.forth.gr
			grid002.jet.efda.org
			grid012.ct.infn.it
			grid01.physics.uoi.gr
			grid0.fe.infn.it
			grid106.kfki.hu
			grid107.kfki.hu
			grid109.kfki.hu
			grid129.sinp.msu.ru
			grid36.lal.in2p3.fr
			grid72.phy.ncu.edu.tw
			gridce01.ifca.es
			gridce03.ifca.es
			gridce0.pi.infn.it
			gridce1.pi.infn.it
			grid-ce2.physik.rwth-aachen.de
			gridce2.pi.infn.it
			gridce3.pi.infn.it
			gridce4.pi.infn.it
			gridce.ilc.cnr.it
			grid-ce.physik.rwth-aachen.de
			grid-cr0.desy.de
			grid-cr1.desy.de
			grid-cr2.desy.de
			grid-cr3.desy.de
			grid-cr4.desy.de
			gridgk01.racf.bnl.gov
			gridgk02.racf.bnl.gov
			gridgk03.racf.bnl.gov
			gridgk04.racf.bnl.gov
			gridgk05.racf.bnl.gov
			gridgk06.racf.bnl.gov
			gridgk08.racf.bnl.gov
			gridtest02.racf.bnl.gov
			gridvm03.roma2.infn.it
			grisuce.scope.unina.it
			gt3.pnpi.nw.ru
			hansen-osg.rcac.purdue.edu
			hepcms-0.umd.edu
			hepgrid10.ph.liv.ac.uk
			hepgrid5.ph.liv.ac.uk
			hepgrid6.ph.liv.ac.uk
			hepgrid97.ph.liv.ac.uk
			hephygr.oeaw.ac.at
			heposg01.colorado.edu
			hurr.tamu.edu
			ingrid.cism.ucl.ac.be
			jade-cms.hip.fi
			juk.nikhef.nl
			kalkan1.ulakbim.gov.tr
			khaldun.biruni.upm.my
			klomp.nikhef.nl
			kodiak-ce.baylor.edu
			lcg18.sinp.msu.ru
			lcg52.sinp.msu.ru
			lcgce01.phy.bris.ac.uk
			lcgce03.phy.bris.ac.uk
			lcgce04.phy.bris.ac.uk
			lcgce12.jinr.ru
			lcgce1.shef.ac.uk
			lcgce21.jinr.ru
			lcgce2.shef.ac.uk
			lcg-cream.ifh.de
			llrcream.in2p3.fr
			lpnhe-cream.in2p3.fr
			lyogrid07.in2p3.fr
			magic.cse.buffalo.edu
			mwt2-gk.campuscluster.illinois.edu
			ndcms.crc.nd.edu
			node01-03.usm.renam.md
			node01-04.grid.renam.md
			node05-02.imi.renam.md
			node74.datagrid.cea.fr
			nodeslab-0002.nlab.tb.hiit.fi
			ntugrid2.phys.ntu.edu.tw
			ntugrid5.phys.ntu.edu.tw
			nys1.cac.cornell.edu
			osgce.hepgrid.uerj.br
			osg-ce.sprace.org.br
			osg-gk.mwt2.org
			osg-gw-6.t2.ucsd.edu
			osg-gw-7.t2.ucsd.edu
			osg.hpc.ufl.edu
			osg-nemo-ce.phys.uwm.edu
			osg.rcac.purdue.edu
			osgserv01.slac.stanford.edu
			osgserv02.slac.stanford.edu
			ouhep0.nhn.ou.edu
			pamelace01.na.infn.it
			pcncp04.ncp.edu.pk
			pcncp05.ncp.edu.pk
			pre7230.datagrid.cea.fr
			prod-ce-01.pd.infn.it
			razi.biruni.upm.my
			recasce01.na.infn.it
			red-gw1.unl.edu
			red-gw2.unl.edu
			red.unl.edu
			rossmann-osg.rcac.purdue.edu
			sbgce2.in2p3.fr
			snf-189278.vm.okeanos.grnet.gr
			snf-458754.vm.okeanos.grnet.gr
			spacina-ce.scope.unina.it
			svr009.gla.scotgrid.ac.uk
			svr010.gla.scotgrid.ac.uk
			svr011.gla.scotgrid.ac.uk
			svr014.gla.scotgrid.ac.uk
			t2arc01.physics.ox.ac.uk
			t2-ce-01.lnl.infn.it
			t2-ce-01.to.infn.it
			t2-ce-02.lnl.infn.it
			t2ce02.physics.ox.ac.uk
			t2-ce-03.lnl.infn.it
			t2-ce-04.lnl.infn.it
			t2-ce-04.mi.infn.it
			t2ce04.physics.ox.ac.uk
			t2-ce-05.mi.infn.it
			t2-ce-06.lnl.infn.it
			t2ce06.physics.ox.ac.uk
			t3serv007.mit.edu
			tau-cream.hep.tau.ac.il
			tech-crm.hep.technion.ac.il
			top.ucr.edu
			umiss001.hep.olemiss.edu
			uosaf0008.sscc.uos.ac.kr
			uscms1.fltech-grid3.fit.edu
			v6ce00.grid.hep.ph.ic.ac.uk
			vserv13.hep.phy.cam.ac.uk
			wipp-crm.weizmann.ac.il
		"""
		import sys, time
		from python_compat import set, imap, lfilter, sorted

		counter = 0
		used = set()
		for line in imap(str.strip, ce_list_str.splitlines()):
			time.sleep(0.2)
			match = get_geo_match(line)
			if not match:
				counter += 1
				sys.stderr.write('\t%r: %r\n' % (line, self._geocode(line)))
			else:
				used.add(match)
		sys.stderr.write('%s unmatched entries\n' % counter)
		sys.stderr.write('unused entries:\n%s\n' % repr(lfilter(lambda x: x not in used, _GEO_DICT)))

		sys.stdout.write('_GEO_DICT = {\n')
		geo_dict_key_list = sorted(_GEO_DICT.keys(), key=lambda x: str.join('.', reversed(x.split('.'))))
		for entry in geo_dict_key_list:
			sys.stdout.write('\t%r: (%.6f, %.6f),\n' % (entry, _GEO_DICT[entry][0], _GEO_DICT[entry][1]))
		sys.stdout.write('}\n')

	def _geocode(self, loc):
		result = self._jrc.get(params={'address': str.join('.', loc.split('.')[2:]), 'sensor': 'false'})
		if 'Placemark' in result:  # unfold placemake entries
			place_list = []
			for entry in result['Placemark']:
				place_list.append((entry['address'], tuple(reversed(entry['Point']['coordinates'][:2]))))
			return place_list
		return result
Example #16
0
class CMSBaseProvider(DataProvider):
	# required format: <dataset path>[@<instance>][#<block>]
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None):
		dataset_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters']))
		self._lumi_filter = dataset_config.get_lookup(['lumi filter', '%s lumi filter' % datasource_name],
			default={}, parser=parse_lumi_filter, strfun=str_lumi)
		if not self._lumi_filter.empty():
			config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = dataset_config.get_bool(
			['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty())
		config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=')
		# PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
		self._phedex_filter = dataset_config.get_filter('phedex sites', '-* T1_*_Disk T2_* T3_*',
			default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter')
		self._only_complete = dataset_config.get_bool('only complete sites', True)
		self._only_valid = dataset_config.get_bool('only valid', True)
		self._location_format = dataset_config.get_enum('location format',
			CMSLocationFormat, CMSLocationFormat.hostname)
		self._pjrc = JSONRestClient(url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')
		self._sitedb = SiteDB()

		dataset_expr_parts = split_opt(dataset_expr, '@#')
		(self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts
		instance_default = dataset_config.get('dbs instance', '')
		self._dataset_instance = self._dataset_instance or instance_default
		if not self._dataset_instance:
			self._dataset_instance = 'prod/global'
		elif '/' not in self._dataset_instance:
			self._dataset_instance = 'prod/%s' % self._dataset_instance
		self._dataset_block_selector = self._dataset_block_selector or 'all'

	def check_splitter(self, splitter):
		# Check if splitter is valid
		if (DataSplitter.Skipped in splitter.get_needed_enums()) and not self._lumi_filter.empty():
			self._log.debug('Selected splitter %s is not compatible with active lumi filter!',
				splitter.__name__)
			self._log.warning('Active lumi section filter forced selection of HybridSplitter')
			return HybridSplitter
		return splitter

	def get_dataset_name_list(self):
		if self._cache_dataset is None:
			self._cache_dataset = [self._dataset_path]
			if '*' in self._dataset_path:
				activity = Activity('Getting dataset list for %s' % self._dataset_path)
				self._cache_dataset = list(self._get_cms_dataset_list(self._dataset_path))
				if not self._cache_dataset:
					raise DatasetError('No datasets selected by DBS wildcard %s !' % self._dataset_path)
				activity.finish()
		return self._cache_dataset

	def get_query_interval(self):
		# Define how often the dataprovider can be queried automatically
		return 2 * 60 * 60  # 2 hour delay minimum

	def _fill_cms_fi_list(self, block, block_path):
		activity_fi = Activity('Getting file information')
		lumi_used = False
		lumi_info_dict = {}
		if self._lumi_query:  # central lumi query
			lumi_info_dict = self._get_cms_lumi_dict(block_path)
		fi_list = []
		for (fi, lumi_info_list) in self._iter_cms_files(block_path, self._only_valid, self._lumi_query):
			self._raise_on_abort()
			if lumi_info_dict and not lumi_info_list:
				lumi_info_list = lumi_info_dict.get(fi[DataProvider.URL], [])
			if lumi_info_list:
				(run_list_result, lumi_list_result) = ([], [])
				for (run, lumi_list) in sorted(lumi_info_list):
					run_list_result.extend([run] * len(lumi_list))
					lumi_list_result.extend(lumi_list)
				assert len(run_list_result) == len(lumi_list_result)
				fi[DataProvider.Metadata] = [run_list_result, lumi_list_result]
				lumi_used = True
			fi_list.append(fi)
		if lumi_used:
			block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi'])
		block[DataProvider.FileList] = fi_list
		activity_fi.finish()

	def _filter_cms_blockinfo_list(self, dataset_path, do_query_sites):
		iter_dataset_block_name_selist = self._iter_cms_blocks(dataset_path, do_query_sites)
		n_blocks = 0
		selected_blocks = False
		for (dataset_block_name, selist) in iter_dataset_block_name_selist:
			n_blocks += 1
			block_name = str.split(dataset_block_name, '#')[1]
			if (self._dataset_block_selector != 'all') and (block_name != self._dataset_block_selector):
				continue
			selected_blocks = True
			yield (dataset_block_name, selist)
		if (n_blocks > 0) and not selected_blocks:
			raise DatasetError('Dataset %r contains %d blocks, but none were selected by %r' % (
				dataset_path, n_blocks, self._dataset_block_selector))

	def _get_cms_dataset_list(self, dataset_path):
		raise AbstractError

	def _get_cms_lumi_dict(self, block_path):
		return None

	def _get_gc_block_list(self, use_phedex):
		dataset_name_list = self.get_dataset_name_list()
		progress_ds = ProgressActivity('Getting dataset', len(dataset_name_list))
		for dataset_idx, dataset_path in enumerate(dataset_name_list):
			progress_ds.update_progress(dataset_idx, msg='Getting dataset %s' % dataset_path)
			counter = 0
			blockinfo_list = list(self._filter_cms_blockinfo_list(dataset_path, not use_phedex))
			progress_block = ProgressActivity('Getting block information', len(blockinfo_list))
			for (block_path, replica_infos) in blockinfo_list:
				result = {}
				result[DataProvider.Dataset] = block_path.split('#')[0]
				result[DataProvider.BlockName] = block_path.split('#')[1]
				progress_block.update_progress(counter,
					msg='Getting block information for ' + result[DataProvider.BlockName])

				if use_phedex:  # Start parallel phedex query
					replicas_dict = {}
					phedex_thread = start_thread('Query phedex site info for %s' % block_path,
						self._get_phedex_replica_list, block_path, replicas_dict)
					self._fill_cms_fi_list(result, block_path)
					phedex_thread.join()
					replica_infos = replicas_dict.get(block_path)
				else:
					self._fill_cms_fi_list(result, block_path)
				result[DataProvider.Locations] = self._process_replica_list(block_path, replica_infos)

				if len(result[DataProvider.FileList]):
					counter += 1
					yield result
			progress_block.finish()

			if counter == 0:
				raise DatasetError('Dataset %s does not contain any valid blocks!' % dataset_path)
		progress_ds.finish()

	def _get_phedex_replica_list(self, block_path, replicas_dict):
		activity_fi = Activity('Getting file replica information from PhEDex')
		# Get dataset se list from PhEDex (perhaps concurrent with get_dbs_file_list)
		replicas_dict[block_path] = []
		for phedex_block in self._pjrc.get(params={'block': block_path})['phedex']['block']:
			for replica in phedex_block['replica']:
				replica_info = (replica['node'], replica.get('se'), replica['complete'] == 'y')
				replicas_dict[block_path].append(replica_info)
		activity_fi.finish()

	def _iter_cms_blocks(self, dataset_path, do_query_sites):
		raise AbstractError

	def _iter_cms_files(self, block_path, query_only_valid, query_lumi):
		raise AbstractError

	def _iter_formatted_locations(self, replica_infos):
		for replica_info in replica_infos:
			(_, _, completed) = replica_info
			if completed:
				for entry in self._iter_replica_locations(replica_info):
					yield entry
			else:
				for entry in self._iter_replica_locations(replica_info):
					yield '(%s)' % entry

	def _iter_replica_locations(self, replica_info):
		(name_node, name_hostname, _) = replica_info
		if self._location_format == CMSLocationFormat.siteDB:
			yield name_node
		else:
			if name_hostname is not None:
				name_hostnames = [name_hostname]
			else:
				name_hostnames = self._sitedb.cms_name_to_se(name_node)
			for name_hostname in name_hostnames:
				if self._location_format == CMSLocationFormat.hostname:
					yield name_hostname
				else:
					yield '%s/%s' % (name_node, name_hostname)

	def _process_replica_list(self, block_path, replica_infos):
		def _empty_with_warning(error_msg, *args):
			self._log.warning('Dataset block %r ' + error_msg, block_path, *args)
			return []

		def _expanded_replica_locations(replica_infos):
			for replica_info in replica_infos:
				for entry in self._iter_replica_locations(replica_info):
					yield entry

		if not replica_infos:
			return _empty_with_warning('has no replica information!')
		replica_infos_selected = self._phedex_filter.filter_list(replica_infos, key=itemgetter(0))
		if not replica_infos_selected:
			return _empty_with_warning('is not available at the selected locations!\n' +
				'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos)))
		if not self._only_complete:
			return list(_expanded_replica_locations(replica_infos_selected))
		replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected)
		if not replica_infos_complete:
			return _empty_with_warning('is not completely available at the selected locations!\n' +
				'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos)))
		return list(_expanded_replica_locations(replica_infos_complete))
Example #17
0
	def __init__(self):
		from grid_control.utils.webservice import JSONRestClient
		self._jrc = JSONRestClient(url='http://maps.googleapis.com/maps/api/geocode/json')
Example #18
0
class CMSBaseProvider(DataProvider):
    def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0):
        changeTrigger = triggerResync(['datasets', 'parameters'])
        self._lumi_filter = config.getLookup('lumi filter', {},
                                             parser=parseLumiFilter,
                                             strfun=strLumi,
                                             onChange=changeTrigger)
        if not self._lumi_filter.empty():
            config.set('dataset processor', 'LumiDataProcessor', '+=')
        DataProvider.__init__(self, config, datasetExpr, datasetNick,
                              datasetID)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = config.getBool('lumi metadata',
                                          not self._lumi_filter.empty(),
                                          onChange=changeTrigger)
        # PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
        self._phedexFilter = config.getFilter('phedex sites',
                                              '-T3_US_FNALLPC',
                                              defaultMatcher='blackwhite',
                                              defaultFilter='weak',
                                              onChange=changeTrigger)
        self._phedexT1Filter = config.getFilter('phedex t1 accept',
                                                'T1_DE_KIT T1_US_FNAL',
                                                defaultMatcher='blackwhite',
                                                defaultFilter='weak',
                                                onChange=changeTrigger)
        self._phedexT1Mode = config.getEnum('phedex t1 mode',
                                            PhedexT1Mode,
                                            PhedexT1Mode.disk,
                                            onChange=changeTrigger)
        self.onlyComplete = config.getBool('only complete sites',
                                           True,
                                           onChange=changeTrigger)
        self._locationFormat = config.getEnum('location format',
                                              CMSLocationFormat,
                                              CMSLocationFormat.hostname,
                                              onChange=changeTrigger)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )

        (self._datasetPath, self._url,
         self._datasetBlock) = optSplit(datasetExpr, '@#')
        self._url = self._url or config.get('dbs instance', '')
        self._datasetBlock = self._datasetBlock or 'all'
        self.onlyValid = config.getBool('only valid',
                                        True,
                                        onChange=changeTrigger)

    # Define how often the dataprovider can be queried automatically
    def queryLimit(self):
        return 2 * 60 * 60  # 2 hour delay minimum

    # Check if splitterClass is valid
    def checkSplitter(self, splitterClass):
        if (DataSplitter.Skipped in splitterClass.neededEnums()
            ) and not self._lumi_filter.empty():
            self._log.debug(
                'Selected splitter %s is not compatible with active lumi filter!',
                splitterClass.__name__)
            self._log.warning(
                'Active lumi section filter forced selection of HybridSplitter'
            )
            return HybridSplitter
        return splitterClass

    def _nodeFilter(self, nameSiteDB, complete):
        # Remove T0 and T1 by default
        result = not (nameSiteDB.startswith('T0_')
                      or nameSiteDB.startswith('T1_'))
        # check if listed on the accepted list
        if self._phedexT1Mode in [PhedexT1Mode.disk, PhedexT1Mode.accept]:
            result = result or (self._phedexT1Filter.filterList([nameSiteDB])
                                == [nameSiteDB])
        if self._phedexT1Mode == PhedexT1Mode.disk:
            result = result or nameSiteDB.lower().endswith('_disk')
        # apply phedex blacklist
        result = result and (self._phedexFilter.filterList([nameSiteDB])
                             == [nameSiteDB])
        # check for completeness at the site
        result = result and (complete or not self.onlyComplete)
        return result

    # Get dataset se list from PhEDex (perhaps concurrent with listFiles)
    def _getPhedexSEList(self, blockPath, dictSE):
        dictSE[blockPath] = []
        for phedexBlock in self._pjrc.get(
                params={'block': blockPath})['phedex']['block']:
            for replica in phedexBlock['replica']:
                if self._nodeFilter(replica['node'],
                                    replica['complete'] == 'y'):
                    location = None
                    if self._locationFormat == CMSLocationFormat.hostname:
                        location = replica.get('se')
                    elif self._locationFormat == CMSLocationFormat.siteDB:
                        location = replica.get('node')
                    elif (self._locationFormat == CMSLocationFormat.both) and (
                            replica.get('node') or replica.get('se')):
                        location = '%s/%s' % (replica.get('node'),
                                              replica.get('se'))
                    if location:
                        dictSE[blockPath].append(location)
                    else:
                        self._log.warning(
                            'Dataset block %s replica at %s / %s is skipped!',
                            blockPath, replica.get('node'), replica.get('se'))

    def getDatasets(self):
        if self._cache_dataset is None:
            self._cache_dataset = [self._datasetPath]
            if '*' in self._datasetPath:
                self._cache_dataset = list(
                    self.getCMSDatasets(self._datasetPath))
                if not self._cache_dataset:
                    raise DatasetError(
                        'No datasets selected by DBS wildcard %s !' %
                        self._datasetPath)
        return self._cache_dataset

    def getCMSBlocks(self, datasetPath, getSites):
        iter_blockname_selist = self.getCMSBlocksImpl(datasetPath, getSites)
        n_blocks = 0
        selected_blocks = False
        for (blockname, selist) in iter_blockname_selist:
            n_blocks += 1
            if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1]
                                                  != self._datasetBlock):
                continue
            selected_blocks = True
            yield (blockname, selist)
        if (n_blocks > 0) and not selected_blocks:
            raise DatasetError(
                'Dataset %r contains %d blocks, but none were selected by %r' %
                (datasetPath, n_blocks, self._datasetBlock))

    def fillCMSFiles(self, block, blockPath):
        lumi_used = False
        lumiDict = {}
        if self._lumi_query:  # central lumi query
            lumiDict = self.getCMSLumisImpl(blockPath)
        fileList = []
        for (fileInfo,
             listLumi) in self.getCMSFilesImpl(blockPath, self.onlyValid,
                                               self._lumi_query):
            if lumiDict and not listLumi:
                listLumi = lumiDict.get(fileInfo[DataProvider.URL], [])
            if listLumi:
                (listLumiExt_Run, listLumiExt_Lumi) = ([], [])
                for (run, lumi_list) in sorted(listLumi):
                    listLumiExt_Run.extend([run] * len(lumi_list))
                    listLumiExt_Lumi.extend(lumi_list)
                fileInfo[DataProvider.Metadata] = [
                    listLumiExt_Run, listLumiExt_Lumi
                ]
                lumi_used = True
            fileList.append(fileInfo)
        if lumi_used:
            block.setdefault(DataProvider.Metadata,
                             []).extend(['Runs', 'Lumi'])
        block[DataProvider.FileList] = fileList

    def getCMSLumisImpl(self, blockPath):
        return None

    def getGCBlocks(self, usePhedex):
        for datasetPath in self.getDatasets():
            counter = 0
            for (blockPath,
                 listSE) in self.getCMSBlocks(datasetPath,
                                              getSites=not usePhedex):
                result = {}
                result[DataProvider.Dataset] = blockPath.split('#')[0]
                result[DataProvider.BlockName] = blockPath.split('#')[1]

                if usePhedex:  # Start parallel phedex query
                    dictSE = {}
                    tPhedex = start_thread(
                        'Query phedex site info for %s' % blockPath,
                        self._getPhedexSEList, blockPath, dictSE)
                    self.fillCMSFiles(result, blockPath)
                    tPhedex.join()
                    listSE = dictSE.get(blockPath)
                else:
                    self.fillCMSFiles(result, blockPath)
                result[DataProvider.Locations] = listSE

                if len(result[DataProvider.FileList]):
                    counter += 1
                    yield result

            if counter == 0:
                raise DatasetError(
                    'Dataset %s does not contain any valid blocks!' %
                    datasetPath)
Example #19
0
class CMSBaseProvider(DataProvider):
    def __init__(self, config, datasetExpr, datasetNick=None):
        self._changeTrigger = triggerResync(['datasets', 'parameters'])
        self._lumi_filter = config.getLookup('lumi filter', {},
                                             parser=parseLumiFilter,
                                             strfun=strLumi,
                                             onChange=self._changeTrigger)
        if not self._lumi_filter.empty():
            config.set('dataset processor', 'LumiDataProcessor', '+=')
        DataProvider.__init__(self, config, datasetExpr, datasetNick)
        # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
        self._lumi_query = config.getBool('lumi metadata',
                                          not self._lumi_filter.empty(),
                                          onChange=self._changeTrigger)
        config.set('phedex sites matcher mode', 'shell', '?=')
        # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont!
        self._phedexFilter = config.getFilter('phedex sites',
                                              '-* T1_*_Disk T2_* T3_*',
                                              defaultMatcher='blackwhite',
                                              defaultFilter='strict',
                                              onChange=self._changeTrigger)
        self._onlyComplete = config.getBool('only complete sites',
                                            True,
                                            onChange=self._changeTrigger)
        self._locationFormat = config.getEnum('location format',
                                              CMSLocationFormat,
                                              CMSLocationFormat.hostname,
                                              onChange=self._changeTrigger)
        self._pjrc = JSONRestClient(
            url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas'
        )
        self._sitedb = SiteDB()

        (self._datasetPath, self._datasetInstance,
         self._datasetBlock) = optSplit(datasetExpr, '@#')
        instance_default = config.get('dbs instance',
                                      '',
                                      onChange=self._changeTrigger)
        self._datasetInstance = self._datasetInstance or instance_default
        if not self._datasetInstance:
            self._datasetInstance = 'prod/global'
        elif '/' not in self._datasetInstance:
            self._datasetInstance = 'prod/%s' % self._datasetInstance
        self._datasetBlock = self._datasetBlock or 'all'
        self.onlyValid = config.getBool('only valid',
                                        True,
                                        onChange=self._changeTrigger)

    # Define how often the dataprovider can be queried automatically
    def queryLimit(self):
        return 2 * 60 * 60  # 2 hour delay minimum

    # Check if splitterClass is valid
    def checkSplitter(self, splitterClass):
        if (DataSplitter.Skipped in splitterClass.neededEnums()
            ) and not self._lumi_filter.empty():
            self._log.debug(
                'Selected splitter %s is not compatible with active lumi filter!',
                splitterClass.__name__)
            self._log.warning(
                'Active lumi section filter forced selection of HybridSplitter'
            )
            return HybridSplitter
        return splitterClass

    def _replicaLocation(self, replica_info):
        (name_node, name_hostname, _) = replica_info
        if self._locationFormat == CMSLocationFormat.siteDB:
            yield name_node
        else:
            if name_hostname is not None:
                name_hostnames = [name_hostname]
            else:
                name_hostnames = self._sitedb.cms_name_to_se(name_node)
            for name_hostname in name_hostnames:
                if self._locationFormat == CMSLocationFormat.hostname:
                    yield name_hostname
                else:
                    yield '%s/%s' % (name_node, name_hostname)

    def _fmtLocations(self, replica_infos):
        for replica_info in replica_infos:
            (_, _, completed) = replica_info
            if completed:
                for entry in self._replicaLocation(replica_info):
                    yield entry
            else:
                for entry in self._replicaLocation(replica_info):
                    yield '(%s)' % entry

    def _processReplicas(self, blockPath, replica_infos):
        def empty_with_warning(*args):
            self._log.warning(*args)
            return []

        def expanded_replica_locations(replica_infos):
            for replica_info in replica_infos:
                for entry in self._replicaLocation(replica_info):
                    yield entry

        if not replica_infos:
            return empty_with_warning(
                'Dataset block %r has no replica information!', blockPath)
        replica_infos_selected = self._phedexFilter.filterList(
            replica_infos, key=itemgetter(0))
        if not replica_infos_selected:
            return empty_with_warning(
                'Dataset block %r is not available at the selected locations!\nAvailable locations: %s',
                blockPath, str.join(', ', self._fmtLocations(replica_infos)))
        if not self._onlyComplete:
            return list(expanded_replica_locations(replica_infos_selected))
        replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2],
                                         replica_infos_selected)
        if not replica_infos_complete:
            return empty_with_warning(
                'Dataset block %r is not completely available at the selected locations!\nAvailable locations: %s',
                blockPath, str.join(', ', self._fmtLocations(replica_infos)))
        return list(expanded_replica_locations(replica_infos_complete))

    # Get dataset se list from PhEDex (perhaps concurrent with listFiles)
    def _getPhedexReplicas(self, blockPath, dictReplicas):
        dictReplicas[blockPath] = []
        for phedexBlock in self._pjrc.get(
                params={'block': blockPath})['phedex']['block']:
            for replica in phedexBlock['replica']:
                dictReplicas[blockPath].append(
                    (replica['node'], replica.get('se'),
                     replica['complete'] == 'y'))

    def getDatasets(self):
        if self._cache_dataset is None:
            self._cache_dataset = [self._datasetPath]
            if '*' in self._datasetPath:
                self._cache_dataset = list(
                    self._getCMSDatasets(self._datasetPath))
                if not self._cache_dataset:
                    raise DatasetError(
                        'No datasets selected by DBS wildcard %s !' %
                        self._datasetPath)
        return self._cache_dataset

    def _getCMSBlocks(self, datasetPath, getSites):
        iter_blockname_selist = self._getCMSBlocksImpl(datasetPath, getSites)
        n_blocks = 0
        selected_blocks = False
        for (blockname, selist) in iter_blockname_selist:
            n_blocks += 1
            if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1]
                                                  != self._datasetBlock):
                continue
            selected_blocks = True
            yield (blockname, selist)
        if (n_blocks > 0) and not selected_blocks:
            raise DatasetError(
                'Dataset %r contains %d blocks, but none were selected by %r' %
                (datasetPath, n_blocks, self._datasetBlock))

    def _fillCMSFiles(self, block, blockPath):
        lumi_used = False
        lumiDict = {}
        if self._lumi_query:  # central lumi query
            lumiDict = self._getCMSLumisImpl(blockPath)
        fileList = []
        for (fileInfo,
             listLumi) in self._getCMSFilesImpl(blockPath, self.onlyValid,
                                                self._lumi_query):
            if lumiDict and not listLumi:
                listLumi = lumiDict.get(fileInfo[DataProvider.URL], [])
            if listLumi:
                (listLumiExt_Run, listLumiExt_Lumi) = ([], [])
                for (run, lumi_list) in sorted(listLumi):
                    listLumiExt_Run.extend([run] * len(lumi_list))
                    listLumiExt_Lumi.extend(lumi_list)
                fileInfo[DataProvider.Metadata] = [
                    listLumiExt_Run, listLumiExt_Lumi
                ]
                lumi_used = True
            fileList.append(fileInfo)
        if lumi_used:
            block.setdefault(DataProvider.Metadata,
                             []).extend(['Runs', 'Lumi'])
        block[DataProvider.FileList] = fileList

    def _getCMSLumisImpl(self, blockPath):
        return None

    def _getGCBlocks(self, usePhedex):
        for datasetPath in self.getDatasets():
            counter = 0
            for (blockPath,
                 replica_infos) in self._getCMSBlocks(datasetPath,
                                                      getSites=not usePhedex):
                result = {}
                result[DataProvider.Dataset] = blockPath.split('#')[0]
                result[DataProvider.BlockName] = blockPath.split('#')[1]

                if usePhedex:  # Start parallel phedex query
                    dictReplicas = {}
                    tPhedex = start_thread(
                        'Query phedex site info for %s' % blockPath,
                        self._getPhedexReplicas, blockPath, dictReplicas)
                    self._fillCMSFiles(result, blockPath)
                    tPhedex.join()
                    replica_infos = dictReplicas.get(blockPath)
                else:
                    self._fillCMSFiles(result, blockPath)
                result[DataProvider.Locations] = self._processReplicas(
                    blockPath, replica_infos)

                if len(result[DataProvider.FileList]):
                    counter += 1
                    yield result

            if counter == 0:
                raise DatasetError(
                    'Dataset %s does not contain any valid blocks!' %
                    datasetPath)
class CMSBaseProvider(DataProvider):
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		changeTrigger = triggerResync(['datasets', 'parameters'])
		self._lumi_filter = config.getLookup('lumi filter', {}, parser = parseLumiFilter, strfun = strLumi, onChange = changeTrigger)
		if not self._lumi_filter.empty():
			config.set('dataset processor', 'LumiDataProcessor', '+=')
		DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID)
		# LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well
		self._lumi_query = config.getBool('lumi metadata', not self._lumi_filter.empty(), onChange = changeTrigger)
		# PhEDex blacklist: 'T1_DE_KIT', 'T1_US_FNAL' and '*_Disk' allow user jobs - other T1's dont!
		self._phedexFilter = config.getFilter('phedex sites', '-T3_US_FNALLPC',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger)
		self._phedexT1Filter = config.getFilter('phedex t1 accept', 'T1_DE_KIT T1_US_FNAL',
			defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = changeTrigger)
		self._phedexT1Mode = config.getEnum('phedex t1 mode', PhedexT1Mode, PhedexT1Mode.disk, onChange = changeTrigger)
		self.onlyComplete = config.getBool('only complete sites', True, onChange = changeTrigger)
		self._locationFormat = config.getEnum('location format', CMSLocationFormat, CMSLocationFormat.hostname, onChange = changeTrigger)
		self._pjrc = JSONRestClient(url = 'https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas')

		(self._datasetPath, self._url, self._datasetBlock) = optSplit(datasetExpr, '@#')
		self._url = self._url or config.get('dbs instance', '')
		self._datasetBlock = self._datasetBlock or 'all'
		self.onlyValid = config.getBool('only valid', True, onChange = changeTrigger)


	# Define how often the dataprovider can be queried automatically
	def queryLimit(self):
		return 2 * 60 * 60 # 2 hour delay minimum


	# Check if splitterClass is valid
	def checkSplitter(self, splitterClass):
		if (DataSplitter.Skipped in splitterClass.neededEnums()) and not self._lumi_filter.empty():
			self._log.debug('Selected splitter %s is not compatible with active lumi filter!', splitterClass.__name__)
			self._log.warning('Active lumi section filter forced selection of HybridSplitter')
			return HybridSplitter
		return splitterClass


	def _nodeFilter(self, nameSiteDB, complete):
		# Remove T0 and T1 by default
		result = not (nameSiteDB.startswith('T0_') or nameSiteDB.startswith('T1_'))
		# check if listed on the accepted list
		if self._phedexT1Mode in [PhedexT1Mode.disk, PhedexT1Mode.accept]:
			result = result or (self._phedexT1Filter.filterList([nameSiteDB]) == [nameSiteDB])
		if self._phedexT1Mode == PhedexT1Mode.disk:
			result = result or nameSiteDB.lower().endswith('_disk')
		# apply phedex blacklist
		result = result and (self._phedexFilter.filterList([nameSiteDB]) == [nameSiteDB])
		# check for completeness at the site
		result = result and (complete or not self.onlyComplete)
		return result


	# Get dataset se list from PhEDex (perhaps concurrent with listFiles)
	def _getPhedexSEList(self, blockPath, dictSE):
		dictSE[blockPath] = []
		for phedexBlock in self._pjrc.get(params = {'block': blockPath})['phedex']['block']:
			for replica in phedexBlock['replica']:
				if self._nodeFilter(replica['node'], replica['complete'] == 'y'):
					location = None
					if self._locationFormat == CMSLocationFormat.hostname:
						location = replica.get('se')
					elif self._locationFormat == CMSLocationFormat.siteDB:
						location = replica.get('node')
					elif (self._locationFormat == CMSLocationFormat.both) and (replica.get('node') or replica.get('se')):
						location = '%s/%s' % (replica.get('node'), replica.get('se'))
					if location:
						dictSE[blockPath].append(location)
					else:
						self._log.warning('Dataset block %s replica at %s / %s is skipped!',
							blockPath, replica.get('node'), replica.get('se'))


	def getDatasets(self):
		if self._cache_dataset is None:
			self._cache_dataset = [self._datasetPath]
			if '*' in self._datasetPath:
				self._cache_dataset = list(self.getCMSDatasets(self._datasetPath))
				if not self._cache_dataset:
					raise DatasetError('No datasets selected by DBS wildcard %s !' % self._datasetPath)
		return self._cache_dataset


	def getCMSBlocks(self, datasetPath, getSites):
		iter_blockname_selist = self.getCMSBlocksImpl(datasetPath, getSites)
		n_blocks = 0
		selected_blocks = False
		for (blockname, selist) in iter_blockname_selist:
			n_blocks += 1
			if (self._datasetBlock != 'all') and (str.split(blockname, '#')[1] != self._datasetBlock):
				continue
			selected_blocks = True
			yield (blockname, selist)
		if (n_blocks > 0) and not selected_blocks:
			raise DatasetError('Dataset %r contains %d blocks, but none were selected by %r' % (datasetPath, n_blocks, self._datasetBlock))


	def fillCMSFiles(self, block, blockPath):
		lumi_used = False
		lumiDict = {}
		if self._lumi_query: # central lumi query
			lumiDict = self.getCMSLumisImpl(blockPath)
		fileList = []
		for (fileInfo, listLumi) in self.getCMSFilesImpl(blockPath, self.onlyValid, self._lumi_query):
			if lumiDict and not listLumi:
				listLumi = lumiDict.get(fileInfo[DataProvider.URL], [])
			if listLumi:
				(listLumiExt_Run, listLumiExt_Lumi) = ([], [])
				for (run, lumi_list) in sorted(listLumi):
					listLumiExt_Run.extend([run] * len(lumi_list))
					listLumiExt_Lumi.extend(lumi_list)
				fileInfo[DataProvider.Metadata] = [listLumiExt_Run, listLumiExt_Lumi]
				lumi_used = True
			fileList.append(fileInfo)
		if lumi_used:
			block.setdefault(DataProvider.Metadata, []).extend(['Runs', 'Lumi'])
		block[DataProvider.FileList] = fileList


	def getCMSLumisImpl(self, blockPath):
		return None


	def getGCBlocks(self, usePhedex):
		for datasetPath in self.getDatasets():
			counter = 0
			for (blockPath, listSE) in self.getCMSBlocks(datasetPath, getSites = not usePhedex):
				result = {}
				result[DataProvider.Dataset] = blockPath.split('#')[0]
				result[DataProvider.BlockName] = blockPath.split('#')[1]

				if usePhedex: # Start parallel phedex query
					dictSE = {}
					tPhedex = start_thread('Query phedex site info for %s' % blockPath, self._getPhedexSEList, blockPath, dictSE)
					self.fillCMSFiles(result, blockPath)
					tPhedex.join()
					listSE = dictSE.get(blockPath)
				else:
					self.fillCMSFiles(result, blockPath)
				result[DataProvider.Locations] = listSE

				if len(result[DataProvider.FileList]):
					counter += 1
					yield result

			if counter == 0:
				raise DatasetError('Dataset %s does not contain any valid blocks!' % datasetPath)