def __init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc, scanner_list_default): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # Configure scanners scanner_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters'])) self._interactive_assignment = config.is_interactive('dataset name assignment', True) def _create_scanner(scanner_name): return InfoScanner.create_instance(scanner_name, scanner_config, datasource_name) scanner_list = scanner_config.get_list('scanner', scanner_list_default) + ['NullScanner'] self._scanner_list = lmap(_create_scanner, scanner_list) # Configure dataset / block naming and selection def _setup(prefix): selected_hash_list = scanner_config.get_list(join_config_locations(prefix, 'key select'), []) name = scanner_config.get(join_config_locations(prefix, 'name pattern'), '') return (selected_hash_list, name) (self._selected_hash_list_dataset, self._dataset_pattern) = _setup('dataset') (self._selected_hash_list_block, self._block_pattern) = _setup('block') # Configure hash input for separation of files into datasets / blocks def _get_active_hash_input(prefix, guard_entry_idx): hash_input_list_user = scanner_config.get_list(join_config_locations(prefix, 'hash keys'), []) hash_input_list_guard = scanner_config.get_list(join_config_locations(prefix, 'guard override'), lchain(imap(lambda scanner: scanner.get_guard_keysets()[guard_entry_idx], self._scanner_list))) return hash_input_list_user + hash_input_list_guard self._hash_input_set_dataset = _get_active_hash_input('dataset', 0) self._hash_input_set_block = _get_active_hash_input('block', 1)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): dataset_config = config.change_view( default_on_change=TriggerResync(['datasets', 'parameters'])) self._lumi_filter = dataset_config.get_lookup( ['lumi filter', '%s lumi filter' % datasource_name], default={}, parser=parse_lumi_filter, strfun=str_lumi) if not self._lumi_filter.empty(): config.set('%s processor' % datasource_name, 'LumiDataProcessor', '+=') DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # LumiDataProcessor instantiated in DataProcessor.__ini__ will set lumi metadata as well self._lumi_query = dataset_config.get_bool( ['lumi metadata', '%s lumi metadata' % datasource_name], default=not self._lumi_filter.empty()) config.set('phedex sites matcher mode', 'ShellStyleMatcher', '?=') # PhEDex blacklist: 'T1_*_Disk nodes allow user jobs - other T1's dont! self._phedex_filter = dataset_config.get_filter( 'phedex sites', '-* T1_*_Disk T2_* T3_*', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter') self._only_complete = dataset_config.get_bool('only complete sites', True) self._only_valid = dataset_config.get_bool('only valid', True) self._allow_phedex = dataset_config.get_bool('allow phedex', True) self._location_format = dataset_config.get_enum( 'location format', CMSLocationFormat, CMSLocationFormat.hostname) self._pjrc = JSONRestClient( url='https://cmsweb.cern.ch/phedex/datasvc/json/prod/blockreplicas' ) self._sitedb = SiteDB() dataset_expr_parts = split_opt(dataset_expr, '@#') (self._dataset_path, self._dataset_instance, self._dataset_block_selector) = dataset_expr_parts instance_default = dataset_config.get('dbs instance', '') self._dataset_instance = self._dataset_instance or instance_default if not self._dataset_instance: self._dataset_instance = 'prod/global' elif '/' not in self._dataset_instance: self._dataset_instance = 'prod/%s' % self._dataset_instance self._dataset_block_selector = self._dataset_block_selector or 'all'
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): CMSBaseProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) self._url = config.get('das instance', 'https://cmsweb.cern.ch/das/cache', on_change=TriggerResync( ['datasets', 'parameters'])) if self._dataset_instance.startswith('http'): self._url = self._dataset_instance self._dataset_instance = '' self._gjrc = DASRestClient(get_cms_cert(config), self._url, 'VOMS proxy needed to query DAS!', UserError)
def __new__(cls, config, datasource_name, repository, keep_old=True): provider_name_default = config.get( ['default provider', '%s provider' % datasource_name], 'ListProvider') provider = config.get_composited_plugin( datasource_name, '', ':ThreadedMultiDatasetProvider:', cls=DataProvider, require_plugin=False, on_change=TriggerResync(['datasets', 'parameters']), bind_kwargs={ 'datasource_name': datasource_name, 'provider_name_default': provider_name_default }) if not provider: return NullParameterSource() instance = BaseDataParameterSource.__new__(cls) instance.provider = provider return instance
def __init__(self, config, datasource_name, repository, reader=None): LimitedResyncParameterSource.__init__(self) # needed for backwards compatible file names: datacache/datamap self._name = datasource_name.replace('dataset', 'data') (self._reader, self._len) = (None, None) self._set_reader(reader) self._part_proc = config.get_composited_plugin( [ 'partition processor', '%s partition processor' % datasource_name ], 'TFCPartitionProcessor LocationPartitionProcessor ' + 'MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls=PartitionProcessor, on_change=TriggerResync(['parameters']), pargs=(datasource_name, )) self._log.debug('%s: Using partition processor %s', datasource_name, repr(self._part_proc)) repository['dataset:%s' % self._name] = self
def _setup_repository(self, config, psrc_repository): TaskModule._setup_repository(self, config, psrc_repository) psrc_list = [] for datasource_name in config.get_list( 'datasource names', ['dataset'], on_change=TriggerResync(['datasets', 'parameters'])): data_config = config.change_view(view_class='TaggedConfigView', add_sections=[datasource_name]) self._create_datasource(data_config, datasource_name, psrc_repository, psrc_list) self._has_dataset = (psrc_list != []) # Register signal handler for manual dataset refresh def _external_refresh(sig, frame): for psrc in psrc_list: self._log.info( 'External signal triggered resync of datasource %r', psrc.get_datasource_name()) psrc.setup_resync(force=True) signal.signal(signal.SIGUSR2, _external_refresh) config.set_state(False, 'resync', detail='datasets')
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): ConfigurablePlugin.__init__(self, config) self._log = logging.getLogger('%s.provider' % datasource_name) (self._datasource_name, self._dataset_expr) = (datasource_name, dataset_expr) self._dataset_nick_override = dataset_nick (self._cache_block, self._cache_dataset) = (None, None) self._dataset_query_interval = config.get_time( '%s default query interval' % datasource_name, 60, on_change=None) self._stats = dataset_proc or DataProcessor.create_instance( 'SimpleStatsDataProcessor', config, datasource_name, self._log, ' * Dataset %s:\n\tcontains ' % repr(dataset_nick or dataset_expr)) dataset_config = config.change_view( default_on_change=TriggerResync(['datasets', 'parameters'])) self._nick_producer = dataset_config.get_plugin( ['nickname source', '%s nickname source' % datasource_name], 'SimpleNickNameProducer', cls=DataProcessor, pargs=(datasource_name, )) self._dataset_processor = dataset_proc or dataset_config.get_composited_plugin( '%s processor' % datasource_name, 'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor ' + 'URLCountDataProcessor EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor ' + 'LocationDataProcessor', 'MultiDataProcessor', cls=DataProcessor, pargs=(datasource_name, ))
def _on_change(config, old_obj, cur_obj, cur_entry, obj2str): self._log.critical('Dataset %r changed', dataset_expr) return TriggerResync(['datasets', 'parameters'])(config, old_obj, cur_obj, cur_entry, obj2str)