def _getResyncSource(self, psource_old, psource_new, mapJob2PID, pAdded, pMissing, disableNewPNum): # Construct complete parameter space psource with missing parameter entries and intervention state # NNNNNNNNNNNNN OOOOOOOOO | source: NEW (==self) and OLD (==from file) # <same><added> <missing> | same: both in NEW and OLD, added: only in NEW, missing: only in OLD oldMaxJobs = psource_old.getMaxJobs() # assign sequential job numbers to the added parameter entries sort_inplace(pAdded, key = itemgetter('GC_PARAM')) for (idx, entry) in enumerate(pAdded): if oldMaxJobs + idx != entry['GC_PARAM']: mapJob2PID[oldMaxJobs + idx] = entry['GC_PARAM'] missingInfos = [] newMaxJobs = psource_new.getMaxJobs() sort_inplace(pMissing, key = itemgetter('GC_PARAM')) for (idx, entry) in enumerate(pMissing): mapJob2PID[entry['GC_PARAM']] = newMaxJobs + idx tmp = psource_old.getJobInfo(newMaxJobs + idx, entry['GC_PARAM']) tmp.pop('GC_PARAM') if tmp[ParameterInfo.ACTIVE]: tmp[ParameterInfo.ACTIVE] = False disableNewPNum.add(newMaxJobs + idx) missingInfos.append(tmp) if missingInfos: return self._createAggregatedSource(psource_old, psource_new, missingInfos) return self._source
def get_dataset_info(opts, args, query_blocks=True): config = get_dataset_config(opts, args) if opts.threads is not None: config.set_int('dataprovider thread max', int(opts.threads) or 1) provider = config.get_composited_plugin( 'dataset', cls=DataProvider, bind_kwargs={'provider_name_default': config.get('dataset provider')}, default_compositor=':ThreadedMultiDatasetProvider:') dataset_list = sorted(provider.get_dataset_name_list()) if len(dataset_list) == 0: raise DatasetError('No datasets matched!') # Query blocks only if needed query_blocks = False for option in opts.__dict__: if option.startswith('list_') and (option != 'list_dataset_names') or ( option == 'save'): if getattr(opts, option): query_blocks = True block_list = None if query_blocks: block_list = provider.get_block_list_cached(show_stats=False) if len(block_list) == 0: raise DatasetError('No blocks matched!') if opts.ordered: sort_inplace(block_list, key=itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for block in block_list: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) return (provider, dataset_list, block_list)
def process(self, blockIter): if self._sortDS: dsCache = {} for block in blockIter: dsCache.setdefault(block[DataProvider.Dataset], []).append(block) def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key=itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block blockIter = ds_generator() elif self._sortBlock: blockIter = sorted(blockIter, key=itemgetter(DataProvider.BlockName)) # Yield blocks for block in blockIter: if self._sortFiles: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) if self._sortLocation: sort_inplace(block[DataProvider.Locations]) yield block
def get_dataset_info(opts, args, query_blocks=True): config = get_dataset_config(opts, args) provider = config.get_composited_plugin('dataset', cls=DataProvider, bind_kwargs={'provider_name_default': config.get('dataset provider')}, default_compositor=':ThreadedMultiDatasetProvider:') # -T disables multi-threading further below dataset_list = sorted(provider.get_dataset_name_list()) if len(dataset_list) == 0: raise DatasetError('No datasets matched!') # Query blocks only if needed query_blocks = False for option in opts.__dict__: if option.startswith('list_') and (option != 'list_dataset_names') or (option == 'save'): if getattr(opts, option): query_blocks = True block_list = None if query_blocks: block_list = provider.get_block_list_cached(show_stats=False) if len(block_list) == 0: raise DatasetError('No blocks matched!') if opts.ordered: sort_inplace(block_list, key=itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for block in block_list: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) return (provider, dataset_list, block_list)
def _handle_matching_block(block_list_added, block_list_missing, block_list_matching, block_old, block_new): # Compare different files according to their name - NOT full content get_file_key = itemgetter(DataProvider.URL) sort_inplace(block_old[DataProvider.FileList], key=get_file_key) sort_inplace(block_new[DataProvider.FileList], key=get_file_key) def _handle_matching_fi(fi_list_added, fi_list_missing, fi_list_matched, fi_old, fi_new): fi_list_matched.append((fi_old, fi_new)) (fi_list_added, fi_list_missing, fi_list_matched) = get_list_difference( block_old[DataProvider.FileList], block_new[DataProvider.FileList], get_file_key, _handle_matching_fi, is_sorted=True) if fi_list_added: # Create new block for added files in an existing block block_added = copy.copy(block_new) block_added[DataProvider.FileList] = fi_list_added block_added[DataProvider.NEntries] = sum( imap(itemgetter(DataProvider.NEntries), fi_list_added)) block_list_added.append(block_added) block_list_matching.append( (block_old, block_new, fi_list_missing, fi_list_matched))
def save_dataset(opts, provider): print('') blocks = provider.getBlocks() if opts.ordered: sort_inplace(blocks, key = itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for b in blocks: sort_inplace(b[DataProvider.FileList], key = itemgetter(DataProvider.URL)) DataProvider.saveToFile(opts.save, blocks) print('Dataset information saved to ./%s' % opts.save)
def _finish_partition(self, block, partition, fi_list=None): # Copy infos from block for (dp_prop, ds_prop) in self._dp_ds_prop_list: if dp_prop in block: partition[ds_prop] = block[dp_prop] if DataProvider.Metadata in block: partition[DataSplitter.MetadataHeader] = block[DataProvider.Metadata] # Helper for very simple splitter if fi_list: partition[DataSplitter.FileList] = lmap(itemgetter(DataProvider.URL), fi_list) partition[DataSplitter.NEntries] = sum(imap(itemgetter(DataProvider.NEntries), fi_list)) if DataProvider.Metadata in block: partition[DataSplitter.Metadata] = lmap(itemgetter(DataProvider.Metadata), fi_list) return partition
def _iter_blocks_by_dataset(self, map_dataset2block_list): for dataset_name in sorted(map_dataset2block_list): if self._sort_block: sort_inplace(map_dataset2block_list[dataset_name], key=itemgetter(DataProvider.BlockName)) for block in map_dataset2block_list[dataset_name]: yield block
def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks(block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def _getBlocksInternal(self): # Split files into blocks/datasets via key functions and determine metadata intersection (protoBlocks, commonDS, commonB) = ({}, {}, {}) def getActiveKeys(kUser, kGuard, gIdx): return kUser + (kGuard or lchain(imap(lambda x: x.getGuards()[gIdx], self._scanner))) keysDS = getActiveKeys(self._ds_keys_user, self._ds_keys_guard, 0) keysB = getActiveKeys(self._b_keys_user, self._b_keys_guard, 1) for fileInfo in ifilter(itemgetter(0), self._collectFiles()): hashDS = self._generateKey(keysDS, md5_hex(repr(self._datasetExpr)) + md5_hex(repr(self._datasetNick)), *fileInfo) hashB = self._generateKey(keysB, hashDS + md5_hex(repr(fileInfo[3])), *fileInfo) # [3] == SE list if not self._ds_select or (hashDS in self._ds_select): if not self._b_select or (hashB in self._b_select): fileInfo[1].update({'DS_KEY': hashDS, 'BLOCK_KEY': hashB}) protoBlocks.setdefault(hashDS, {}).setdefault(hashB, []).append(fileInfo) utils.intersectDict(commonDS.setdefault(hashDS, dict(fileInfo[1])), fileInfo[1]) utils.intersectDict(commonB.setdefault(hashDS, {}).setdefault(hashB, dict(fileInfo[1])), fileInfo[1]) # Generate names for blocks/datasets using common metadata (hashNameDictDS, hashNameDictB) = ({}, {}) for hashDS in protoBlocks: hashNameDictDS[hashDS] = self._generateDatasetName(hashDS, commonDS[hashDS]) for hashB in protoBlocks[hashDS]: hashNameDictB[hashB] = (hashDS, self._generateBlockName(hashB, commonB[hashDS][hashB])) self._findCollision('dataset', hashNameDictDS, commonDS, keysDS, lambda name, key: [key]) self._findCollision('block', hashNameDictB, commonB, keysDS + keysB, lambda name, key: [name[0], key], lambda name: name[1]) for block in self._buildBlocks(protoBlocks, hashNameDictDS, hashNameDictB): yield block
def iter_blocks_normed(self): activity = Activity('Retrieving %s' % self._dataset_expr) try: # Validation, Naming: for block in self._iter_blocks_raw(): if not block.get(DataProvider.Dataset): raise DatasetError( 'Block does not contain the dataset name!') block.setdefault(DataProvider.BlockName, '0') block.setdefault(DataProvider.Provider, self.__class__.__name__) block.setdefault(DataProvider.Query, self._dataset_expr) block.setdefault(DataProvider.Locations, None) events = sum( imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) block.setdefault(DataProvider.NEntries, events) if self._dataset_nick_override: block[DataProvider.Nickname] = self._dataset_nick_override elif self._nick_producer: block = self._nick_producer.process_block(block) if not block: raise DatasetError('Nickname producer failed!') yield block except Exception: raise DatasetRetrievalError('Unable to retrieve dataset %s' % repr(self._dataset_expr)) activity.finish()
def _build_blocks(self, map_key2fm_list, map_key2name, map_key2metadata_dict): # Return named dataset for key in sorted(map_key2fm_list): result = { DataProvider.Dataset: map_key2name[key[:1]], DataProvider.BlockName: map_key2name[key[:2]], } fm_list = map_key2fm_list[key] # Determine location_list location_list = None for file_location_list in ifilter(lambda s: s is not None, imap(itemgetter(3), fm_list)): location_list = location_list or [] location_list.extend(file_location_list) if location_list is not None: result[DataProvider.Locations] = list(UniqueList(location_list)) # use first file [0] to get the initial metadata_dict [1] metadata_name_list = list(fm_list[0][1].keys()) result[DataProvider.Metadata] = metadata_name_list # translate file metadata into data provider file info entries def _translate_fm2fi(url, metadata_dict, entries, location_list, obj_dict): if entries is None: entries = -1 return {DataProvider.URL: url, DataProvider.NEntries: entries, DataProvider.Metadata: lmap(metadata_dict.get, metadata_name_list)} result[DataProvider.FileList] = lsmap(_translate_fm2fi, fm_list) yield result
def _diffParams(self, psource_old, psource_new, mapJob2PID, redoNewPNum, disableNewPNum): # Reduces psource output to essential information for diff - faster than keying def translatePSource(psource): keys_store = sorted(ifilter(lambda k: not k.untracked, psource.getJobKeys())) def translateEntry(meta): # Translates parameter setting into hash tmp = md5() for key in ifilter(lambda k: k in meta, keys_store): if str(meta[key]): tmp.update(str2bytes(key)) tmp.update(str2bytes(str(meta[key]))) return { ParameterInfo.HASH: tmp.hexdigest(), 'GC_PARAM': meta['GC_PARAM'], ParameterInfo.ACTIVE: meta[ParameterInfo.ACTIVE] } for entry in psource.iterJobs(): yield translateEntry(entry) params_old = list(translatePSource(psource_old)) params_new = list(translatePSource(psource_new)) def sameParams(paramsAdded, paramsMissing, paramsSame, oldParam, newParam): mapJob2PID[oldParam['GC_PARAM']] = newParam['GC_PARAM'] if not oldParam[ParameterInfo.ACTIVE] and newParam[ParameterInfo.ACTIVE]: redoNewPNum.add(newParam['GC_PARAM']) if oldParam[ParameterInfo.ACTIVE] and not newParam[ParameterInfo.ACTIVE]: disableNewPNum.add(newParam['GC_PARAM']) return utils.DiffLists(params_old, params_new, itemgetter(ParameterInfo.HASH), sameParams)
def _processReplicas(self, blockPath, replica_infos): def empty_with_warning(*args): self._log.warning(*args) return [] def expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._replicaLocation(replica_info): yield entry if not replica_infos: return empty_with_warning( 'Dataset block %r has no replica information!', blockPath) replica_infos_selected = self._phedexFilter.filterList( replica_infos, key=itemgetter(0)) if not replica_infos_selected: return empty_with_warning( 'Dataset block %r is not available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) if not self._onlyComplete: return list(expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return empty_with_warning( 'Dataset block %r is not completely available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) return list(expanded_replica_locations(replica_infos_complete))
def display(self): stateMap = dict(self._stateMap) def transform(data, label, level): if None in data: total = data.pop(None) if (len(data) > 1): for result in self._get_entry(stateMap, total, ['Total']): yield result yield '=' for idx, entry in enumerate(sorted(data)): if level == 1: for result in self._get_entry(stateMap, data[entry], [entry] + label): yield result else: for result in transform(data[entry], [entry] + label, level - 1): yield result if idx != len(data) - 1: yield '-' stats = self._getHierachicalStats() displayStates = lmap(itemgetter(1), self._stateMap) header = [('', 'Category')] + lzip(displayStates, displayStates) printTabular(header, transform(stats, [], len(self._idxList)), fmtString = 'l' + 'c'*len(stateMap), fmt = {'': lambda x: str.join(' ', x)}) return 0
def _get_entries_for_url(url): fi = _fast_search(block_old[DataProvider.FileList], itemgetter(DataProvider.URL), url) if not fi: raise Exception('url %s not found in block %s\n%s' % (url, block_old, partition)) return fi[DataProvider.NEntries]
def __init__(self, name, container_old, container_cur, parent=None, set_sections=unspecified, add_sections=None, set_names=unspecified, add_names=None, set_tags=unspecified, add_tags=None, set_classes=unspecified, add_classes=None, inherit_sections=False): parent = parent or self if inherit_sections and isinstance(parent, TaggedConfigView): add_sections = (parent.get_class_section_list() or []) + (add_sections or []) SimpleConfigView.__init__(self, name, container_old, container_cur, parent, set_sections=set_sections, add_sections=add_sections) self._class_section_list = self._init_variable(parent, '_class_section_list', None, set_classes, add_classes, norm_config_locations, lambda x: x.config_section_list) self._section_name_list = self._init_variable(parent, '_section_name_list', [], set_names, add_names, norm_config_locations) def _get_tag_tuple(tag_obj): try: config_tag_name = tag_obj.config_tag_name.lower() except Exception: raise APIError('Class %r does not define a valid tag name!' % tag_obj.__class__.__name__) return [(config_tag_name, tag_obj.get_object_name().lower())] self._section_tag_list = self._init_variable(parent, '_section_tag_list', [], set_tags, add_tags, identity, _get_tag_tuple) self._section_tag_order = lmap(itemgetter(0), self._section_tag_list)
def _addEntry(self, container, section, option, value, source): option = option.strip() opttype = '=' if option[-1] in imap(itemgetter(0), ConfigEntry.OptTypeDesc.keys()): opttype = option[-1] + '=' option = option[:-1].strip() container.append(ConfigEntry(section.strip(), option, value.strip(), opttype, source))
def show_report(self, job_db, jobnum_list): state_map = dict(self._state_map) def _transform(data, label, level): if None in data: total = data.pop(None) if len(data) > 1: for result in self._get_entry(state_map, total, ['Total']): yield result yield '=' for idx, entry in enumerate(sorted(data)): if level == 1: for result in self._get_entry(state_map, data[entry], [entry] + label): yield result else: for result in _transform(data[entry], [entry] + label, level - 1): yield result if idx != len(data) - 1: yield '-' stats = self._get_hierachical_stats_dict(job_db, jobnum_list) displace_states_list = lmap(itemgetter(1), self._state_map) header = [('', 'Category')] + lzip(displace_states_list, displace_states_list) self._show_table(header, _transform(stats, [], len(self._idx_list)), align_str='l' + 'c' * len(state_map), fmt_dict={'': lambda x: str.join(' ', x)})
def main(opts, args): config = get_dataset_config(opts, args) provider = config.getPlugin('dataset', cls = DataProvider) blocks = provider.getBlocks() if len(blocks) == 0: raise DatasetError('No blocks!') datasets = set(imap(itemgetter(DataProvider.Dataset), blocks)) if len(datasets) > 1 or opts.info: headerbase = [(DataProvider.Dataset, 'Dataset')] else: print('Dataset: %s' % blocks[0][DataProvider.Dataset]) headerbase = [] if opts.list_datasets: list_datasets(blocks) if opts.list_blocks: list_blocks(blocks, headerbase) if opts.list_files: list_files(datasets, blocks) if opts.list_storage: list_storage(blocks, headerbase) if opts.metadata and not opts.save: list_metadata(datasets, blocks) if opts.block_metadata and not opts.save: list_block_metadata(datasets, blocks) if opts.config_entry: list_config_entries(opts, blocks, provider) if opts.info: list_infos(blocks) if opts.save: save_dataset(opts, provider)
def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key=itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block
def _resync_files(self, splitter, partition_mod, partition_num, size_list, fi_list_missing, fi_list_matched, block_new, metadata_setup_list, partition_list_added): # resync a single file in the partition, return next file index to process # Select processing mode for job (disable > complete > changed > ignore) # [ie. disable overrides all] using min # Result: one of [disable, complete, ignore] (changed -> complete or igore) fi_idx = 0 metadata_list_current = [] proc_mode = ResyncMode.ignore while fi_idx < len(partition_mod[DataSplitter.FileList]): fi_removed = _fast_search( fi_list_missing, itemgetter(DataProvider.URL), partition_mod[DataSplitter.FileList][fi_idx]) if fi_removed: proc_mode = self._handle_removed_file(proc_mode, fi_idx, partition_mod, size_list, fi_removed) else: (proc_mode, fi_idx) = self._handle_changed_file( splitter, proc_mode, fi_idx, partition_mod, partition_num, size_list, block_new, partition_list_added, fi_list_matched, metadata_list_current, metadata_setup_list) return (proc_mode, metadata_list_current)
def _process_replica_list(self, block_path, replica_infos): def _empty_with_warning(error_msg, *args): self._log.warning('Dataset block %r ' + error_msg, block_path, *args) return [] def _expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._iter_replica_locations(replica_info): yield entry if not replica_infos: return _empty_with_warning('has no replica information!') replica_infos_selected = self._phedex_filter.filter_list( replica_infos, key=itemgetter(0)) if not replica_infos_selected: return _empty_with_warning( 'is not available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) if not self._only_complete: return list(_expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return _empty_with_warning( 'is not completely available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) return list(_expanded_replica_locations(replica_infos_complete))
def process_block(self, block): # Check entry consistency events = sum(imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events) self._handle_error(error_msg, self._mode) return block
def _get_js_class_infos(self): job_class_list = [('AT WMS', JobClass.ATWMS), ('RUNNING', JobClass.RUNNING), ('FAILING', JobClass.FAILING), ('SUCCESS', JobClass.SUCCESS)] state_class_map = {} for (job_class_name, job_class) in job_class_list: for job_state in job_class.state_list: state_class_map[job_state] = job_class_name return (state_class_map, lmap(itemgetter(0), job_class_list))
def _translate_pa2pspi_list(padapter): # Reduces parameter adapter output to essential information for diff - faster than keying meta_iter = ifilter(lambda k: not k.untracked, padapter.get_job_metadata()) meta_list = sorted(meta_iter, key=lambda k: k.value) for psp in padapter.iter_jobs(): # Translates parameter space point into hash psp_item_iter = imap(lambda meta: (meta.value, psp.get(meta.value)), meta_list) hash_str = md5_hex(repr(lfilter(itemgetter(1), psp_item_iter))) yield (psp[ParameterInfo.ACTIVE], hash_str, psp['GC_PARAM'])
def process(self, block_iter): if self._sort_ds: map_dataset2block_list = {} for block in block_iter: map_dataset2block_list.setdefault(block[DataProvider.Dataset], []).append(block) block_iter = self._iter_blocks_by_dataset(map_dataset2block_list) elif self._sort_block: block_iter = sorted(block_iter, key=itemgetter(DataProvider.BlockName)) # pylint:disable=redefined-variable-type # Yield blocks for block in block_iter: if self._sort_files: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) if self._sort_location: sort_inplace(block[DataProvider.Locations]) yield block
def _reduce_fn_list(self, block, fn_list_limit_map): dataset_name = block[DataProvider.Dataset] fn_list_limit = fn_list_limit_map[dataset_name] fi_list_removed = block[DataProvider.FileList][fn_list_limit:] nentry_removed_iter = imap(itemgetter(DataProvider.NEntries), fi_list_removed) block[DataProvider.NEntries] -= sum(nentry_removed_iter) block[DataProvider.FileList] = block[ DataProvider.FileList][:fn_list_limit] fn_list_limit_map[dataset_name] -= len(block[DataProvider.FileList])
def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks( block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def __init__(self, config): DataProcessor.__init__(self, config) internal_config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['dataprocessor']) internal_config.set('dataset processor', 'NullDataProcessor') self._url_filter = config.getFilter(['dataset ignore files', 'dataset ignore urls'], '', negate = True, filterParser = lambda value: self._parseFilter(internal_config, value), filterStr = lambda value: str.join('\n', value.split()), matchKey = itemgetter(DataProvider.URL), defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = DataProcessor.triggerDataResync)
def _add_entry(self, container, section, option, value, source): opttype = '=' try: option = option.strip() if option[-1] in imap(itemgetter(0), ConfigEntry.map_opt_type2desc.keys()): opttype = option[-1] + '=' option = option[:-1].strip() container.append(ConfigEntry(section.strip(), option, value.strip(), opttype, source)) except Exception: raise ConfigError('Unable to register config value [%s] %s %s %s (from %s)' % ( section, option, opttype, value, source))
def process_block(self, block): # Check entry consistency events = sum( imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events) self._handle_error(error_msg, self._mode) return block
def process(self, blockIter): if self._sortDS: dsCache = {} for block in blockIter: dsCache.setdefault(block[DataProvider.Dataset], []).append(block) def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key = itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block blockIter = ds_generator() elif self._sortBlock: blockIter = sorted(blockIter, key = itemgetter(DataProvider.BlockName)) # Yield blocks for block in blockIter: if self._sortFiles: sort_inplace(block[DataProvider.FileList], key = itemgetter(DataProvider.URL)) if self._sortLocation: sort_inplace(block[DataProvider.Locations]) yield block
def _getBlocksInternal(self): # Split files into blocks/datasets via key functions and determine metadata intersection (protoBlocks, commonDS, commonB) = ({}, {}, {}) def getActiveKeys(kUser, kGuard, gIdx): return kUser + (kGuard or lchain( imap(lambda x: x.getGuards()[gIdx], self._scanner))) keysDS = getActiveKeys(self._ds_keys_user, self._ds_keys_guard, 0) keysB = getActiveKeys(self._b_keys_user, self._b_keys_guard, 1) for fileInfo in ifilter(itemgetter(0), self._collectFiles()): hashDS = self._generateKey( keysDS, md5_hex(repr(self._datasetExpr)) + md5_hex(repr(self._datasetNick)), *fileInfo) hashB = self._generateKey(keysB, hashDS + md5_hex(repr(fileInfo[3])), *fileInfo) # [3] == SE list if not self._ds_select or (hashDS in self._ds_select): if not self._b_select or (hashB in self._b_select): fileInfo[1].update({'DS_KEY': hashDS, 'BLOCK_KEY': hashB}) protoBlocks.setdefault(hashDS, {}).setdefault(hashB, []).append(fileInfo) utils.intersectDict( commonDS.setdefault(hashDS, dict(fileInfo[1])), fileInfo[1]) utils.intersectDict( commonB.setdefault(hashDS, {}).setdefault( hashB, dict(fileInfo[1])), fileInfo[1]) # Generate names for blocks/datasets using common metadata (hashNameDictDS, hashNameDictB) = ({}, {}) for hashDS in protoBlocks: hashNameDictDS[hashDS] = self._generateDatasetName( hashDS, commonDS[hashDS]) for hashB in protoBlocks[hashDS]: hashNameDictB[hashB] = (hashDS, self._generateBlockName( hashB, commonB[hashDS][hashB])) self._findCollision('dataset', hashNameDictDS, commonDS, keysDS, lambda name, key: [key]) self._findCollision('block', hashNameDictB, commonB, keysDS + keysB, lambda name, key: [name[0], key], lambda name: name[1]) for block in self._buildBlocks(protoBlocks, hashNameDictDS, hashNameDictB): yield block
def _diff_pspi_list(pa_old, pa_new, result_redo, result_disable): map_jobnum2pnum = {} def _handle_matching_pspi(pspi_list_added, pspi_list_missing, pspi_list_same, pspi_old, pspi_new): map_jobnum2pnum[pspi_old[TrackingInfo.pnum]] = pspi_new[TrackingInfo.pnum] if not pspi_old[TrackingInfo.ACTIVE] and pspi_new[TrackingInfo.ACTIVE]: result_redo.add(pspi_new[TrackingInfo.pnum]) if pspi_old[TrackingInfo.ACTIVE] and not pspi_new[TrackingInfo.ACTIVE]: result_disable.add(pspi_new[TrackingInfo.pnum]) # pspi_list_changed is ignored, since it is already processed by the change handler above (pspi_list_added, pspi_list_missing, _) = get_list_difference( _translate_pa2pspi_list(pa_old), _translate_pa2pspi_list(pa_new), itemgetter(TrackingInfo.HASH), _handle_matching_pspi) return (map_jobnum2pnum, pspi_list_added, pspi_list_missing)
def _resyncFiles(self, modSI, jobNum, sizeInfo, filesMissing, filesMatched, newBlock, metaIdxLookup, extended): # Select processing mode for job (disable > complete > changed > ignore) [ie. disable overrides all] using min # Result: one of [disable, complete, ignore] (changed -> complete or igore) idx = 0 newMetadata = [] procMode = ResyncMode.ignore while idx < len(modSI[DataSplitter.FileList]): rmFI = fast_search(filesMissing, itemgetter(DataProvider.URL), modSI[DataSplitter.FileList][idx]) if rmFI: procMode = min(procMode, self._resyncRemovedFile(idx, modSI, sizeInfo, rmFI)) else: (oldFI, newFI) = fast_search(filesMatched, lambda x: x[0][DataProvider.URL], modSI[DataSplitter.FileList][idx]) (procMode, idx) = self._resyncChangedFile(procMode, idx, modSI, jobNum, sizeInfo, newBlock, extended, oldFI, newFI, newMetadata, metaIdxLookup) return (procMode, newMetadata)
def _add_entry(self, container, section, option, value, source): opttype = '=' try: option = option.strip() if option[-1] in imap(itemgetter(0), ConfigEntry.map_opt_type2desc.keys()): opttype = option[-1] + '=' option = option[:-1].strip() container.append( ConfigEntry(section.strip(), option, value.strip(), opttype, source)) except Exception: raise ConfigError( 'Unable to register config value [%s] %s %s %s (from %s)' % (section, option, opttype, value, source))
def __init__(self, name, container_old, container_cur, parent=None, set_sections=unspecified, add_sections=None, set_names=unspecified, add_names=None, set_tags=unspecified, add_tags=None, set_classes=unspecified, add_classes=None, inherit_sections=False): parent = parent or self if inherit_sections and isinstance(parent, TaggedConfigView): add_sections = (parent.get_class_section_list() or []) + (add_sections or []) SimpleConfigView.__init__(self, name, container_old, container_cur, parent, set_sections=set_sections, add_sections=add_sections) self._class_section_list = self._init_variable( parent, '_class_section_list', None, set_classes, add_classes, norm_config_locations, lambda x: x.config_section_list) self._section_name_list = self._init_variable(parent, '_section_name_list', [], set_names, add_names, norm_config_locations) def _get_tag_tuple(tag_obj): try: config_tag_name = tag_obj.config_tag_name.lower() except Exception: raise APIError('Class %r does not define a valid tag name!' % tag_obj.__class__.__name__) return [(config_tag_name, tag_obj.get_object_name().lower())] self._section_tag_list = self._init_variable(parent, '_section_tag_list', [], set_tags, add_tags, identity, _get_tag_tuple) self._section_tag_order = lmap(itemgetter(0), self._section_tag_list)
def _iter_blocks_raw(self): # Handling dataset and block information separately leads to nasty, nested code (map_key2fm_list, map_key2metadata_dict) = ({}, {}) self._assign_dataset_block(map_key2fm_list, map_key2metadata_dict, ifilter(itemgetter(0), self._iter_file_infos())) # Generate names for blocks/datasets using common metadata - creating map id -> name map_key2name = {} for (key, metadata_dict) in map_key2metadata_dict.items(): if len(key) == 1: map_key2name[key] = self._get_dataset_name(metadata_dict, hash_dataset=key[0]) else: map_key2name[key] = self._get_block_name(metadata_dict, hash_block=key[1]) # Check for bijective mapping id <-> name: self._check_map_name2key(map_key2name, map_key2metadata_dict) # Yield finished dataset blocks for block in self._build_blocks(map_key2fm_list, map_key2name, map_key2metadata_dict): yield block
def __init__(self, stream, screen): (self.stream, self.screen, self.logged) = (stream, screen, True) # This is a list of (regular expression, GUI attributes). The # attributes are applied to matches of the regular expression in # the output written into this stream. Lookahead expressions # should not overlap with other regular expressions. self.attrs = [ (r'DONE(?!:)', [Console.COLOR_BLUE, Console.BOLD]), (r'FAILED(?!:)', [Console.COLOR_RED, Console.BOLD]), (r'SUCCESS(?!:)', [Console.COLOR_GREEN, Console.BOLD]), (r'(?<=DONE:)\s+[1-9]\d*', [Console.COLOR_BLUE, Console.BOLD]), (r'(?<=Failing jobs:)\s+[1-9]\d*', [Console.COLOR_RED, Console.BOLD]), (r'(?<=FAILED:)\s+[1-9]\d*', [Console.COLOR_RED, Console.BOLD]), (r'(?<=Successful jobs:)\s+[1-9]\d*', [Console.COLOR_GREEN, Console.BOLD]), (r'(?<=SUCCESS:)\s+[1-9]\d*', [Console.COLOR_GREEN, Console.BOLD]), ] self.regex = re.compile('(%s)' % '|'.join(imap(itemgetter(0), self.attrs)))
def __init__(self, stream, console, lock): (self._stream, self._console, self.logged, self._log, self._lock) = (stream, console, True, [None] * 100, lock) # This is a list of (regular expression, GUI attributes). The # attributes are applied to matches of the regular expression in # the output written into this stream. Lookahead expressions # should not overlap with other regular expressions. attrs = [ (r'DONE(?!:)', [Console.COLOR_BLUE, Console.BOLD]), (r'FAILED(?!:)', [Console.COLOR_RED, Console.BOLD]), (r'SUCCESS(?!:)', [Console.COLOR_GREEN, Console.BOLD]), (r'(?<=DONE:)\s+[1-9]\d*', [Console.COLOR_BLUE, Console.BOLD]), (r'(?<=Failing jobs:)\s+[1-9]\d*', [Console.COLOR_RED, Console.BOLD]), (r'(?<=FAILED:)\s+[1-9]\d*', [Console.COLOR_RED, Console.BOLD]), (r'(?<=Successful jobs:)\s+[1-9]\d*', [Console.COLOR_GREEN, Console.BOLD]), (r'(?<=SUCCESS:)\s+[1-9]\d*', [Console.COLOR_GREEN, Console.BOLD]), ] self._match_any_attr = re.compile('(%s)' % '|'.join(imap(itemgetter(0), attrs))) self._attrs = lmap(lambda expr_attr: (re.compile(expr_attr[0]), expr_attr[1]), attrs)
def _create_placeholder_psrc(pa_old, pa_new, map_jobnum2pnum, pspi_list_missing, result_disable): # Construct placeholder parameter source with missing parameter entries and intervention state psp_list_missing = [] missing_pnum_start = pa_new.get_job_len() sort_inplace(pspi_list_missing, key=itemgetter(TrackingInfo.pnum)) for (idx, pspi_missing) in enumerate(pspi_list_missing): map_jobnum2pnum[pspi_missing[TrackingInfo.pnum]] = missing_pnum_start + idx psp_missing = pa_old.get_job_content(missing_pnum_start + idx, pspi_missing[TrackingInfo.pnum]) psp_missing.pop('GC_PARAM') if psp_missing[ParameterInfo.ACTIVE]: psp_missing[ParameterInfo.ACTIVE] = False result_disable.add(missing_pnum_start + idx) psp_list_missing.append(psp_missing) meta_list_new = pa_new.get_job_metadata() meta_name_list_new = lmap(lambda key: key.value, meta_list_new) meta_list_old = pa_old.get_job_metadata() meta_list_missing = lfilter(lambda key: key.value not in meta_name_list_new, meta_list_old) return ParameterSource.create_instance('InternalParameterSource', psp_list_missing, meta_list_missing)
def _resync_files(self, splitter, partition_mod, partition_num, size_list, fi_list_missing, fi_list_matched, block_new, metadata_setup_list, partition_list_added): # resync a single file in the partition, return next file index to process # Select processing mode for job (disable > complete > changed > ignore) # [ie. disable overrides all] using min # Result: one of [disable, complete, ignore] (changed -> complete or igore) fi_idx = 0 metadata_list_current = [] proc_mode = ResyncMode.ignore while fi_idx < len(partition_mod[DataSplitter.FileList]): fi_removed = _fast_search(fi_list_missing, itemgetter(DataProvider.URL), partition_mod[DataSplitter.FileList][fi_idx]) if fi_removed: proc_mode = self._handle_removed_file(proc_mode, fi_idx, partition_mod, size_list, fi_removed) else: (proc_mode, fi_idx) = self._handle_changed_file(splitter, proc_mode, fi_idx, partition_mod, partition_num, size_list, block_new, partition_list_added, fi_list_matched, metadata_list_current, metadata_setup_list) return (proc_mode, metadata_list_current)
def _processReplicas(self, blockPath, replica_infos): def empty_with_warning(*args): self._log.warning(*args) return [] def expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._replicaLocation(replica_info): yield entry if not replica_infos: return empty_with_warning('Dataset block %r has no replica information!', blockPath) replica_infos_selected = self._phedexFilter.filterList(replica_infos, key = itemgetter(0)) if not replica_infos_selected: return empty_with_warning('Dataset block %r is not available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) if not self._onlyComplete: return list(expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return empty_with_warning('Dataset block %r is not completely available at the selected locations!\nAvailable locations: %s', blockPath, str.join(', ', self._fmtLocations(replica_infos))) return list(expanded_replica_locations(replica_infos_complete))
def _process_replica_list(self, block_path, replica_infos): def _empty_with_warning(error_msg, *args): self._log.warning('Dataset block %r ' + error_msg, block_path, *args) return [] def _expanded_replica_locations(replica_infos): for replica_info in replica_infos: for entry in self._iter_replica_locations(replica_info): yield entry if not replica_infos: return _empty_with_warning('has no replica information!') replica_infos_selected = self._phedex_filter.filter_list(replica_infos, key=itemgetter(0)) if not replica_infos_selected: return _empty_with_warning('is not available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) if not self._only_complete: return list(_expanded_replica_locations(replica_infos_selected)) replica_infos_complete = lfilter(lambda nn_nh_c: nn_nh_c[2], replica_infos_selected) if not replica_infos_complete: return _empty_with_warning('is not completely available at the selected locations!\n' + 'Available locations: %s', str.join(', ', self._iter_formatted_locations(replica_infos))) return list(_expanded_replica_locations(replica_infos_complete))
def search_url(url): return fast_search(oldBlock[DataProvider.FileList], itemgetter(DataProvider.URL), url)
def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key = itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block