def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks(block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def get_endpoint(self): activity = Activity('Discovering available WMS services') wms_best_list = [] for wms in self._list_endpoint_good(): activity_wms = Activity('pinging WMS %s' % wms) if wms is None: continue ping, pingtime = self._ping_dict.get(wms, (None, 0)) if time.time() - pingtime > 30 * 60: # check every ~30min ping = ping_host(wms.split('://')[1].split('/')[0].split(':')[0]) self._ping_dict[wms] = (ping, time.time() + 10 * 60 * random.random()) # 10 min variation if ping is not None: wms_best_list.append((wms, ping)) activity_wms.finish() activity.finish() if not wms_best_list: return None sort_inplace(wms_best_list, key=lambda name_ping: name_ping[1]) result = _choice_exp(wms_best_list) if result is not None: activity = Activity('selecting WMS %s' % result) wms, ping = result # reduce timeout by 5min for chosen wms => re-ping every 6 submits self._ping_dict[wms] = (ping, self._ping_dict[wms][1] + 5 * 60) result = wms activity.finish() self._update_state() return result
def get_endpoint(self): activity = Activity('Discovering available WMS services') wms_best_list = [] for wms in self._list_endpoint_good(): activity_wms = Activity('pinging WMS %s' % wms) if wms is None: continue ping, pingtime = self._ping_dict.get(wms, (None, 0)) if time.time() - pingtime > 30 * 60: # check every ~30min ping = ping_host( wms.split('://')[1].split('/')[0].split(':')[0]) self._ping_dict[wms] = (ping, time.time() + 10 * 60 * random.random() ) # 10 min variation if ping is not None: wms_best_list.append((wms, ping)) activity_wms.finish() activity.finish() if not wms_best_list: return None sort_inplace(wms_best_list, key=lambda name_ping: name_ping[1]) result = _choice_exp(wms_best_list) if result is not None: activity = Activity('selecting WMS %s' % result) wms, ping = result # reduce timeout by 5min for chosen wms => re-ping every 6 submits self._ping_dict[wms] = (ping, self._ping_dict[wms][1] + 5 * 60) result = wms activity.finish() self._update_state() return result
def get_dataset_info(opts, args, query_blocks=True): config = get_dataset_config(opts, args) provider = config.get_composited_plugin('dataset', cls=DataProvider, bind_kwargs={'provider_name_default': config.get('dataset provider')}, default_compositor=':ThreadedMultiDatasetProvider:') # -T disables multi-threading further below dataset_list = sorted(provider.get_dataset_name_list()) if len(dataset_list) == 0: raise DatasetError('No datasets matched!') # Query blocks only if needed query_blocks = False for option in opts.__dict__: if option.startswith('list_') and (option != 'list_dataset_names') or (option == 'save'): if getattr(opts, option): query_blocks = True block_list = None if query_blocks: block_list = provider.get_block_list_cached(show_stats=False) if len(block_list) == 0: raise DatasetError('No blocks matched!') if opts.ordered: sort_inplace(block_list, key=itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for block in block_list: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) return (provider, dataset_list, block_list)
def resyncSources(oldBlocks, newBlocks): # Compare different blocks according to their name - NOT full content def keyBlock(x): return (x[DataProvider.Dataset], x[DataProvider.BlockName]) sort_inplace(oldBlocks, key = keyBlock) sort_inplace(newBlocks, key = keyBlock) def onMatchingBlock(blocksAdded, blocksMissing, blocksMatching, oldBlock, newBlock): # Compare different files according to their name - NOT full content def keyFiles(x): return x[DataProvider.URL] sort_inplace(oldBlock[DataProvider.FileList], key = keyFiles) sort_inplace(newBlock[DataProvider.FileList], key = keyFiles) def onMatchingFile(filesAdded, filesMissing, filesMatched, oldFile, newFile): filesMatched.append((oldFile, newFile)) (filesAdded, filesMissing, filesMatched) = \ utils.DiffLists(oldBlock[DataProvider.FileList], newBlock[DataProvider.FileList], keyFiles, onMatchingFile, isSorted = True) if filesAdded: # Create new block for added files in an existing block tmpBlock = copy.copy(newBlock) tmpBlock[DataProvider.FileList] = filesAdded tmpBlock[DataProvider.NEntries] = sum(imap(lambda x: x[DataProvider.NEntries], filesAdded)) blocksAdded.append(tmpBlock) blocksMatching.append((oldBlock, newBlock, filesMissing, filesMatched)) return utils.DiffLists(oldBlocks, newBlocks, keyBlock, onMatchingBlock, isSorted = True)
def get_dataset_info(opts, args, query_blocks=True): config = get_dataset_config(opts, args) if opts.threads is not None: config.set_int('dataprovider thread max', int(opts.threads) or 1) provider = config.get_composited_plugin( 'dataset', cls=DataProvider, bind_kwargs={'provider_name_default': config.get('dataset provider')}, default_compositor=':ThreadedMultiDatasetProvider:') dataset_list = sorted(provider.get_dataset_name_list()) if len(dataset_list) == 0: raise DatasetError('No datasets matched!') # Query blocks only if needed query_blocks = False for option in opts.__dict__: if option.startswith('list_') and (option != 'list_dataset_names') or ( option == 'save'): if getattr(opts, option): query_blocks = True block_list = None if query_blocks: block_list = provider.get_block_list_cached(show_stats=False) if len(block_list) == 0: raise DatasetError('No blocks matched!') if opts.ordered: sort_inplace(block_list, key=itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for block in block_list: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) return (provider, dataset_list, block_list)
def resyncMapping(self, newSplitPath, oldBlocks, newBlocks): log = utils.ActivityLog('Performing resynchronization of dataset') (blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks) for rmBlock in blocksMissing: # Files in matching blocks are already sorted sort_inplace(rmBlock[DataProvider.FileList], key=lambda x: x[DataProvider.URL]) log.finish() # User overview and setup starts here resultRedo = [] resultDisable = [] newSplitPathTMP = newSplitPath + '.tmp' resyncIter = self._resyncIterator(resultRedo, resultDisable, blocksAdded, blocksMissing, blocksMatching) self.savePartitions( newSplitPathTMP, resyncIter, sourceLen=self.getMaxJobs(), message= 'Performing resynchronization of dataset map (progress is estimated)' ) if self._interactive: # TODO: print info and ask if not utils.getUserBool( 'Do you want to use the new dataset partition?', False): return None os.rename(newSplitPathTMP, newSplitPath) return (resultRedo, resultDisable)
def _iter_blocks_by_dataset(self, map_dataset2block_list): for dataset_name in sorted(map_dataset2block_list): if self._sort_block: sort_inplace(map_dataset2block_list[dataset_name], key=itemgetter(DataProvider.BlockName)) for block in map_dataset2block_list[dataset_name]: yield block
def _getResyncSource(self, psource_old, psource_new, mapJob2PID, pAdded, pMissing, disableNewPNum): # Construct complete parameter space psource with missing parameter entries and intervention state # NNNNNNNNNNNNN OOOOOOOOO | source: NEW (==self) and OLD (==from file) # <same><added> <missing> | same: both in NEW and OLD, added: only in NEW, missing: only in OLD oldMaxJobs = psource_old.getMaxJobs() # assign sequential job numbers to the added parameter entries sort_inplace(pAdded, key = itemgetter('GC_PARAM')) for (idx, entry) in enumerate(pAdded): if oldMaxJobs + idx != entry['GC_PARAM']: mapJob2PID[oldMaxJobs + idx] = entry['GC_PARAM'] missingInfos = [] newMaxJobs = psource_new.getMaxJobs() sort_inplace(pMissing, key = itemgetter('GC_PARAM')) for (idx, entry) in enumerate(pMissing): mapJob2PID[entry['GC_PARAM']] = newMaxJobs + idx tmp = psource_old.getJobInfo(newMaxJobs + idx, entry['GC_PARAM']) tmp.pop('GC_PARAM') if tmp[ParameterInfo.ACTIVE]: tmp[ParameterInfo.ACTIVE] = False disableNewPNum.add(newMaxJobs + idx) missingInfos.append(tmp) if missingInfos: return self._createAggregatedSource(psource_old, psource_new, missingInfos) return self._source
def _handle_matching_block(block_list_added, block_list_missing, block_list_matching, block_old, block_new): # Compare different files according to their name - NOT full content get_file_key = itemgetter(DataProvider.URL) sort_inplace(block_old[DataProvider.FileList], key=get_file_key) sort_inplace(block_new[DataProvider.FileList], key=get_file_key) def _handle_matching_fi(fi_list_added, fi_list_missing, fi_list_matched, fi_old, fi_new): fi_list_matched.append((fi_old, fi_new)) (fi_list_added, fi_list_missing, fi_list_matched) = get_list_difference( block_old[DataProvider.FileList], block_new[DataProvider.FileList], get_file_key, _handle_matching_fi, is_sorted=True) if fi_list_added: # Create new block for added files in an existing block block_added = copy.copy(block_new) block_added[DataProvider.FileList] = fi_list_added block_added[DataProvider.NEntries] = sum( imap(itemgetter(DataProvider.NEntries), fi_list_added)) block_list_added.append(block_added) block_list_matching.append( (block_old, block_new, fi_list_missing, fi_list_matched))
def process(self, blockIter): if self._sortDS: dsCache = {} for block in blockIter: dsCache.setdefault(block[DataProvider.Dataset], []).append(block) def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key=itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block blockIter = ds_generator() elif self._sortBlock: blockIter = sorted(blockIter, key=itemgetter(DataProvider.BlockName)) # Yield blocks for block in blockIter: if self._sortFiles: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) if self._sortLocation: sort_inplace(block[DataProvider.Locations]) yield block
def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key=itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block
def getWMS(self): log = utils.ActivityLog('Discovering available WMS services') wms_best_list = [] for wms in self.listWMS_good(): log = utils.ActivityLog('Discovering available WMS services - pinging %s' % wms) if wms is None: continue ping, pingtime = self.pingDict.get(wms, (None, 0)) if time.time() - pingtime > 30 * 60: # check every ~30min ping = utils.ping_host(wms.split('://')[1].split('/')[0].split(':')[0]) self.pingDict[wms] = (ping, time.time() + 10 * 60 * random.random()) # 10 min variation if ping is not None: wms_best_list.append((wms, ping)) log.finish() log.finish() if not wms_best_list: return None sort_inplace(wms_best_list, key = lambda name_ping: name_ping[1]) result = choice_exp(wms_best_list) log = utils.ActivityLog('Discovering available WMS services - using %s' % wms) if result is not None: wms, ping = result # reduce timeout by 5min for chosen wms => re-ping every 6 submits self.pingDict[wms] = (ping, self.pingDict[wms][1] + 5*60) result = wms self.updateState() del log return result
def save_dataset(opts, provider): print('') blocks = provider.getBlocks() if opts.ordered: sort_inplace(blocks, key = itemgetter(DataProvider.Dataset, DataProvider.BlockName)) for b in blocks: sort_inplace(b[DataProvider.FileList], key = itemgetter(DataProvider.URL)) DataProvider.saveToFile(opts.save, blocks) print('Dataset information saved to ./%s' % opts.save)
def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks( block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def __init__(self, psource1, psource2, var1, var2 = None): psource1_values = {} for (pNum1, value) in self._iterParamItems(psource1, var1): psource1_values.setdefault(value, []).append(pNum1) self._combine_idx = [] for (pNum2, value) in self._iterParamItems(psource2, var2 or var1): for pNum1 in psource1_values.get(value, []): self._combine_idx.append((pNum1, pNum2)) sort_inplace(self._combine_idx) raise AbstractError
def split_list(iterable, fun, sort_key=unspecified): # single pass on iterable! (result_true, result_false) = ([], []) for value in iterable: if fun(value): result_true.append(value) else: result_false.append(value) if not unspecified(sort_key): sort_inplace(result_true, key=sort_key) sort_inplace(result_false, key=sort_key) return (result_true, result_false)
def splitBlocks(self, blocks): for block in blocks: files = block[DataProvider.FileList] sort_inplace(files, key = lambda fi: self.metaKey(block[DataProvider.Metadata], block, fi)) (fileStack, reprKey) = ([], None) for fi in files: if reprKey is None: reprKey = self.metaKey(block[DataProvider.Metadata], block, fi) curKey = self.metaKey(block[DataProvider.Metadata], block, fi) if curKey != reprKey: yield self.newBlock(block, fileStack) (fileStack, reprKey) = ([], curKey) fileStack.append(fi) yield self.newBlock(block, fileStack)
def divide_blocks(self, block_iter): for block in block_iter: fi_list = block[DataProvider.FileList] sort_inplace(fi_list, key=lambda fi: self._get_fi_class(fi, block)) partition_fi_list = [] if fi_list: fi_class_active = self._get_fi_class(fi_list[0], block) for fi in fi_list: fi_class_current = self._get_fi_class(fi, block) if fi_class_current != fi_class_active: yield self._create_sub_block(block, partition_fi_list) (partition_fi_list, fi_class_active) = ([], fi_class_current) partition_fi_list.append(fi) if partition_fi_list: yield self._create_sub_block(block, partition_fi_list)
def splitBlocks(self, blocks): for block in blocks: files = block[DataProvider.FileList] sort_inplace(files, key=lambda fi: self.metaKey( block[DataProvider.Metadata], block, fi)) (fileStack, reprKey) = ([], None) for fi in files: if reprKey is None: reprKey = self.metaKey(block[DataProvider.Metadata], block, fi) curKey = self.metaKey(block[DataProvider.Metadata], block, fi) if curKey != reprKey: yield self.newBlock(block, fileStack) (fileStack, reprKey) = ([], curKey) fileStack.append(fi) yield self.newBlock(block, fileStack)
def merge_lumi_list(run_lumi_range_list): """ Merge consecutive lumi sections >>> merge_lumi_list([([1, 11], [1, 20]), ([1, 1], [1, 10]), ([1, 22], [1, 30])]) [([1, 1], [1, 20]), ([1, 22], [1, 30])] >>> merge_lumi_list([([1, 1], [2, 2]), ([2, 3], [2, 10]), ([2, 11], [4, 30])]) [([1, 1], [4, 30])] """ sort_inplace(run_lumi_range_list, key=lambda run_lumi_range: tuple(run_lumi_range[0])) idx = 0 while idx < len(run_lumi_range_list) - 1: (end_run, end_lumi) = run_lumi_range_list[idx][1] (start_next_run, start_next_lumi) = run_lumi_range_list[idx + 1][0] if (end_run == start_next_run) and (end_lumi == start_next_lumi - 1): run_lumi_range_list[idx] = (run_lumi_range_list[idx][0], run_lumi_range_list[idx + 1][1]) del run_lumi_range_list[idx + 1] else: idx += 1 return run_lumi_range_list
def mergeLumi(rlrange): """ Merge consecutive lumi sections >>> mergeLumi([([1, 11], [1, 20]), ([1, 1], [1, 10]), ([1, 22], [1, 30])]) [([1, 1], [1, 20]), ([1, 22], [1, 30])] >>> mergeLumi([([1, 1], [2, 2]), ([2, 3], [2, 10]), ([2, 11], [4, 30])]) [([1, 1], [4, 30])] """ sort_inplace(rlrange, keyLumi) i = 0 while i < len(rlrange) - 1: (end_run, end_lumi) = rlrange[i][1] (start_next_run, start_next_lumi) = rlrange[i+1][0] if (end_run == start_next_run) and (end_lumi == start_next_lumi - 1): rlrange[i] = (rlrange[i][0], rlrange[i + 1][1]) del rlrange[i+1] else: i += 1 return rlrange
def mergeLumi(rlrange): """ Merge consecutive lumi sections >>> mergeLumi([([1, 11], [1, 20]), ([1, 1], [1, 10]), ([1, 22], [1, 30])]) [([1, 1], [1, 20]), ([1, 22], [1, 30])] >>> mergeLumi([([1, 1], [2, 2]), ([2, 3], [2, 10]), ([2, 11], [4, 30])]) [([1, 1], [4, 30])] """ sort_inplace(rlrange, keyLumi) i = 0 while i < len(rlrange) - 1: (end_run, end_lumi) = rlrange[i][1] (start_next_run, start_next_lumi) = rlrange[i + 1][0] if (end_run == start_next_run) and (end_lumi == start_next_lumi - 1): rlrange[i] = (rlrange[i][0], rlrange[i + 1][1]) del rlrange[i + 1] else: i += 1 return rlrange
def process(self, block_iter): if self._sort_ds: map_dataset2block_list = {} for block in block_iter: map_dataset2block_list.setdefault(block[DataProvider.Dataset], []).append(block) block_iter = self._iter_blocks_by_dataset(map_dataset2block_list) elif self._sort_block: block_iter = sorted(block_iter, key=itemgetter(DataProvider.BlockName)) # pylint:disable=redefined-variable-type # Yield blocks for block in block_iter: if self._sort_files: sort_inplace(block[DataProvider.FileList], key=itemgetter(DataProvider.URL)) if self._sort_location: sort_inplace(block[DataProvider.Locations]) yield block
def _create_placeholder_psrc(pa_old, pa_new, map_jobnum2pnum, pspi_list_missing, result_disable): # Construct placeholder parameter source with missing parameter entries and intervention state psp_list_missing = [] missing_pnum_start = pa_new.get_job_len() sort_inplace(pspi_list_missing, key=itemgetter(TrackingInfo.pnum)) for (idx, pspi_missing) in enumerate(pspi_list_missing): map_jobnum2pnum[pspi_missing[TrackingInfo.pnum]] = missing_pnum_start + idx psp_missing = pa_old.get_job_content(missing_pnum_start + idx, pspi_missing[TrackingInfo.pnum]) psp_missing.pop('GC_PARAM') if psp_missing[ParameterInfo.ACTIVE]: psp_missing[ParameterInfo.ACTIVE] = False result_disable.add(missing_pnum_start + idx) psp_list_missing.append(psp_missing) meta_list_new = pa_new.get_job_metadata() meta_name_list_new = lmap(lambda key: key.value, meta_list_new) meta_list_old = pa_old.get_job_metadata() meta_list_missing = lfilter(lambda key: key.value not in meta_name_list_new, meta_list_old) return ParameterSource.create_instance('InternalParameterSource', psp_list_missing, meta_list_missing)
def resync_blocks(block_list_old, block_list_new): # Returns changes between two sets of blocks in terms of added, missing and changed blocks # Only the affected files are returned in the block file list def _get_block_key( block ): # Compare different blocks according to their name - NOT full content return (block[DataProvider.Dataset], block[DataProvider.BlockName]) sort_inplace(block_list_old, key=_get_block_key) sort_inplace(block_list_new, key=_get_block_key) def _handle_matching_block(block_list_added, block_list_missing, block_list_matching, block_old, block_new): # Compare different files according to their name - NOT full content get_file_key = itemgetter(DataProvider.URL) sort_inplace(block_old[DataProvider.FileList], key=get_file_key) sort_inplace(block_new[DataProvider.FileList], key=get_file_key) def _handle_matching_fi(fi_list_added, fi_list_missing, fi_list_matched, fi_old, fi_new): fi_list_matched.append((fi_old, fi_new)) (fi_list_added, fi_list_missing, fi_list_matched) = get_list_difference( block_old[DataProvider.FileList], block_new[DataProvider.FileList], get_file_key, _handle_matching_fi, is_sorted=True) if fi_list_added: # Create new block for added files in an existing block block_added = copy.copy(block_new) block_added[DataProvider.FileList] = fi_list_added block_added[DataProvider.NEntries] = sum( imap(itemgetter(DataProvider.NEntries), fi_list_added)) block_list_added.append(block_added) block_list_matching.append( (block_old, block_new, fi_list_missing, fi_list_matched)) return get_list_difference(block_list_old, block_list_new, _get_block_key, _handle_matching_block, is_sorted=True)
def process(self, blockIter): if self._sortDS: dsCache = {} for block in blockIter: dsCache.setdefault(block[DataProvider.Dataset], []).append(block) def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key = itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block blockIter = ds_generator() elif self._sortBlock: blockIter = sorted(blockIter, key = itemgetter(DataProvider.BlockName)) # Yield blocks for block in blockIter: if self._sortFiles: sort_inplace(block[DataProvider.FileList], key = itemgetter(DataProvider.URL)) if self._sortLocation: sort_inplace(block[DataProvider.Locations]) yield block
def resyncMapping(self, newSplitPath, oldBlocks, newBlocks): activity = Activity('Performing resynchronization of dataset') (blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks) for rmBlock in blocksMissing: # Files in matching blocks are already sorted sort_inplace(rmBlock[DataProvider.FileList], key = lambda x: x[DataProvider.URL]) activity.finish() # User overview and setup starts here resultRedo = [] resultDisable = [] newSplitPathTMP = newSplitPath + '.tmp' resyncIter = self._resyncIterator(resultRedo, resultDisable, blocksAdded, blocksMissing, blocksMatching) self.savePartitions(newSplitPathTMP, resyncIter, sourceLenHint = self.getMaxJobs(), message = 'Performing resynchronization of dataset map (progress is estimated)') if self._interactive: # TODO: print info and ask if not utils.getUserBool('Do you want to use the new dataset partition?', False): return os.rename(newSplitPathTMP, newSplitPath) return (resultRedo, resultDisable)
def _extend_map_jobnum2pnum(map_jobnum2pnum, jobnum_start, pspi_list_added): # assign sequential job numbers to the added parameter entries sort_inplace(pspi_list_added, key=itemgetter(TrackingInfo.pnum)) for (pspi_idx, pspi_added) in enumerate(pspi_list_added): if jobnum_start + pspi_idx != pspi_added[TrackingInfo.pnum]: map_jobnum2pnum[jobnum_start + pspi_idx] = pspi_added[TrackingInfo.pnum]
def ds_generator(): for ds in sorted(dsCache): if self._sortBlock: sort_inplace(dsCache[ds], key = itemgetter(DataProvider.BlockName)) for block in dsCache[ds]: yield block