def __init__(self, config, datasource_name, dataset_expr, dataset_nick, provider_list): for provider in provider_list: provider.disable_stream_singletons() DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick) self._stats = DataProcessor.create_instance('SimpleStatsDataProcessor', config, 'dataset', self._log, 'Summary: Running over ') self._provider_list = provider_list
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) (self._path, self._events, selist) = utils.optSplit(datasetExpr, '|@') self._selist = parseList(selist, ',') or None if not (self._path and self._events): raise ConfigError('Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]')
def __init__(self, config, datasetExpr, datasetNick, datasetID, providerList): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) self._providerList = providerList for provider in self._providerList: provider.setPassthrough()
def __init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc, scanner_list_default): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) # Configure scanners scanner_config = config.change_view(default_on_change=TriggerResync(['datasets', 'parameters'])) self._interactive_assignment = config.is_interactive('dataset name assignment', True) def _create_scanner(scanner_name): return InfoScanner.create_instance(scanner_name, scanner_config, datasource_name) scanner_list = scanner_config.get_list('scanner', scanner_list_default) + ['NullScanner'] self._scanner_list = lmap(_create_scanner, scanner_list) # Configure dataset / block naming and selection def _setup(prefix): selected_hash_list = scanner_config.get_list(join_config_locations(prefix, 'key select'), []) name = scanner_config.get(join_config_locations(prefix, 'name pattern'), '') return (selected_hash_list, name) (self._selected_hash_list_dataset, self._dataset_pattern) = _setup('dataset') (self._selected_hash_list_block, self._block_pattern) = _setup('block') # Configure hash input for separation of files into datasets / blocks def _get_active_hash_input(prefix, guard_entry_idx): hash_input_list_user = scanner_config.get_list(join_config_locations(prefix, 'hash keys'), []) hash_input_list_guard = scanner_config.get_list(join_config_locations(prefix, 'guard override'), lchain(imap(lambda scanner: scanner.get_guard_keysets()[guard_entry_idx], self._scanner_list))) return hash_input_list_user + hash_input_list_guard self._hash_input_set_dataset = _get_active_hash_input('dataset', 0) self._hash_input_set_block = _get_active_hash_input('block', 1)
def __init__(self, config, datasetExpr, datasetNick = None): DataProvider.__init__(self, config, datasetExpr, datasetNick) (self._path, self._events, selist) = utils.optSplit(datasetExpr, '|@') self._selist = parseList(selist, ',') or None if not (self._path and self._events): raise ConfigError('Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]')
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) self._common_prefix = max(DataProvider.enum_value_list) + 1 self._common_metadata = max(DataProvider.enum_value_list) + 2 self._entry_handler_info = { 'events': (DataProvider.NEntries, int, 'block entry counter'), 'id': (None, None, 'dataset ID'), # legacy key - skip 'metadata': (DataProvider.Metadata, parse_json, 'metadata description'), 'metadata common': (self._common_metadata, parse_json, 'common metadata'), 'nickname': (DataProvider.Nickname, str, 'dataset nickname'), 'prefix': (self._common_prefix, str, 'common prefix'), 'se list': (DataProvider.Locations, lambda value: parse_list(value, ','), 'block location'), } (path, self._forced_prefix, self._filter) = split_opt(dataset_expr, '@%') self._filename = config.resolve_path( path, True, 'Error resolving dataset file: %s' % path)
def __init__(self, config, datasetExpr, datasetNick, datasetID=0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) (self._path, self._events, selist) = utils.optSplit(datasetExpr, "|@") self._selist = utils.parseList(selist, delimeter=",", onEmpty=None) if not (self._path and self._events): raise ConfigError("Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]")
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) (self._path, self._events, selist) = split_opt(dataset_expr, '|@') self._selist = parse_list(selist, ',') or None if not (self._path and self._events): raise ConfigError('Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]')
def __init__(self, config, datasetExpr, datasetNick, sList): DataProvider.__init__(self, config, datasetExpr, datasetNick) (self._ds_select, self._ds_name, self._ds_keys_user, self._ds_keys_guard) = self._setup(config, 'dataset') (self._b_select, self._b_name, self._b_keys_user, self._b_keys_guard) = self._setup(config, 'block') scanList = config.getList('scanner', sList) + ['NullScanner'] self._scanner = lmap( lambda cls: InfoScanner.createInstance(cls, config), scanList)
def processBlock(self, block): if block[DataProvider.Locations] is not None: sites = self._locationfilter.filterList(block[DataProvider.Locations]) if (sites is not None) and (len(sites) == 0) and (len(block[DataProvider.FileList]) != 0): if not len(block[DataProvider.Locations]): self._log.warning('Block %s is not available at any site!', DataProvider.bName(block)) elif not len(sites): self._log.warning('Block %s is not available at any selected site!', DataProvider.bName(block)) block[DataProvider.Locations] = sites return block
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): DataProvider.__init__(self, config, '', datasetNick, datasetID) def DSB(cFun, n, *args, **kargs): return (cFun('dataset %s' % n, *args, **kargs), cFun('block %s' % n, *args, **kargs)) (self.nameDS, self.nameB) = DSB(config.get, 'name pattern', '') (self.kUserDS, self.kUserB) = DSB(config.getList, 'hash keys', []) (self.kGuardDS, self.kGuardB) = DSB(config.getList, 'guard override', []) self.kSelectDS = config.getList('dataset key select', []) scanList = config.getList('scanner', datasetExpr) + ['NullScanner'] self.scanner = lmap(lambda cls: InfoScanner.createInstance(cls, config), scanList)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) ds_config = config.change_view(view_class='SimpleConfigView', set_sections=['datasource %s' % dataset_expr]) self._block = self._read_block(ds_config, dataset_expr, dataset_nick) def _on_change(config, old_obj, cur_obj, cur_entry, obj2str): self._log.critical('Dataset %r changed', dataset_expr) return TriggerResync(['datasets', 'parameters'])(config, old_obj, cur_obj, cur_entry, obj2str) ds_config.get('dataset hash', self._get_dataset_hash(), persistent=True, on_change=_on_change)
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['datasource %s' % datasetExpr]) self._block = self._readBlockFromConfig(config, datasetExpr, datasetNick, datasetID) dataset_hash_new = md5_hex(repr(self._block)) dataset_hash_old = config.get('dataset hash', dataset_hash_new, persistent = True) self._request_resync = dataset_hash_new != dataset_hash_old if self._request_resync: self._log.critical('Dataset %r changed', datasetExpr) config.setState(True, 'resync', detail = 'dataset') config.setState(True, 'resync', detail = 'parameters') config.set('dataset hash', dataset_hash_new)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) (self._path, self._events, selist) = split_opt(dataset_expr, '|@') self._selist = parse_list(selist, ',') or None if not (self._path and self._events): raise ConfigError( 'Invalid dataset expression!\nCorrect: /local/path/to/file|events[@SE1,SE2]' )
def processBlock(self, block): if block[DataProvider.Locations] is not None: sites = self._locationfilter.filterList( block[DataProvider.Locations]) if (sites is not None) and (len(sites) == 0) and (len( block[DataProvider.FileList]) != 0): if not len(block[DataProvider.Locations]): self._log.warning('Block %s is not available at any site!', DataProvider.bName(block)) elif not len(sites): self._log.warning( 'Block %s is not available at any selected site!', DataProvider.bName(block)) block[DataProvider.Locations] = sites return block
def processBlock(self, block): # Check entry consistency events = sum(imap(lambda x: x[DataProvider.NEntries], block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: self._handleError('Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' % ( DataProvider.bName(block), block[DataProvider.NEntries], events), self._mode) return block
def resyncMapping(self, newSplitPath, oldBlocks, newBlocks): log = utils.ActivityLog('Performing resynchronization of dataset') (blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks) for rmBlock in blocksMissing: # Files in matching blocks are already sorted sort_inplace(rmBlock[DataProvider.FileList], key=lambda x: x[DataProvider.URL]) log.finish() # User overview and setup starts here resultRedo = [] resultDisable = [] newSplitPathTMP = newSplitPath + '.tmp' resyncIter = self._resyncIterator(resultRedo, resultDisable, blocksAdded, blocksMissing, blocksMatching) self.savePartitions( newSplitPathTMP, resyncIter, sourceLen=self.getMaxJobs(), message= 'Performing resynchronization of dataset map (progress is estimated)' ) if self._interactive: # TODO: print info and ask if not utils.getUserBool( 'Do you want to use the new dataset partition?', False): return None os.rename(newSplitPathTMP, newSplitPath) return (resultRedo, resultDisable)
def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks(block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def _read_block(self, ds_config, dataset_expr, dataset_nick): metadata_name_list = parse_json(ds_config.get('metadata', '[]', on_change=None)) common_metadata = parse_json(ds_config.get('metadata common', '[]', on_change=None)) if len(common_metadata) > len(metadata_name_list): raise DatasetError('Unable to set %d common metadata items ' % len(common_metadata) + 'with %d metadata keys' % len(metadata_name_list)) common_prefix = ds_config.get('prefix', '', on_change=None) fn_list = [] has_events = False has_se_list = False for url in ds_config.get_option_list(): if url == 'se list': has_se_list = True elif url == 'events': has_events = True elif url not in ['dataset hash', 'metadata', 'metadata common', 'nickname', 'prefix']: fi = self._read_fi(ds_config, url, metadata_name_list, common_metadata, common_prefix) fn_list.append(fi) if not fn_list: raise DatasetError('There are no dataset files specified for dataset %r' % dataset_expr) result = { DataProvider.Nickname: ds_config.get('nickname', dataset_nick or '', on_change=None), DataProvider.FileList: sorted(fn_list, key=lambda fi: fi[DataProvider.URL]) } result.update(DataProvider.parse_block_id(dataset_expr)) if metadata_name_list: result[DataProvider.Metadata] = metadata_name_list if has_events: result[DataProvider.NEntries] = ds_config.get_int('events', -1, on_change=None) if has_se_list: result[DataProvider.Locations] = parse_list(ds_config.get('se list', '', on_change=None), ',') return result
def process_block(self, block): # Check uniqueness of URLs url_hash_list = [] if self._check_url != DatasetUniqueMode.ignore: block[DataProvider.FileList] = list( self._process_fi_list(url_hash_list, block[DataProvider.FileList])) url_hash_list.sort() # Check uniqueness of blocks if self._check_block != DatasetUniqueMode.ignore: block_hash = md5_hex( repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], url_hash_list, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if block_hash in self._recorded_block: msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id( block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._check_block == DatasetUniqueMode.warn: self._log.warning(msg) elif self._check_block == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._check_block == DatasetUniqueMode.skip: return None self._recorded_block.add(block_hash) return block
def split_partitions(self, block_iter, entry_first=0): for block in block_iter: entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block)) for proto_partition in self._partition_block(block[DataProvider.FileList], entries_per_job, entry_first): entry_first = 0 yield self._finish_partition(block, proto_partition)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) self._common_prefix = max(DataProvider.enum_value_list) + 1 self._common_metadata = max(DataProvider.enum_value_list) + 2 self._entry_handler_info = { 'events': (DataProvider.NEntries, int, 'block entry counter'), 'id': (None, None, 'dataset ID'), # legacy key - skip 'metadata': (DataProvider.Metadata, parse_json, 'metadata description'), 'metadata common': (self._common_metadata, parse_json, 'common metadata'), 'nickname': (DataProvider.Nickname, str, 'dataset nickname'), 'prefix': (self._common_prefix, str, 'common prefix'), 'se list': (DataProvider.Locations, lambda value: parse_list(value, ','), 'block location'), } (path, self._forced_prefix, self._filter) = split_opt(dataset_expr, '@%') self._filename = config.resolve_path(path, True, 'Error resolving dataset file: %s' % path)
def __init__(self, config, datasetExpr, datasetNick = None): DataProvider.__init__(self, config, datasetExpr, datasetNick) self._CommonPrefix = max(self.enumValues) + 1 self._CommonMetadata = max(self.enumValues) + 2 self._handleEntry = { 'events': (DataProvider.NEntries, int, 'block entry counter'), 'id': (None, None, 'dataset ID'), # legacy key - skip 'metadata': (DataProvider.Metadata, parseJSON, 'metadata description'), 'metadata common': (self._CommonMetadata, parseJSON, 'common metadata'), 'nickname': (DataProvider.Nickname, str, 'dataset nickname'), 'prefix': (self._CommonPrefix, str, 'common prefix'), 'se list': (DataProvider.Locations, lambda value: parseList(value, ','), 'block location'), } (path, self._forcePrefix, self._filter) = utils.optSplit(datasetExpr, '@%') self._filename = config.resolvePath(path, True, 'Error resolving dataset file: %s' % path)
def process_block(self, block): # Check entry consistency events = sum(imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events) self._handle_error(error_msg, self._mode) return block
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) self._CommonPrefix = max(self.enumValues) + 1 self._CommonMetadata = max(self.enumValues) + 2 self._handleEntry = { 'events': (DataProvider.NEntries, int, 'block entry counter'), 'id': (DataProvider.DatasetID, int, 'dataset ID'), 'metadata': (DataProvider.Metadata, parseJSON, 'metadata description'), 'metadata common': (self._CommonMetadata, parseJSON, 'common metadata'), 'nickname': (DataProvider.Nickname, str, 'dataset nickname'), 'prefix': (self._CommonPrefix, str, 'common prefix'), 'se list': (DataProvider.Locations, lambda value: parseList(value, ','), 'block location'), } (path, self._forcePrefix, self._filter) = utils.optSplit(datasetExpr, '@%') self._filename = config.resolvePath(path, True, 'Error resolving dataset file: %s' % path)
def getFilterEntries(): for pat in value.split(): if ':' not in pat.lstrip(':'): yield pat else: for block in DataProvider.getBlocksFromExpr(config, ':%s' % pat.lstrip(':')): for fi in block[DataProvider.FileList]: yield fi[DataProvider.URL]
def _create_block(self, block_name): result = { DataProvider.Locations: None, DataProvider.FileList: [], self._common_prefix: None, self._common_metadata: [], } result.update(DataProvider.parse_block_id(block_name.lstrip('[').rstrip(']'))) return result
def split_partitions(self, block_iter, entry_first=0): for block in block_iter: entries_per_job = self._entries_per_job.lookup( DataProvider.get_block_id(block)) for proto_partition in self._partition_block( block[DataProvider.FileList], entries_per_job, entry_first): entry_first = 0 yield self._finish_partition(block, proto_partition)
def getFilterEntries(): for pat in value.split(): if ':' not in pat.lstrip(':'): yield pat else: for block in DataProvider.getBlocksFromExpr( config, ':%s' % pat.lstrip(':')): for fi in block[DataProvider.FileList]: yield fi[DataProvider.URL]
def __init__(self, block_list_old, block_list_new): activity = Activity('Performing resynchronization of dataset') block_resync_tuple = DataProvider.resync_blocks( block_list_old, block_list_new) (self.block_list_added, self._block_list_missing, self._block_list_matching) = block_resync_tuple for block_missing in self._block_list_missing: # Files in matching blocks are already sorted sort_inplace(block_missing[DataProvider.FileList], key=itemgetter(DataProvider.URL)) activity.finish()
def __init__(self, config, datasetExpr, datasetNick=None, datasetID=0): DataProvider.__init__(self, config, datasetExpr, datasetNick, datasetID) config = config.changeView(viewClass='SimpleConfigView', setSections=['datasource %s' % datasetExpr]) self._block = self._readBlockFromConfig(config, datasetExpr, datasetNick, datasetID) dataset_hash_new = md5_hex(repr(self._block)) dataset_hash_old = config.get('dataset hash', dataset_hash_new, persistent=True) self._request_resync = dataset_hash_new != dataset_hash_old if self._request_resync: self._log.critical('Dataset %r changed', datasetExpr) config.setState(True, 'resync', detail='dataset') config.setState(True, 'resync', detail='parameters') config.set('dataset hash', dataset_hash_new)
def getFilterEntries(): for pat in value.split(): if ':' not in pat.lstrip(':'): yield pat else: for dfac in DataProvider.bind(':%s' % pat.lstrip(':'), config = config): dproc = dfac.getBoundInstance() for block in dproc.getBlocks(): for fi in block[DataProvider.FileList]: yield fi[DataProvider.URL]
def __init__(self, config, datasetExpr, datasetNick=None): DataProvider.__init__(self, config, datasetExpr, datasetNick) ds_config = config.changeView( viewClass='SimpleConfigView', setSections=['datasource %s' % datasetExpr]) self._block = self._readBlockFromConfig(ds_config, datasetExpr, datasetNick) def onChange(config, old_obj, cur_obj, cur_entry, obj2str): self._log.critical('Dataset %r changed', datasetExpr) return triggerResync(['datasets', 'parameters'])(config, old_obj, cur_obj, cur_entry, obj2str) ds_config.get('dataset hash', self.getHash(), persistent=True, onChange=onChange)
def _create_block(self, block_name): result = { DataProvider.Locations: None, DataProvider.FileList: [], self._common_prefix: None, self._common_metadata: [], } result.update( DataProvider.parse_block_id(block_name.lstrip('[').rstrip(']'))) return result
def divide_blocks(self, block_iter): for block in block_iter: fi_idx_start = 0 files_per_job = self._files_per_job.lookup(DataProvider.get_block_id(block)) if files_per_job <= 0: raise PartitionError('Invalid number of files per job: %d' % files_per_job) while fi_idx_start < len(block[DataProvider.FileList]): fi_list = block[DataProvider.FileList][fi_idx_start:fi_idx_start + files_per_job] fi_idx_start += files_per_job if fi_list: yield self._create_sub_block(block, fi_list)
def process_block(self, block): # Check entry consistency events = sum( imap(itemgetter(DataProvider.NEntries), block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: error_msg = 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' error_msg = error_msg % (DataProvider.get_block_id(block), block[DataProvider.NEntries], events) self._handle_error(error_msg, self._mode) return block
def processBlock(self, block): # Check entry consistency events = sum( imap(lambda x: x[DataProvider.NEntries], block[DataProvider.FileList])) if block.setdefault(DataProvider.NEntries, events) != events: self._handleError( 'Inconsistency in block %s: Number of events doesn\'t match (b:%d != f:%d)' % (DataProvider.bName(block), block[DataProvider.NEntries], events), self._mode) return block
def _get_fi_class(self, fi, block): metadata_name_list = block.get(DataProvider.Metadata, []) metadata_name_list_selected = self._metadata_user_list.lookup(DataProvider.get_block_id(block)) metadata_idx_list = lmap(lambda metadata_name: safe_index(metadata_name_list, metadata_name), metadata_name_list_selected) def _query_metadata(idx): if (idx is not None) and (idx < len(fi[DataProvider.Metadata])): return fi[DataProvider.Metadata][idx] return '' return tuple(imap(_query_metadata, metadata_idx_list))
def _get_filter_entries(): for pat in value.split(): if ':' not in pat.lstrip(':'): yield pat else: block_iter = DataProvider.iter_blocks_from_expr( config, ':%s' % pat.lstrip(':'), dataset_proc=dataset_proc) for block in block_iter: for fi in block[DataProvider.FileList]: yield fi[DataProvider.URL]
def _get_fi_class(self, fi, block): metadata_name_list = block.get(DataProvider.Metadata, []) metadata_name_list_selected = self._metadata_user_list.lookup( DataProvider.get_block_id(block)) metadata_idx_list = lmap( lambda metadata_name: safe_index(metadata_name_list, metadata_name ), metadata_name_list_selected) def _query_metadata(idx): if (idx is not None) and (idx < len(fi[DataProvider.Metadata])): return fi[DataProvider.Metadata][idx] return '' return tuple(imap(_query_metadata, metadata_idx_list))
def process_block(self, block): if block[DataProvider.Locations] is not None: sites = self._location_filter.filter_list( block[DataProvider.Locations]) if (sites is not None) and (len(sites) == 0) and (len( block[DataProvider.FileList]) != 0): error_msg = 'Block %s is not available ' % DataProvider.get_block_id( block) if not len(block[DataProvider.Locations]): self._log.warning(error_msg + 'at any site!') elif not len(sites): self._log.warning(error_msg + 'at any selected site!') block[DataProvider.Locations] = sites return block
def discover_blocks(options): # Get work directory, create dbs dump directory if os.path.isdir(options.args[0]): workDir = os.path.abspath(os.path.normpath(options.args[0])) else: workDir = getConfig(configFile = options.args[0]).getWorkPath() if not options.tempdir: options.tempdir = os.path.join(workDir, 'dbs') if not os.path.exists(options.tempdir): os.mkdir(options.tempdir) # get provider with dataset information if options.input_file: provider = DataProvider.createInstance('ListProvider', getConfig(), options.opts.input_file, None) else: config = getConfig(configDict = {'dataset': options.config_dict}) provider = DataProvider.createInstance('DBSInfoProvider', config, options.args[0], None) blocks = provider.getBlocks() DataProvider.saveToFile(os.path.join(options.opts.tempdir, 'dbs.dat'), blocks) if options.opts.discovery: sys.exit(os.EX_OK) return blocks
def divide_blocks(self, block_iter): for block in block_iter: (entries, fi_list) = (0, []) entries_per_job = self._entries_per_job.lookup(DataProvider.get_block_id(block)) if entries_per_job <= 0: raise PartitionError('Invalid number of entries per job: %d' % entries_per_job) for fi in block[DataProvider.FileList]: if fi_list and (entries + fi[DataProvider.NEntries] > entries_per_job): yield self._create_sub_block(block, fi_list) (entries, fi_list) = (0, []) fi_list.append(fi) entries += fi[DataProvider.NEntries] if fi_list: yield self._create_sub_block(block, fi_list)
def divide_blocks(self, block_iter): for block in block_iter: fi_idx_start = 0 files_per_job = self._files_per_job.lookup( DataProvider.get_block_id(block)) if files_per_job <= 0: raise PartitionError('Invalid number of files per job: %d' % files_per_job) while fi_idx_start < len(block[DataProvider.FileList]): fi_list = block[ DataProvider.FileList][fi_idx_start:fi_idx_start + files_per_job] fi_idx_start += files_per_job if fi_list: yield self._create_sub_block(block, fi_list)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick, dataset_proc) ds_config = config.change_view( view_class='SimpleConfigView', set_sections=['datasource %s' % dataset_expr]) self._block = self._read_block(ds_config, dataset_expr, dataset_nick) def _on_change(config, old_obj, cur_obj, cur_entry, obj2str): self._log.critical('Dataset %r changed', dataset_expr) return TriggerResync(['datasets', 'parameters'])(config, old_obj, cur_obj, cur_entry, obj2str) ds_config.get('dataset hash', self._get_dataset_hash(), persistent=True, on_change=_on_change)
def processBlock(self, block): # Check uniqueness of URLs recordedBlockURL = [] if self._checkURL != DatasetUniqueMode.ignore: def processFI(fiList): for fi in fiList: urlHash = md5_hex( repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[ DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi block[DataProvider.FileList] = list( processFI(block[DataProvider.FileList])) recordedBlockURL.sort() # Check uniqueness of blocks if self._checkBlock != DatasetUniqueMode.ignore: blockHash = md5_hex( repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], recordedBlockURL, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if blockHash in self._recordedBlock: msg = 'Multiple occurences of block: "%s"!' % DataProvider.bName( block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._checkBlock == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkBlock == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkBlock == DatasetUniqueMode.skip: return None self._recordedBlock.add(blockHash) return block
def _read_block(self, ds_config, dataset_expr, dataset_nick): metadata_name_list = parse_json( ds_config.get('metadata', '[]', on_change=None)) common_metadata = parse_json( ds_config.get('metadata common', '[]', on_change=None)) if len(common_metadata) > len(metadata_name_list): raise DatasetError('Unable to set %d common metadata items ' % len(common_metadata) + 'with %d metadata keys' % len(metadata_name_list)) common_prefix = ds_config.get('prefix', '', on_change=None) fn_list = [] has_events = False has_se_list = False for url in ds_config.get_option_list(): if url == 'se list': has_se_list = True elif url == 'events': has_events = True elif url not in [ 'dataset hash', 'metadata', 'metadata common', 'nickname', 'prefix' ]: fi = self._read_fi(ds_config, url, metadata_name_list, common_metadata, common_prefix) fn_list.append(fi) if not fn_list: raise DatasetError( 'There are no dataset files specified for dataset %r' % dataset_expr) result = { DataProvider.Nickname: ds_config.get('nickname', dataset_nick or '', on_change=None), DataProvider.FileList: sorted(fn_list, key=lambda fi: fi[DataProvider.URL]) } result.update(DataProvider.parse_block_id(dataset_expr)) if metadata_name_list: result[DataProvider.Metadata] = metadata_name_list if has_events: result[DataProvider.NEntries] = ds_config.get_int('events', -1, on_change=None) if has_se_list: result[DataProvider.Locations] = parse_list( ds_config.get('se list', '', on_change=None), ',') return result
def divide_blocks(self, block_iter): for block in block_iter: (entries, fi_list) = (0, []) entries_per_job = self._entries_per_job.lookup( DataProvider.get_block_id(block)) if entries_per_job <= 0: raise PartitionError('Invalid number of entries per job: %d' % entries_per_job) for fi in block[DataProvider.FileList]: if fi_list and (entries + fi[DataProvider.NEntries] > entries_per_job): yield self._create_sub_block(block, fi_list) (entries, fi_list) = (0, []) fi_list.append(fi) entries += fi[DataProvider.NEntries] if fi_list: yield self._create_sub_block(block, fi_list)
def resyncMapping(self, newSplitPath, oldBlocks, newBlocks): activity = Activity('Performing resynchronization of dataset') (blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks) for rmBlock in blocksMissing: # Files in matching blocks are already sorted sort_inplace(rmBlock[DataProvider.FileList], key = lambda x: x[DataProvider.URL]) activity.finish() # User overview and setup starts here resultRedo = [] resultDisable = [] newSplitPathTMP = newSplitPath + '.tmp' resyncIter = self._resyncIterator(resultRedo, resultDisable, blocksAdded, blocksMissing, blocksMatching) self.savePartitions(newSplitPathTMP, resyncIter, sourceLenHint = self.getMaxJobs(), message = 'Performing resynchronization of dataset map (progress is estimated)') if self._interactive: # TODO: print info and ask if not utils.getUserBool('Do you want to use the new dataset partition?', False): return os.rename(newSplitPathTMP, newSplitPath) return (resultRedo, resultDisable)
def processBlock(self, block): # Check uniqueness of URLs recordedBlockURL = [] if self._checkURL != DatasetUniqueMode.ignore: def processFI(fiList): for fi in fiList: urlHash = md5_hex(repr((fi[DataProvider.URL], fi[DataProvider.NEntries], fi.get(DataProvider.Metadata)))) if urlHash in self._recordedURL: msg = 'Multiple occurences of URL: %r!' % fi[DataProvider.URL] msg += ' (This check can be configured with %r)' % 'dataset check unique url' if self._checkURL == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkURL == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkURL == DatasetUniqueMode.skip: continue self._recordedURL.add(urlHash) recordedBlockURL.append(urlHash) yield fi block[DataProvider.FileList] = list(processFI(block[DataProvider.FileList])) recordedBlockURL.sort() # Check uniqueness of blocks if self._checkBlock != DatasetUniqueMode.ignore: blockHash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], recordedBlockURL, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if blockHash in self._recordedBlock: msg = 'Multiple occurences of block: "%s"!' % DataProvider.bName(block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._checkBlock == DatasetUniqueMode.warn: self._log.warning(msg) elif self._checkBlock == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._checkBlock == DatasetUniqueMode.skip: return None self._recordedBlock.add(blockHash) return block
def process_block(self, block): # Check uniqueness of URLs url_hash_list = [] if self._check_url != DatasetUniqueMode.ignore: block[DataProvider.FileList] = list(self._process_fi_list(url_hash_list, block[DataProvider.FileList])) url_hash_list.sort() # Check uniqueness of blocks if self._check_block != DatasetUniqueMode.ignore: block_hash = md5_hex(repr((block.get(DataProvider.Dataset), block[DataProvider.BlockName], url_hash_list, block[DataProvider.NEntries], block[DataProvider.Locations], block.get(DataProvider.Metadata)))) if block_hash in self._recorded_block: msg = 'Multiple occurences of block: "%s"!' % DataProvider.get_block_id(block) msg += ' (This check can be configured with %r)' % 'dataset check unique block' if self._check_block == DatasetUniqueMode.warn: self._log.warning(msg) elif self._check_block == DatasetUniqueMode.abort: raise DatasetError(msg) elif self._check_block == DatasetUniqueMode.skip: return None self._recorded_block.add(block_hash) return block
def main(): usage = '%s [OPTIONS] <config file / work directory>' % sys.argv[0] parser = optparse.OptionParser(usage=usage) parser.add_option('-G', '--globaltag', dest='globaltag', default='crab2_tag', help='Specify global tag') parser.add_option('-F', '--input', dest='inputFile', default=None, help='Specify dbs input file to use instead of scanning job output') # parser.add_option('-k', '--key-select', dest='dataset key select', default='', # help='Specify dataset keys to process') parser.add_option('-c', '--continue-migration', dest='continue_migration', default=False, action='store_true', help='Continue an already started migration') ogDiscover = optparse.OptionGroup(parser, 'Discovery options - ignored in case dbs input file is specified', '') ogDiscover.add_option('-n', '--name', dest='dataset name pattern', default='', help='Specify dbs path name - Example: DataSet_@NICK@_@VAR@') ogDiscover.add_option('-T', '--datatype', dest='datatype', default=None, help='Supply dataset type in case cmssw report did not specify it - valid values: "mc" or "data"') ogDiscover.add_option('-m', '--merge', dest='merge parents', default=False, action='store_true', help='Merge output files from different parent blocks into a single block [Default: Keep boundaries]') ogDiscover.add_option('-j', '--jobhash', dest='useJobHash', default=False, action='store_true', help='Use hash of all config files in job for dataset key calculation') ogDiscover.add_option('-u', '--unique-cfg', dest='uniqueCfg', default=False, action='store_true', help='Circumvent edmConfigHash collisions so each dataset is stored with unique config information') ogDiscover.add_option('-P', '--parent', dest='parent source', default='', help='Override parent information source - to bootstrap a reprocessing on local files') ogDiscover.add_option('-H', '--hash-keys', dest='dataset hash keys', default='', help='Included additional variables in dataset hash calculation') parser.add_option_group(ogDiscover) ogDiscover2 = optparse.OptionGroup(parser, 'Discovery options II - only available when config file is used', '') ogDiscover2.add_option('-J', '--job-selector', dest='selected', default=None, help='Specify dataset(s) to process') parser.add_option_group(ogDiscover2) ogMode = optparse.OptionGroup(parser, 'Processing mode', '') ogMode.add_option('-b', '--batch', dest='batch', default=False, action='store_true', help='Enable non-interactive batch mode [Default: Interactive mode]') ogMode.add_option('-d', '--discovery', dest='discovery', default=False, action='store_true', help='Enable discovery mode - just collect file information and exit') ogMode.add_option('', '--tempdir', dest='tmpDir', default='', help='Override temp directory') ogMode.add_option('-i', '--no-import', dest='doImport', default=True, action='store_false', help='Disable import of new datasets into target DBS instance - only temporary xml files are created, ' + 'which can be added later via datasetDBSTool.py [Default: Import datasets]') parser.add_option_group(ogMode) ogInc = optparse.OptionGroup(parser, 'Incremental adding of files to DBS', '') ogInc.add_option('-I', '--incremental', dest='incremental', default=False, action='store_true', help='Skip import of existing files - Warning: this destroys coherent block structure!') # ogInc.add_option('-o', '--open-blocks', dest='closeBlock', default=True, action='store_false', # help='Keep blocks open for addition of further files [Default: Close blocks]') parser.add_option_group(ogInc) ogInst = optparse.OptionGroup(parser, 'DBS instance handling', '') ogInst.add_option('-t', '--target-instance', dest='dbsTarget', default='https://cmsweb.cern.ch/dbs/prod/phys03', help='Specify target dbs instance url') ogInst.add_option('-s', '--source-instance', dest='dbsSource', default='https://cmsweb.cern.ch/dbs/prod/global', help='Specify source dbs instance url(s), where parent datasets are taken from') parser.add_option_group(ogInst) ogDbg = optparse.OptionGroup(parser, 'Display options', '') ogDbg.add_option('-D', '--display-dataset', dest='display_data', default=None, help='Display information associated with dataset key(s) (accepts "all")') ogDbg.add_option('-C', '--display-config', dest='display_cfg', default=None, help='Display information associated with config hash(es) (accepts "all")') ogDbg.add_option('-v', '--verbose', dest='verbosity', default=0, action='count', help='Increase verbosity') parser.add_option_group(ogDbg) (opts, args) = parser.parse_args() utils.verbosity(opts.verbosity) setattr(opts, 'include parent infos', True) setattr(opts, 'importLumi', True) setattr(opts, 'dataset hash keys', getattr(opts, 'dataset hash keys').replace(',', ' ')) if opts.useJobHash: setattr(opts, 'dataset hash keys', getattr(opts, 'dataset hash keys') + ' CMSSW_CONFIG_JOBHASH') # 0) Get work directory, create dbs dump directory if len(args) != 1: utils.exitWithUsage(usage, 'Neither work directory nor config file specified!') if os.path.isdir(args[0]): opts.workDir = os.path.abspath(os.path.normpath(args[0])) else: opts.workDir = getConfig(configFile=args[0]).getWorkPath() if not opts.tmpDir: opts.tmpDir = os.path.join(opts.workDir, 'dbs') if not os.path.exists(opts.tmpDir): os.mkdir(opts.tmpDir) # Lock file in case several instances of this program are running mutex = FileMutex(os.path.join(opts.tmpDir, 'datasetDBSAdd.lock')) # 1) Get dataset information if opts.inputFile: provider = DataProvider.getInstance('ListProvider', getConfig(), opts.inputFile, None) else: config = getConfig(configDict = {'dataset': dict(parser.values.__dict__)}) if opts.discovery: config.set('dataset name pattern', '@DS_KEY@') provider = DataProvider.getInstance('DBSInfoProvider', config, args[0], None) provider.saveState(os.path.join(opts.tmpDir, 'dbs.dat')) if opts.discovery: sys.exit(os.EX_OK) blocks = provider.getBlocks() # 2) Filter datasets if opts.incremental: # Query target DBS for all found datasets and perform dataset resync with "supposed" state dNames = set(map(lambda b: b[DataProvider.Dataset], blocks)) dNames = filter(lambda ds: hasDataset(opts.dbsTarget, ds), dNames) config = getConfig(configDict = {None: {'dbs instance': opts.dbsTarget}}) oldBlocks = reduce(operator.add, map(lambda ds: DBSApiv2(config, None, ds, None).getBlocks(), dNames), []) (blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(oldBlocks, blocks) if len(blocksMissing) or len(blocksChanged): if not utils.getUserBool(' * WARNING: Block structure has changed! Continue?', False): sys.exit(os.EX_OK) # Search for blocks which were partially added and generate "pseudo"-blocks with left over files setOldBlocks = set(map(lambda x: x[DataProvider.BlockName], oldBlocks)) setAddedBlocks = set(map(lambda x: x[DataProvider.BlockName], blocksAdded)) blockCollision = set.intersection(setOldBlocks, setAddedBlocks) if blockCollision and opts.closeBlock: # Block are closed and contents have changed for block in blocksAdded: if block[DataProvider.BlockName] in blockCollision: block[DataProvider.BlockName] = utils.strGuid(md5(str(time.time())).hexdigest()) blocks = blocksAdded # 3) Display dataset properties if opts.display_data or opts.display_cfg: raise APIError('Not yet reimplemented') #set-up logging logging.basicConfig(format='%(levelname)s: %(message)s') logger = logging.getLogger('dbs3-migration') logger.addHandler(NullHandler()) logger.setLevel(logging.DEBUG) #set-up dbs clients dbs3_target_client = DBS3LiteClient(url=opts.dbsTarget) dbs3_source_client = DBS3LiteClient(url=opts.dbsSource) dbs3_migration_queue = DBS3MigrationQueue() for blockDump in generateDBS3BlockDumps(opts, blocks): if not opts.continue_migration: ###initiate the dbs3 to dbs3 migration of parent blocks logger.debug('Checking parentage for block: %s' % blockDump['block']['block_name']) unique_parent_lfns = set((parent[u'parent_logical_file_name'] for parent in blockDump[u'file_parent_list'])) unique_blocks = set((block['block_name'] for parent_lfn in unique_parent_lfns for block in dbs3_source_client.listBlocks(logical_file_name=parent_lfn))) for block_to_migrate in unique_blocks: if dbs3_target_client.listBlocks(block_name=block_to_migrate): #block already at destination logger.debug('Block %s is already at destination' % block_to_migrate) continue migration_task = MigrationTask(block_name=block_to_migrate, migration_url='https://cmsweb.cern.ch/dbs/prod/global/DBSReader', dbs_client=dbs3_target_client) try: dbs3_migration_queue.add_migration_task(migration_task) except AlreadyQueued as aq: logger.debug(aq.message) dbs3_migration_queue.save_to_disk(os.path.join(opts.tmpDir, 'dbs3_migration.pkl')) else: try: dbs3_migration_queue = DBS3MigrationQueue.read_from_disk(os.path.join(opts.tmpDir, 'dbs3_migration.pkl')) except IOError as io_err: msg = "Probably, there is no DBS 3 migration for this dataset ongoing, Dude!" logger.exception('%s\n%s' % (io_err.message, msg)) raise #wait for all parent blocks migrated to dbs3 do_migration(dbs3_migration_queue) #insert block into dbs3 dbs3_target_client.insertBulkBlock(blockDump)
def __init__(self, config, datasetExpr, datasetNick, providerList): DataProvider.__init__(self, config, datasetExpr, datasetNick) self._stats = DataProcessor.createInstance('SimpleStatsDataProcessor', config, None, self._log, 'Summary: Running over ') self._providerList = providerList
def resyncMapping(self, newSplitPath, oldBlocks, newBlocks): log = utils.ActivityLog('Performing resynchronization of dataset') (blocksAdded, blocksMissing, blocksMatching) = DataProvider.resyncSources(oldBlocks, newBlocks) for rmBlock in blocksMissing: # Files in matching blocks are already sorted rmBlock[DataProvider.FileList].sort(lambda a, b: cmp(a[DataProvider.URL], b[DataProvider.URL])) del log # Get block information (oldBlock, newBlock, filesMissing, filesMatched) which splitInfo is based on def getMatchingBlock(splitInfo): # Comparison operator between dataset block and splitting def cmpSplitBlock(dsBlock, splitInfo): if dsBlock[DataProvider.Dataset] == splitInfo[DataSplitter.Dataset]: return cmp(dsBlock[DataProvider.BlockName], splitInfo[DataSplitter.BlockName]) return cmp(dsBlock[DataProvider.Dataset], splitInfo[DataSplitter.Dataset]) # Search for block in missing and matched blocks result = fast_search(blocksMissing, lambda x: cmpSplitBlock(x, splitInfo)) if result: return (result, None, result[DataProvider.FileList], []) return fast_search(blocksMatching, lambda x: cmpSplitBlock(x[0], splitInfo)) # compare with old block ####################################### # Process modifications of event sizes ####################################### # Apply modification list to old splitting # Input: oldSplit, modList = [(rmfile, addfile), ...], doExpandOutside # With doExpandOutside, gc tries to handle expanding files via the splitting function def resyncSplitting(oldSplit, doExpandOutside, jobNum): if oldSplit.get(DataSplitter.Invalid, False): return (oldSplit, ResyncMode.ignore, []) (oldBlock, newBlock, filesMissing, filesMatched) = getMatchingBlock(oldSplit) modSI = copy.deepcopy(oldSplit) if newBlock: modSI[DataSplitter.Locations] = newBlock.get(DataProvider.Locations) # Determine size infos and get started search_url = lambda url: fast_search(oldBlock[DataProvider.FileList], lambda x: cmp(x[DataProvider.URL], url)) sizeInfo = map(lambda url: search_url(url)[DataProvider.NEntries], modSI[DataSplitter.FileList]) extended = [] metaIdxLookup = [] for meta in self.metaOpts: (oldIdx, newIdx) = (None, None) if oldBlock and (meta in oldBlock.get(DataProvider.Metadata, [])): oldIdx = oldBlock[DataProvider.Metadata].index(meta) if newBlock and (meta in newBlock.get(DataProvider.Metadata, [])): newIdx = newBlock[DataProvider.Metadata].index(meta) if (oldIdx != None) or (newIdx != None): metaIdxLookup.append((oldIdx, newIdx, self.metaOpts[meta])) # Select processing mode for job (disable > complete > changed > ignore) [ie. disable overrides all] using min # Result: one of [disable, complete, ignore] (changed -> complete or igore) procMode = ResyncMode.ignore # Remove files from splitting def removeFile(idx, rmFI): modSI[DataSplitter.Comment] += '[rm] ' + rmFI[DataProvider.URL] modSI[DataSplitter.Comment] += '-%d ' % rmFI[DataProvider.NEntries] if idx == len(modSI[DataSplitter.FileList]) - 1: # Removal of last file from current splitting modSI[DataSplitter.NEntries] = sum(sizeInfo) - modSI.get(DataSplitter.Skipped, 0) modSI[DataSplitter.Comment] += '[rm_last] ' elif idx == 0: # Removal of first file from current splitting modSI[DataSplitter.NEntries] += max(0, sizeInfo[idx] - modSI.get(DataSplitter.Skipped, 0) - modSI[DataSplitter.NEntries]) modSI[DataSplitter.NEntries] += modSI.get(DataSplitter.Skipped, 0) modSI[DataSplitter.Skipped] = 0 modSI[DataSplitter.Comment] += '[rm_first] ' else: # File in the middle is affected - solution very simple :) modSI[DataSplitter.Comment] += '[rm_middle] ' modSI[DataSplitter.NEntries] -= rmFI[DataProvider.NEntries] modSI[DataSplitter.FileList].pop(idx) sizeInfo.pop(idx) # Process changed files in splitting - returns True if file index should be increased def changeFile(idx, oldFI, newFI): modSI[DataSplitter.Comment] += '[changed] ' + oldFI[DataProvider.URL] modSI[DataSplitter.Comment] += (' -%d ' % oldFI[DataProvider.NEntries]) modSI[DataSplitter.Comment] += (' +%d ' % newFI[DataProvider.NEntries]) def removeCompleteFile(): modSI[DataSplitter.NEntries] -= oldFI[DataProvider.NEntries] modSI[DataSplitter.FileList].pop(idx) sizeInfo.pop(idx) def replaceCompleteFile(): modSI[DataSplitter.NEntries] += newFI[DataProvider.NEntries] modSI[DataSplitter.NEntries] -= oldFI[DataProvider.NEntries] sizeInfo[idx] = newFI[DataProvider.NEntries] def expandOutside(): fileList = newBlock.pop(DataProvider.FileList) newBlock[DataProvider.FileList] = [newFI] for extSplit in self.splitDatasetInternal([newBlock], oldFI[DataProvider.NEntries]): extSplit[DataSplitter.Comment] = oldSplit[DataSplitter.Comment] + '[ext_1] ' extended.append(extSplit) newBlock[DataProvider.FileList] = fileList sizeInfo[idx] = newFI[DataProvider.NEntries] if idx == len(modSI[DataSplitter.FileList]) - 1: coverLast = modSI.get(DataSplitter.Skipped, 0) + modSI[DataSplitter.NEntries] - sum(sizeInfo[:-1]) if coverLast == oldFI[DataProvider.NEntries]: # Change of last file, which ends in current splitting if doExpandOutside and (oldFI[DataProvider.NEntries] < newFI[DataProvider.NEntries]): expandOutside() modSI[DataSplitter.Comment] += '[last_add_1] ' else: replaceCompleteFile() modSI[DataSplitter.Comment] += '[last_add_2] ' elif coverLast > newFI[DataProvider.NEntries]: # Change of last file, which changes current coverage modSI[DataSplitter.NEntries] -= coverLast modSI[DataSplitter.NEntries] += oldFI[DataProvider.NEntries] replaceCompleteFile() modSI[DataSplitter.Comment] += '[last_add_3] ' else: # Change of last file outside of current splitting sizeInfo[idx] = newFI[DataProvider.NEntries] modSI[DataSplitter.Comment] += '[last_add_4] ' elif idx == 0: # First file is affected if (newFI[DataProvider.NEntries] > modSI.get(DataSplitter.Skipped, 0)): # First file changes and still lives in new splitting following = sizeInfo[0] - modSI.get(DataSplitter.Skipped, 0) - modSI[DataSplitter.NEntries] shrinkage = oldFI[DataProvider.NEntries] - newFI[DataProvider.NEntries] if following > 0: # First file not completely covered by current splitting if following < shrinkage: # Covered area of first file shrinks modSI[DataSplitter.NEntries] += following replaceCompleteFile() modSI[DataSplitter.Comment] += '[first_add_1] ' else: # First file changes outside of current splitting sizeInfo[idx] = newFI[DataProvider.NEntries] modSI[DataSplitter.Comment] = '[first_add_2] ' else: # Change of first file ending in current splitting - One could try to # 'reverse fix' expanding files to allow expansion via adding only the expanding part replaceCompleteFile() modSI[DataSplitter.Comment] += '[first_add_3] ' else: # Removal of first file from current splitting modSI[DataSplitter.NEntries] += max(0, sizeInfo[idx] - modSI.get(DataSplitter.Skipped, 0) - modSI[DataSplitter.NEntries]) modSI[DataSplitter.NEntries] += modSI.get(DataSplitter.Skipped, 0) modSI[DataSplitter.Skipped] = 0 removeCompleteFile() return False else: # File in the middle is affected - solution very simple :) # Replace file - expanding files could be swapped to the (fully contained) end # to allow expansion via adding only the expanding part replaceCompleteFile() modSI[DataSplitter.Comment] += '[middle_add_1] ' return True idx = 0 newMetadata = [] while idx < len(modSI[DataSplitter.FileList]): url = modSI[DataSplitter.FileList][idx] rmFI = fast_search(filesMissing, lambda x: cmp(x[DataProvider.URL], url)) if rmFI: removeFile(idx, rmFI) procMode = min(procMode, self.mode_removed) for meta in modSI.get(DataSplitter.MetadataHeader, []): procMode = min(procMode, self.metaOpts.get(meta, ResyncMode.ignore)) continue # dont increase filelist index! (oldFI, newFI) = fast_search(filesMatched, lambda x: cmp(x[0][DataProvider.URL], url)) if DataProvider.Metadata in newFI: newMetadata.append(newFI[DataProvider.Metadata]) for (oldMI, newMI, metaProc) in metaIdxLookup: if (oldMI == None) or (newMI == None): procMode = min(procMode, metaProc) # Metadata was removed elif (oldFI[DataProvider.Metadata][oldMI] != newFI[DataProvider.Metadata][newMI]): procMode = min(procMode, metaProc) # Metadata was changed if oldFI[DataProvider.NEntries] == newFI[DataProvider.NEntries]: idx += 1 continue oldEvts = modSI[DataSplitter.NEntries] oldSkip = modSI[DataSplitter.Skipped] if changeFile(idx, oldFI, newFI): pass idx += 1 mode = utils.QM(oldFI[DataProvider.NEntries] < newFI[DataProvider.NEntries], self.mode_expanded, self.mode_shrunken) if mode == ResyncMode.changed: changed = (oldEvts != modSI[DataSplitter.NEntries]) or (oldSkip != modSI[DataSplitter.Skipped]) mode = utils.QM(changed, ResyncMode.complete, ResyncMode.ignore) procMode = min(procMode, mode) continue # Disable invalid / invalidated splittings if (len(modSI[DataSplitter.FileList]) == 0) or (modSI[DataSplitter.NEntries] <= 0): procMode = ResyncMode.disable if procMode == ResyncMode.disable: modSI[DataSplitter.Invalid] = True return (modSI, ResyncMode.disable, []) # Discard extensions # Update metadata if DataSplitter.Metadata in modSI: modSI.pop(DataSplitter.MetadataHeader) modSI.pop(DataSplitter.Metadata) if newMetadata: modSI[DataSplitter.MetadataHeader] = newBlock.get(DataProvider.Metadata) modSI[DataSplitter.Metadata] = newMetadata return (modSI, procMode, extended) # Process splittings def resyncIterator_raw(): extList = [] # Perform resync of existing splittings for jobNum in range(self.getMaxJobs()): splitInfo = self.getSplitInfo(jobNum) if DataSplitter.Comment not in splitInfo: splitInfo[DataSplitter.Comment] = 'src: %d ' % jobNum (modSplitInfo, procMode, extended) = resyncSplitting(splitInfo, True, jobNum) if (self.resyncOrder == ResyncOrder.append) and (procMode == ResyncMode.complete): extList.append(modSplitInfo) modSplitInfo = copy.copy(splitInfo) modSplitInfo[DataSplitter.Invalid] = True procMode = ResyncMode.disable extList.extend(extended) yield (jobNum, modSplitInfo, procMode) # Yield collected extensions of existing splittings for extSplitInfo in extList: yield (None, extSplitInfo, ResyncMode.ignore) # Yield completely new splittings if self.mode_new == ResyncMode.complete: for newSplitInfo in self.splitDatasetInternal(blocksAdded): yield (None, newSplitInfo, ResyncMode.ignore) def getSplitContainer(): (rawInfo, extInfo) = ([], []) for (jobNum, splitInfo, procMode) in resyncIterator_raw(): if jobNum != None: # Separate existing and new splittings rawInfo.append((jobNum, splitInfo, procMode)) else: extInfo.append((None, splitInfo, None)) return (rawInfo, extInfo) def getReorderIterator(mainIter, altIter): # alt source is used if main source contains invalid entries for (jobNum, splitInfo, procMode) in mainIter: if splitInfo.get(DataSplitter.Invalid, False) or (procMode == ResyncMode.disable): extInfo = next(altIter, None) while extInfo and extInfo[1].get(DataSplitter.Invalid, False): extInfo = next(altIter, None) if extInfo: yield (jobNum, extInfo[1], ResyncMode.complete) # Overwrite invalid splittings continue yield (jobNum, splitInfo, procMode) for extInfo in altIter: yield (None, extInfo[1], ResyncMode.ignore) # Use reordering if setup - log interventions (disable, redo) according to procMode resultRedo = [] resultDisable = [] def resyncIterator(): if self.resyncOrder == ResyncOrder.fillgap: rawInfo, extInfo = getSplitContainer() resyncIter = getReorderIterator(rawInfo, iter(extInfo)) elif self.resyncOrder == ResyncOrder.reorder: rawInfo, extInfo = getSplitContainer() tsi = utils.TwoSidedIterator(rawInfo + extInfo) resyncIter = getReorderIterator(tsi.forward(), tsi.backward()) else: resyncIter = resyncIterator_raw() for (jobNum, splitInfo, procMode) in resyncIter: if jobNum: if procMode == ResyncMode.complete: resultRedo.append(jobNum) if procMode == ResyncMode.disable: resultDisable.append(jobNum) yield splitInfo # User overview and setup starts here newSplitPathTMP = newSplitPath + '.tmp' self.saveState(newSplitPathTMP, resyncIterator(), sourceLen = self.getMaxJobs(), message = 'Performing resynchronization of dataset map (progress is estimated)') if self.interactive: # TODO: print info and ask if not getUserBool('Do you want to use the new dataset splitting?', False): return None os.rename(newSplitPathTMP, newSplitPath) return (resultRedo, resultDisable)
def __init__(self, config, datasetExpr, datasetNick, sList): DataProvider.__init__(self, config, datasetExpr, datasetNick) (self._ds_select, self._ds_name, self._ds_keys_user, self._ds_keys_guard) = self._setup(config, 'dataset') (self._b_select, self._b_name, self._b_keys_user, self._b_keys_guard) = self._setup(config, 'block') scanList = config.getList('scanner', sList) + ['NullScanner'] self._scanner = lmap(lambda cls: InfoScanner.createInstance(cls, config), scanList)
def _filter_block(block): if self._filter: return self._filter in '/%s#' % DataProvider.get_block_id(block) return True