def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._targetJobs = config.getInt('target partitions', -1, onChange = onChange) self._targetJobsDS = config.getInt('target partitions per nickname', -1, onChange = onChange) self._entries = {None: 0} self._files = {None: 0} self._config = config
def __init__(self, config, datasource_name): DataProcessor.__init__(self, config, datasource_name) self._location_filter = config.get_filter( self._get_dproc_opt('location filter'), '', default_matcher='BlackWhiteMatcher', default_filter='StrictListFilter')
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._locationfilter = config.getFilter('dataset location filter', '', defaultMatcher='blackwhite', defaultFilter='strict', onChange=onChange)
def __init__(self, config): DataProcessor.__init__(self, config) self._targetJobs = config.getInt('target partitions', -1, onChange = DataProcessor.triggerDataResync) self._targetJobsDS = config.getInt('target partitions per nickname', -1, onChange = DataProcessor.triggerDataResync) self._entries = {None: 0} self._files = {None: 0} self._config = config
def __init__(self, config, datasource_name): DataProcessor.__init__(self, config, datasource_name) self._empty_files = config.get_bool( self._get_dproc_opt('remove empty files'), True) self._empty_block = config.get_bool( self._get_dproc_opt('remove empty blocks'), True) (self._removed_files, self._removed_blocks) = (0, 0)
def __init__(self, config, datasource_name): DataProcessor.__init__(self, config, datasource_name) self._limit_files = config.get_int( self._get_dproc_opt(['limit files', 'limit urls']), -1) self._limit_files_fraction = config.get_float( self._get_dproc_opt( ['limit files fraction', 'limit urls fraction']), -1.) (self._limit_files_per_ds, self._files_per_ds) = ({}, {})
def __init__(self, config): DataProcessor.__init__(self, config) # Ensure the same nickname is used consistently in all blocks of a dataset self._checkConsistency = config.getBool('nickname check consistency', True) self._checkConsistencyData = {} # Check if two different datasets have the same nickname self._checkCollision = config.getBool('nickname check collision', True) self._checkCollisionData = {}
def __init__(self, config): DataProcessor.__init__(self, config) self._targetJobs = config.getInt('target partitions', -1) self._targetJobsDS = config.getInt('target partitions per nickname', -1) self._writeSettings = (self._targetJobs != -1) or (self._targetJobsDS != -1) self._entries = {None: 0} self._files = {None: 0} self._config = config
def __init__(self, config): DataProcessor.__init__(self, config) self._checkURLOpt = 'dataset check unique url' self._checkURL = config.getEnum(self._checkURLOpt, DatasetUniqueMode, DatasetUniqueMode.abort) self._checkBlockOpt = 'dataset check unique block' self._checkBlock = config.getEnum(self._checkBlockOpt, DatasetUniqueMode, DatasetUniqueMode.abort)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._emptyFiles = config.getBool('dataset remove empty files', True, onChange=onChange) self._emptyBlock = config.getBool('dataset remove empty blocks', True, onChange=onChange) (self._removedFiles, self._removedBlocks) = (0, 0)
def __init__(self, config, datasource_name): DataProcessor.__init__(self, config, datasource_name) self._sort_ds = config.get_bool(self._get_dproc_opt('sort'), False) self._sort_block = config.get_bool(self._get_dproc_opt('block sort'), False) self._sort_files = config.get_bool(self._get_dproc_opt('files sort'), False) self._sort_location = config.get_bool( self._get_dproc_opt('location sort'), False)
def __init__(self, config): DataProcessor.__init__(self, config) internal_config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['dataprocessor']) internal_config.set('dataset processor', 'NullDataProcessor') self._url_filter = config.getFilter(['dataset ignore files', 'dataset ignore urls'], '', negate = True, filterParser = lambda value: self._parseFilter(internal_config, value), filterStr = lambda value: str.join('\n', value.split()), matchKey = itemgetter(DataProvider.URL), defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = DataProcessor.triggerDataResync)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) internal_config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['dataprocessor']) internal_config.set('dataset processor', 'NullDataProcessor') config.set('dataset ignore urls matcher case sensitive', 'False') self._url_filter = config.getFilter(['dataset ignore files', 'dataset ignore urls'], '', negate = True, filterParser = lambda value: self._parseFilter(internal_config, value), filterStr = lambda value: str.join('\n', value.split()), defaultMatcher = 'blackwhite', defaultFilter = 'weak', onChange = onChange)
def __init__(self, config, datasource_name): DataProcessor.__init__(self, config, datasource_name) self._target_jobs = config.get_int( join_config_locations(['', datasource_name], 'target partitions'), -1) self._target_jobs_ds = config.get_int( join_config_locations(['', datasource_name], 'target partitions per nickname'), -1) self._entries = {None: 0} self._files = {None: 0} self._config = None if self.enabled(): self._config = config
def __init__(self, config, datasource_name): DataProcessor.__init__(self, config, datasource_name) config.set('%s ignore urls matcher case sensitive' % datasource_name, 'False') self._url_filter = config.get_filter( self._get_dproc_opt(['ignore files', 'ignore urls']), '', negate=True, default_matcher='BlackWhiteMatcher', default_filter='WeakListFilter', filter_parser=lambda value: self._parse_filter(config, value), filter_str=lambda value: str.join('\n', value.split()))
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._sortDS = config.getBool('dataset sort', False, onChange=onChange) self._sortBlock = config.getBool('dataset block sort', False, onChange=onChange) self._sortFiles = config.getBool('dataset files sort', False, onChange=onChange) self._sortLocation = config.getBool('dataset location sort', False, onChange=onChange)
def process(self, block_iter): (self._limit_files_per_ds, self._files_per_ds) = ({}, {} ) # reset counters if self._limit_files_fraction >= 0: block_list = list(DataProcessor.process(self, block_iter)) goal_per_ds = {} # calculate file limit per dataset for (dataset_name, fn_list_len) in self._files_per_ds.items(): goal_per_ds[dataset_name] = int( self._limit_files_fraction * fn_list_len) or 1 for block in block_list: self._reduce_fn_list(block, goal_per_ds) yield block else: for block in DataProcessor.process(self, block_iter): yield block
def __init__(self, config, datasetExpr, datasetNick=None): ConfigurablePlugin.__init__(self, config) self._log = logging.getLogger('dataset.provider') (self._datasetExpr, self._datasetNick) = (datasetExpr, datasetNick) (self._cache_block, self._cache_dataset) = (None, None) self._dataset_query_interval = config.getTime( 'dataset default query interval', 60, onChange=None) triggerDataResync = triggerResync(['datasets', 'parameters']) self._stats = DataProcessor.createInstance( 'SimpleStatsDataProcessor', config, triggerDataResync, self._log, ' * Dataset %s:\n\tcontains ' % repr(datasetNick or datasetExpr)) self._nickProducer = config.getPlugin('nickname source', 'SimpleNickNameProducer', cls=DataProcessor, pargs=(triggerDataResync, ), onChange=triggerDataResync) self._datasetProcessor = config.getCompositePlugin( 'dataset processor', 'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor ' + 'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor', 'MultiDataProcessor', cls=DataProcessor, pargs=(triggerDataResync, ), onChange=triggerDataResync)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) internal_config = config.changeView(viewClass='SimpleConfigView', setSections=['dataprocessor']) internal_config.set('dataset processor', 'NullDataProcessor') config.set('dataset ignore urls matcher case sensitive', 'False') self._url_filter = config.getFilter( ['dataset ignore files', 'dataset ignore urls'], '', negate=True, filterParser=lambda value: self._parseFilter( internal_config, value), filterStr=lambda value: str.join('\n', value.split()), defaultMatcher='blackwhite', defaultFilter='weak', onChange=onChange)
def __init__(self, config, datasource_name, dataset_expr, dataset_nick, provider_list): for provider in provider_list: provider.disable_stream_singletons() DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick) self._stats = DataProcessor.create_instance('SimpleStatsDataProcessor', config, 'dataset', self._log, 'Summary: Running over ') self._provider_list = provider_list
def process(self, block_iter): if self.enabled() and self._config: block_list = list(DataProcessor.process(self, block_iter)) if (self._target_jobs > 0) or (self._target_jobs_ds > 0): self._set_split_opt(self._config, 'files per job', dict(self._files), self._target_jobs, self._target_jobs_ds) self._set_split_opt(self._config, 'events per job', dict(self._entries), self._target_jobs, self._target_jobs_ds) self._config = None return block_list return block_iter
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): ConfigurablePlugin.__init__(self, config) self._log = logging.getLogger('user.dataprovider') (self._datasetExpr, self._datasetNick, self._datasetID) = (datasetExpr, datasetNick, datasetID) (self._cache_block, self._cache_dataset, self._passthrough) = (None, None, False) self._stats = DataProcessor.createInstance('StatsDataProcessor', config) self._nickProducer = config.getPlugin('nickname source', 'SimpleNickNameProducer', cls = DataProcessor) self._datasetProcessor = config.getCompositePlugin('dataset processor', 'EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor ' + 'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor', 'MultiDataProcessor', cls = DataProcessor, onChange = triggerResync(['datasets', 'parameters']))
def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0): ConfigurablePlugin.__init__(self, config) self._log = logging.getLogger('user.dataprovider') (self._datasetExpr, self._datasetNick, self._datasetID) = (datasetExpr, datasetNick, datasetID) (self._cache_block, self._cache_dataset, self._passthrough) = (None, None, False) self._stats = DataProcessor.createInstance('StatsDataProcessor', config) self._nickProducer = config.getPlugin('nickname source', 'SimpleNickNameProducer', cls = DataProcessor) self._datasetProcessor = config.getCompositePlugin('dataset processor', 'EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor ' + 'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor', 'MultiDataProcessor', cls = DataProcessor)
def __init__(self, config, datasetExpr, datasetNick = None): ConfigurablePlugin.__init__(self, config) self._log = logging.getLogger('dataset.provider') (self._datasetExpr, self._datasetNick) = (datasetExpr, datasetNick) (self._cache_block, self._cache_dataset) = (None, None) self._dataset_query_interval = config.getTime('dataset default query interval', 60, onChange = None) triggerDataResync = triggerResync(['datasets', 'parameters']) self._stats = DataProcessor.createInstance('SimpleStatsDataProcessor', config, triggerDataResync, self._log, ' * Dataset %s:\n\tcontains ' % repr(datasetNick or datasetExpr)) self._nickProducer = config.getPlugin('nickname source', 'SimpleNickNameProducer', cls = DataProcessor, pargs = (triggerDataResync,), onChange = triggerDataResync) self._datasetProcessor = config.getCompositePlugin('dataset processor', 'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor ' + 'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor', 'MultiDataProcessor', cls = DataProcessor, pargs = (triggerDataResync,), onChange = triggerDataResync)
def _parse_filter(self, config, value): dataset_proc = DataProcessor.create_instance('NullDataProcessor') def _get_filter_entries(): for pat in value.split(): if ':' not in pat.lstrip(':'): yield pat else: block_iter = DataProvider.iter_blocks_from_expr( config, ':%s' % pat.lstrip(':'), dataset_proc=dataset_proc) for block in block_iter: for fi in block[DataProvider.FileList]: yield fi[DataProvider.URL] return str.join('\n', _get_filter_entries())
def __init__(self, config, datasource_name, dataset_expr, dataset_nick=None, dataset_proc=None): ConfigurablePlugin.__init__(self, config) self._log = logging.getLogger('%s.provider' % datasource_name) (self._datasource_name, self._dataset_expr) = (datasource_name, dataset_expr) self._dataset_nick_override = dataset_nick (self._cache_block, self._cache_dataset) = (None, None) self._dataset_query_interval = config.get_time( '%s default query interval' % datasource_name, 60, on_change=None) self._stats = dataset_proc or DataProcessor.create_instance( 'SimpleStatsDataProcessor', config, datasource_name, self._log, ' * Dataset %s:\n\tcontains ' % repr(dataset_nick or dataset_expr)) dataset_config = config.change_view( default_on_change=TriggerResync(['datasets', 'parameters'])) self._nick_producer = dataset_config.get_plugin( ['nickname source', '%s nickname source' % datasource_name], 'SimpleNickNameProducer', cls=DataProcessor, pargs=(datasource_name, )) self._dataset_processor = dataset_proc or dataset_config.get_composited_plugin( '%s processor' % datasource_name, 'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor ' + 'URLCountDataProcessor EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor ' + 'LocationDataProcessor', 'MultiDataProcessor', cls=DataProcessor, pargs=(datasource_name, ))
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange)
def __init__(self, config, datasetExpr, datasetNick, providerList): DataProvider.__init__(self, config, datasetExpr, datasetNick) self._stats = DataProcessor.createInstance('SimpleStatsDataProcessor', config, None, self._log, 'Summary: Running over ') self._providerList = providerList
def process(self, blockIter): self._recordedURL = set() self._recordedBlock = set() return DataProcessor.process(self, blockIter)
def __init__(self, config): DataProcessor.__init__(self, config) self._emptyFiles = config.getBool('dataset remove empty files', True, onChange = DataProcessor.triggerDataResync) self._emptyBlock = config.getBool('dataset remove empty blocks', True, onChange = DataProcessor.triggerDataResync)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._limitEntries = config.getInt( ['dataset limit events', 'dataset limit entries'], -1, onChange=onChange)
def __init__(self, config): DataProcessor.__init__(self, config) self.reset()
def __init__(self, config): DataProcessor.__init__(self, config) self._emptyFiles = config.getBool('dataset remove empty files', True) self._emptyBlock = config.getBool('dataset remove empty blocks', True)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._limitFiles = config.getInt(['dataset limit files', 'dataset limit urls'], -1, onChange = onChange)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._emptyFiles = config.getBool('dataset remove empty files', True, onChange = onChange) self._emptyBlock = config.getBool('dataset remove empty blocks', True, onChange = onChange) (self._removedFiles, self._removedBlocks) = (0, 0)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._limitEntries = config.getInt(['dataset limit events', 'dataset limit entries'], -1, onChange = onChange)
def __init__(self, config, datasource_name): DataProcessor.__init__(self, config, datasource_name) (self._entries, self._blocks, self._files) = (0, 0, 0)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._sortDS = config.getBool('dataset sort', False, onChange = onChange) self._sortBlock = config.getBool('dataset block sort', False, onChange = onChange) self._sortFiles = config.getBool('dataset files sort', False, onChange = onChange) self._sortLocation = config.getBool('dataset location sort', False, onChange = onChange)
def __init__(self, config): DataProcessor.__init__(self, config) self._ignoreURLs = config.getList(['dataset ignore urls', 'dataset ignore files'], [])
def __init__(self, config): DataProcessor.__init__(self, config) self._entries = 0 self._blocks = 0
def __init__(self, config): DataProcessor.__init__(self, config) self._locationfilter = config.getFilter('dataset location filter', '', defaultMatcher = 'blackwhite', defaultFilter = 'strict', onChange = DataProcessor.triggerDataResync)
def process(self, block_iter): self._recorded_url = set() # reset records self._recorded_block = set() return DataProcessor.process(self, block_iter)
def __init__(self, config, onChange): DataProcessor.__init__(self, config, onChange) self._limitFiles = config.getInt( ['dataset limit files', 'dataset limit urls'], -1, onChange=onChange)
def __init__(self, config): DataProcessor.__init__(self, config) self._checkURL = config.getEnum('dataset check unique url', DatasetUniqueMode, DatasetUniqueMode.abort, onChange = DataProcessor.triggerDataResync) self._checkBlock = config.getEnum('dataset check unique block', DatasetUniqueMode, DatasetUniqueMode.abort, onChange = DataProcessor.triggerDataResync)
def __init__(self, config): DataProcessor.__init__(self, config) self._limitEntries = config.getInt(['dataset limit events', 'dataset limit entries'], -1, onChange = DataProcessor.triggerDataResync)
def __init__(self, config): DataProcessor.__init__(self, config) self._limitFiles = config.getInt(['dataset limit files', 'dataset limit urls'], -1, onChange = DataProcessor.triggerDataResync)