def __init__(self, config, onChange):
		DataProcessor.__init__(self, config, onChange)
		self._targetJobs = config.getInt('target partitions', -1, onChange = onChange)
		self._targetJobsDS = config.getInt('target partitions per nickname', -1, onChange = onChange)
		self._entries = {None: 0}
		self._files = {None: 0}
		self._config = config
Exemple #2
0
 def __init__(self, config, datasource_name):
     DataProcessor.__init__(self, config, datasource_name)
     self._location_filter = config.get_filter(
         self._get_dproc_opt('location filter'),
         '',
         default_matcher='BlackWhiteMatcher',
         default_filter='StrictListFilter')
Exemple #3
0
 def __init__(self, config, onChange):
     DataProcessor.__init__(self, config, onChange)
     self._locationfilter = config.getFilter('dataset location filter',
                                             '',
                                             defaultMatcher='blackwhite',
                                             defaultFilter='strict',
                                             onChange=onChange)
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._targetJobs = config.getInt('target partitions', -1, onChange = DataProcessor.triggerDataResync)
		self._targetJobsDS = config.getInt('target partitions per nickname', -1, onChange = DataProcessor.triggerDataResync)
		self._entries = {None: 0}
		self._files = {None: 0}
		self._config = config
Exemple #5
0
 def __init__(self, config, datasource_name):
     DataProcessor.__init__(self, config, datasource_name)
     self._empty_files = config.get_bool(
         self._get_dproc_opt('remove empty files'), True)
     self._empty_block = config.get_bool(
         self._get_dproc_opt('remove empty blocks'), True)
     (self._removed_files, self._removed_blocks) = (0, 0)
Exemple #6
0
 def __init__(self, config, datasource_name):
     DataProcessor.__init__(self, config, datasource_name)
     self._limit_files = config.get_int(
         self._get_dproc_opt(['limit files', 'limit urls']), -1)
     self._limit_files_fraction = config.get_float(
         self._get_dproc_opt(
             ['limit files fraction', 'limit urls fraction']), -1.)
     (self._limit_files_per_ds, self._files_per_ds) = ({}, {})
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		# Ensure the same nickname is used consistently in all blocks of a dataset
		self._checkConsistency = config.getBool('nickname check consistency', True)
		self._checkConsistencyData = {}
		# Check if two different datasets have the same nickname
		self._checkCollision = config.getBool('nickname check collision', True)
		self._checkCollisionData = {}
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._targetJobs = config.getInt('target partitions', -1)
		self._targetJobsDS = config.getInt('target partitions per nickname', -1)
		self._writeSettings = (self._targetJobs != -1) or (self._targetJobsDS != -1)
		self._entries = {None: 0}
		self._files = {None: 0}
		self._config = config
 def __init__(self, config):
     DataProcessor.__init__(self, config)
     # Ensure the same nickname is used consistently in all blocks of a dataset
     self._checkConsistency = config.getBool('nickname check consistency',
                                             True)
     self._checkConsistencyData = {}
     # Check if two different datasets have the same nickname
     self._checkCollision = config.getBool('nickname check collision', True)
     self._checkCollisionData = {}
Exemple #10
0
 def __init__(self, config):
     DataProcessor.__init__(self, config)
     self._checkURLOpt = 'dataset check unique url'
     self._checkURL = config.getEnum(self._checkURLOpt, DatasetUniqueMode,
                                     DatasetUniqueMode.abort)
     self._checkBlockOpt = 'dataset check unique block'
     self._checkBlock = config.getEnum(self._checkBlockOpt,
                                       DatasetUniqueMode,
                                       DatasetUniqueMode.abort)
Exemple #11
0
 def __init__(self, config, onChange):
     DataProcessor.__init__(self, config, onChange)
     self._emptyFiles = config.getBool('dataset remove empty files',
                                       True,
                                       onChange=onChange)
     self._emptyBlock = config.getBool('dataset remove empty blocks',
                                       True,
                                       onChange=onChange)
     (self._removedFiles, self._removedBlocks) = (0, 0)
Exemple #12
0
 def __init__(self, config, datasource_name):
     DataProcessor.__init__(self, config, datasource_name)
     self._sort_ds = config.get_bool(self._get_dproc_opt('sort'), False)
     self._sort_block = config.get_bool(self._get_dproc_opt('block sort'),
                                        False)
     self._sort_files = config.get_bool(self._get_dproc_opt('files sort'),
                                        False)
     self._sort_location = config.get_bool(
         self._get_dproc_opt('location sort'), False)
 def __init__(self, config):
     DataProcessor.__init__(self, config)
     self._targetJobs = config.getInt('target partitions', -1)
     self._targetJobsDS = config.getInt('target partitions per nickname',
                                        -1)
     self._writeSettings = (self._targetJobs != -1) or (self._targetJobsDS
                                                        != -1)
     self._entries = {None: 0}
     self._files = {None: 0}
     self._config = config
Exemple #14
0
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		internal_config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['dataprocessor'])
		internal_config.set('dataset processor', 'NullDataProcessor')
		self._url_filter = config.getFilter(['dataset ignore files', 'dataset ignore urls'], '', negate = True,
			filterParser = lambda value: self._parseFilter(internal_config, value),
			filterStr = lambda value: str.join('\n', value.split()),
			matchKey = itemgetter(DataProvider.URL),
			defaultMatcher = 'blackwhite', defaultFilter = 'weak',
			onChange = DataProcessor.triggerDataResync)
Exemple #15
0
	def __init__(self, config, onChange):
		DataProcessor.__init__(self, config, onChange)
		internal_config = config.changeView(viewClass = 'SimpleConfigView', setSections = ['dataprocessor'])
		internal_config.set('dataset processor', 'NullDataProcessor')
		config.set('dataset ignore urls matcher case sensitive', 'False')
		self._url_filter = config.getFilter(['dataset ignore files', 'dataset ignore urls'], '', negate = True,
			filterParser = lambda value: self._parseFilter(internal_config, value),
			filterStr = lambda value: str.join('\n', value.split()),
			defaultMatcher = 'blackwhite', defaultFilter = 'weak',
			onChange = onChange)
	def __init__(self, config, datasource_name):
		DataProcessor.__init__(self, config, datasource_name)
		self._target_jobs = config.get_int(
			join_config_locations(['', datasource_name], 'target partitions'), -1)
		self._target_jobs_ds = config.get_int(
			join_config_locations(['', datasource_name], 'target partitions per nickname'), -1)
		self._entries = {None: 0}
		self._files = {None: 0}
		self._config = None
		if self.enabled():
			self._config = config
Exemple #17
0
 def __init__(self, config, datasource_name):
     DataProcessor.__init__(self, config, datasource_name)
     config.set('%s ignore urls matcher case sensitive' % datasource_name,
                'False')
     self._url_filter = config.get_filter(
         self._get_dproc_opt(['ignore files', 'ignore urls']),
         '',
         negate=True,
         default_matcher='BlackWhiteMatcher',
         default_filter='WeakListFilter',
         filter_parser=lambda value: self._parse_filter(config, value),
         filter_str=lambda value: str.join('\n', value.split()))
Exemple #18
0
 def __init__(self, config, onChange):
     DataProcessor.__init__(self, config, onChange)
     self._sortDS = config.getBool('dataset sort', False, onChange=onChange)
     self._sortBlock = config.getBool('dataset block sort',
                                      False,
                                      onChange=onChange)
     self._sortFiles = config.getBool('dataset files sort',
                                      False,
                                      onChange=onChange)
     self._sortLocation = config.getBool('dataset location sort',
                                         False,
                                         onChange=onChange)
 def __init__(self, config, datasource_name):
     DataProcessor.__init__(self, config, datasource_name)
     self._target_jobs = config.get_int(
         join_config_locations(['', datasource_name], 'target partitions'),
         -1)
     self._target_jobs_ds = config.get_int(
         join_config_locations(['', datasource_name],
                               'target partitions per nickname'), -1)
     self._entries = {None: 0}
     self._files = {None: 0}
     self._config = None
     if self.enabled():
         self._config = config
Exemple #20
0
 def process(self, block_iter):
     (self._limit_files_per_ds, self._files_per_ds) = ({}, {}
                                                       )  # reset counters
     if self._limit_files_fraction >= 0:
         block_list = list(DataProcessor.process(self, block_iter))
         goal_per_ds = {}  # calculate file limit per dataset
         for (dataset_name, fn_list_len) in self._files_per_ds.items():
             goal_per_ds[dataset_name] = int(
                 self._limit_files_fraction * fn_list_len) or 1
         for block in block_list:
             self._reduce_fn_list(block, goal_per_ds)
             yield block
     else:
         for block in DataProcessor.process(self, block_iter):
             yield block
Exemple #21
0
    def __init__(self, config, datasetExpr, datasetNick=None):
        ConfigurablePlugin.__init__(self, config)
        self._log = logging.getLogger('dataset.provider')
        (self._datasetExpr, self._datasetNick) = (datasetExpr, datasetNick)
        (self._cache_block, self._cache_dataset) = (None, None)
        self._dataset_query_interval = config.getTime(
            'dataset default query interval', 60, onChange=None)

        triggerDataResync = triggerResync(['datasets', 'parameters'])
        self._stats = DataProcessor.createInstance(
            'SimpleStatsDataProcessor', config, triggerDataResync, self._log,
            ' * Dataset %s:\n\tcontains ' % repr(datasetNick or datasetExpr))
        self._nickProducer = config.getPlugin('nickname source',
                                              'SimpleNickNameProducer',
                                              cls=DataProcessor,
                                              pargs=(triggerDataResync, ),
                                              onChange=triggerDataResync)
        self._datasetProcessor = config.getCompositePlugin(
            'dataset processor',
            'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor '
            +
            'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor',
            'MultiDataProcessor',
            cls=DataProcessor,
            pargs=(triggerDataResync, ),
            onChange=triggerDataResync)
Exemple #22
0
 def __init__(self, config, onChange):
     DataProcessor.__init__(self, config, onChange)
     internal_config = config.changeView(viewClass='SimpleConfigView',
                                         setSections=['dataprocessor'])
     internal_config.set('dataset processor', 'NullDataProcessor')
     config.set('dataset ignore urls matcher case sensitive', 'False')
     self._url_filter = config.getFilter(
         ['dataset ignore files', 'dataset ignore urls'],
         '',
         negate=True,
         filterParser=lambda value: self._parseFilter(
             internal_config, value),
         filterStr=lambda value: str.join('\n', value.split()),
         defaultMatcher='blackwhite',
         defaultFilter='weak',
         onChange=onChange)
	def __init__(self, config, datasource_name, dataset_expr, dataset_nick, provider_list):
		for provider in provider_list:
			provider.disable_stream_singletons()
		DataProvider.__init__(self, config, datasource_name, dataset_expr, dataset_nick)
		self._stats = DataProcessor.create_instance('SimpleStatsDataProcessor', config,
			'dataset', self._log, 'Summary: Running over ')
		self._provider_list = provider_list
	def process(self, block_iter):
		if self.enabled() and self._config:
			block_list = list(DataProcessor.process(self, block_iter))
			if (self._target_jobs > 0) or (self._target_jobs_ds > 0):
				self._set_split_opt(self._config, 'files per job', dict(self._files),
					self._target_jobs, self._target_jobs_ds)
				self._set_split_opt(self._config, 'events per job', dict(self._entries),
					self._target_jobs, self._target_jobs_ds)
			self._config = None
			return block_list
		return block_iter
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		ConfigurablePlugin.__init__(self, config)
		self._log = logging.getLogger('user.dataprovider')
		(self._datasetExpr, self._datasetNick, self._datasetID) = (datasetExpr, datasetNick, datasetID)
		(self._cache_block, self._cache_dataset, self._passthrough) = (None, None, False)

		self._stats = DataProcessor.createInstance('StatsDataProcessor', config)
		self._nickProducer = config.getPlugin('nickname source', 'SimpleNickNameProducer', cls = DataProcessor)
		self._datasetProcessor = config.getCompositePlugin('dataset processor',
			'EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor ' +
			'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor',
			'MultiDataProcessor', cls = DataProcessor, onChange = triggerResync(['datasets', 'parameters']))
	def __init__(self, config, datasetExpr, datasetNick = None, datasetID = 0):
		ConfigurablePlugin.__init__(self, config)
		self._log = logging.getLogger('user.dataprovider')
		(self._datasetExpr, self._datasetNick, self._datasetID) = (datasetExpr, datasetNick, datasetID)
		(self._cache_block, self._cache_dataset, self._passthrough) = (None, None, False)

		self._stats = DataProcessor.createInstance('StatsDataProcessor', config)
		self._nickProducer = config.getPlugin('nickname source', 'SimpleNickNameProducer', cls = DataProcessor)
		self._datasetProcessor = config.getCompositePlugin('dataset processor',
			'EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor ' +
			'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor',
			'MultiDataProcessor', cls = DataProcessor)
 def process(self, block_iter):
     if self.enabled() and self._config:
         block_list = list(DataProcessor.process(self, block_iter))
         if (self._target_jobs > 0) or (self._target_jobs_ds > 0):
             self._set_split_opt(self._config, 'files per job',
                                 dict(self._files), self._target_jobs,
                                 self._target_jobs_ds)
             self._set_split_opt(self._config, 'events per job',
                                 dict(self._entries), self._target_jobs,
                                 self._target_jobs_ds)
         self._config = None
         return block_list
     return block_iter
Exemple #28
0
	def __init__(self, config, datasetExpr, datasetNick = None):
		ConfigurablePlugin.__init__(self, config)
		self._log = logging.getLogger('dataset.provider')
		(self._datasetExpr, self._datasetNick) = (datasetExpr, datasetNick)
		(self._cache_block, self._cache_dataset) = (None, None)
		self._dataset_query_interval = config.getTime('dataset default query interval', 60, onChange = None)

		triggerDataResync = triggerResync(['datasets', 'parameters'])
		self._stats = DataProcessor.createInstance('SimpleStatsDataProcessor', config, triggerDataResync, self._log,
			' * Dataset %s:\n\tcontains ' % repr(datasetNick or datasetExpr))
		self._nickProducer = config.getPlugin('nickname source', 'SimpleNickNameProducer',
			cls = DataProcessor, pargs = (triggerDataResync,), onChange = triggerDataResync)
		self._datasetProcessor = config.getCompositePlugin('dataset processor',
			'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor URLCountDataProcessor ' +
			'EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor LocationDataProcessor', 'MultiDataProcessor',
			cls = DataProcessor, pargs = (triggerDataResync,), onChange = triggerDataResync)
Exemple #29
0
    def _parse_filter(self, config, value):
        dataset_proc = DataProcessor.create_instance('NullDataProcessor')

        def _get_filter_entries():
            for pat in value.split():
                if ':' not in pat.lstrip(':'):
                    yield pat
                else:
                    block_iter = DataProvider.iter_blocks_from_expr(
                        config,
                        ':%s' % pat.lstrip(':'),
                        dataset_proc=dataset_proc)
                    for block in block_iter:
                        for fi in block[DataProvider.FileList]:
                            yield fi[DataProvider.URL]

        return str.join('\n', _get_filter_entries())
Exemple #30
0
    def __init__(self,
                 config,
                 datasource_name,
                 dataset_expr,
                 dataset_nick=None,
                 dataset_proc=None):
        ConfigurablePlugin.__init__(self, config)
        self._log = logging.getLogger('%s.provider' % datasource_name)
        (self._datasource_name, self._dataset_expr) = (datasource_name,
                                                       dataset_expr)
        self._dataset_nick_override = dataset_nick
        (self._cache_block, self._cache_dataset) = (None, None)
        self._dataset_query_interval = config.get_time(
            '%s default query interval' % datasource_name, 60, on_change=None)

        self._stats = dataset_proc or DataProcessor.create_instance(
            'SimpleStatsDataProcessor', config, datasource_name, self._log,
            ' * Dataset %s:\n\tcontains ' % repr(dataset_nick or dataset_expr))

        dataset_config = config.change_view(
            default_on_change=TriggerResync(['datasets', 'parameters']))
        self._nick_producer = dataset_config.get_plugin(
            ['nickname source',
             '%s nickname source' % datasource_name],
            'SimpleNickNameProducer',
            cls=DataProcessor,
            pargs=(datasource_name, ))
        self._dataset_processor = dataset_proc or dataset_config.get_composited_plugin(
            '%s processor' % datasource_name,
            'NickNameConsistencyProcessor EntriesConsistencyDataProcessor URLDataProcessor '
            +
            'URLCountDataProcessor EntriesCountDataProcessor EmptyDataProcessor UniqueDataProcessor '
            + 'LocationDataProcessor',
            'MultiDataProcessor',
            cls=DataProcessor,
            pargs=(datasource_name, ))
Exemple #31
0
	def __init__(self, config, onChange):
		DataProcessor.__init__(self, config, onChange)
Exemple #32
0
	def __init__(self, config, datasetExpr, datasetNick, providerList):
		DataProvider.__init__(self, config, datasetExpr, datasetNick)
		self._stats = DataProcessor.createInstance('SimpleStatsDataProcessor', config, None, self._log, 'Summary: Running over ')
		self._providerList = providerList
 def process(self, blockIter):
     self._recordedURL = set()
     self._recordedBlock = set()
     return DataProcessor.process(self, blockIter)
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._emptyFiles = config.getBool('dataset remove empty files', True, onChange = DataProcessor.triggerDataResync)
		self._emptyBlock = config.getBool('dataset remove empty blocks', True, onChange = DataProcessor.triggerDataResync)
Exemple #35
0
 def __init__(self, config, onChange):
     DataProcessor.__init__(self, config, onChange)
     self._limitEntries = config.getInt(
         ['dataset limit events', 'dataset limit entries'],
         -1,
         onChange=onChange)
Exemple #36
0
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self.reset()
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._emptyFiles = config.getBool('dataset remove empty files', True)
		self._emptyBlock = config.getBool('dataset remove empty blocks', True)
Exemple #38
0
	def __init__(self, config, onChange):
		DataProcessor.__init__(self, config, onChange)
		self._limitFiles = config.getInt(['dataset limit files', 'dataset limit urls'], -1,
			onChange = onChange)
Exemple #39
0
	def __init__(self, config, onChange):
		DataProcessor.__init__(self, config, onChange)
		self._emptyFiles = config.getBool('dataset remove empty files', True, onChange = onChange)
		self._emptyBlock = config.getBool('dataset remove empty blocks', True, onChange = onChange)
		(self._removedFiles, self._removedBlocks) = (0, 0)
Exemple #40
0
	def __init__(self, config, onChange):
		DataProcessor.__init__(self, config, onChange)
		self._limitEntries = config.getInt(['dataset limit events', 'dataset limit entries'], -1,
			onChange = onChange)
 def __init__(self, config, datasetExpr, datasetNick, providerList):
     DataProvider.__init__(self, config, datasetExpr, datasetNick)
     self._stats = DataProcessor.createInstance('SimpleStatsDataProcessor',
                                                config, None, self._log,
                                                'Summary: Running over ')
     self._providerList = providerList
Exemple #42
0
	def __init__(self, config, datasource_name):
		DataProcessor.__init__(self, config, datasource_name)
		(self._entries, self._blocks, self._files) = (0, 0, 0)
Exemple #43
0
	def __init__(self, config, onChange):
		DataProcessor.__init__(self, config, onChange)
		self._sortDS = config.getBool('dataset sort', False, onChange = onChange)
		self._sortBlock = config.getBool('dataset block sort', False, onChange = onChange)
		self._sortFiles = config.getBool('dataset files sort', False, onChange = onChange)
		self._sortLocation = config.getBool('dataset location sort', False, onChange = onChange)
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._ignoreURLs = config.getList(['dataset ignore urls', 'dataset ignore files'], [])
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._checkURLOpt = 'dataset check unique url'
		self._checkURL = config.getEnum(self._checkURLOpt, DatasetUniqueMode, DatasetUniqueMode.abort)
		self._checkBlockOpt = 'dataset check unique block'
		self._checkBlock = config.getEnum(self._checkBlockOpt, DatasetUniqueMode, DatasetUniqueMode.abort)
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._entries = 0
		self._blocks = 0
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._locationfilter = config.getFilter('dataset location filter', '',
			defaultMatcher = 'blackwhite', defaultFilter = 'strict',
			onChange = DataProcessor.triggerDataResync)
	def process(self, block_iter):
		self._recorded_url = set()  # reset records
		self._recorded_block = set()
		return DataProcessor.process(self, block_iter)
Exemple #49
0
 def __init__(self, config, onChange):
     DataProcessor.__init__(self, config, onChange)
     self._limitFiles = config.getInt(
         ['dataset limit files', 'dataset limit urls'],
         -1,
         onChange=onChange)
Exemple #50
0
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._checkURL = config.getEnum('dataset check unique url', DatasetUniqueMode, DatasetUniqueMode.abort,
			onChange = DataProcessor.triggerDataResync)
		self._checkBlock = config.getEnum('dataset check unique block', DatasetUniqueMode, DatasetUniqueMode.abort,
			onChange = DataProcessor.triggerDataResync)
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._limitEntries = config.getInt(['dataset limit events', 'dataset limit entries'], -1,
			onChange = DataProcessor.triggerDataResync)
Exemple #52
0
	def process(self, blockIter):
		self._recordedURL = set()
		self._recordedBlock = set()
		return DataProcessor.process(self, blockIter)
Exemple #53
0
	def __init__(self, config):
		DataProcessor.__init__(self, config)
		self._limitFiles = config.getInt(['dataset limit files', 'dataset limit urls'], -1,
			onChange = DataProcessor.triggerDataResync)
	def __init__(self, config, datasource_name):
		DataProcessor.__init__(self, config, datasource_name)
		(self._entries, self._blocks, self._files) = (0, 0, 0)