def _create_datasource(self, config, datasource_name, psrc_repository, psrc_list): data_ps = ParameterSource.create_instance('DataParameterSource', config, datasource_name, psrc_repository) if not isinstance(data_ps, ParameterSource.get_class('NullParameterSource')): config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@', section='storage') config.set('default lookup', 'DATASETNICK', section='parameters') psrc_list.append(data_ps) return data_ps
def setup_dataset(config, dataset): if dataset.lower() == 'true': log.info('Registering dummy data provider data') dataSplitter = DummySplitter() else: dataSplitter = DataSplitter.loadPartitionsForScript(dataset) config = config.changeView(setSections = None) partProcessor = config.getCompositePlugin('partition processor', 'TFCPartitionProcessor LocationPartitionProcessor MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls = 'PartitionProcessor', onChange = None) ParameterSource.createInstance('DataParameterSource', config.getWorkPath(), 'data', None, dataSplitter, partProcessor, repository)
def setup_dataset(config, dataset, repository): if dataset.lower() == 'true': logging.info('Registering dummy data provider data') def _create_partition(ds_name, nick, n_events, fn_list): return {DataSplitter.Dataset: ds_name, DataSplitter.Nickname: nick, DataSplitter.FileList: fn_list, DataSplitter.NEntries: n_events} reader = PartitionReader.create_instance('TrivialPartitionReader', [ _create_partition('ds1', 'data_1', 23, ['a', 'b']), _create_partition('ds1', 'data_1', 42, ['1']), _create_partition('ds2', 'data_2', 123, ['m', 'n']), _create_partition('ds2', 'data_3', 987, ['x', 'y', 'z']) ]) else: reader = DataSplitter.load_partitions(dataset) config = config.change_view(set_sections=None, default_on_change=None) ParameterSource.create_instance('BaseDataParameterSource', config, 'dataset', repository, reader)
def setup_dataset(config, dataset): if dataset.lower() == 'true': log.info('Registering dummy data provider data') dataSplitter = DummySplitter() else: dataSplitter = DataSplitter.loadPartitionsForScript(dataset) config = config.changeView(setSections=None) partProcessor = config.getCompositePlugin( 'partition processor', 'TFCPartitionProcessor LocationPartitionProcessor MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls='PartitionProcessor', onChange=None) ParameterSource.createInstance('DataParameterSource', config.getWorkPath(), 'data', None, dataSplitter, partProcessor, repository)
def setup_dataset(config, dataset): if dataset.lower() == 'true': utils.vprint('Registering dummy data provider data') dataSplitter = DummySplitter() else: dataSplitter = DataSplitter.loadStateForScript(dataset) DataParameterSource = ParameterSource.getClass('DataParameterSource') DataParameterSource.datasetsAvailable['data'] = DataParameterSource( config.getWorkPath(), 'data', None, dataSplitter, DataSplitProcessorTest())
def _setupJobParameters(self, config): data_config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['dataset']) self.dataSplitter = None self._data_refresh = -1 def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str): if (old_obj == '') and (cur_obj != ''): raise UserError('It is currently not possible to attach a dataset to a non-dataset task!') self._log.info('Dataset setup was changed - forcing resync...') config.setState(True, 'resync', detail = 'dataset') config.setState(True, 'init', detail = 'config') # This will trigger a write of the new options return cur_obj dataProvider = data_config.getCompositePlugin('dataset', '', ':MultiDatasetProvider:', cls = DataProvider, requirePlugin = False, onChange = userRefresh) self._forceRefresh = config.getState('resync', detail = 'dataset') config.setState(False, 'resync', detail = 'dataset') if not dataProvider: return tmp_config = data_config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['storage']) tmp_config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') tmp_config = data_config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['parameters']) tmp_config.set('default lookup', 'DATASETNICK') splitterName = data_config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(data_config) # Create and register dataset parameter source partProcessor = data_config.getCompositePlugin('partition processor', 'TFCPartitionProcessor LocationPartitionProcessor MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls = PartitionProcessor, onChange = triggerResync(['parameters'])) DataParameterSource = ParameterSource.getClass('DataParameterSource') self._dataPS = DataParameterSource(data_config.getWorkPath(), 'data', dataProvider, self.dataSplitter, partProcessor) DataParameterSource.datasetsAvailable['data'] = self._dataPS # Select dataset refresh rate self._data_refresh = data_config.getTime('dataset refresh', -1, onChange = None) if self._data_refresh > 0: self._dataPS.resyncSetup(interval = max(self._data_refresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % strTime(self._data_refresh), -1) else: self._dataPS.resyncSetup(interval = 0) if self._forceRefresh: self._dataPS.resyncSetup(force = True) def externalRefresh(sig, frame): self._dataPS.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def setupJobParameters(self, config, pm): config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['dataset']) self.dataSplitter = None self.dataRefresh = -1 def userRefresh(config, old_obj, cur_obj, cur_entry, obj2str): if (old_obj == '') and (cur_obj != ''): raise UserError('It is currently not possible to attach a dataset to a non-dataset task!') self._log.info('Dataset setup was changed - forcing resync...') config.setState(True, 'resync', detail = 'dataset') config.setState(True, 'init', detail = 'config') # This will trigger a write of the new options return cur_obj dataProvider = config.getCompositePlugin('dataset', '', ':MultiDatasetProvider:', cls = DataProvider, requirePlugin = False, onChange = userRefresh) self._forceRefresh = config.getState('resync', detail = 'dataset') config.setState(False, 'resync', detail = 'dataset') if not dataProvider: return tmp_config = config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['storage']) tmp_config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') tmp_config = config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['parameters']) tmp_config.set('default lookup', 'DATASETNICK') splitterName = config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self.dataSplitter = splitterClass(config) # Create and register dataset parameter source partProcessor = config.getCompositePlugin('partition processor', 'BasicPartitionProcessor LocationPartitionProcessor', 'MultiPartitionProcessor', cls = PartitionProcessor) DataParameterSource = ParameterSource.getClass('DataParameterSource') self._dataPS = DataParameterSource(config.getWorkPath(), 'data', dataProvider, self.dataSplitter, partProcessor) DataParameterSource.datasetsAvailable['data'] = self._dataPS # Select dataset refresh rate self.dataRefresh = config.getTime('dataset refresh', -1, onChange = None) if self.dataRefresh > 0: self._dataPS.resyncSetup(interval = max(self.dataRefresh, dataProvider.queryLimit())) utils.vprint('Dataset source will be queried every %s' % strTime(self.dataRefresh), -1) else: self._dataPS.resyncSetup(interval = 0) if self._forceRefresh: self._dataPS.resyncSetup(force = True) def externalRefresh(sig, frame): self._dataPS.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self.dataSplitter.getMaxJobs() == 0: raise UserError('There are no events to process')
def setup_dataset(config, dataset, repository): if dataset.lower() == 'true': logging.info('Registering dummy data provider data') def _create_partition(ds_name, nick, n_events, fn_list): return { DataSplitter.Dataset: ds_name, DataSplitter.Nickname: nick, DataSplitter.FileList: fn_list, DataSplitter.NEntries: n_events } reader = PartitionReader.create_instance('TrivialPartitionReader', [ _create_partition('ds1', 'data_1', 23, ['a', 'b']), _create_partition('ds1', 'data_1', 42, ['1']), _create_partition('ds2', 'data_2', 123, ['m', 'n']), _create_partition('ds2', 'data_3', 987, ['x', 'y', 'z']) ]) else: reader = DataSplitter.load_partitions(dataset) config = config.change_view(set_sections=None, default_on_change=None) ParameterSource.create_instance('BaseDataParameterSource', config, 'dataset', repository, reader)
def setup_dataset(config, dataset): if dataset.lower() == 'true': utils.vprint('Registering dummy data provider data') dataSplitter = DummySplitter() else: dataSplitter = DataSplitter.loadPartitionsForScript(dataset) config = config.changeView(setSections=None) partProcessor = config.getCompositePlugin( 'partition processor', 'TFCPartitionProcessor LocationPartitionProcessor MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls='PartitionProcessor', onChange=None) DataParameterSource = ParameterSource.getClass('DataParameterSource') DataParameterSource.datasetsAvailable['data'] = DataParameterSource( config.getWorkPath(), 'data', None, dataSplitter, partProcessor)
def _setupJobParameters(self, config, psrc_repository): TaskModule._setupJobParameters(self, config, psrc_repository) data_config = config.changeView(viewClass = 'TaggedConfigView', addSections = ['dataset']) self._dataSplitter = None dataProvider = data_config.getCompositePlugin('dataset', '', ':MultiDatasetProvider:', cls = DataProvider, requirePlugin = False, onChange = triggerResync(['datasets', 'parameters'])) self._forceRefresh = config.getState('resync', detail = 'datasets') config.setState(False, 'resync', detail = 'datasets') if not dataProvider: return tmp_config = data_config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['storage']) tmp_config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') tmp_config = data_config.changeView(viewClass = 'TaggedConfigView', setClasses = None, setNames = None, setTags = [], addSections = ['parameters']) tmp_config.set('default lookup', 'DATASETNICK') splitterName = data_config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter(DataSplitter.getClass(splitterName)) self._dataSplitter = splitterClass(data_config) # Create and register dataset parameter source self._partProcessor = data_config.getCompositePlugin('partition processor', 'TFCPartitionProcessor LocationPartitionProcessor MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls = PartitionProcessor, onChange = triggerResync(['parameters'])) dataPS = ParameterSource.createInstance('DataParameterSource', data_config.getWorkPath(), 'data', dataProvider, self._dataSplitter, self._partProcessor, psrc_repository) # Select dataset refresh rate data_refresh = data_config.getTime('dataset refresh', -1, onChange = None) if data_refresh >= 0: data_refresh = max(data_refresh, dataProvider.queryLimit()) self._log.info('Dataset source will be queried every %s', strTime(data_refresh)) dataPS.resyncSetup(interval = data_refresh, force = self._forceRefresh) def externalRefresh(sig, frame): self._log.info('External signal triggered resync of dataset source') dataPS.resyncSetup(force = True) signal.signal(signal.SIGUSR2, externalRefresh) if self._dataSplitter.getMaxJobs() == 0: if data_refresh < 0: raise UserError('Currently used dataset does not provide jobs to process') self._log.warning('Currently used dataset does not provide jobs to process')
def force_intervention(): DataParameterSource = ParameterSource.getClass('DataParameterSource') for dp in DataParameterSource.datasetSources: dp.intervention = (set([1]), set([0]), True)
def save_parameters(psource, fn): utils.vprint('') ParameterSource.getClass('GCDumpParameterSource').write(fn, psource) utils.vprint('Parameter information saved to ./%s' % fn)
def save_parameters(psrc, fn): logging.info('') ParameterSource.get_class('GCDumpParameterSource').write( fn, psrc.get_job_len(), psrc.get_job_metadata(), psrc.iter_jobs()) logging.info('Parameter information saved to ./%s', fn)
def save_parameters(psource, fn): log.info('') ParameterSource.getClass('GCDumpParameterSource').write(fn, psource) log.info('Parameter information saved to ./%s', fn)
def _get_dataset_lookup_psrc(psrc): is_lookup_cls = isinstance( psrc, ParameterSource.get_class('LookupBaseParameterSource')) return is_lookup_cls and ('DATASETNICK' in psrc.get_parameter_deps())
def _setupJobParameters(self, config, psrc_repository): TaskModule._setupJobParameters(self, config, psrc_repository) data_config = config.changeView(viewClass='TaggedConfigView', addSections=['dataset']) self._dataSplitter = None dataProvider = data_config.getCompositePlugin( 'dataset', '', ':MultiDatasetProvider:', cls=DataProvider, requirePlugin=False, onChange=triggerResync(['datasets', 'parameters'])) self._forceRefresh = config.getState('resync', detail='datasets') config.setState(False, 'resync', detail='datasets') if not dataProvider: return tmp_config = data_config.changeView(viewClass='TaggedConfigView', setClasses=None, setNames=None, setTags=[], addSections=['storage']) tmp_config.set('se output pattern', '@NICK@_job_@GC_JOB_ID@_@X@') tmp_config = data_config.changeView(viewClass='TaggedConfigView', setClasses=None, setNames=None, setTags=[], addSections=['parameters']) tmp_config.set('default lookup', 'DATASETNICK') splitterName = data_config.get('dataset splitter', 'FileBoundarySplitter') splitterClass = dataProvider.checkSplitter( DataSplitter.getClass(splitterName)) self._dataSplitter = splitterClass(data_config) # Create and register dataset parameter source self._partProcessor = data_config.getCompositePlugin( 'partition processor', 'TFCPartitionProcessor LocationPartitionProcessor MetaPartitionProcessor BasicPartitionProcessor', 'MultiPartitionProcessor', cls=PartitionProcessor, onChange=triggerResync(['parameters'])) dataPS = ParameterSource.createInstance('DataParameterSource', data_config.getWorkPath(), 'data', dataProvider, self._dataSplitter, self._partProcessor, psrc_repository) # Select dataset refresh rate data_refresh = data_config.getTime('dataset refresh', -1, onChange=None) if data_refresh >= 0: data_refresh = max(data_refresh, dataProvider.queryLimit()) self._log.info('Dataset source will be queried every %s', strTime(data_refresh)) dataPS.resyncSetup(interval=data_refresh, force=self._forceRefresh) def externalRefresh(sig, frame): self._log.info( 'External signal triggered resync of dataset source') dataPS.resyncSetup(force=True) signal.signal(signal.SIGUSR2, externalRefresh) if self._dataSplitter.getMaxJobs() == 0: if data_refresh < 0: raise UserError( 'Currently used dataset does not provide jobs to process') self._log.warning( 'Currently used dataset does not provide jobs to process')
def main(): # Set config based on settings from config file or command line configFile = None if os.path.exists(args[0]): configFile = args[0] config = getConfig(configFile, section = 'global') config.changeView(setSections = ['jobs']).set('nseeds', '1', '?=') configParameters = config.changeView(setSections = ['parameters']) if opts.parameters: utils.vprint('Provided options:') for p in opts.parameters: k, v = p.split('=', 1) configParameters.set(k.strip(), v.strip().replace('\\n', '\n'), '=') utils.vprint('\t%s: %s' % (k.strip(), v.strip())) utils.vprint('') if not os.path.exists(args[0]): configParameters.set('parameters', str.join(' ', args).replace('\\n', '\n')) if opts.dataset: configParameters.set('default lookup', 'DATASETNICK') # configParameters.set('parameter adapter', 'BasicParameterAdapter', '=') # Don't track parameter changes if opts.verbosity > 2: config.changeView(setSections = None).write(sys.stdout) # Initialize ParameterFactory configTask = config.changeView(setSections = [config.get(['task', 'module'], 'DummyTask')]) pm = config.getPlugin('parameter factory', 'SimpleParameterFactory', cls = ParameterFactory).getInstance() # Create dataset parameter source class DummySplitter: def getMaxJobs(self): return 3 def getSplitInfo(self, pNum): mkEntry = lambda ds, fl, n, nick: { DataSplitter.Dataset: ds, DataSplitter.Nickname: nick, DataSplitter.FileList: fl, DataSplitter.NEntries: n } rndStr = lambda: md5(str(random.random())).hexdigest()[:10] tmp = [ mkEntry('ds1', ['a', 'b'], 23, 'data_1'), mkEntry('ds1', ['1'], 42, 'data_1'), mkEntry('ds2', ['m', 'n'], 123, 'data_2'), mkEntry('ds2', ['x', 'y', 'z'], 987, 'data_3') ] return tmp[pNum] class DataSplitProcessorTest: def getKeys(self): return map(lambda k: ParameterMetadata(k, untracked=True), ['DATASETINFO', 'DATASETID', 'DATASETPATH', 'DATASETBLOCK', 'DATASETNICK']) def process(self, pNum, splitInfo, result): result.update({ 'DATASETINFO': '', 'DATASETID': splitInfo.get(DataSplitter.DatasetID, None), 'DATASETPATH': splitInfo.get(DataSplitter.Dataset, None), 'DATASETBLOCK': splitInfo.get(DataSplitter.BlockName, None), 'DATASETNICK': splitInfo.get(DataSplitter.Nickname, None), 'DATASETSPLIT': pNum, }) if opts.dataset.lower() == 'true': utils.vprint('Registering dummy data provider data') dataSplitter = DummySplitter() elif opts.dataset: dataSplitter = DataSplitter.loadState(opts.dataset) if opts.dataset: DataParameterSource.datasetsAvailable['data'] = DataParameterSource( config.getWorkPath(), 'data', None, dataSplitter, DataSplitProcessorTest()) psource = pm.getSource(config) if opts.forceiv: for dp in DataParameterSource.datasetSources: dp.intervention = (set([1]), set([0]), True) if opts.listparams: result = [] needGCParam = False if psource.getMaxJobs() != None: countActive = 0 for jobNum in range(psource.getMaxJobs()): info = psource.getJobInfo(jobNum) if info[ParameterInfo.ACTIVE]: countActive += 1 if opts.inactive or info[ParameterInfo.ACTIVE]: if not info[ParameterInfo.ACTIVE]: info['GC_PARAM'] = 'N/A' if str(info['GC_PARAM']) != str(jobNum): needGCParam = True result.append(info) if opts.displaymode == 'parseable': utils.vprint('Count,%d,%d' % (countActive, psource.getMaxJobs())) else: utils.vprint('Number of parameter points: %d' % psource.getMaxJobs()) if countActive != psource.getMaxJobs(): utils.vprint('Number of active parameter points: %d' % countActive) else: result.append(psource.getJobInfo(123)) enabledOutput = opts.output.split(',') output = filter(lambda k: not opts.output or k in enabledOutput, psource.getJobKeys()) stored = filter(lambda k: k.untracked == False, output) untracked = filter(lambda k: k.untracked == True, output) if opts.collapse > 0: result_old = result result = {} result_nicks = {} head = [('COLLATE_JOBS', '# of jobs')] if 'DATASETSPLIT' in stored: stored.remove('DATASETSPLIT') if (opts.collapse == 1): stored.append('DATASETNICK') head.append(('DATASETNICK', 'DATASETNICK')) elif opts.collapse == 2: head.append(('COLLATE_NICK', '# of nicks')) for pset in result_old: if ('DATASETSPLIT' in pset) and (opts.collapse == 1): pset.pop('DATASETSPLIT') nickname = None if ('DATASETNICK' in pset) and (opts.collapse == 2): nickname = pset.pop('DATASETNICK') h = md5(repr(map(lambda key: pset.get(key), stored))).hexdigest() result.setdefault(h, []).append(pset) result_nicks.setdefault(h, set()).add(nickname) def doCollate(h): tmp = result[h][0] tmp['COLLATE_JOBS'] = len(result[h]) tmp['COLLATE_NICK'] = len(result_nicks[h]) return tmp result = map(doCollate, result) else: head = [('GC_JOB_ID', '#')] if needGCParam: head.append(('GC_PARAM', 'GC_PARAM')) if opts.active: head.append((ParameterInfo.ACTIVE, 'ACTIVE')) if opts.visible: stored = opts.visible.split(',') head.extend(sorted(zip(stored, stored))) if opts.untracked: head.extend(sorted(map(lambda n: (n, '(%s)' % n), filter(lambda n: n not in ['GC_PARAM', 'GC_JOB_ID'], untracked)))) utils.vprint('') utils.printTabular(head, result) if opts.save: utils.vprint('') ParameterSource.getClass('GCDumpParameterSource').write(opts.save, psource) utils.vprint('Parameter information saved to ./%s' % opts.save) if opts.intervention: utils.vprint('') tmp = psource.getJobIntervention() if tmp: if opts.displaymode == 'parseable': utils.vprint('R: %s' % str.join(',', map(str, tmp[0]))) utils.vprint('D: %s' % str.join(',', map(str, tmp[1]))) else: utils.vprint(' Redo: %r' % tmp[0]) utils.vprint('Disable: %r' % tmp[1]) else: if opts.displaymode == 'parseable': utils.vprint('NOINT') else: utils.vprint('No intervention')
def save_parameters(psrc, fn): logging.info('') ParameterSource.get_class('GCDumpParameterSource').write(fn, psrc.get_job_len(), psrc.get_job_metadata(), psrc.iter_jobs()) logging.info('Parameter information saved to ./%s', fn)