def main(): if opts.save_jobjson or opts.save_jobgc or opts.get_events: (workDir, nJobs, jobList) = getWorkJobs(args) (log, incomplete, splitter, splitInfo) = (None, False, None, {}) (lumiDict, readDict, writeDict) = ({}, {}, {}) try: splitter = DataSplitter.loadState(os.path.join(workDir, 'datamap.tar')) except Exception: pass jobList = sorted(jobList) for jobNum in jobList: del log log = utils.ActivityLog('Reading job logs - [%d / %d]' % (jobNum, jobList[-1])) jobInfo = getJobInfo(workDir, jobNum, lambda retCode: retCode == 0) if not jobInfo: if not incomplete: print 'WARNING: Not all jobs have finished - results will be incomplete!' incomplete = True continue if not parameterized: if splitter: splitInfo = splitter.getSplitInfo(jobNum) outputName = splitInfo.get(DataSplitter.Nickname, splitInfo.get(DataSplitter.DatasetID, 0)) else: outputName = jobInfo['file'].split()[2].replace("_%d_" % jobNum, '_').replace('/', '_').replace('__', '_') # Read framework report files to get number of events try: outputDir = os.path.join(workDir, 'output', 'job_' + str(jobNum)) for fwkXML in getCMSSWInfo(os.path.join(outputDir, 'cmssw.dbs.tar.gz')): for run in fwkXML.getElementsByTagName('Run'): for lumi in run.getElementsByTagName('LumiSection'): run_id = int(run.getAttribute('ID')) lumi_id = int(lumi.getAttribute('ID')) lumiDict.setdefault(outputName, {}).setdefault(run_id, set()).add(lumi_id) for outFile in fwkXML.getElementsByTagName('File'): pfn = outFile.getElementsByTagName('PFN')[0].childNodes[0].data if pfn not in writeDict.setdefault(outputName, {}): writeDict[outputName][pfn] = 0 writeDict[outputName][pfn] += int(outFile.getElementsByTagName('TotalEvents')[0].childNodes[0].data) for inFile in fwkXML.getElementsByTagName('InputFile'): if outputName not in readDict: readDict[outputName] = 0 readDict[outputName] += int(inFile.getElementsByTagName('EventsRead')[0].childNodes[0].data) except KeyboardInterrupt: sys.exit(os.EX_OK) except Exception: raise print 'Error while parsing framework output of job %s!' % jobNum continue del log log = utils.ActivityLog('Simplifying lumi sections') lumis = {} for sample in lumiDict: for run in lumiDict[sample]: for lumi in lumiDict[sample][run]: lumis.setdefault(sample, []).append(([run, lumi], [run, lumi])) for sample in lumiDict: lumis[sample] = mergeLumi(lumis[sample]) del log for sample, lumis in lumis.items(): print 'Sample:', sample print '=========================================' print 'Number of events processed: %12d' % readDict[sample] print ' Number of events written: %12d' % sum(writeDict.get(sample, {}).values()) if writeDict.get(sample, None): print head = [(0, ' Output filename'), (1, 'Events')] utils.printTabular(head, map(lambda pfn: {0: pfn, 1: writeDict[sample][pfn]}, writeDict[sample])) if opts.save_jobjson: outputJSON(lumis, open(os.path.join(workDir, 'processed_%s.json' % sample), 'w')) print 'Saved processed lumi sections in', os.path.join(workDir, 'processed_%s.json' % sample) if opts.save_jobgc: print print 'List of processed lumisections:' print '-----------------------------------------' outputGC(lumis) print ########################### # Lumi filter manuipulation ########################### if opts.save_exprgc or opts.save_exprjson or opts.save_exprfull: if len(args) == 0: raise Exception('No arguments given!') try: lumis = parseLumiFilter(str.join(' ', args)) except Exception: raise Exception('Could not parse: %s' % str.join(' ', args)) if opts.save_exprgc: outputGC(lumis) if opts.save_exprjson: outputJSON(lumis) if opts.save_exprfull: result = {} for rlrange in lumis: start, end = rlrange assert(start[0] == end[0]) llist = result.setdefault(start[0], []).extend(range(start[1], end[1] + 1)) print result
if opts.findrm: removed = [] utils.eprint = lambda *x: {} oldDP = DataProvider.loadState(args[0]) for new in args[1:]: newDP = DataProvider.loadState(new) (blocksAdded, blocksMissing, blocksChanged) = DataProvider.resyncSources(oldDP.getBlocks(), newDP.getBlocks()) for block in blocksMissing: tmp = dict(block) tmp[-1] = new removed.append(tmp) oldDP = newDP utils.printTabular([(DataProvider.Dataset, "Dataset"), (DataProvider.BlockName, "Block"), (-1, "Removed in file")], removed) if opts.invalid: splitter = DataSplitter.loadState(opts.invalid) def getInvalid(): for jobNum in range(splitter.getMaxJobs()): splitInfo = splitter.getSplitInfo(jobNum) if splitInfo.get(DataSplitter.Invalid, False): yield str(jobNum) print str.join(",", getInvalid()) if opts.jdl: print job.get("jdl") if opts.state: try: newState = getattr(Job, opts.state) except: print "Invalid state: %s", opts.state
def main(): # Set config based on settings from config file or command line configFile = None if os.path.exists(args[0]): configFile = args[0] config = getConfig(configFile, section = 'global') config.changeView(setSections = ['jobs']).set('nseeds', '1', '?=') configParameters = config.changeView(setSections = ['parameters']) if opts.parameters: utils.vprint('Provided options:') for p in opts.parameters: k, v = p.split('=', 1) configParameters.set(k.strip(), v.strip().replace('\\n', '\n'), '=') utils.vprint('\t%s: %s' % (k.strip(), v.strip())) utils.vprint('') if not os.path.exists(args[0]): configParameters.set('parameters', str.join(' ', args).replace('\\n', '\n')) if opts.dataset: configParameters.set('default lookup', 'DATASETNICK') # configParameters.set('parameter adapter', 'BasicParameterAdapter', '=') # Don't track parameter changes if opts.verbosity > 2: config.changeView(setSections = None).write(sys.stdout) # Initialize ParameterFactory configTask = config.changeView(setSections = [config.get(['task', 'module'], 'DummyTask')]) pm = config.getPlugin('parameter factory', 'SimpleParameterFactory', cls = ParameterFactory).getInstance() # Create dataset parameter source class DummySplitter: def getMaxJobs(self): return 3 def getSplitInfo(self, pNum): mkEntry = lambda ds, fl, n, nick: { DataSplitter.Dataset: ds, DataSplitter.Nickname: nick, DataSplitter.FileList: fl, DataSplitter.NEntries: n } rndStr = lambda: md5(str(random.random())).hexdigest()[:10] tmp = [ mkEntry('ds1', ['a', 'b'], 23, 'data_1'), mkEntry('ds1', ['1'], 42, 'data_1'), mkEntry('ds2', ['m', 'n'], 123, 'data_2'), mkEntry('ds2', ['x', 'y', 'z'], 987, 'data_3') ] return tmp[pNum] class DataSplitProcessorTest: def getKeys(self): return map(lambda k: ParameterMetadata(k, untracked=True), ['DATASETINFO', 'DATASETID', 'DATASETPATH', 'DATASETBLOCK', 'DATASETNICK']) def process(self, pNum, splitInfo, result): result.update({ 'DATASETINFO': '', 'DATASETID': splitInfo.get(DataSplitter.DatasetID, None), 'DATASETPATH': splitInfo.get(DataSplitter.Dataset, None), 'DATASETBLOCK': splitInfo.get(DataSplitter.BlockName, None), 'DATASETNICK': splitInfo.get(DataSplitter.Nickname, None), 'DATASETSPLIT': pNum, }) if opts.dataset.lower() == 'true': utils.vprint('Registering dummy data provider data') dataSplitter = DummySplitter() elif opts.dataset: dataSplitter = DataSplitter.loadState(opts.dataset) if opts.dataset: DataParameterSource.datasetsAvailable['data'] = DataParameterSource( config.getWorkPath(), 'data', None, dataSplitter, DataSplitProcessorTest()) psource = pm.getSource(config) if opts.forceiv: for dp in DataParameterSource.datasetSources: dp.intervention = (set([1]), set([0]), True) if opts.listparams: result = [] needGCParam = False if psource.getMaxJobs() != None: countActive = 0 for jobNum in range(psource.getMaxJobs()): info = psource.getJobInfo(jobNum) if info[ParameterInfo.ACTIVE]: countActive += 1 if opts.inactive or info[ParameterInfo.ACTIVE]: if not info[ParameterInfo.ACTIVE]: info['GC_PARAM'] = 'N/A' if str(info['GC_PARAM']) != str(jobNum): needGCParam = True result.append(info) if opts.displaymode == 'parseable': utils.vprint('Count,%d,%d' % (countActive, psource.getMaxJobs())) else: utils.vprint('Number of parameter points: %d' % psource.getMaxJobs()) if countActive != psource.getMaxJobs(): utils.vprint('Number of active parameter points: %d' % countActive) else: result.append(psource.getJobInfo(123)) enabledOutput = opts.output.split(',') output = filter(lambda k: not opts.output or k in enabledOutput, psource.getJobKeys()) stored = filter(lambda k: k.untracked == False, output) untracked = filter(lambda k: k.untracked == True, output) if opts.collapse > 0: result_old = result result = {} result_nicks = {} head = [('COLLATE_JOBS', '# of jobs')] if 'DATASETSPLIT' in stored: stored.remove('DATASETSPLIT') if (opts.collapse == 1): stored.append('DATASETNICK') head.append(('DATASETNICK', 'DATASETNICK')) elif opts.collapse == 2: head.append(('COLLATE_NICK', '# of nicks')) for pset in result_old: if ('DATASETSPLIT' in pset) and (opts.collapse == 1): pset.pop('DATASETSPLIT') nickname = None if ('DATASETNICK' in pset) and (opts.collapse == 2): nickname = pset.pop('DATASETNICK') h = md5(repr(map(lambda key: pset.get(key), stored))).hexdigest() result.setdefault(h, []).append(pset) result_nicks.setdefault(h, set()).add(nickname) def doCollate(h): tmp = result[h][0] tmp['COLLATE_JOBS'] = len(result[h]) tmp['COLLATE_NICK'] = len(result_nicks[h]) return tmp result = map(doCollate, result) else: head = [('GC_JOB_ID', '#')] if needGCParam: head.append(('GC_PARAM', 'GC_PARAM')) if opts.active: head.append((ParameterInfo.ACTIVE, 'ACTIVE')) if opts.visible: stored = opts.visible.split(',') head.extend(sorted(zip(stored, stored))) if opts.untracked: head.extend(sorted(map(lambda n: (n, '(%s)' % n), filter(lambda n: n not in ['GC_PARAM', 'GC_JOB_ID'], untracked)))) utils.vprint('') utils.printTabular(head, result) if opts.save: utils.vprint('') ParameterSource.getClass('GCDumpParameterSource').write(opts.save, psource) utils.vprint('Parameter information saved to ./%s' % opts.save) if opts.intervention: utils.vprint('') tmp = psource.getJobIntervention() if tmp: if opts.displaymode == 'parseable': utils.vprint('R: %s' % str.join(',', map(str, tmp[0]))) utils.vprint('D: %s' % str.join(',', map(str, tmp[1]))) else: utils.vprint(' Redo: %r' % tmp[0]) utils.vprint('Disable: %r' % tmp[1]) else: if opts.displaymode == 'parseable': utils.vprint('NOINT') else: utils.vprint('No intervention')