def getInputFileToNumEventMapping(m_list): """ Returns a dictionary of the file names and the number of events in each file """ print( "|====================================================================================================================|" ) print( "| Doing the mapping from file to number of events |" ) print( "|====================================================================================================================|" ) m_numEventsPerFile = {} import PyUtils.PoolFile as PF failedFiles = [ "/castor/cern.ch/grid/atlas/caf/atlcal/perm/id/cosmics/ESDs/InDetESD_91890_lb7.root", "/castor/cern.ch/grid/atlas/caf/atlcal/perm/id/cosmics/ESDs/InDetESD_91891_lb13.root", "/castor/cern.ch/grid/atlas/caf/atlcal/perm/id/cosmics/ESDs/InDetESD_91891_lb24.root" ] for i in range(m_list.__len__()): if m_list[i] not in failedFiles: poolFile = PF.PoolFile(m_list[i]) m_numEventsPerFile[m_list[i]] = int( poolFile.fileInfos().split()[6]) print(m_numEventsPerFile) return m_numEventsPerFile
def main(args): """read a POOL file and dump its content. """ files = args.files if isinstance(files, basestring): files = [files] import sys import os import os.path as osp for i, f in enumerate(files): files[i] = osp.expandvars(osp.expanduser(f)) exitcode = 0 for fname in files: try: import PyUtils.PoolFile as PF PF.PoolOpts.FAST_MODE = args.fast pool_file = PF.PoolFile(fname) pool_file.checkFile(sorting=args.sort_fct) if args.detailed_dump: dump_file = osp.basename(fname) + '.txt' print "## dumping details into [%s]" % (dump_file, ) pool_file.detailedDump(dump_file) if args.output: oname = args.output print "## saving report into [%s]..." % (oname, ) pool_file.saveReport(oname) except Exception, e: print "## Caught exception [%s] !!" % str(e.__class__) print "## What:", e print sys.exc_info()[0] print sys.exc_info()[1] exitcode = 1 pass except:
pass fileNames = set(fileNames) # Check the consistency with the CSV output: if len(fileNames) > 1 and options.csvFileName: print( "WARNING CSV output is only available when processing a single " "input file") pass # Loop over the specified file(s): for fileName in fileNames: # Open the file: import PyUtils.PoolFile as PF poolFile = PF.PoolFile(fileName) # Loop over all the branches of the file, and sum up the information # about them in a smart way... summedData = {} categData = {} categTrigData = {} categTrigDynVars = {} for d in poolFile.data: # Skip metadata/TAG/etc. branches: # if d.dirType != "B": continue # The name of this branch: brName = d.name # Check if this is a static auxiliary store: m = re.match("(.*)Aux\..*", d.name) if m:
def __init__(self, TOTALCPUS, LOCALDIR, FILELIST, OutputLevel, doDetailedSplitting=False, nEventsPerFile=-1): def sort_by_value(d): """ Returns the keys of dictionary d sorted by their values """ items = d.items() backitems = [[v[1], v[0]] for v in items] backitems.sort() backitems.reverse() return [backitems[i][1] for i in range(0, len(backitems))] self.doDetailedSplitting = doDetailedSplitting self.OutputLevel = OutputLevel self.Files = FILELIST self.LocalDir = LOCALDIR self.nSubJobs = TOTALCPUS self.totalNumberOfEvents = 0 self.totalNumberOfFiles = 0 self.nEventsPerFile = nEventsPerFile inputFileList = [] if not self.doDetailedSplitting: inputfiles = open(FILELIST, "r") filelist = inputfiles.read().split('\n') for i in range(0, len(filelist)): filelist[i] = filelist[i].rstrip() inputfiles.close() if not LOCALDIR: print("Reading Custom File") FinalListSorted = [] for line in filelist: if line and line[0] != '#': FinalListSorted.append(line) #print (FinalListSorted) elif "castor" in LOCALDIR: print("Reading castor directory " + LOCALDIR + " Please wait...") extendedFileList = os.popen("rfdir " + LOCALDIR[7:]).read().splitlines() else: print("Reading directory " + LOCALDIR + " Please wait...") extendedFileList = os.popen("ls -l " + LOCALDIR).read().splitlines() if LOCALDIR: i = 0 SizeList = {} for line in extendedFileList: curr = line.split() if curr[0] != 'total': #print (curr[8], " ", curr[4]) SizeList[i] = {} SizeList[i][0] = curr[8].rstrip() SizeList[i][1] = curr[4].rstrip() i = i + 1 FinalList = {} count = 0 for i in range(0, len(SizeList)): #print (SizeList[i][0]) if SizeList[i][0] in filelist: #print ("Accepted") #print (SizeList[i][0], " size:", SizeList[i][1]) FinalList[SizeList[i][0]] = int(SizeList[i][1]) #SizeListSorted = [ (k,SizeList[k]) for k in sorted(SizeList.values())] FinalListSorted = sort_by_value(FinalList) #print ("Sorted list" ) #for i in range(0,len(FinalListSorted)): # print (FinalListSorted[i], "\tsize:\t", FinalList[FinalListSorted[i]]) currCPU = 0 self.CPUsFiles = {} nFiles = len(FinalListSorted) nRemainder = nFiles % TOTALCPUS nFilesPerCpu = nFiles / TOTALCPUS nFilesForThisCpu = 1 sumFileSize = 0 if len(FinalListSorted[0].split()) == 2: for i in range(0, nFiles): sumFileSize += int(FinalListSorted[i].split()[1]) print(sumFileSize) averageSizePerCpu = sumFileSize / TOTALCPUS * 0.97 print(averageSizePerCpu) sumSizeOnCpu = 0 #print ("NFile, Remainder, NfilesperCpu ", nFiles, " ", nRemainder, " ", nFilesPerCpu) #If the file size is present then use it to split the files if sumFileSize != 0: for i in range(0, nFiles): if currCPU in self.CPUsFiles: self.CPUsFiles[currCPU].append( LOCALDIR + FinalListSorted[i].split()[0]) else: self.CPUsFiles[currCPU] = [ LOCALDIR + FinalListSorted[i].split()[0] ] extraFiles = 0 sumSizeOnCpu += int(FinalListSorted[i].split()[1]) if (sumSizeOnCpu > averageSizePerCpu and i < nFiles - 1 and (sumSizeOnCpu + int(FinalListSorted[i + 1].split()[1])) > averageSizePerCpu * 1.04): print("File size on CPU: ", currCPU, '\t', sumSizeOnCpu) currCPU = currCPU + 1 if currCPU >= TOTALCPUS: currCPU = TOTALCPUS - 1 else: sumSizeOnCpu = 0 elif (nFiles - i == TOTALCPUS - currCPU): currCPU = currCPU + 1 print("File size on CPU: ", currCPU, '\t', sumSizeOnCpu) else: for i in range(0, nFiles): #print (FinalListSorted[i], "CPU: ", currCPU, " FPCPU: " , nFilesForThisCpu) if currCPU in self.CPUsFiles: self.CPUsFiles[currCPU].append( LOCALDIR + FinalListSorted[i].split()[0]) else: self.CPUsFiles[currCPU] = [ LOCALDIR + FinalListSorted[i].split()[0] ] extraFiles = 0 if (currCPU < nRemainder): extraFiles = 1 if (nFilesForThisCpu < nFilesPerCpu + extraFiles): nFilesForThisCpu = nFilesForThisCpu + 1 else: currCPU = currCPU + 1 nFilesForThisCpu = 1 # Doing the Detailed slitting else: import PyUtils.PoolFile as PF #Getting the number of events in each file. inputfiles = open(FILELIST, "r") numEventsPerFile = {} print("==================================================") print("The input file are: (May take some time..)") for line in inputfiles: if line.rstrip().find(".root") > 0: fullFileName = self.LocalDir + "/" + line.rstrip() inputFileList.append(fullFileName) poolFile = PF.PoolFile(fullFileName) thisNumEvents = int(poolFile.dataHeader.nEntries) self.totalNumberOfEvents += thisNumEvents self.totalNumberOfFiles += 1 print(fullFileName, " with ", thisNumEvents, " events") numEventsPerFile[fullFileName] = thisNumEvents print("==================================================") #Getting the Number of events to process, to skip, and the inputFile # The relavent quantities for each subJob self.m_skipEvents = {} self.nEvents = 0 self.m_inputFiles = {} # This means will do all the events if self.nEventsPerFile == -1: self.nEvents = int(self.totalNumberOfEvents / self.nSubJobs) else: self.nEvents = self.nEventsPerFile # local counters m_usedFiles = 0 m_numberEventsUsed = 0 for subJob in range(self.nSubJobs): self.m_inputFiles[subJob] = [] m_eventsNeeded = self.nEvents while (m_eventsNeeded != 0 and m_usedFiles < self.totalNumberOfFiles): # Two case the file indexed by m_usedFiles has enough event to complete # the events needed or it doesn't # If it does if m_eventsNeeded <= numEventsPerFile[ inputFileList[m_usedFiles]]: numEventsPerFile[ inputFileList[m_usedFiles]] -= m_eventsNeeded # Debugging #print ("subJob",subJob) #print ("m_eventsNeeded",m_eventsNeeded) #print ("self.m_inputFiles",self.m_inputFiles) #print ("m_usedFiles",m_usedFiles) self.m_inputFiles[subJob].append( inputFileList[m_usedFiles]) self.m_skipEvents[subJob] = m_numberEventsUsed m_numberEventsUsed += m_eventsNeeded m_eventsNeeded = 0 print("self.m_skipEvents[" + str(subJob) + "]", self.m_skipEvents[subJob]) print("m_numberEventsUsed", m_numberEventsUsed) # If it doesn't else: m_eventsNeeded -= numEventsPerFile[ inputFileList[m_usedFiles]] self.m_skipEvents[subJob] = m_numberEventsUsed self.m_inputFiles[subJob].append( inputFileList[m_usedFiles]) m_usedFiles += 1 m_numberEventsUsed = 0
for Path in paths: if "castor" in Path: print "Reading castor directory. Please wait..." inputfiles = os.popen("rfdir " + Path).read().splitlines() else: print "Reading directory. Please wait..." inputfiles = os.popen("ls -l " + Path).read().splitlines() for line in inputfiles: filename = line.split()[8] if "root" in filename or "ESD" in filename or "data" in filename: fullFilename = Path + filename try: if ReadNEvents: poolFile = PF.PoolFile(fullFilename) numEvents = int(poolFile.dataHeader.nEntries) print filename print numEvents if numEvents > EventCut: outputText.write(fullFilename + '\t' + str(numEvents) + '\n') else: print "File with few events, skipping..." else: if not PoolFileCatalog: outputText.write(fullFilename + '\n') if PoolFileCatalog: print " Creating as PoolFileToCatalogue Please wait..." print " pool_insertFileToCatalog ", fullFilename
def finalize(self): import sys import PyUtils.PoolFile as PF class ShutUp: def __init__(self): self.save = os.dup(sys.stdout.fileno()) self.quiet = open("/dev/null", "w") return def mute(self): os.dup2(self.quiet.fileno(), sys.stdout.fileno()) return def unMute(self): os.dup2(self.save, sys.stdout.fileno()) return def _unfold(pr): return { 'name': pr.name, 'memSize': pr.memSize, 'diskSize': pr.diskSize, 'memSizeNoZip': pr.memSizeNoZip, 'nEntries': pr.nEntries, 'dirType': pr.dirType, 'details': pr.details, } import tempfile, atexit, os tmpfileno, tmpfile = tempfile.mkstemp(suffix='.py_shelve.dat') os.close(tmpfileno) atexit.register(os.unlink, tmpfile) import commands sc, checkfile = commands.getstatusoutput('which checkFile.py') if sc != 0: self.msg.info('could not fetch checkFile.py !') self.msg.info('no POOL file post-processing') return checkfile = os.path.realpath(checkfile) def run_check_file(pool_file, dbfile, app=checkfile): from subprocess import call with open('/dev/null', 'w') as dev_null: res = call([app, "-f", pool_file, "-o", dbfile, "--fast"], stdout=dev_null, stderr=dev_null) return res _msg = ShutUp() self.msg.info("Finalizing [%s]", self.name) ## FIXME: move to 'with' stmt when py2.5 is there for proper RAII ## and leaner code... self.inputPoolFiles = [f for f in self.inputPoolFiles] if len(self.inputPoolFiles) > 0: self.msg.info("Content of input POOL files:") _msg.mute() _msg.unMute() for i, inFileName in enumerate(self.inputPoolFiles[:20]): try: _msg.mute() sc = run_check_file(pool_file=inFileName, dbfile=tmpfile) if sc != 0: raise RuntimeError \ ('error running check file (rc=%r)'%sc) inFile = PF.PoolFile(tmpfile) _msg.unMute() self.svc.meta['inputPoolFiles/%i' % i] = { 'infos': inFile._fileInfos, 'nbrEvts': inFile.dataHeader.nEntries, 'dataHeader': [_unfold(inFile.dataHeader)], 'data': [_unfold(p) for p in inFile.data] } self.msg.info(" - [%s] %8.3f kb (%i events)", inFile._fileInfos['name'], inFile._fileInfos['size'] / Units.kB, inFile.dataHeader.nEntries) del inFile except Exception, err: _msg.unMute() self.msg.info("Could not run checkFile on [%s] !!", inFileName) self.msg.info("Reason: %s", err) if 'inFile' in dir(): del inFile _msg.unMute()
class PoolMonTool(object): """ Persistency monitoring tool: measures memory and disk sizes of input and output containers (in POOL files) """ def __init__(self, svc): super(PoolMonTool, self).__init__() self.svc = svc self.name = svc.name + ".Pool" from AthenaCommon.AppMgr import ServiceMgr as svcMgr inFiles = set() if hasattr(svcMgr,'EventSelector') and \ hasattr(svcMgr.EventSelector, 'InputCollections') : for inFile in svcMgr.EventSelector.InputCollections: if inFile.startswith("ROOTTREE:"): inFile = inFile[len("ROOTTREE:"):] inFiles.add(inFile) outFiles = set() from AthenaCommon import CfgMgr from AthenaCommon.Configurable import Configurable for c in Configurable.allConfigurables.values(): if not isinstance(c, CfgMgr.AthenaOutputStream): continue try: outFile = c.properties()["OutputFile"] except KeyError: continue if outFile.startswith("ROOTTREE:"): outFile = outFile[len("ROOTTREE:"):] outFiles.add(outFile) self.inputPoolFiles = [i for i in inFiles] self.outputPoolFiles = [o for o in outFiles] @property def msg(self): import AthenaCommon.Logging as L return L.logging.getLogger(self.name) def initialize(self): self.msg.info("Initializing [%s]", self.name) self.msg.info("InputPoolFiles: %r", self.inputPoolFiles) self.msg.info("OutputPoolFiles: %r", self.outputPoolFiles) return def finalize(self): import sys import PyUtils.PoolFile as PF class ShutUp: def __init__(self): self.save = os.dup(sys.stdout.fileno()) self.quiet = open("/dev/null", "w") return def mute(self): os.dup2(self.quiet.fileno(), sys.stdout.fileno()) return def unMute(self): os.dup2(self.save, sys.stdout.fileno()) return def _unfold(pr): return { 'name': pr.name, 'memSize': pr.memSize, 'diskSize': pr.diskSize, 'memSizeNoZip': pr.memSizeNoZip, 'nEntries': pr.nEntries, 'dirType': pr.dirType, 'details': pr.details, } import tempfile, atexit, os tmpfileno, tmpfile = tempfile.mkstemp(suffix='.py_shelve.dat') os.close(tmpfileno) atexit.register(os.unlink, tmpfile) import commands sc, checkfile = commands.getstatusoutput('which checkFile.py') if sc != 0: self.msg.info('could not fetch checkFile.py !') self.msg.info('no POOL file post-processing') return checkfile = os.path.realpath(checkfile) def run_check_file(pool_file, dbfile, app=checkfile): from subprocess import call with open('/dev/null', 'w') as dev_null: res = call([app, "-f", pool_file, "-o", dbfile, "--fast"], stdout=dev_null, stderr=dev_null) return res _msg = ShutUp() self.msg.info("Finalizing [%s]", self.name) ## FIXME: move to 'with' stmt when py2.5 is there for proper RAII ## and leaner code... self.inputPoolFiles = [f for f in self.inputPoolFiles] if len(self.inputPoolFiles) > 0: self.msg.info("Content of input POOL files:") _msg.mute() _msg.unMute() for i, inFileName in enumerate(self.inputPoolFiles[:20]): try: _msg.mute() sc = run_check_file(pool_file=inFileName, dbfile=tmpfile) if sc != 0: raise RuntimeError \ ('error running check file (rc=%r)'%sc) inFile = PF.PoolFile(tmpfile) _msg.unMute() self.svc.meta['inputPoolFiles/%i' % i] = { 'infos': inFile._fileInfos, 'nbrEvts': inFile.dataHeader.nEntries, 'dataHeader': [_unfold(inFile.dataHeader)], 'data': [_unfold(p) for p in inFile.data] } self.msg.info(" - [%s] %8.3f kb (%i events)", inFile._fileInfos['name'], inFile._fileInfos['size'] / Units.kB, inFile.dataHeader.nEntries) del inFile except Exception, err: _msg.unMute() self.msg.info("Could not run checkFile on [%s] !!", inFileName) self.msg.info("Reason: %s", err) if 'inFile' in dir(): del inFile _msg.unMute() if len(self.outputPoolFiles) > 0: self.msg.info("Content of output POOL files:") for i, outFileName in enumerate(self.outputPoolFiles): try: _msg.mute() sc = run_check_file(pool_file=outFileName, dbfile=tmpfile) if sc != 0: raise RuntimeError \ ('error running check file (rc=%r)'%sc) outFile = PF.PoolFile(tmpfile) _msg.unMute() self.svc.meta['outputPoolFiles/%i' % i] = { 'infos': outFile._fileInfos, 'nbrEvts': outFile.dataHeader.nEntries, 'dataHeader': [_unfold(outFile.dataHeader)], 'data': [_unfold(p) for p in outFile.data] } self.msg.info(" - [%s] %8.3f kb (%i events)", outFile._fileInfos['name'], outFile._fileInfos['size'] / Units.kB, outFile.dataHeader.nEntries) del outFile except Exception, err: _msg.unMute() self.msg.info("Could not run checkFile on [%s] !!", outFileName) self.msg.info("Reason: %s", err) if 'outFile' in dir(): del outFile _msg.unMute()