Example #1
0
def getInputFileToNumEventMapping(m_list):
    """ Returns a dictionary of the file names
        and the number of events in each file
   """
    print(
        "|====================================================================================================================|"
    )
    print(
        "|                                         Doing the mapping from file to number of events                            |"
    )
    print(
        "|====================================================================================================================|"
    )
    m_numEventsPerFile = {}

    import PyUtils.PoolFile as PF

    failedFiles = [
        "/castor/cern.ch/grid/atlas/caf/atlcal/perm/id/cosmics/ESDs/InDetESD_91890_lb7.root",
        "/castor/cern.ch/grid/atlas/caf/atlcal/perm/id/cosmics/ESDs/InDetESD_91891_lb13.root",
        "/castor/cern.ch/grid/atlas/caf/atlcal/perm/id/cosmics/ESDs/InDetESD_91891_lb24.root"
    ]

    for i in range(m_list.__len__()):
        if m_list[i] not in failedFiles:
            poolFile = PF.PoolFile(m_list[i])
            m_numEventsPerFile[m_list[i]] = int(
                poolFile.fileInfos().split()[6])

    print(m_numEventsPerFile)
    return m_numEventsPerFile
Example #2
0
def main(args):
    """read a POOL file and dump its content.
    """
    files = args.files
    if isinstance(files, basestring):
        files = [files]

    import sys
    import os
    import os.path as osp

    for i, f in enumerate(files):
        files[i] = osp.expandvars(osp.expanduser(f))

    exitcode = 0
    for fname in files:
        try:
            import PyUtils.PoolFile as PF
            PF.PoolOpts.FAST_MODE = args.fast
            pool_file = PF.PoolFile(fname)
            pool_file.checkFile(sorting=args.sort_fct)
            if args.detailed_dump:
                dump_file = osp.basename(fname) + '.txt'
                print "## dumping details into [%s]" % (dump_file, )
                pool_file.detailedDump(dump_file)
            if args.output:
                oname = args.output
                print "## saving report into [%s]..." % (oname, )
                pool_file.saveReport(oname)
        except Exception, e:
            print "## Caught exception [%s] !!" % str(e.__class__)
            print "## What:", e
            print sys.exc_info()[0]
            print sys.exc_info()[1]
            exitcode = 1
            pass

        except:
Example #3
0
        pass

    fileNames = set(fileNames)
    # Check the consistency with the CSV output:
    if len(fileNames) > 1 and options.csvFileName:
        print(
            "WARNING  CSV output is only available when processing a single "
            "input file")
        pass

    # Loop over the specified file(s):
    for fileName in fileNames:

        # Open the file:
        import PyUtils.PoolFile as PF
        poolFile = PF.PoolFile(fileName)

        # Loop over all the branches of the file, and sum up the information
        # about them in a smart way...
        summedData = {}
        categData = {}
        categTrigData = {}
        categTrigDynVars = {}
        for d in poolFile.data:
            # Skip metadata/TAG/etc. branches:
            # if d.dirType != "B": continue
            # The name of this branch:
            brName = d.name
            # Check if this is a static auxiliary store:
            m = re.match("(.*)Aux\..*", d.name)
            if m:
Example #4
0
    def __init__(self,
                 TOTALCPUS,
                 LOCALDIR,
                 FILELIST,
                 OutputLevel,
                 doDetailedSplitting=False,
                 nEventsPerFile=-1):
        def sort_by_value(d):
            """ Returns the keys of dictionary d sorted by their values """
            items = d.items()
            backitems = [[v[1], v[0]] for v in items]
            backitems.sort()
            backitems.reverse()
            return [backitems[i][1] for i in range(0, len(backitems))]

        self.doDetailedSplitting = doDetailedSplitting
        self.OutputLevel = OutputLevel
        self.Files = FILELIST
        self.LocalDir = LOCALDIR
        self.nSubJobs = TOTALCPUS
        self.totalNumberOfEvents = 0
        self.totalNumberOfFiles = 0
        self.nEventsPerFile = nEventsPerFile
        inputFileList = []

        if not self.doDetailedSplitting:
            inputfiles = open(FILELIST, "r")
            filelist = inputfiles.read().split('\n')
            for i in range(0, len(filelist)):
                filelist[i] = filelist[i].rstrip()
            inputfiles.close()
            if not LOCALDIR:

                print("Reading Custom File")
                FinalListSorted = []
                for line in filelist:
                    if line and line[0] != '#':
                        FinalListSorted.append(line)

                #print (FinalListSorted)

            elif "castor" in LOCALDIR:
                print("Reading castor directory " + LOCALDIR +
                      " Please wait...")
                extendedFileList = os.popen("rfdir " +
                                            LOCALDIR[7:]).read().splitlines()
            else:
                print("Reading directory " + LOCALDIR + " Please wait...")
                extendedFileList = os.popen("ls -l " +
                                            LOCALDIR).read().splitlines()

            if LOCALDIR:
                i = 0
                SizeList = {}
                for line in extendedFileList:
                    curr = line.split()
                    if curr[0] != 'total':
                        #print (curr[8], " ", curr[4])
                        SizeList[i] = {}
                        SizeList[i][0] = curr[8].rstrip()
                        SizeList[i][1] = curr[4].rstrip()
                        i = i + 1
                FinalList = {}
                count = 0

                for i in range(0, len(SizeList)):
                    #print (SizeList[i][0])
                    if SizeList[i][0] in filelist:
                        #print ("Accepted")
                        #print (SizeList[i][0], " size:", SizeList[i][1])
                        FinalList[SizeList[i][0]] = int(SizeList[i][1])

                #SizeListSorted = [ (k,SizeList[k]) for k in sorted(SizeList.values())]

                FinalListSorted = sort_by_value(FinalList)
                #print ("Sorted list" )
                #for i in range(0,len(FinalListSorted)):
                #   print (FinalListSorted[i], "\tsize:\t", FinalList[FinalListSorted[i]])
            currCPU = 0
            self.CPUsFiles = {}
            nFiles = len(FinalListSorted)
            nRemainder = nFiles % TOTALCPUS
            nFilesPerCpu = nFiles / TOTALCPUS
            nFilesForThisCpu = 1
            sumFileSize = 0
            if len(FinalListSorted[0].split()) == 2:
                for i in range(0, nFiles):
                    sumFileSize += int(FinalListSorted[i].split()[1])
            print(sumFileSize)
            averageSizePerCpu = sumFileSize / TOTALCPUS * 0.97
            print(averageSizePerCpu)
            sumSizeOnCpu = 0
            #print ("NFile, Remainder, NfilesperCpu  ",  nFiles, " ", nRemainder, " ", nFilesPerCpu)
            #If the file size is present then use it to split the files
            if sumFileSize != 0:
                for i in range(0, nFiles):
                    if currCPU in self.CPUsFiles:
                        self.CPUsFiles[currCPU].append(
                            LOCALDIR + FinalListSorted[i].split()[0])
                    else:
                        self.CPUsFiles[currCPU] = [
                            LOCALDIR + FinalListSorted[i].split()[0]
                        ]
                        extraFiles = 0
                    sumSizeOnCpu += int(FinalListSorted[i].split()[1])
                    if (sumSizeOnCpu > averageSizePerCpu and i < nFiles - 1 and
                        (sumSizeOnCpu + int(FinalListSorted[i + 1].split()[1]))
                            > averageSizePerCpu * 1.04):
                        print("File size on CPU: ", currCPU, '\t',
                              sumSizeOnCpu)
                        currCPU = currCPU + 1
                        if currCPU >= TOTALCPUS:
                            currCPU = TOTALCPUS - 1
                        else:
                            sumSizeOnCpu = 0
                    elif (nFiles - i == TOTALCPUS - currCPU):
                        currCPU = currCPU + 1
                print("File size on CPU: ", currCPU, '\t', sumSizeOnCpu)
            else:
                for i in range(0, nFiles):
                    #print (FinalListSorted[i], "CPU: ", currCPU, " FPCPU: " , nFilesForThisCpu)
                    if currCPU in self.CPUsFiles:
                        self.CPUsFiles[currCPU].append(
                            LOCALDIR + FinalListSorted[i].split()[0])
                    else:
                        self.CPUsFiles[currCPU] = [
                            LOCALDIR + FinalListSorted[i].split()[0]
                        ]
                    extraFiles = 0
                    if (currCPU < nRemainder):
                        extraFiles = 1
                    if (nFilesForThisCpu < nFilesPerCpu + extraFiles):
                        nFilesForThisCpu = nFilesForThisCpu + 1
                    else:
                        currCPU = currCPU + 1
                        nFilesForThisCpu = 1

        # Doing the Detailed slitting
        else:
            import PyUtils.PoolFile as PF

            #Getting the number of events in each file.
            inputfiles = open(FILELIST, "r")
            numEventsPerFile = {}
            print("==================================================")
            print("The input file are: (May take some time..)")
            for line in inputfiles:
                if line.rstrip().find(".root") > 0:

                    fullFileName = self.LocalDir + "/" + line.rstrip()

                    inputFileList.append(fullFileName)

                    poolFile = PF.PoolFile(fullFileName)
                    thisNumEvents = int(poolFile.dataHeader.nEntries)

                    self.totalNumberOfEvents += thisNumEvents

                    self.totalNumberOfFiles += 1

                    print(fullFileName, " with ", thisNumEvents, " events")

                    numEventsPerFile[fullFileName] = thisNumEvents
            print("==================================================")

            #Getting the Number of events to process, to skip, and the inputFile

            # The relavent quantities for each subJob
            self.m_skipEvents = {}
            self.nEvents = 0
            self.m_inputFiles = {}

            # This means will do all the events
            if self.nEventsPerFile == -1:
                self.nEvents = int(self.totalNumberOfEvents / self.nSubJobs)
            else:
                self.nEvents = self.nEventsPerFile

            # local counters
            m_usedFiles = 0
            m_numberEventsUsed = 0
            for subJob in range(self.nSubJobs):
                self.m_inputFiles[subJob] = []
                m_eventsNeeded = self.nEvents

                while (m_eventsNeeded != 0
                       and m_usedFiles < self.totalNumberOfFiles):
                    # Two case the file indexed by m_usedFiles has enough event to complete
                    #  the events needed or it doesn't

                    # If it does
                    if m_eventsNeeded <= numEventsPerFile[
                            inputFileList[m_usedFiles]]:
                        numEventsPerFile[
                            inputFileList[m_usedFiles]] -= m_eventsNeeded

                        # Debugging
                        #print ("subJob",subJob)
                        #print ("m_eventsNeeded",m_eventsNeeded)
                        #print ("self.m_inputFiles",self.m_inputFiles)
                        #print ("m_usedFiles",m_usedFiles)

                        self.m_inputFiles[subJob].append(
                            inputFileList[m_usedFiles])
                        self.m_skipEvents[subJob] = m_numberEventsUsed
                        m_numberEventsUsed += m_eventsNeeded
                        m_eventsNeeded = 0
                        print("self.m_skipEvents[" + str(subJob) + "]",
                              self.m_skipEvents[subJob])
                        print("m_numberEventsUsed", m_numberEventsUsed)

                    # If it doesn't
                    else:
                        m_eventsNeeded -= numEventsPerFile[
                            inputFileList[m_usedFiles]]
                        self.m_skipEvents[subJob] = m_numberEventsUsed
                        self.m_inputFiles[subJob].append(
                            inputFileList[m_usedFiles])
                        m_usedFiles += 1
                        m_numberEventsUsed = 0
for Path in paths:
    if "castor" in Path:
        print "Reading castor directory. Please wait..."
        inputfiles = os.popen("rfdir " + Path).read().splitlines()
    else:
        print "Reading directory. Please wait..."
        inputfiles = os.popen("ls -l " + Path).read().splitlines()

    for line in inputfiles:
        filename = line.split()[8]
        if "root" in filename or "ESD" in filename or "data" in filename:
            fullFilename = Path + filename
            try:
                if ReadNEvents:
                    poolFile = PF.PoolFile(fullFilename)
                    numEvents = int(poolFile.dataHeader.nEntries)
                    print filename
                    print numEvents

                    if numEvents > EventCut:
                        outputText.write(fullFilename + '\t' + str(numEvents) +
                                         '\n')
                    else:
                        print "File with few events, skipping..."
                else:
                    if not PoolFileCatalog:
                        outputText.write(fullFilename + '\n')
                    if PoolFileCatalog:
                        print " Creating as PoolFileToCatalogue Please wait..."
                        print "  pool_insertFileToCatalog ", fullFilename
Example #6
0
    def finalize(self):
        import sys
        import PyUtils.PoolFile as PF

        class ShutUp:
            def __init__(self):
                self.save = os.dup(sys.stdout.fileno())
                self.quiet = open("/dev/null", "w")
                return

            def mute(self):
                os.dup2(self.quiet.fileno(), sys.stdout.fileno())
                return

            def unMute(self):
                os.dup2(self.save, sys.stdout.fileno())
                return

        def _unfold(pr):
            return {
                'name': pr.name,
                'memSize': pr.memSize,
                'diskSize': pr.diskSize,
                'memSizeNoZip': pr.memSizeNoZip,
                'nEntries': pr.nEntries,
                'dirType': pr.dirType,
                'details': pr.details,
            }

        import tempfile, atexit, os
        tmpfileno, tmpfile = tempfile.mkstemp(suffix='.py_shelve.dat')
        os.close(tmpfileno)
        atexit.register(os.unlink, tmpfile)

        import commands
        sc, checkfile = commands.getstatusoutput('which checkFile.py')
        if sc != 0:
            self.msg.info('could not fetch checkFile.py !')
            self.msg.info('no POOL file post-processing')
            return
        checkfile = os.path.realpath(checkfile)

        def run_check_file(pool_file, dbfile, app=checkfile):
            from subprocess import call
            with open('/dev/null', 'w') as dev_null:
                res = call([app, "-f", pool_file, "-o", dbfile, "--fast"],
                           stdout=dev_null,
                           stderr=dev_null)
            return res

        _msg = ShutUp()
        self.msg.info("Finalizing [%s]", self.name)
        ## FIXME: move to 'with' stmt when py2.5 is there for proper RAII
        ##        and leaner code...
        self.inputPoolFiles = [f for f in self.inputPoolFiles]
        if len(self.inputPoolFiles) > 0:
            self.msg.info("Content of input POOL files:")
            _msg.mute()
            _msg.unMute()
            for i, inFileName in enumerate(self.inputPoolFiles[:20]):
                try:
                    _msg.mute()
                    sc = run_check_file(pool_file=inFileName, dbfile=tmpfile)
                    if sc != 0:
                        raise RuntimeError \
                              ('error running check file (rc=%r)'%sc)
                    inFile = PF.PoolFile(tmpfile)
                    _msg.unMute()
                    self.svc.meta['inputPoolFiles/%i' % i] = {
                        'infos': inFile._fileInfos,
                        'nbrEvts': inFile.dataHeader.nEntries,
                        'dataHeader': [_unfold(inFile.dataHeader)],
                        'data': [_unfold(p) for p in inFile.data]
                    }
                    self.msg.info(" - [%s] %8.3f kb (%i events)",
                                  inFile._fileInfos['name'],
                                  inFile._fileInfos['size'] / Units.kB,
                                  inFile.dataHeader.nEntries)
                    del inFile
                except Exception, err:
                    _msg.unMute()
                    self.msg.info("Could not run checkFile on [%s] !!",
                                  inFileName)
                    self.msg.info("Reason: %s", err)
                    if 'inFile' in dir(): del inFile
                _msg.unMute()
Example #7
0
class PoolMonTool(object):
    """
    Persistency monitoring tool: measures memory and disk sizes of input and
    output containers (in POOL files)
    """
    def __init__(self, svc):
        super(PoolMonTool, self).__init__()
        self.svc = svc
        self.name = svc.name + ".Pool"
        from AthenaCommon.AppMgr import ServiceMgr as svcMgr
        inFiles = set()
        if hasattr(svcMgr,'EventSelector') and \
           hasattr(svcMgr.EventSelector, 'InputCollections') :
            for inFile in svcMgr.EventSelector.InputCollections:
                if inFile.startswith("ROOTTREE:"):
                    inFile = inFile[len("ROOTTREE:"):]
                inFiles.add(inFile)
        outFiles = set()
        from AthenaCommon import CfgMgr
        from AthenaCommon.Configurable import Configurable
        for c in Configurable.allConfigurables.values():
            if not isinstance(c, CfgMgr.AthenaOutputStream): continue
            try:
                outFile = c.properties()["OutputFile"]
            except KeyError:
                continue
            if outFile.startswith("ROOTTREE:"):
                outFile = outFile[len("ROOTTREE:"):]
            outFiles.add(outFile)

        self.inputPoolFiles = [i for i in inFiles]
        self.outputPoolFiles = [o for o in outFiles]

    @property
    def msg(self):
        import AthenaCommon.Logging as L
        return L.logging.getLogger(self.name)

    def initialize(self):
        self.msg.info("Initializing [%s]", self.name)
        self.msg.info("InputPoolFiles:  %r", self.inputPoolFiles)
        self.msg.info("OutputPoolFiles: %r", self.outputPoolFiles)
        return

    def finalize(self):
        import sys
        import PyUtils.PoolFile as PF

        class ShutUp:
            def __init__(self):
                self.save = os.dup(sys.stdout.fileno())
                self.quiet = open("/dev/null", "w")
                return

            def mute(self):
                os.dup2(self.quiet.fileno(), sys.stdout.fileno())
                return

            def unMute(self):
                os.dup2(self.save, sys.stdout.fileno())
                return

        def _unfold(pr):
            return {
                'name': pr.name,
                'memSize': pr.memSize,
                'diskSize': pr.diskSize,
                'memSizeNoZip': pr.memSizeNoZip,
                'nEntries': pr.nEntries,
                'dirType': pr.dirType,
                'details': pr.details,
            }

        import tempfile, atexit, os
        tmpfileno, tmpfile = tempfile.mkstemp(suffix='.py_shelve.dat')
        os.close(tmpfileno)
        atexit.register(os.unlink, tmpfile)

        import commands
        sc, checkfile = commands.getstatusoutput('which checkFile.py')
        if sc != 0:
            self.msg.info('could not fetch checkFile.py !')
            self.msg.info('no POOL file post-processing')
            return
        checkfile = os.path.realpath(checkfile)

        def run_check_file(pool_file, dbfile, app=checkfile):
            from subprocess import call
            with open('/dev/null', 'w') as dev_null:
                res = call([app, "-f", pool_file, "-o", dbfile, "--fast"],
                           stdout=dev_null,
                           stderr=dev_null)
            return res

        _msg = ShutUp()
        self.msg.info("Finalizing [%s]", self.name)
        ## FIXME: move to 'with' stmt when py2.5 is there for proper RAII
        ##        and leaner code...
        self.inputPoolFiles = [f for f in self.inputPoolFiles]
        if len(self.inputPoolFiles) > 0:
            self.msg.info("Content of input POOL files:")
            _msg.mute()
            _msg.unMute()
            for i, inFileName in enumerate(self.inputPoolFiles[:20]):
                try:
                    _msg.mute()
                    sc = run_check_file(pool_file=inFileName, dbfile=tmpfile)
                    if sc != 0:
                        raise RuntimeError \
                              ('error running check file (rc=%r)'%sc)
                    inFile = PF.PoolFile(tmpfile)
                    _msg.unMute()
                    self.svc.meta['inputPoolFiles/%i' % i] = {
                        'infos': inFile._fileInfos,
                        'nbrEvts': inFile.dataHeader.nEntries,
                        'dataHeader': [_unfold(inFile.dataHeader)],
                        'data': [_unfold(p) for p in inFile.data]
                    }
                    self.msg.info(" - [%s] %8.3f kb (%i events)",
                                  inFile._fileInfos['name'],
                                  inFile._fileInfos['size'] / Units.kB,
                                  inFile.dataHeader.nEntries)
                    del inFile
                except Exception, err:
                    _msg.unMute()
                    self.msg.info("Could not run checkFile on [%s] !!",
                                  inFileName)
                    self.msg.info("Reason: %s", err)
                    if 'inFile' in dir(): del inFile
                _msg.unMute()
        if len(self.outputPoolFiles) > 0:
            self.msg.info("Content of output POOL files:")
            for i, outFileName in enumerate(self.outputPoolFiles):
                try:
                    _msg.mute()
                    sc = run_check_file(pool_file=outFileName, dbfile=tmpfile)
                    if sc != 0:
                        raise RuntimeError \
                              ('error running check file (rc=%r)'%sc)

                    outFile = PF.PoolFile(tmpfile)
                    _msg.unMute()
                    self.svc.meta['outputPoolFiles/%i' % i] = {
                        'infos': outFile._fileInfos,
                        'nbrEvts': outFile.dataHeader.nEntries,
                        'dataHeader': [_unfold(outFile.dataHeader)],
                        'data': [_unfold(p) for p in outFile.data]
                    }
                    self.msg.info(" - [%s] %8.3f kb (%i events)",
                                  outFile._fileInfos['name'],
                                  outFile._fileInfos['size'] / Units.kB,
                                  outFile.dataHeader.nEntries)
                    del outFile
                except Exception, err:
                    _msg.unMute()
                    self.msg.info("Could not run checkFile on [%s] !!",
                                  outFileName)
                    self.msg.info("Reason: %s", err)
                    if 'outFile' in dir(): del outFile
                _msg.unMute()