Example #1
0
    def fromCMGCrabDirectory(cls, name, baseDirectory, treeFilename = 'tree.root', treeName = 'tree', maxN = None, \
            selectionString = None, weightString = None,
            isData = False, color = 0, texName = None):
        '''Load a CMG crab output directory
        ''' 
        import tarfile
        from cmg_helpers import read_cmg_normalization

        maxN = maxN if maxN is not None and maxN>0 else None

        # Walk through all subdirectories and pick up pairs of files '..._n.root and ..._n.tgz where n is the job number'
        treeFiles = {}
        zipFiles  = {}
        for root, subFolders, filenames in os.walk( baseDirectory ):
            for filename in filenames:
                base, ext = os.path.splitext( filename )
                try:
                    n = int(base.split('_')[-1])
                except:
                    # filename is not of the form 'xyz_n' where n is the job number
                    continue
                # add the tgz and files to the dict.   
                filename_ = os.path.join(root, filename)
                if ext=='.root':
                    treeFiles[n] = filename_
                if ext=='.tgz':
                    zipFiles[n] = filename_
        # Find pairs of zip and root files
        pairs = set(zipFiles.keys()) & set(treeFiles.keys())
        n_jobs = len( set(zipFiles.keys()) | set(treeFiles.keys()) )

        normalization = 0
        files = []
        failedJobs = []
        for n in pairs:
            sumW = None
            tf = tarfile.open( zipFiles[n], 'r:gz' )
            for f in tf.getmembers():
                if "SkimReport.txt" in f.name:
                    sumW = read_cmg_normalization(tf.extractfile(f))
                if sumW is not None: break
            if sumW is None:
                logger.warning( "No normalization found when reading tar file %s", zipFiles[n] )
            tf.close()

            # Check treefile for whether the tree 'treeName' can be found.
            # This is an implicit check for broken, recovered or otherwise corrupted root files.
            treeFile = treeFiles[n] if helpers.checkRootFile(treeFiles[n], checkForObjects = [treeName] ) else None

            if treeFile is None: logger.warning( "File %s looks broken. Checked for presence of tree %s.", treeFiles[n] , treeName )

            # If both, normalization and treefile are OK call it successful.
            if sumW and treeFile:
                files.append( treeFile )
                normalization += sumW
                logger.debug( "Successfully read job %i and incremented normalization by %7.2f",  n, sumW )
            else:
                failedJobs.append( n )

        # Don't allow empty samples
        if len(files) == 0:
            raise helpers.EmptySampleError("Could not find valid crab CMG output for sample {0}. Total number of jobs: {1}. baseDirectory: {2}"\
                  .format(name, len(pairs), baseDirectory))

        # Log statements
        eff = 100*len(failedJobs)/float( n_jobs )
        logger.debug("Loaded CMGOutput sample %s. Total number of  jobs: %i, both tgz and root: %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \
                          name, len(pairs), n_jobs, normalization, len(failedJobs), eff)

        logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization )
        return cls( name = name, treeName = treeName, files = files, normalization = normalization, 
                selectionString = selectionString, weightString = weightString, 
                isData = isData, color = color, texName = texName )
Example #2
0
    def fromCMGCrabDirectory(cls, name, baseDirectory, treeFilename = 'tree.root', treeName = 'tree', maxN = None, xSection = -1,\
            selectionString = None, weightString = None,
            isData = False, color = 0, texName = None):
        '''Load a CMG crab output directory
        ''' 
        import tarfile
        from cmg_helpers import read_cmg_normalization

        maxN = maxN if maxN is not None and maxN>0 else None

        # Walk through all subdirectories and pick up pairs of files '..._n.root and ..._n.tgz where n is the job number'
        treeFiles = {}
        zipFiles  = {}
        for root, subFolders, filenames in os.walk( baseDirectory ):
            for filename in filenames:
                base, ext = os.path.splitext( filename )
                try:
                    n = int(base.split('_')[-1])
                except:
                    # filename is not of the form 'xyz_n' where n is the job number
                    continue
                # add the tgz and files to the dict.   
                filename_ = os.path.join(root, filename)
                if ext=='.root':
                    treeFiles[n] = filename_
                if ext=='.tgz':
                    zipFiles[n] = filename_
        # Find pairs of zip and root files
        pairs = set(zipFiles.keys()) & set(treeFiles.keys())
        n_jobs = len( set(zipFiles.keys()) | set(treeFiles.keys()) )

        normalization = 0
        files = []
        failedJobs = []
        for n in pairs:
            sumW = None
            tf = tarfile.open( zipFiles[n], 'r:gz' )
            for f in tf.getmembers():
                if "SkimReport.txt" in f.name:
                    sumW = read_cmg_normalization(tf.extractfile(f))
                if sumW is not None: break
            if sumW is None:
                logger.warning( "No normalization found when reading tar file %s", zipFiles[n] )
            tf.close()

            # Check treefile for whether the tree 'treeName' can be found.
            # This is an implicit check for broken, recovered or otherwise corrupted root files.
            treeFile = treeFiles[n] if helpers.checkRootFile(treeFiles[n], checkForObjects = [treeName] ) else None

            if treeFile is None: logger.warning( "File %s looks broken. Checked for presence of tree %s.", treeFiles[n] , treeName )

            # If both, normalization and treefile are OK call it successful.
            if sumW and treeFile:
                files.append( treeFile )
                normalization += sumW
                logger.debug( "Successfully read job %i and incremented normalization by %7.2f",  n, sumW )
            else:
                failedJobs.append( n )

        # Don't allow empty samples
        if len(files) == 0:
            raise helpers.EmptySampleError("Could not find valid crab CMG output for sample {0}. Total number of jobs: {1}. baseDirectory: {2}"\
                  .format(name, len(pairs), baseDirectory))

        # Log statements
        eff = 100*len(failedJobs)/float( n_jobs )
        logger.debug("Loaded CMGOutput sample %s. Total number of  jobs: %i, both tgz and root: %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \
                          name, len(pairs), n_jobs, normalization, len(failedJobs), eff)

        logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization )
        return cls( name = name, treeName = treeName, files = files, normalization = normalization, xSection = xSection, 
                selectionString = selectionString, weightString = weightString, 
                isData = isData, color = color, texName = texName )
Example #3
0
    def fromCMGOutput(cls, name, baseDirectory, treeFilename = 'tree.root', chunkString = None, treeName = 'tree', maxN = None, \
            selectionString = None, weightString = None, 
            isData = False, color = 0, texName = None):
        '''Load a CMG output directory from e.g. unzipped crab output in the 'Chunks' directory structure. 
           Expects the presence of the tree root file and the SkimReport.txt
        ''' 
        from cmg_helpers import read_cmg_normalization
        maxN = maxN if maxN is not None and maxN>0 else None

        # Reading all subdirectories in base directory. If chunkString != None, require cmg output name formatting
        chunkDirectories = []

        # FIXME: Better to loop only over subdirectories in base directory?
        for x in os.listdir(baseDirectory): 
            if os.path.isdir(os.path.join(baseDirectory, x)):
                if not chunkString or (x.startswith(chunkString) and x.endswith('_Chunk')) or x==chunkString:
                    chunkDirectories.append(os.path.join(baseDirectory, x))
                    if len(chunkDirectories)==maxN:break

        logger.debug( "Found %i chunk directories with chunkString %s in base directory %s", \
                           len(chunkDirectories), chunkString, baseDirectory )
        normalization = 0
        files = []
        failedChunks=[]
        goodChunks  =[]

        for i, chunkDirectory in enumerate( chunkDirectories ):
            success = False
            logger.debug("Reading chunk %s", chunkDirectory)

            # Find normalization
            sumW = None
            for root, subFolders, filenames in os.walk( chunkDirectory ):
                # Determine normalization constant
                if 'SkimReport.txt' in filenames:
                    skimReportFilename = os.path.join(root, 'SkimReport.txt')
                    with open(skimReportFilename, 'r') as fin:
                      sumW = read_cmg_normalization(fin)
                      if not sumW:
                          logger.warning( "Read chunk %s and found report '%s' but could not read normalization.",
                                               chunkDirectory, skimReportFilename )
            # Find treefile
            treeFile = None
            for root, subFolders, filenames in os.walk( chunkDirectory ):
                # Load tree file 
                if treeFilename in filenames:
                    treeFile = os.path.join(root, treeFilename)
                    # Checking whether root file is OG and contains a tree
                    if not helpers.checkRootFile(treeFile, checkForObjects=[treeName] ):
                        logger.warning( "Read chunk %s and found tree file '%s' but file looks broken.",  chunkDirectory, treeFile )

            # If both, normalization and treefile are OK call it successful.
            if sumW and treeFile:
                files.append( treeFile )
                normalization += sumW
                logger.debug( "Successfully read chunk %s and incremented normalization by %7.2f",  chunkDirectory, sumW )
                success = True
                goodChunks.append( chunkDirectory )

            if not success:
                failedChunks.append( chunkDirectory )

        # Don't allow empty samples
        if len(goodChunks) == 0:
            raise helpers.EmptySampleError("Could not find good CMGOutput chunks for sample {0}. Total number of chunks: {1}. baseDirectory: {2}"\
                  .format(name, len(chunkDirectories), baseDirectory))

        # Log statements
        eff = 100*len(failedChunks)/float( len(chunkDirectories) )
        logger.debug("Loaded CMGOutput sample %s. Total number of chunks : %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \
                          name, len(chunkDirectories), normalization, len(failedChunks), eff)

        for chunk in failedChunks:
            logger.debug( "Failed to load chunk %s", chunk)
        logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization )
        return cls( name = name, treeName = treeName, files = files, normalization = normalization, 
            selectionString = selectionString, weightString = weightString,
            isData = isData, color = color, texName = texName )
Example #4
0
    def fromCMGOutput(cls, name, baseDirectory, treeFilename = 'tree.root', chunkString = None, treeName = 'tree', maxN = None, \
            selectionString = None, xSection = -1, weightString = None, 
            isData = False, color = 0, texName = None):
        '''Load a CMG output directory from e.g. unzipped crab output in the 'Chunks' directory structure. 
           Expects the presence of the tree root file and the SkimReport.txt
        ''' 
        from cmg_helpers import read_cmg_normalization
        maxN = maxN if maxN is not None and maxN>0 else None

        # Reading all subdirectories in base directory. If chunkString != None, require cmg output name formatting
        chunkDirectories = []

        # FIXME: Better to loop only over subdirectories in base directory?
        for x in os.listdir(baseDirectory): 
            if os.path.isdir(os.path.join(baseDirectory, x)):
                if not chunkString or (x.startswith(chunkString) and x.endswith('_Chunk')) or x==chunkString:
                    chunkDirectories.append(os.path.join(baseDirectory, x))
                    if len(chunkDirectories)==maxN:break

        logger.debug( "Found %i chunk directories with chunkString %s in base directory %s", \
                           len(chunkDirectories), chunkString, baseDirectory )
        normalization = 0
        files = []
        failedChunks=[]
        goodChunks  =[]

        for i, chunkDirectory in enumerate( chunkDirectories ):
            success = False
            logger.debug("Reading chunk %s", chunkDirectory)

            # Find normalization
            sumW = None
            for root, subFolders, filenames in os.walk( chunkDirectory ):
                # Determine normalization constant
                if 'SkimReport.txt' in filenames:
                    skimReportFilename = os.path.join(root, 'SkimReport.txt')
                    with open(skimReportFilename, 'r') as fin:
                      sumW = read_cmg_normalization(fin)
                      if not sumW:
                          logger.warning( "Read chunk %s and found report '%s' but could not read normalization.",
                                               chunkDirectory, skimReportFilename )
            # Find treefile
            treeFile = None
            for root, subFolders, filenames in os.walk( chunkDirectory ):
                # Load tree file 
                if treeFilename in filenames:
                    treeFile = os.path.join(root, treeFilename)
                    # Checking whether root file is OG and contains a tree
                    if not helpers.checkRootFile(treeFile, checkForObjects=[treeName] ):
                        logger.warning( "Read chunk %s and found tree file '%s' but file looks broken.",  chunkDirectory, treeFile )

            # If both, normalization and treefile are OK call it successful.
            if sumW and treeFile:
                files.append( treeFile )
                normalization += sumW
                logger.debug( "Successfully read chunk %s and incremented normalization by %7.2f",  chunkDirectory, sumW )
                success = True
                goodChunks.append( chunkDirectory )

            if not success:
                failedChunks.append( chunkDirectory )

        # Don't allow empty samples
        if len(goodChunks) == 0:
            raise helpers.EmptySampleError("Could not find good CMGOutput chunks for sample {0}. Total number of chunks: {1}. baseDirectory: {2}"\
                  .format(name, len(chunkDirectories), baseDirectory))

        # Log statements
        eff = 100*len(failedChunks)/float( len(chunkDirectories) )
        logger.debug("Loaded CMGOutput sample %s. Total number of chunks : %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \
                          name, len(chunkDirectories), normalization, len(failedChunks), eff)

        for chunk in failedChunks:
            logger.debug( "Failed to load chunk %s", chunk)
        logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization )
        return cls( name = name, treeName = treeName, files = files, normalization = normalization, 
            selectionString = selectionString, weightString = weightString, xSection = xSection,
            isData = isData, color = color, texName = texName )