def fromCMGCrabDirectory(cls, name, baseDirectory, treeFilename = 'tree.root', treeName = 'tree', maxN = None, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG crab output directory ''' import tarfile from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Walk through all subdirectories and pick up pairs of files '..._n.root and ..._n.tgz where n is the job number' treeFiles = {} zipFiles = {} for root, subFolders, filenames in os.walk( baseDirectory ): for filename in filenames: base, ext = os.path.splitext( filename ) try: n = int(base.split('_')[-1]) except: # filename is not of the form 'xyz_n' where n is the job number continue # add the tgz and files to the dict. filename_ = os.path.join(root, filename) if ext=='.root': treeFiles[n] = filename_ if ext=='.tgz': zipFiles[n] = filename_ # Find pairs of zip and root files pairs = set(zipFiles.keys()) & set(treeFiles.keys()) n_jobs = len( set(zipFiles.keys()) | set(treeFiles.keys()) ) normalization = 0 files = [] failedJobs = [] for n in pairs: sumW = None tf = tarfile.open( zipFiles[n], 'r:gz' ) for f in tf.getmembers(): if "SkimReport.txt" in f.name: sumW = read_cmg_normalization(tf.extractfile(f)) if sumW is not None: break if sumW is None: logger.warning( "No normalization found when reading tar file %s", zipFiles[n] ) tf.close() # Check treefile for whether the tree 'treeName' can be found. # This is an implicit check for broken, recovered or otherwise corrupted root files. treeFile = treeFiles[n] if helpers.checkRootFile(treeFiles[n], checkForObjects = [treeName] ) else None if treeFile is None: logger.warning( "File %s looks broken. Checked for presence of tree %s.", treeFiles[n] , treeName ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read job %i and incremented normalization by %7.2f", n, sumW ) else: failedJobs.append( n ) # Don't allow empty samples if len(files) == 0: raise helpers.EmptySampleError("Could not find valid crab CMG output for sample {0}. Total number of jobs: {1}. baseDirectory: {2}"\ .format(name, len(pairs), baseDirectory)) # Log statements eff = 100*len(failedJobs)/float( n_jobs ) logger.debug("Loaded CMGOutput sample %s. Total number of jobs: %i, both tgz and root: %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(pairs), n_jobs, normalization, len(failedJobs), eff) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )
def fromCMGCrabDirectory(cls, name, baseDirectory, treeFilename = 'tree.root', treeName = 'tree', maxN = None, xSection = -1,\ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG crab output directory ''' import tarfile from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Walk through all subdirectories and pick up pairs of files '..._n.root and ..._n.tgz where n is the job number' treeFiles = {} zipFiles = {} for root, subFolders, filenames in os.walk( baseDirectory ): for filename in filenames: base, ext = os.path.splitext( filename ) try: n = int(base.split('_')[-1]) except: # filename is not of the form 'xyz_n' where n is the job number continue # add the tgz and files to the dict. filename_ = os.path.join(root, filename) if ext=='.root': treeFiles[n] = filename_ if ext=='.tgz': zipFiles[n] = filename_ # Find pairs of zip and root files pairs = set(zipFiles.keys()) & set(treeFiles.keys()) n_jobs = len( set(zipFiles.keys()) | set(treeFiles.keys()) ) normalization = 0 files = [] failedJobs = [] for n in pairs: sumW = None tf = tarfile.open( zipFiles[n], 'r:gz' ) for f in tf.getmembers(): if "SkimReport.txt" in f.name: sumW = read_cmg_normalization(tf.extractfile(f)) if sumW is not None: break if sumW is None: logger.warning( "No normalization found when reading tar file %s", zipFiles[n] ) tf.close() # Check treefile for whether the tree 'treeName' can be found. # This is an implicit check for broken, recovered or otherwise corrupted root files. treeFile = treeFiles[n] if helpers.checkRootFile(treeFiles[n], checkForObjects = [treeName] ) else None if treeFile is None: logger.warning( "File %s looks broken. Checked for presence of tree %s.", treeFiles[n] , treeName ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read job %i and incremented normalization by %7.2f", n, sumW ) else: failedJobs.append( n ) # Don't allow empty samples if len(files) == 0: raise helpers.EmptySampleError("Could not find valid crab CMG output for sample {0}. Total number of jobs: {1}. baseDirectory: {2}"\ .format(name, len(pairs), baseDirectory)) # Log statements eff = 100*len(failedJobs)/float( n_jobs ) logger.debug("Loaded CMGOutput sample %s. Total number of jobs: %i, both tgz and root: %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(pairs), n_jobs, normalization, len(failedJobs), eff) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, xSection = xSection, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )
def fromCMGOutput(cls, name, baseDirectory, treeFilename = 'tree.root', chunkString = None, treeName = 'tree', maxN = None, \ selectionString = None, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG output directory from e.g. unzipped crab output in the 'Chunks' directory structure. Expects the presence of the tree root file and the SkimReport.txt ''' from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Reading all subdirectories in base directory. If chunkString != None, require cmg output name formatting chunkDirectories = [] # FIXME: Better to loop only over subdirectories in base directory? for x in os.listdir(baseDirectory): if os.path.isdir(os.path.join(baseDirectory, x)): if not chunkString or (x.startswith(chunkString) and x.endswith('_Chunk')) or x==chunkString: chunkDirectories.append(os.path.join(baseDirectory, x)) if len(chunkDirectories)==maxN:break logger.debug( "Found %i chunk directories with chunkString %s in base directory %s", \ len(chunkDirectories), chunkString, baseDirectory ) normalization = 0 files = [] failedChunks=[] goodChunks =[] for i, chunkDirectory in enumerate( chunkDirectories ): success = False logger.debug("Reading chunk %s", chunkDirectory) # Find normalization sumW = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Determine normalization constant if 'SkimReport.txt' in filenames: skimReportFilename = os.path.join(root, 'SkimReport.txt') with open(skimReportFilename, 'r') as fin: sumW = read_cmg_normalization(fin) if not sumW: logger.warning( "Read chunk %s and found report '%s' but could not read normalization.", chunkDirectory, skimReportFilename ) # Find treefile treeFile = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Load tree file if treeFilename in filenames: treeFile = os.path.join(root, treeFilename) # Checking whether root file is OG and contains a tree if not helpers.checkRootFile(treeFile, checkForObjects=[treeName] ): logger.warning( "Read chunk %s and found tree file '%s' but file looks broken.", chunkDirectory, treeFile ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read chunk %s and incremented normalization by %7.2f", chunkDirectory, sumW ) success = True goodChunks.append( chunkDirectory ) if not success: failedChunks.append( chunkDirectory ) # Don't allow empty samples if len(goodChunks) == 0: raise helpers.EmptySampleError("Could not find good CMGOutput chunks for sample {0}. Total number of chunks: {1}. baseDirectory: {2}"\ .format(name, len(chunkDirectories), baseDirectory)) # Log statements eff = 100*len(failedChunks)/float( len(chunkDirectories) ) logger.debug("Loaded CMGOutput sample %s. Total number of chunks : %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(chunkDirectories), normalization, len(failedChunks), eff) for chunk in failedChunks: logger.debug( "Failed to load chunk %s", chunk) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, isData = isData, color = color, texName = texName )
def fromCMGOutput(cls, name, baseDirectory, treeFilename = 'tree.root', chunkString = None, treeName = 'tree', maxN = None, \ selectionString = None, xSection = -1, weightString = None, isData = False, color = 0, texName = None): '''Load a CMG output directory from e.g. unzipped crab output in the 'Chunks' directory structure. Expects the presence of the tree root file and the SkimReport.txt ''' from cmg_helpers import read_cmg_normalization maxN = maxN if maxN is not None and maxN>0 else None # Reading all subdirectories in base directory. If chunkString != None, require cmg output name formatting chunkDirectories = [] # FIXME: Better to loop only over subdirectories in base directory? for x in os.listdir(baseDirectory): if os.path.isdir(os.path.join(baseDirectory, x)): if not chunkString or (x.startswith(chunkString) and x.endswith('_Chunk')) or x==chunkString: chunkDirectories.append(os.path.join(baseDirectory, x)) if len(chunkDirectories)==maxN:break logger.debug( "Found %i chunk directories with chunkString %s in base directory %s", \ len(chunkDirectories), chunkString, baseDirectory ) normalization = 0 files = [] failedChunks=[] goodChunks =[] for i, chunkDirectory in enumerate( chunkDirectories ): success = False logger.debug("Reading chunk %s", chunkDirectory) # Find normalization sumW = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Determine normalization constant if 'SkimReport.txt' in filenames: skimReportFilename = os.path.join(root, 'SkimReport.txt') with open(skimReportFilename, 'r') as fin: sumW = read_cmg_normalization(fin) if not sumW: logger.warning( "Read chunk %s and found report '%s' but could not read normalization.", chunkDirectory, skimReportFilename ) # Find treefile treeFile = None for root, subFolders, filenames in os.walk( chunkDirectory ): # Load tree file if treeFilename in filenames: treeFile = os.path.join(root, treeFilename) # Checking whether root file is OG and contains a tree if not helpers.checkRootFile(treeFile, checkForObjects=[treeName] ): logger.warning( "Read chunk %s and found tree file '%s' but file looks broken.", chunkDirectory, treeFile ) # If both, normalization and treefile are OK call it successful. if sumW and treeFile: files.append( treeFile ) normalization += sumW logger.debug( "Successfully read chunk %s and incremented normalization by %7.2f", chunkDirectory, sumW ) success = True goodChunks.append( chunkDirectory ) if not success: failedChunks.append( chunkDirectory ) # Don't allow empty samples if len(goodChunks) == 0: raise helpers.EmptySampleError("Could not find good CMGOutput chunks for sample {0}. Total number of chunks: {1}. baseDirectory: {2}"\ .format(name, len(chunkDirectories), baseDirectory)) # Log statements eff = 100*len(failedChunks)/float( len(chunkDirectories) ) logger.debug("Loaded CMGOutput sample %s. Total number of chunks : %i. Normalization: %7.2f Bad: %i. Inefficiency: %3.3f", \ name, len(chunkDirectories), normalization, len(failedChunks), eff) for chunk in failedChunks: logger.debug( "Failed to load chunk %s", chunk) logger.debug( "Read %i chunks and total normalization of %f", len(files), normalization ) return cls( name = name, treeName = treeName, files = files, normalization = normalization, selectionString = selectionString, weightString = weightString, xSection = xSection, isData = isData, color = color, texName = texName )