def sendSkims(in_dir, num_jobs, cut, out_parent, file_tag, overwrite, cache): in_dir = utilities.fullPath(in_dir) skim_name = getSkimName(cut) if out_parent == None: dir_pat = re.compile( "(.*?/cms[0-9]+/cms[0-9]+r0/babymaker/babies/[0-9]{4}_[0-9]{2}_[0-9]{2}/.*?)/" ) match = dir_pat.search(in_dir + "/") out_parent = match.group(0) out_dir = os.path.join(out_parent, "skim_" + skim_name) in_files = [ f for f in glob.glob( utilities.fullPath(os.path.join(in_dir, "*" + file_tag + "*.root"))) ] out_files = [ f.replace(in_dir, out_dir).replace(".root", "_" + skim_name + ".root") for f in in_files ] in_files = splitJobs(in_files, num_jobs) out_files = splitJobs(out_files, num_jobs) total_jobs = 0 for ijob in xrange(len(in_files)): total_jobs += 1 sendSkimJob( in_files[ijob], out_files[ijob], cut, overwrite, cache, skim_name + "_" + file_tag + "_" + str(ijob) + "_of_" + str(num_jobs) + ".py") print("Submitted " + str(total_jobs) + " jobs.") print("Output sent to {}".format(out_dir))
def cacheCopy(src, dst, min_free, file_map, no_delete): #Cache a copy of src if possible, removing old files from cache if necessary src_size = os.stat( src ).st_size * 2 #Safety factor of 2 to account for file growth if cached copy is modified du = os.statvfs(utilities.fullPath("/scratch/babymaker")) avail = du.f_bsize * du.f_bavail while avail - src_size < min_free: #Keep deleting until there's room if no_delete: return removed_file = removeOldCache(file_map) if not removed_file: return du = os.statvfs(utilities.fullPath("/scratch/babymaker")) avail = du.f_bsize * du.f_bavail print("Caching " + src + " to " + dst + "\n") try: shutil.copy(src, dst) os.chmod(dst, 0775) while not cacheUpToDate(dst, src): now = time.time() os.utime(dst, (now, now)) except: os.remove(dst) utilities.ePrint("Failed to cache " + src + " to " + dst + "\n") raise
def removeOldCache(file_map): #Deletes oldest cached file found_file = False oldest_mod_time = 0 oldest_path = "" for root, dirs, files in os.walk(utilities.fullPath("/scratch/babymaker")): for f in files: path = os.path.join(root, f) if path in file_map.itervalues(): continue mod_time = lastTime(path) if mod_time < oldest_mod_time or not found_file: found_file = True oldest_mod_time = mod_time oldest_path = path if time.time()-oldest_mod_time <= 86400.: #Don't delete files used in last 24 hours return False oldest_path = utilities.fullPath(oldest_path) if found_file: print("Deleting "+oldest_path+" from cache\n") try: os.remove(oldest_path) except: return False while oldest_path != "/" and oldest_path != "": try: os.rmdir(oldest_path) except OSError: pass finally: oldest_path = os.path.dirname(oldest_path) return True else: return False
def cacheCopy(src, dst, min_free, file_map, no_delete): #Cache a copy of src if possible, removing old files from cache if necessary src_size = os.stat(src).st_size * 2 #Safety factor of 2 to account for file growth if cached copy is modified du = os.statvfs(utilities.fullPath("/scratch/babymaker")) avail = du.f_bsize*du.f_bavail while avail-src_size < min_free: #Keep deleting until there's room if no_delete: return removed_file = removeOldCache(file_map) if not removed_file: return du = os.statvfs(utilities.fullPath("/scratch/babymaker")) avail = du.f_bsize*du.f_bavail print("Caching "+src+" to "+dst+"\n") try: shutil.copy(src, dst) os.chmod(dst, 0775) while not cacheUpToDate(dst, src): now = time.time() os.utime(dst, (now, now)) except: os.remove(dst) utilities.ePrint("Failed to cache "+src+" to "+dst+"\n") raise
def removeOldCache(file_map): #Deletes oldest cached file found_file = False oldest_mod_time = 0 oldest_path = "" for root, dirs, files in os.walk(utilities.fullPath("/scratch/babymaker")): for f in files: path = os.path.join(root, f) if path in file_map.itervalues(): continue mod_time = lastTime(path) if mod_time < oldest_mod_time or not found_file: found_file = True oldest_mod_time = mod_time oldest_path = path if time.time() - oldest_mod_time <= 86400.: #Don't delete files used in last 24 hours return False oldest_path = utilities.fullPath(oldest_path) if found_file: print("Deleting " + oldest_path + " from cache\n") try: os.remove(oldest_path) except: return False while oldest_path != "/" and oldest_path != "": try: os.rmdir(oldest_path) except OSError: pass finally: oldest_path = os.path.dirname(oldest_path) return True else: return False
def expand(files): expanded = [] for f in files: globbed = glob.glob(f) if len(globbed) > 0: for g in globbed: expanded.append(utilities.fullPath(g)) else: expanded.append(utilities.fullPath(f)) return expanded
def sendSlimJobs(input_dir, skims, slims, overwrite, output_dir): input_dir = utilities.fullPath(input_dir) if skims == []: skims = ["*"] skims = [ utilities.fullPath(skim) for sublist in skims for skim in glob.glob(os.path.join(input_dir, "skim_"+sublist)) ] slims = [ utilities.fullPath(slim) for sublist in slims for slim in glob.glob(sublist)] total_jobs = 0 for slim in slims: for skim in skims: total_jobs += sendSlimJob(skim, slim, overwrite, output_dir) print("Submitted "+str(total_jobs)+" jobs.")
def sendSkimJob(in_files, out_files, cut, overwrite, cache, exe_name): python_dir = utilities.fullPath(os.path.dirname(__file__)) run_dir = os.path.join(os.path.dirname(out_files[0]), "run") utilities.ensureDir(run_dir) run_file = os.path.join(run_dir, exe_name) with open(run_file, "w") as f: f.write('#! /usr/bin/env python\n') f.write('import sys\n') f.write('sys.path.append("' + python_dir + '")\n') f.write('import subprocess\n') f.write('import cache\n') for in_file, out_file in itertools.izip(in_files, out_files): if os.path.exists(out_file) and not overwrite: continue if cache: f.write('cache.cacheRun(["' + out_file + '","' + in_file + '"],["' + os.path.join(python_dir, 'skim_ntuple.py') + '","' + cut + '","' + out_file + '","' + in_file + '"],False,10000000000,0.5,False)\n') else: f.write('subprocess.call(["' + os.path.join(python_dir, 'skim_ntuple.py') + '","' + cut + '","' + out_file + '","' + in_file + '"])\n') os.chmod(run_file, 0755) subprocess.call(["JobSubmit.csh", "run/wrapper.sh", run_file])
def mapFiles(command, file_map): #Replace executable arguments with cached equivalent expanded_args = [] for arg in command: globbed = glob.glob(arg) if len(globbed) > 0: #Argument represents file(s) for f in globbed: expanded_args.append(utilities.fullPath(f)) else: expanded_args.append(arg) command = [] inv_file_map = dict((cached,net) for net,cached in file_map.iteritems()) for arg in expanded_args: if arg in file_map and cacheUpToDate(file_map[arg], arg): #Check if generated cache for file command.append(file_map[arg]) elif isNetFile(arg): #Check if pre-existing cache cache_path = cachePath(arg) if cacheUpToDate(cache_path, arg): command.append(cache_path) inv_file_map[cache_path] = arg else: command.append(arg) else: command.append(arg) return command, inv_file_map
def mapFiles(command, file_map): #Replace executable arguments with cached equivalent expanded_args = [] for arg in command: globbed = glob.glob(arg) if len(globbed) > 0: #Argument represents file(s) for f in globbed: expanded_args.append(utilities.fullPath(f)) else: expanded_args.append(arg) command = [] inv_file_map = dict((cached, net) for net, cached in file_map.iteritems()) for arg in expanded_args: if arg in file_map and cacheUpToDate(file_map[arg], arg): #Check if generated cache for file command.append(file_map[arg]) elif isNetFile(arg): #Check if pre-existing cache cache_path = cachePath(arg) if cacheUpToDate(cache_path, arg): command.append(cache_path) inv_file_map[cache_path] = arg else: command.append(arg) else: command.append(arg) return command, inv_file_map
def skimFiles(in_files, out_file, cut, keep_existing): in_files = [ utilities.fullPath(in_file) for in_file in in_files ] out_file = utilities.fullPath(out_file) utilities.ensureDir(os.path.dirname(out_file)) cut = expandCut(cut) print("INPUT FILES:",in_files,"\n") print("OUTPUT FILE:",out_file,"\n") print(" CUT:",cut,"\n") if keep_existing and os.path.exists(out_file): print("Keeping pre-existing "+out_file+"\n") return in_tree = ROOT.TChain("tree", "tree") for in_file in in_files: in_tree.Add(in_file) with utilities.ROOTFile(out_file, "recreate") as out: out_tree = in_tree.CopyTree(cut) out_tree.Write()
def skimFiles(in_files, out_file, cut, keep_existing): in_files = [utilities.fullPath(in_file) for in_file in in_files] out_file = utilities.fullPath(out_file) utilities.ensureDir(os.path.dirname(out_file)) cut = expandCut(cut) print("INPUT FILES:", in_files, "\n") print("OUTPUT FILE:", out_file, "\n") print(" CUT:", cut, "\n") if keep_existing and os.path.exists(out_file): print("Keeping pre-existing " + out_file + "\n") return in_tree = ROOT.TChain("tree", "tree") for in_file in in_files: in_tree.Add(in_file) with utilities.ROOTFile(out_file, "recreate") as out: out_tree = in_tree.CopyTree(cut) out_tree.Write()
def cacheRun(caches, command, fragile, abs_limit, rel_limit, no_delete): for s in [sig for sig in dir(signal) if sig.startswith("SIG") and not sig.startswith("SIG_") and sig!="SIGKILL" and sig!="SIGSTOP"]: signum = getattr(signal, s) signal.signal(signum,signalHandler) if not os.path.isdir("/scratch/babymaker"): cacheRecurse([], dict(), command, True, 0, True) return caches = expand(caches) du = os.statvfs(utilities.fullPath("/scratch/babymaker")) min_free = max(abs_limit, du.f_bsize*du.f_blocks*rel_limit) cacheRecurse(caches, dict(), command, fragile, min_free, no_delete)
def sendSkims(in_dir, num_jobs, cut, out_parent, file_tag, overwrite): in_dir = utilities.fullPath(in_dir) skim_name = getSkimName(cut) if out_parent == None: dir_pat = re.compile("(.*?/cms[0-9]+/cms[0-9]+r0/babymaker/babies/[0-9]{4}_[0-9]{2}_[0-9]{2}/.*?)/") match = dir_pat.search(in_dir+"/") out_parent = match.group(0) out_dir = os.path.join(out_parent,"skim_"+skim_name) in_files = [ f for f in glob.glob(utilities.fullPath(os.path.join(in_dir, "*"+file_tag+"*.root"))) ] out_files = [ f.replace(in_dir, out_dir).replace(".root","_"+skim_name+".root") for f in in_files ] in_files = splitJobs(in_files, num_jobs) out_files = splitJobs(out_files, num_jobs) total_jobs = 0 for ijob in xrange(len(in_files)): total_jobs += 1 sendSkimJob(in_files[ijob], out_files[ijob], cut, overwrite, skim_name+"_"+file_tag+"_"+str(ijob)+"_of_"+str(num_jobs)+".py") print("Submitted "+str(total_jobs)+" jobs.")
def cacheRun(caches, command, fragile, abs_limit, rel_limit, no_delete): for s in [ sig for sig in dir(signal) if sig.startswith("SIG") and not sig.startswith("SIG_") and sig != "SIGKILL" and sig != "SIGSTOP" ]: signum = getattr(signal, s) signal.signal(signum, signalHandler) if not os.path.isdir("/scratch/babymaker"): cacheRecurse([], dict(), command, True, 0, True) return caches = expand(caches) du = os.statvfs(utilities.fullPath("/scratch/babymaker")) min_free = max(abs_limit, du.f_bsize * du.f_blocks * rel_limit) cacheRecurse(caches, dict(), command, fragile, min_free, no_delete)
def killZombies(in_dirs): in_dirs = [ utilities.fullPath(d) for sublist in in_dirs for d in glob.glob(sublist) ] ROOT.gErrorIgnoreLevel = 6000 for d in in_dirs: for root, dirs, files in os.walk(d): print "In "+root for f in files: path = os.path.join(root, f) if os.path.splitext(f)[1] != ".root": continue tfile = ROOT.TFile(path, "read") kill = tfile.IsZombie() or not tfile.IsOpen() tfile.Close() if kill: print "Removing "+path os.remove(path)
def killZombies(in_dirs): in_dirs = [ utilities.fullPath(d) for sublist in in_dirs for d in glob.glob(sublist) ] ROOT.gErrorIgnoreLevel = 6000 for d in in_dirs: for root, dirs, files in os.walk(d): print "In " + root for f in files: path = os.path.join(root, f) if os.path.splitext(f)[1] != ".root": continue tfile = ROOT.TFile(path, "read") kill = tfile.IsZombie() or not tfile.IsOpen() tfile.Close() if kill: print "Removing " + path os.remove(path)
def sendSkimJob(in_files, out_files, cut, overwrite, exe_name): python_dir = utilities.fullPath(os.path.dirname(__file__)) run_dir = os.path.join(os.path.dirname(out_files[0]), "run") utilities.ensureDir(run_dir) run_file = os.path.join(run_dir, exe_name) with open(run_file, "w") as f: f.write('#! /usr/bin/env python\n') f.write('import sys\n') f.write('sys.path.append("'+python_dir+'")\n') f.write('import cache\n') for in_file, out_file in itertools.izip(in_files, out_files): if os.path.exists(out_file) and not overwrite: continue f.write('cache.cacheRun(["'+out_file+'","'+in_file+'"],["' +os.path.join(python_dir,'skim_ntuple.py') +'","'+cut+'","'+out_file+'","'+in_file +'"],False,10000000000,0.5,False)\n') os.chmod(run_file, 0755) subprocess.call(["JobSubmit.csh","run/wrapper.sh",run_file])
def cachePath(path): cache_root = utilities.fullPath("/scratch/babymaker") return os.path.join(cache_root, path[5:])
def deleteTreeglobal(in_files): in_files = [ utilities.fullPath(f) for sublist in in_files for f in glob.glob(sublist) ] for file_dir in in_files: recursiveDelete(file_dir)