def copylist(fname="", jobs=InputArgument(1, "Number of parallel jobs to use", ["--njobs", "-j"], int)): """Takes a text file and downloads the files from grid""" if jobs is None: jobs = 1 verbose_msg("Copying files from list", fname, "with", jobs, "jobs") fname = path.normpath(fname) if not path.isfile(fname): warning_msg("Input file not provided! Aborting") return sofar = copied(fname, "So far") f = open(fname, "r") Group = [] for line in f: if "%" in line: msg("Character % encountered! Aborting") break if "#" in line: msg("Character # encountered! Skipping") continue line = "./" + line if jobs == 1: copyfile(line) else: Group.append(line) if jobs > 1: msg("Copying list in parallel with", jobs, "jobs") run_in_parallel(processes=jobs, job_runner=copyfile, job_arguments=Group, job_message="Downloading files", linearize_single_core=True) copied(fname, extra_msg="In recent run", last_time=sofar)
def build_list_of_files(file_list): verbose_msg("Building list of files from", file_list) # Check that runlist does not have duplicates unique_file_list = set(file_list) if len(file_list) != len(unique_file_list): # for i in file_list fatal_msg("Runlist has duplicated entries, fix runlist!", len(unique_file_list), "unique files, while got", len(file_list), "files") file_status = { "Does not exist": [], "Cannot be open": [], "Was recovered": [], "Is Ok": [] } if check_input_file_integrity: # Check that input files can be open for i in file_list: verbose_msg("Checking that TFile", i.strip(), "can be processed") file_status[is_root_file_sane(i)] = i recovered_files = file_status["Was recovered"] not_readable = [] for i in file_status: if i == "Is Ok": continue not_readable += file_status[i] if len(recovered_files) > 0: msg( "Recovered", len(recovered_files), "files:\n", ) if len(not_readable) > 0: warning_msg(len(not_readable), "over", len(file_list), "files cannot be read and will be skipped") for i in not_readable: if i not in file_list: warning_msg("did not find file to remove", f"'{i}'") file_list.remove(i) files_per_batch = [] iter_file_list = iter(file_list) for i in range(0, len(file_list)): sub_set = list(islice(iter_file_list, batch_size)) if len(sub_set) <= 0: continue files_per_batch.append(sub_set) run_list = [] if len(files_per_batch) > 0: for i, lines in enumerate(files_per_batch): p = os.path.join(out_path, f"{i}") if not os.path.isdir(p): os.makedirs(p) run_list.append(os.path.join(p, f"ListForRun5Analysis.{i}.txt")) with open(run_list[-1], "w") as f: for j in lines: f.write(j.strip() + "\n") msg("Number of runs:", len(run_list)) return run_list
def build_list_of_files(file_list): if len(file_list) != len(set(file_list)): # Check that runlist does not have duplicates fatal_msg("Runlist has duplicated entries, fix runlist!") not_readable = [] for i in file_list: # Check that input files can be open f = TFile(i.strip(), "READ") if not f.IsOpen(): verbose_msg("Cannot open AOD file:", i, color=bcolors.WARNING) not_readable.append(i) if len(not_readable) > 0: warning_msg(len(not_readable), "files cannot be read and will be skipped") for i in not_readable: file_list.remove(i) files_per_batch = [] iter_file_list = iter(file_list) for i in range(0, len(file_list)): sub_set = list(islice(iter_file_list, batch_size)) if len(sub_set) <= 0: continue files_per_batch.append(sub_set) run_list = [] if len(files_per_batch) > 0: for i, lines in enumerate(files_per_batch): p = os.path.join(out_path, f"{i}") if not os.path.isdir(p): os.makedirs(p) run_list.append(os.path.join( p, f"ListForRun5Analysis.{i}.txt")) with open(run_list[-1], "w") as f: for j in lines: f.write(j.strip() + "\n") msg("Number of runs:", len(run_list)) return run_list
def inspect(name, tree_name): tree_name = f"{name}/{tree_name}" t = input_file.Get(tree_name) if not t: warning_msg("Did not get tree", tree_name) return -1 if verbose: input_file.Get(name).ls() verbose_msg(tree_name, t.GetEntries()) return t.GetEntries()
def must_be_same(*args): counts = [] names = [] for k in args: counts.append(dictionary_of_counts[k]) names.append(k) if len(set(counts)) != 1: add_bad() warning_msg("Did not get equal counts for", ", ".join(names), counts, "in DF", df_index, "/", len(list_of_keys), ":", i.GetName())
def main(input_files, args=None): if type(input_files) is not list: input_files = [input_files] if len(input_files) <= 0: warning_msg("Passed no input, use: --input_files") return if args.command == "listfiles": for i in input_files: list_of_files = [] if os.path.isfile(i): paths_to_list = [] with open(i) as fsecondary: for j in fsecondary: j = j.strip().strip(" ").strip(",") if j == "": continue for k in j.split(","): paths_to_list.append(k) for j in paths_to_list: list_of_files += listfiles( Path=j, What=args.what, MustHave=args.musthave, MustHaveCount=args.musthavecount, MustNotHaveCount=args.mustnothavecount, MustNotHave=args.mustnothave) else: list_of_files = listfiles( Path=i, What=args.what, MustHave=args.musthave, MustHaveCount=args.musthavecount, MustNotHaveCount=args.mustnothavecount, MustNotHave=args.mustnothave) append = args.append do_write_files = args.outfile if len(list_of_files) > 0 and do_write_files: writefiles(list_of_files, do_write_files, append=(i == list_of_files[0]) or append) elif args.command == "copyfile": for i in input_files: copyfile(i) elif args.command == "copylist": for i in input_files: copylist(i, jobs=args.jobs) elif args.command == "copied": for i in input_files: print(copied(i)) elif args.command == "merge_aod": for i in input_files: merge_aod(i, input_file=args.what) else: warning_msg("Did not do anything")
def is_root_file_sane(file_name_to_check): file_name_to_check = file_name_to_check.strip() if not os.path.isfile(file_name_to_check): warning_msg("File", file_name_to_check, "does not exist") return "Does not exist" file_to_check = TFile(file_name_to_check, "READ") if not file_to_check.IsOpen(): warning_msg("Cannot open AOD file:", file_name_to_check) return "Cannot be open" elif file_to_check.TestBit(TFile.kRecovered): verbose_msg(file_name_to_check, "was a recovered file") return "Was recovered" else: verbose_msg(file_name_to_check, "is OK") return "Is Ok"
def run_o2_analysis(tmp_script_name, remove_tmp_script=False, explore_bad_files=False, time_it=True): global number_of_runs verbose_msg("> starting run with", tmp_script_name) cmd = f"bash {tmp_script_name}" if do_bash_script: with open("parallelbash.sh", "a") as fout: with open("parallelbash.sh", "r") as fin: lastline = fin.readlines()[-1] if lastline.startswith("#"): lastline = int(lastline.strip("#")) else: lastline = 0 fout.write(f"echo Running {lastline}\n") fout.write(f"{cmd} &\n") lastline += 1 if lastline % (bash_parallel_jobs + 1) == 0: fout.write(f"wait\n") fout.write(f"\n#{lastline}\n") return if explore_bad_files: if run_cmd(cmd, check_status=True, throw_fatal=False, time_it=time_it) == False: list_name = os.listdir(os.path.dirname(tmp_script_name)) for i in list_name: if "ListForRun5Analysis" in i: list_name = i break if type(list_name) != list: with open( os.path.join(os.path.dirname(tmp_script_name), list_name)) as f: list_name = [] for i in f: list_name.append(i) warning_msg("Issue when running", tmp_script_name, "with", list_name) else: run_cmd(cmd, log_file=f"{tmp_script_name}.log", time_it=time_it) if remove_tmp_script: os.remove(tmp_script_name) verbose_msg("< end run with", tmp_script_name) return tmp_script_name
def get_the_daughters(): idaughters = [] if d0 > -1 and d1 > -1: for j in range(d0, d1 + 1): entry = numpy.where(npy["part_index"] == j)[0] if len(entry) > 1: raise ValueError("Entry size is too high!") if len(entry) == 0: raise ValueError("Entry size is too low!") entry = entry[0] if 0: d_m0 = npy["fMother0"][entry] d_m1 = npy["fMother1"][entry] else: d_m0 = npy["fIndexArray_Mothers"][entry][0] d_m1 = npy["fIndexArray_Mothers"][entry][ int(npy["fIndexArray_Mothers_size"][entry]) - 1] if d_m0 != part_index and d_m1 != part_index: if not continue_on_inconsistency: raise ValueError("Daughter", j, "has a different mother!", "d_m0", d_m0, "d_m1", d_m1, "w.r.t.", part_index) else: warning_msg("Daughter", j, "has a different mother!", "d_m0", d_m0, "d_m1", d_m1, "w.r.t.", part_index) if d_m0 == d_m1 and 0: raise ValueError("Daughter has same mother!", d_m0, d_m1) idaughters.append(entry) if len(idaughters) == 0: warning_msg("Found no daughters") return idaughters # Checking that indices are increasing if sorted(idaughters) != idaughters: raise ValueError("Daughters are not in order!") # Checking that indices have no holes if idaughters != [*range(idaughters[0], idaughters[-1] + 1)]: raise ValueError("Daughters have hole in indices!", idaughters) return idaughters
def check_momentum(daughters): d_p = daughters_pxpypz(daughters) if d_p is None: return m_p = [px, py, pz] m_p_d = {0: "Px", 1: "Py", 2: "Pz"} momentum_format = "(px={:.5f}, py={:.5f}, pz={:.5f})" for j in enumerate(m_p): if abs(j[1] - d_p[j[0]]) > 0.001: e_msg = [ "Non-closure in", m_p_d[j[0]], "=", momentum_format.format(*d_p) ] if not continue_on_inconsistency: raise ValueError(*e_msg) else: warning_msg(*e_msg) warning_msg(" mother =", momentum_format.format(*m_p))
def check_root_file(file_name): if not file_name.endswith(".root"): warning_msg("Testing a non root file:", file_name) return True if not path.isfile(file_name): warning_msg("Testing a non existing file:", file_name) return True try: f = TFile(file_name, "READ") if f.TestBit(TFile.kRecovered): msg("File", file_name, "was recovered", color=bcolors.WARNING) return False if not f.IsOpen(): msg("File", file_name, "is not open", color=bcolors.WARNING) return False except OSError: msg("Issue when checking file", file_name, color=bcolors.WARNING) return False verbose_msg(file_name, "is ok and has size", os.path.getsize(file_name) * 1e-6, "MB") return True
def useapi(ccdb_path, host): global timestamps objectlist = get_ccdb_api(host).list(ccdb_path, False, "text/plain") bunch_objects = [] starting_sequence = "ID: " for i in objectlist.split("\n"): if starting_sequence in i: bunch_objects.append("") if len(bunch_objects) <= 0: warning_msg("Skipping", i, "because found no object there") continue bunch_objects[-1] += f"{i}\n" verbose_msg("Found", len(bunch_objects), "object in path", ccdb_path) for counter, i in enumerate(bunch_objects): if 0: print(f"Object #{counter}/{len(bunch_objects)-1}") print(i) t = {} for j in i.split("\n"): save_fields(j, fields_of_interest=t) # print(t) timestamps.setdefault(ccdb_path, []).append(t)
def check_rootfile(fname): try: f = TFile(fname, "READ") if f.TestBit(TFile.kRecovered): warning_msg("File", fname, "was recovered") return False elif not f.IsOpen(): warning_msg("File", fname, "is not open") return False except OSError: warning_msg("Issue when checking file", fname) return False return True
def print_evt(event_filter=">= 0"): pdg_db = TDatabasePDG() ev_df = df.Filter(f"fIndexMcCollisions {event_filter}") npy = ev_df.AsNumpy() print() lastmother = 0 for i, part_index in enumerate(npy["part_index"]): ev = npy["fIndexMcCollisions"][i] count("events", ev) if 0: m0 = npy["fMother0"][i] m1 = npy["fMother1"][i] d0 = npy["fDaughter0"][i] d1 = npy["fDaughter1"][i] else: m_arr = npy["fIndexArray_Mothers"][i] d_arr = npy["fIndexSlice_Daughters"][i] m_size = npy["fIndexArray_Mothers_size"][i] # print(m_size) # print("Mothers", m_arr) # print("Daughters", d_arr) if len(m_arr) == 0: m0 = -1 m1 = -1 else: m0 = m_arr[0] m1 = m_arr[int(m_size) - 1] d0 = d_arr[0] d1 = d_arr[1] # print(d_arr) pdg = npy["fPdgCode"][i] px = npy["fPx"][i] py = npy["fPy"][i] pz = npy["fPz"][i] eta = npy["eta"][i] is_ps = bool(npy["isPhysicalPrimary"][i]) is_pt = bool(npy["isProducedByTransport"][i]) process = npy["fStatusCode"][i] def getpname(pdg_code): p = pdg_db.GetParticle(int(pdg_code)) if p: p = p.GetName() else: p = "Undef" return p part = getpname(pdg) summary_line = f" ({part_index}) ev {ev} m0 {m0} m1 {m1}, d0 {d0} d1 {d1}, pdg {pdg} '{part}', physical primary {is_ps}, in transport {is_pt}, process {process}" if abs(pdg) not in [21, 2101, 2103, 2203, 1, 2, 3, 4, 5 ] and m0 > -1: if lastmother != m0 and count("mothers", m0): raise ValueError("Duplicate mothers for ", summary_line) lastmother = m0 if d1 > -1 and d0 > d1: if not continue_on_inconsistency: raise ValueError("d0 > d1:", summary_line) else: warning_msg("d0 > d1 for", part_index) def get_the_daughters(): idaughters = [] if d0 > -1 and d1 > -1: for j in range(d0, d1 + 1): entry = numpy.where(npy["part_index"] == j)[0] if len(entry) > 1: raise ValueError("Entry size is too high!") if len(entry) == 0: raise ValueError("Entry size is too low!") entry = entry[0] if 0: d_m0 = npy["fMother0"][entry] d_m1 = npy["fMother1"][entry] else: d_m0 = npy["fIndexArray_Mothers"][entry][0] d_m1 = npy["fIndexArray_Mothers"][entry][ int(npy["fIndexArray_Mothers_size"][entry]) - 1] if d_m0 != part_index and d_m1 != part_index: if not continue_on_inconsistency: raise ValueError("Daughter", j, "has a different mother!", "d_m0", d_m0, "d_m1", d_m1, "w.r.t.", part_index) else: warning_msg("Daughter", j, "has a different mother!", "d_m0", d_m0, "d_m1", d_m1, "w.r.t.", part_index) if d_m0 == d_m1 and 0: raise ValueError("Daughter has same mother!", d_m0, d_m1) idaughters.append(entry) if len(idaughters) == 0: warning_msg("Found no daughters") return idaughters # Checking that indices are increasing if sorted(idaughters) != idaughters: raise ValueError("Daughters are not in order!") # Checking that indices have no holes if idaughters != [*range(idaughters[0], idaughters[-1] + 1)]: raise ValueError("Daughters have hole in indices!", idaughters) return idaughters def daughters_pxpypz(daughters): d_px = 0 d_py = 0 d_pz = 0 if len(daughters) == 0: return None for j in daughters: d_px += npy["fPx"][j] d_py += npy["fPy"][j] d_pz += npy["fPz"][j] return d_px, d_py, d_pz def daughters_pdg(daughters): d_pdgs = [] for j in daughters: d_pdgs.append(npy["fPdgCode"][j]) return d_pdgs def check_momentum(daughters): d_p = daughters_pxpypz(daughters) if d_p is None: return m_p = [px, py, pz] m_p_d = {0: "Px", 1: "Py", 2: "Pz"} momentum_format = "(px={:.5f}, py={:.5f}, pz={:.5f})" for j in enumerate(m_p): if abs(j[1] - d_p[j[0]]) > 0.001: e_msg = [ "Non-closure in", m_p_d[j[0]], "=", momentum_format.format(*d_p) ] if not continue_on_inconsistency: raise ValueError(*e_msg) else: warning_msg(*e_msg) warning_msg(" mother =", momentum_format.format(*m_p)) def is_decay_channel(desired_pdg_codes, daughters, fill_counter=True, min_prongs=0, max_prongs=10): d_pdgs = daughters_pdg(daughters) if len(daughters) >= min_prongs and len( daughters) <= max_prongs: print(pdg, part, "decaying in", len(d_pdgs), "particles") for i, j in enumerate(d_pdgs): if 0: this_m0 = npy["fMother0"][daughters[i]] this_m1 = npy["fMother1"][daughters[i]] else: this_m0 = npy["fIndexArray_Mothers"][ daughters[i]][0] this_m1 = npy["fIndexArray_Mothers"][daughters[i]][ int(npy["fIndexArray_Mothers_size"][ daughters[i]]) - 1] print(" >", j, getpname(j), "index", daughters[i], npy["part_index"][daughters[i]], "m0", this_m0, "m1", this_m1, " -> physical primary", npy["isPhysicalPrimary"][daughters[i]]) if desired_pdg_codes is not None: for i in desired_pdg_codes: if i not in d_pdgs: return False if fill_counter: count( f"{bcolors.BOKGREEN} {pdg} {part} {bcolors.ENDC} in {d_pdgs}", part_index) return True extra = [] if m0 < 0 and m1 < 0 and d0 < 1 and d1 < 0: extra.append("Sterile") if d1 < 0 and d1 != d0: extra.append(bcolors.BWARNING + "Problematic" + bcolors.ENDC) if pdg in pdg_of_interest: extra.append(", px={:.3f} py={:.2f} pz={:.2f}".format( px, py, pz)) extra.append(", eta={:.4f}".format(eta)) extra.append(bcolors.BOKGREEN + "PDG of interest" + bcolors.ENDC) extra = " ".join(extra) extra = extra.strip() count(part, part_index) if verbose or pdg in pdg_of_interest: print(summary_line, extra) if pdg in pdg_of_interest: daughters = get_the_daughters() check_momentum(daughters) is_decay_channel(None, daughters=daughters, fill_counter=True)
def get_ccdb_obj(ccdb_path, timestamp, out_path, host, show, tag=False, overwrite_preexisting=True, use_o2_api=True, check_metadata=True, interesting_metadata=[ "ObjectType", "PassName", "PeriodName", "RunNumber", "Valid-From", "Valid-Until", "" ]): """ Gets the ccdb object from 'ccdb_path' and 'timestamp' and downloads it into 'out_path' If 'tag' is True then the filename will be renamed after the timestamp. """ def check_rootfile(fname): try: f = TFile(fname, "READ") if f.TestBit(TFile.kRecovered): warning_msg("File", fname, "was recovered") return False elif not f.IsOpen(): warning_msg("File", fname, "is not open") return False except OSError: warning_msg("Issue when checking file", fname) return False return True verbose_msg("Getting obj", host, ccdb_path, "with timestamp", timestamp, convert_timestamp(timestamp)) out_name = "snapshot.root" if tag: out_name = f"snapshot_{timestamp}.root" out_path = os.path.normpath(out_path) fullname = os.path.join(out_path, ccdb_path, out_name) if os.path.isfile(fullname) and not overwrite_preexisting: if check_rootfile(fullname): msg("File", fullname, "already existing, not overwriting") return if use_o2_api: api = get_ccdb_api(host) if timestamp == -1: timestamp = o2.ccdb.getCurrentTimestamp() metadata = std.map('string,string')() api.retrieveBlob(ccdb_path, out_path, metadata, timestamp) if tag: os.rename(os.path.join(out_path, ccdb_path, "snapshot.root"), fullname) else: cmd = f"o2-ccdb-downloadccdbfile --host {host} --path {ccdb_path} --dest {out_path} --timestamp {timestamp}" cmd += f" -o {out_name}" print(cmd) subprocess.run(cmd.split()) if not os.path.isfile(fullname): raise ValueError("File", fullname, "not found") if not check_rootfile(fullname): raise ValueError("File", fullname, "is not Ok") if check_metadata: f = TFile(os.path.join(fullname), "READ") meta = f.Get("ccdb_meta") verbose_msg("Metadata") m_d = {"Valid-From": None, "Valid-Until": None} for i in meta: if i[0] in m_d: m_d[i[0]] = int(i[1]) if interesting_metadata[0] != "" and i[ 0] not in interesting_metadata: continue if i[0] in m_d: verbose_msg(i, convert_timestamp(int(i[1]))) else: verbose_msg(i) if timestamp < m_d["Valid-From"] or timestamp > m_d["Valid-Until"]: warning_msg("Timestamp asked is outside of window", timestamp, m_d) def print_info(entry): print("Object", entry, meta[entry]) print_info("Last-Modified") if show: obj = f.Get("ccdb_object") obj.Draw() time_box = TPaveText(.01, .9, 0.3, 0.99, "NDC") time_box.AddText(ccdb_path) time_box.AddText(f"timestamp {timestamp}") time_box.AddText(f"{convert_timestamp(timestamp)}") time_box.Draw() gPad.Update() input("Press enter to continue") # obj.Print("ALL") return fullname
input_files = [] for i in args.input_files: i = os.path.normpath(i) if i.endswith(".root"): input_files.append(i) elif i.endswith(".txt"): with open(i, "r") as f: for j in f: j = j.strip() input_files.append( os.path.join(os.path.abspath(os.path.dirname(i)), os.path.normpath(j))) run_in_parallel(args.njobs, main, input_files, "Checking file", linearize_single_core=True) if len(bad_files) > 0: warning_msg("There were", len(bad_files), "bad files") for i in bad_files: msg(i) if args.output is not None: msg("Writing good files to", args.output) with open(args.output, "w") as f: for i in input_files: if not i in bad_files: f.write(i + "\n")
def main(mode, input_file, out_path, out_tag="", batch_size=4, n_max_files=100, dpl_configuration_file=None, njobs=1, merge_output=True, merge_only=False, shm_mem_size=16000000000, rate_lim=1000000000, readers=1, avoid_overwriting_merge=False, clean_localhost_after_running=True, extra_arguments=""): if len(input_file) == 1: input_file = input_file[0] else: input_file = input_file[0:n_max_files] if not merge_only: msg("Running", f"'{mode}'", "analysis on", f"'{input_file}'", color=bcolors.BOKBLUE) msg("Maximum", n_max_files, "files with batch size", batch_size, "and", njobs, "jobs" if njobs > 1 else "job", color=bcolors.BOKBLUE) else: msg("Merging output of", f"'{mode}'", "analysis", color=bcolors.BOKBLUE) o2_arguments = f"-b --shm-segment-size {shm_mem_size} --aod-memory-rate-limit {rate_lim} --readers {readers}" o2_arguments += extra_arguments if mode not in analyses: raise ValueError("Did not find analyses matching mode", mode, ", please choose in", ", ".join(analyses.keys())) an = analyses[mode] tag = mode + out_tag # Build input file list input_file_list = [] def build_list_of_files(file_list): if len(file_list) != len( set(file_list)): # Check that runlist does not have duplicates fatal_msg("Runlist has duplicated entries, fix runlist!") not_readable = [] for i in file_list: # Check that input files can be open f = TFile(i.strip(), "READ") if not f.IsOpen(): verbose_msg("Cannot open AOD file:", i, color=bcolors.WARNING) not_readable.append(i) if len(not_readable) > 0: warning_msg(len(not_readable), "files cannot be read and will be skipped") for i in not_readable: file_list.remove(i) files_per_batch = [] iter_file_list = iter(file_list) for i in range(0, len(file_list)): sub_set = list(islice(iter_file_list, batch_size)) if len(sub_set) <= 0: continue files_per_batch.append(sub_set) run_list = [] if len(files_per_batch) > 0: for i, lines in enumerate(files_per_batch): p = os.path.join(out_path, f"{i}") if not os.path.isdir(p): os.makedirs(p) run_list.append(os.path.join(p, f"ListForRun5Analysis.{i}.txt")) with open(run_list[-1], "w") as f: for j in lines: f.write(j.strip() + "\n") msg("Number of runs:", len(run_list)) return run_list if type(input_file) is list: input_file = [os.path.join(os.getcwd(), i) for i in input_file] input_file_list = build_list_of_files(input_file) elif not input_file.endswith(".root"): with open(input_file, "r") as f: lines = f.readlines() msg("Building input list from", len(lines), "inputs, limiting to", n_max_files) if len(lines) > n_max_files: lines = lines[0:n_max_files] input_file_list = build_list_of_files(lines) else: input_file_list = [os.path.join(os.getcwd(), input_file)] if dpl_configuration_file is not None: dpl_configuration_file = os.path.join(os.getcwd(), dpl_configuration_file) run_list = [] for i, j in enumerate(input_file_list): run_list.append( set_o2_analysis(an, o2_arguments=o2_arguments, input_file=j, tag=tag, dpl_configuration_file=dpl_configuration_file)) if not merge_only: run_in_parallel(processes=njobs, job_runner=run_o2_analysis, job_arguments=run_list, job_message="Running analysis") if clean_localhost_after_running: run_cmd( "find /tmp/ -maxdepth 1 -name localhost* -user $(whoami) | xargs rm -v" ) if (merge_output or merge_only) and len(run_list) > 1: files_to_merge = [] for i in input_file_list: p = os.path.dirname(os.path.abspath(i)) for j in os.listdir(p): if j.endswith(f"_{tag}.root"): files_to_merge.append(os.path.join(p, j)) if len(files_to_merge) == 0: warning_msg("Did not find any file to merge for tag", tag) return if len(files_to_merge) > len(run_list): fatal_msg("Trying to merge too many files!", tag) msg("Merging", len(files_to_merge), "results", color=bcolors.BOKBLUE) files_per_type = {} # List of files to be merged per type for i in files_to_merge: fn = os.path.basename(i) files_per_type.setdefault(fn, []) files_per_type[fn].append(i) merged_files = [] for i in files_per_type: merged_file = os.path.join(out_path, i) if avoid_overwriting_merge and os.path.isfile(merged_file): warning_msg( "file", merged_file, "is already found, remove it before merging, you can use the --mergeonly flag to avoid running the analysis again" ) continue merged_files.append(merged_file) merge_file_list = os.path.join( os.path.dirname(os.path.abspath(merged_file)), "tomerge_" + "".join(i.split(".")[:-1]) + ".txt") verbose_msg("List of files to be merged:", merge_file_list) with open(merge_file_list, "w") as fmerge: for j in files_per_type[i]: fmerge.write(j + "\n") run_cmd( f"hadd -j {njobs} -f {merged_file} `cat {merge_file_list}`", log_file=merge_file_list.replace(".txt", ".log")) if len(merged_files) == 0: warning_msg("Merged no files") else: msg("Merging completed, merged:", *merged_files, color=bcolors.BOKGREEN)
def main(mode, input_file, out_path, out_tag="", batch_size=4, n_max_files=100, dpl_configuration_file=None, njobs=1, merge_output=True, merge_only=False, shm_mem_size=16000000000, rate_lim=1000000000, readers=1, avoid_overwriting_merge=False, clean_localhost_after_running=True, extra_arguments="", resume_previous_analysis=False, check_input_file_integrity=True, analysis_timeout=None, linearize_single_core=True): if do_bash_script: njobs = 1 linearize_single_core = True if len(input_file) == 1: input_file = input_file[0] else: input_file = input_file[0:n_max_files] if not merge_only: msg("Running", f"'{mode}'", "analysis on", f"'{input_file}'", color=bcolors.BOKBLUE) msg("Maximum", n_max_files, "files with batch size", batch_size, "and", njobs, "jobs" if njobs > 1 else "job", color=bcolors.BOKBLUE) else: msg("Merging output of", f"'{mode}'", "analysis", color=bcolors.BOKBLUE) if analysis_timeout is not None: msg("Using analysis timeout of", analysis_timeout, "seconds", color=bcolors.BOKBLUE) analysis_timeout = f"--time-limit {analysis_timeout}" else: analysis_timeout = "" o2_arguments = f"-b --shm-segment-size {shm_mem_size} --aod-memory-rate-limit {rate_lim} --readers {readers} {analysis_timeout}" o2_arguments += extra_arguments if mode not in analyses: raise ValueError("Did not find analyses matching mode", mode, ", please choose in", ", ".join(analyses.keys())) an = analyses[mode] tag = mode + out_tag # Build input file list input_file_list = [] def is_root_file_sane(file_name_to_check): file_name_to_check = file_name_to_check.strip() if not os.path.isfile(file_name_to_check): warning_msg("File", file_name_to_check, "does not exist") return "Does not exist" file_to_check = TFile(file_name_to_check, "READ") if not file_to_check.IsOpen(): warning_msg("Cannot open AOD file:", file_name_to_check) return "Cannot be open" elif file_to_check.TestBit(TFile.kRecovered): verbose_msg(file_name_to_check, "was a recovered file") return "Was recovered" else: verbose_msg(file_name_to_check, "is OK") return "Is Ok" def build_list_of_files(file_list): verbose_msg("Building list of files from", file_list) # Check that runlist does not have duplicates unique_file_list = set(file_list) if len(file_list) != len(unique_file_list): # for i in file_list fatal_msg("Runlist has duplicated entries, fix runlist!", len(unique_file_list), "unique files, while got", len(file_list), "files") file_status = { "Does not exist": [], "Cannot be open": [], "Was recovered": [], "Is Ok": [] } if check_input_file_integrity: # Check that input files can be open for i in file_list: verbose_msg("Checking that TFile", i.strip(), "can be processed") file_status[is_root_file_sane(i)] = i recovered_files = file_status["Was recovered"] not_readable = [] for i in file_status: if i == "Is Ok": continue not_readable += file_status[i] if len(recovered_files) > 0: msg( "Recovered", len(recovered_files), "files:\n", ) if len(not_readable) > 0: warning_msg(len(not_readable), "over", len(file_list), "files cannot be read and will be skipped") for i in not_readable: if i not in file_list: warning_msg("did not find file to remove", f"'{i}'") file_list.remove(i) files_per_batch = [] iter_file_list = iter(file_list) for i in range(0, len(file_list)): sub_set = list(islice(iter_file_list, batch_size)) if len(sub_set) <= 0: continue files_per_batch.append(sub_set) run_list = [] if len(files_per_batch) > 0: for i, lines in enumerate(files_per_batch): p = os.path.join(out_path, f"{i}") if not os.path.isdir(p): os.makedirs(p) run_list.append(os.path.join(p, f"ListForRun5Analysis.{i}.txt")) with open(run_list[-1], "w") as f: for j in lines: f.write(j.strip() + "\n") msg("Number of runs:", len(run_list)) return run_list if type(input_file) is list: input_file = [os.path.join(os.getcwd(), i) for i in input_file] input_file_list = build_list_of_files(input_file) elif not input_file.endswith(".root"): with open(input_file, "r") as f: lines = f.readlines() msg("Building input list from", len(lines), "inputs, limiting to", n_max_files) if len(lines) > n_max_files: lines = lines[0:n_max_files] lines = [ os.path.join(os.path.dirname(os.path.abspath(input_file)), i) for i in lines ] input_file_list = build_list_of_files(lines) else: input_file_list = [os.path.join(os.getcwd(), input_file)] if dpl_configuration_file is not None: dpl_configuration_file = os.path.join(os.getcwd(), dpl_configuration_file) run_list = [] for i, j in enumerate(input_file_list): run_list.append( set_o2_analysis(an, o2_arguments=o2_arguments, input_file=j, tag=tag, dpl_configuration_file=dpl_configuration_file, resume_previous_analysis=resume_previous_analysis, write_runner_script=not merge_only)) if not merge_only: if do_bash_script: with open("parallelbash.sh", "w") as f: f.write(f"#!/bin/bash\n\n") f.write(f"echo \"Start running\"\n\n") f.write(f"date\n\n") f.write("""function trap_ctrlc (){ # perform cleanup here echo "Ctrl-C caught...performing clean up" exit 2 }\n\n""") f.write("""trap "trap_ctrlc" 2\n""") run_in_parallel( processes=njobs, job_runner=run_o2_analysis, job_arguments=run_list, job_message=f"Running analysis, it's {datetime.datetime.now()}", linearize_single_core=linearize_single_core) if do_bash_script: with open("parallelbash.sh", "a") as f: f.write(f"wait\n\n") f.write(f"date\n\n") msg("Now run bash script `bash parallelbash.sh`") return if clean_localhost_after_running: run_cmd( "find /tmp/ -maxdepth 1 -name localhost* -user $(whoami) | xargs rm -v 2>&1", check_status=False) if (merge_output or merge_only) and len(run_list) > 1: files_to_merge = [] for i in input_file_list: p = os.path.dirname(os.path.abspath(i)) for j in os.listdir(p): if j.endswith(f"_{tag}.root"): files_to_merge.append(os.path.join(p, j)) if len(files_to_merge) == 0: warning_msg("Did not find any file to merge for tag", tag) return files_per_type = {} # List of files to be merged per type # List of files to be merged per type that are not declared sane non_sane_files_per_type = {} for i in files_to_merge: if is_root_file_sane(i) != "Is Ok": non_sane_files_per_type[fn].setdefault(fn, []).append(i) warning_msg("Result file", i, "is not sane") continue fn = os.path.basename(i) files_per_type.setdefault(fn, []) files_per_type[fn].append(i) for i in non_sane_files_per_type: warning_msg("Non sane files for type", i) for j in non_sane_files_per_type[i]: msg(j) merged_files = [] for i in files_per_type: merged_file = os.path.join(out_path, i) if avoid_overwriting_merge and os.path.isfile(merged_file): warning_msg( "file", merged_file, "is already found, remove it before merging, you can use the --mergeonly flag to avoid running the analysis again" ) continue merged_files.append(merged_file) merge_file_list = os.path.join( os.path.dirname(os.path.abspath(merged_file)), "tomerge_" + "".join(i.split(".")[:-1]) + ".txt") verbose_msg("List of files to be merged:", merge_file_list) with open(merge_file_list, "w") as fmerge: for j in files_per_type[i]: fmerge.write(j + "\n") if len(files_per_type[i]) > len(run_list): fatal_msg("Trying to merge too many files of type", i, "for tag", tag, ":", len(files_per_type[i]), "vs", len(run_list), "runs") msg("Merging", len(files_per_type[i]), "files to", merged_file) run_cmd( f"hadd -j {njobs} -f {merged_file} `cat {merge_file_list}`", log_file=merge_file_list.replace(".txt", ".log"), time_it=True, comment=f"Merging to {merged_file}") if len(merged_files) == 0: warning_msg("Merged no files") else: msg("Merging completed, merged:", *merged_files, color=bcolors.BOKGREEN)
def main(input_files, do_merge=True, sanity_file=None, max_bunch_size=200, out_path="./", over_write_lists=False, jobs=1): msg("Merging to", out_path, "with maximum input size", max_bunch_size) out_path = os.path.normpath(out_path) if not os.path.exists(out_path): warning_msg("Output path", out_path, "does not exist") ans = input("Create it? (Y/[N])") if ans == "Y": os.makedirs(out_path) else: msg("Exit") return sane_files = None if sanity_file is not None: msg("Using sanity file", sanity_file) sane_files = [] with open(sanity_file, "r") as f: for i in f: sane_files.append(os.path.abspath(os.path.normpath(i.strip()))) size_of_files = {} for i in input_files: i = os.path.normpath(i.strip()) if sane_files is not None and os.path.abspath(i) not in sane_files: msg("Skipping", i, "because not in sanity file") continue size_of_files[i] = os.path.getsize(i) * 1e-6 bunched_files = [[]] bunched_sizes = [] bunch_size = [] for i in size_of_files: verbose_msg("Checking file", i, "of size", size_of_files[i], "MB") if sum(bunch_size) > max_bunch_size: verbose_msg("Bunch size", sum(bunch_size), "reached limit with", len(bunch_size), "files", max_bunch_size, "MB", "preparing next bunch!") bunched_files.append([]) bunched_sizes.append(sum(bunch_size)) bunch_size = [] bunch_size.append(size_of_files[i]) bunched_files[-1].append(i) bunched_sizes.append(sum(bunch_size)) verbose_msg("Got", len(bunched_files), "bunches") for i, j in enumerate(bunched_files): verbose_msg(f"{i})", bunched_sizes[i], "MB, with", len(j), j) msg("Preparing", len(bunched_files), "bunched lists") bunched_aod_names.clear() for i, j in enumerate(bunched_files): fn = f"aod_merge_list_bunch{i}.txt" verbose_msg("Writing bunch", i, "to", fn) if not over_write_lists: if os.path.isfile(fn): fatal_msg(fn, "already present, remove it first") with open(fn, "w") as f: for k in j: f.write(k + "\n") if do_merge: out_aod = os.path.join(out_path, f"AO2D_Merge_{i}.root") if os.path.isfile(out_aod): fatal_msg(out_aod, "already present") bunched_aod_names[fn] = { "out_aod": out_aod, "file_index": i, "total_files": len(bunched_files), "input_size": bunched_sizes[i] } run_in_parallel(jobs, run_merge, list(bunched_aod_names.keys()), job_message="Running AOD merging", linearize_single_core=True)