def parallel_draw(drawer, jobs, mode, ncores, batch_opts): if len(jobs) == 0: return njobs = ncores if mode in ["multiprocessing"]: njobs = len(jobs) grouped_jobs = [list(x) for x in np.array_split(jobs, njobs)] tasks = [{ "task": multidraw, "args": (drawer, args), "kwargs": {} } for args in grouped_jobs] if mode == "multiprocessing" and ncores == 0: pysge.local_submit(tasks) elif mode == "multiprocessing": pysge.mp_submit(tasks, ncores=ncores) elif mode == "sge": pysge.sge_submit( tasks, "zdb-draw", "_ccsp_temp/", options=batch_opts, sleep=5, request_resubmission_options=True, return_files=True, )
def submit_tasks(tasks, mode="multiprocessing", ncores=0, batch_opts=""): if mode == "multiprocessing" and ncores == 0: results = pysge.local_submit(tasks) elif mode == "multiprocessing": results = pysge.mp_submit(tasks, ncores=ncores) elif mode == "sge": results = pysge.sge_submit( tasks, "zdb", "_ccsp_temp/", options=batch_opts, sleep=5, request_resubmission_options=True, return_files=True, ) elif mode == "condor": import conpy results = conpy.condor_submit( "zdb", "_ccsp_temp/", tasks=tasks, options=batch_opts, sleep=5, request_resubmission_options=True, ) return results
def run( sequence, datasets, name, outdir, tempdir, mode, batch_opts, ncores, nblocks_per_dataset, nblocks_per_process, nfiles_per_dataset, nfiles_per_process, blocksize, cachesize, quiet, dryrun, sample, predetermined_nevents_in_file, ): process = AtUproot( outdir, quiet = quiet, max_blocks_per_dataset = nblocks_per_dataset, max_blocks_per_process = nblocks_per_process, max_files_per_dataset = nfiles_per_dataset, max_files_per_process = nfiles_per_process, nevents_per_block = blocksize, predetermined_nevents_in_file=predetermined_nevents_in_file, branch_cache = LFUCache(int(cachesize*1024**3), get_size), ) tasks = process.run(datasets, sequence) if mode=="multiprocessing" and ncores==0: results = pysge.local_submit(tasks) elif mode=="multiprocessing": results = pysge.mp_submit(tasks, ncores=ncores) elif mode=="sge": results = pysge.sge_submit( tasks, name, tempdir, options=batch_opts, dryrun=dryrun, sleep=5, request_resubmission_options=True, return_files=True, ) return results
def main(): options = parse_args() mode = options.mode njobs = options.ncores # setup jobs with open(options.config, 'r') as f: cfg = yaml.full_load(f) # group jobs files = cfg["files"] if options.nfiles > 0: files = files[:options.nfiles] if mode in ["multiprocessing"] or njobs < 0: njobs = len(files) grouped_files = [list(x) for x in np.array_split(files, njobs)] tasks = [ {"task": df_skim, "args": (fs,cfg,options.output.format(idx)), "kwargs": {}} for idx, fs in enumerate(grouped_files) ] if mode=="multiprocessing" and options.ncores==0: results = pysge.local_submit(tasks) elif mode=="multiprocessing": results = pysge.mp_submit(tasks, ncores=options.ncores) elif mode=="sge": results = pysge.sge_submit( "zdb", "_ccsp_temp/", tasks=tasks, options=options.sge_opts, sleep=5, request_resubmission_options=True, ) print("Finished!")
def parallel_draw(draw, jobs, options): if len(jobs) == 0: return mode = options.mode njobs = options.ncores if options.mode in ["multiprocessing"]: njobs = len(jobs) + 1 jobs = [list(x) for x in np.array_split(jobs, njobs)] tasks = [{ "task": multidraw, "args": (draw, args), "kwargs": {} } for args in jobs] if mode == "multiprocessing" and options.ncores == 0: results = pysge.local_submit(tasks) elif mode == "multiprocessing": results = pysge.mp_submit(tasks, ncores=options.ncores) elif mode == "sge": results = pysge.sge_submit( tasks, "zdb", "_ccsp_temp/", options=options.sge_opts, request_resubmission_options=True, return_files=True, ) else: results = []
def main(): options = parse_args() results = pysge.sge_resume( "zdb", options.path, options=options.sge_opts, sleep=5, request_resubmission_options=True, ) njobs = options.ncores if options.mode in ["multiprocessing"] or options.ncores < 0: njobs = len(results) grouped_args = [list(x) for x in np.array_split(results, njobs)] tasks = [{ "task": df_open_merge, "args": (args, ), "kwargs": { "quiet": True } } for args in grouped_args] if options.mode == "multiprocessing" and options.ncores == 0: merge_results = pysge.local_submit(tasks) df = pd.DataFrame() for result in merge_results: df = df_merge(df, result) elif options.mode == "multiprocessing": merge_results = pysge.mp_submit(tasks, ncores=options.ncores) df = pd.DataFrame() for result in merge_results: df = df_merge(df, result) elif options.mode == "sge": merge_results = pysge.sge_submit( "zdb-merge", "_ccsp_temp/", tasks=tasks, options=options.sge_opts, sleep=5, request_resubmission_options=True, ) df = df_open_merge(merge_results) else: df = pd.DataFrame() print(df) path, table = options.output.split(":") df.to_hdf( path, table, format='table', append=False, complevel=9, complib='zlib', )
outpaths = [ "data/hists_qcd_estimation.h5:DataAggEvents", "data/hists_qcd_estimation.h5:MCAggEvents", "data/hists_qcd_estimation.h5:MCAggEvents_jes", "data/hists_qcd_estimation.h5:MCAggEvents_jer", "data/hists_qcd_estimation.h5:MCAggEvents_unclust", "data/hists_qcd_estimation.h5:MCAggEvents_lepscales", ] tasks = [] for idx, outpath in enumerate(outpaths): start = 10 * idx stop = min(10 * (idx + 1), len(paths)) tasks.append({ "task": job, "args": ( [os.path.abspath(p) for p in paths[start:stop]], os.path.abspath(outpath), ), "kwargs": {}, }) pysge.sge_submit( tasks, "merge", "_ccsp_temp", options="-q hep.q -l h_vmem=24G -pe hep.pe 8", )
def analyse( config, mode="multiprocesing", ncores=0, nfiles=-1, batch_opts="", output=None, chunksize=500000, merge_opts={}, ): if len(output.split(":")) != 2: raise ValueError( "The output kwarg should be None or a string with the format " "'{file_name}:{table_name}' instead of " + "{}".format(output)) njobs = ncores # setup jobs with open(config, 'r') as f: cfg = yaml.full_load(f) # group jobs files = cfg["files"] if nfiles > 0: files = files[:nfiles] if mode in ["multiprocessing"] or njobs < 0: njobs = len(files) grouped_files = [list(x) for x in np.array_split(files, njobs)] tasks = [{ "task": df_process, "args": (fs, cfg["query"]), "kwargs": { "chunksize": chunksize }, } for fs in grouped_files] results = submit_tasks(tasks, mode=mode, ncores=ncores, batch_opts=batch_opts) if mode == 'multiprocessing': df = functools.reduce(lambda x, y: df_merge(x, y), results) else: # grouped multi-merge merge_njobs = merge_opts.get("ncores", 100) grouped_merges = [ list(x) for x in np.array_split(results, merge_njobs) ] tasks = [{ "task": df_open_merge, "args": (r, ), "kwargs": {}, } for r in grouped_merges] merge_mode = merge_opts.get("mode", "multiprocessing") if merge_mode == "multiprocessing" and ncores == 0: semimerged_results = pysge.local_submit(tasks) df = functools.reduce(lambda x, y: df_merge(x, y), results) elif mode == "multiprocessing": semimerged_results = pysge.mp_submit(tasks, ncores=ncores) df = functools.reduce(lambda x, y: df_merge(x, y), results) elif mode == "sge": semimerged_results = pysge.sge_submit( tasks, "zdb-merge", "_ccsp_temp", options=merge_opts.get("batch_opts", "-q hep.q"), sleep=5, request_resubmission_options=True, return_files=True, ) df = df_open_merge(semimerged_results) if output is not None: path, table = output.split(":") df.to_hdf( path, table, format='table', append=False, complevel=9, complib='zlib', ) else: return df
def main(): with open("data_v2.txt", 'r') as f: datain = f.read() datasets, tasks = [], [] for block in datain.split("\n\n"): if len(block) == 0: continue lines = block.split("\n") das = lines[1] files = sorted(list(set(lines[3].split(" ")))) summary = eval(lines[5])[0] parent = get_parent(das) print(parent) #runyear, runletter, ver = get_runera(das) tasks.extend([{ "task": get_nevents_sumweights, "args": (f'{xrd_redir}{p}', ), "kwargs": { "param": None }, } for p in files]) isdata = True tree = "Events" xsec = None datasets.append({ "name": parent, "parent": parent, "isdata": isdata, "nevents": int(summary["nevents"]), "sumweights": None, "files": [f'{xrd_redir}{f}' for f in files], "file_nevents": [], "DAS": das, "tree": tree, "xsection": xsec, }) #results = pysge.local_submit(tasks) #results = pysge.mp_submit(tasks, 6) results = pysge.sge_submit(tasks, "dasq", "_ccsp_temp") all_files_results = {} for r in results: all_files_results.update(r) new_datasets = [] for d in datasets: tot_nevts = 0 tot_sumw = 0. fnevts = [] for p in d["files"]: #nevts, sumw = all_files_results[p] nevts = all_files_results[p] sumw = nevts tot_nevts += nevts tot_sumw += sumw fnevts.append(nevts) if tot_nevts != d["nevents"]: print("Mismatch in nevents from files {} and summary {} for {}". format( tot_nevts, d["nevents"], d["DAS"], )) new_datasets.append({ "name": d["name"], "parent": d["parent"], "isdata": d["isdata"], "nevents": tot_nevts, "sumweights": tot_sumw, "files": d["files"], "file_nevents": fnevts, "DAS": d["DAS"], "tree": d["tree"], "xsection": d["xsection"], }) with open("data_v2.yaml", 'w') as f: yaml.dump(new_datasets, f, indent=4)