def do_dask_pandas_job(client, filelist): from coffea.processor.test_items import NanoTestProcessorPandas from coffea import nanoevents executor = processor.DaskExecutor(client=client, use_dataframes=True) run = processor.Runner(executor=executor, schema=nanoevents.NanoAODSchema) output = run(filelist, "Events", processor_instance=NanoTestProcessorPandas()) # Can save to Parquet straight from distributed DataFrame without explicitly collecting the outputs: # # import dask.dataframe as dd # dd.to_parquet(df=output, path=/output/path/) # # # It's also possible to do some operations on distributed DataFrames without collecting them. # For example, split the dataframe by column value and save to different directories: # # dd.to_parquet(df=output[output.dataset=='ZJets'], path=/output/path/ZJets/) # dd.to_parquet(df=output[output.dataset=='Data'], path=/output/path/Data/) # Alternatively, can continue working with output. # Convert from Dask DataFrame back to Pandas: output = output.compute() assert output[output.dataset == "ZJets"].shape[0] == 6 assert output[output.dataset == "Data"].shape[0] == 18
def test_dataframe_analysis(executor, schema, chunksize, maxchunks, skipbadfiles): from coffea.processor.test_items import NanoTestProcessor filelist = { "ZJets": [osp.abspath("tests/samples/nano_dy.root")], "Data": [osp.abspath("tests/samples/nano_dimuon.root")], } executor = executor() run = processor.Runner( executor=executor, schema=schema, chunksize=chunksize, maxchunks=maxchunks, skipbadfiles=skipbadfiles, ) hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) if maxchunks is None: assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 else: assert maxchunks == 1 assert hists["cutflow"]["ZJets_pt"] == 18 if chunksize == 100_000 else 2 assert hists["cutflow"]["ZJets_mass"] == 6 if chunksize == 100_000 else 1 assert hists["cutflow"]["Data_pt"] == 84 if chunksize == 100_000 else 13 assert hists["cutflow"]["Data_mass"] == 66 if chunksize == 100_000 else 12
def do_dask_cached(client, filelist, cachestrategy=None): from coffea.nanoevents import schemas from coffea.processor.test_items import NanoEventsProcessor from coffea.processor.dask import register_columncache register_columncache(client) worker_affinity = True if cachestrategy is not None else False executor = processor.DaskExecutor(client=client, worker_affinity=worker_affinity) run = processor.Runner( executor=executor, schema=schemas.NanoAODSchema, cachestrategy=cachestrategy, savemetrics=True, ) hists, metrics = run( filelist, "Events", processor_instance=NanoEventsProcessor(canaries=[ "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content", "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content", ]), ) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 return hists["worker"]
def do_parsl_job(filelist, flatten=False, compression=0, config=None): from coffea.processor.test_items import NanoTestProcessor executor = processor.ParslExecutor(compression=compression, config=config) run = processor.Runner(executor=executor) hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
def do_dask_job(client, filelist, compression=0): from coffea.processor.test_items import NanoTestProcessor executor = processor.DaskExecutor(client=client, compression=compression) run = processor.Runner(executor=executor) hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
def run(query, chunksize, workers, file): if not file.startswith("/dev/shm"): # https://stackoverflow.com/questions/9551838/how-to-purge-disk-i-o-caches-on-linux try: subprocess.run("sync", check=True) subprocess.run( ["sudo", "bash", "-c", "echo 3 > /proc/sys/vm/drop_caches"], check=True) except PermissionError: pass tic = time.monotonic() cputic = proc.cpu_times() if workers > 1: executor = processor.FuturesExecutor(workers=workers, status=False) else: executor = processor.IterativeExecutor(status=False) runner = processor.Runner( executor=executor, schema=nanoevents.NanoAODSchema, savemetrics=True, chunksize=chunksize, ) output, metrics = runner( fileset={"SingleMu": [file]}, treename="Events", processor_instance=query(), ) toc = time.monotonic() cputoc = proc.cpu_times() metrics["query"] = query.__name__ metrics["tgt_chunksize"] = chunksize metrics["chunksize"] = metrics["entries"] / metrics["chunks"] metrics["workers"] = workers metrics["walltime"] = toc - tic metrics["path"] = os.path.dirname(file) metrics.update({ n: f - i for n, f, i in zip( "user system children_user children_system iowait".split(), cputoc, cputic, ) }) return output, metrics
def test_dataframe_analysis(executor, schema, chunksize): from coffea.processor.test_items import NanoTestProcessor filelist = { "ZJets": [osp.abspath("tests/samples/nano_dy.root")], "Data": [osp.abspath("tests/samples/nano_dimuon.root")], } executor = executor() run = processor.Runner(executor=executor, schema=schema, chunksize=chunksize) hists = run(filelist, "Events", processor_instance=NanoTestProcessor()) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
def test_nanoevents_analysis(executor, compression, maxchunks): from coffea.processor.test_items import NanoEventsProcessor filelist = { "DummyBad": { "treename": "Events", "files": [osp.abspath("tests/samples/non_existent.root")], }, "ZJets": { "treename": "Events", "files": [osp.abspath("tests/samples/nano_dy.root")], "metadata": { "checkusermeta": True, "someusermeta": "hello" }, }, "Data": { "treename": "Events", "files": [osp.abspath("tests/samples/nano_dimuon.root")], "metadata": { "checkusermeta": True, "someusermeta2": "world" }, }, } executor = executor(compression=compression) run = processor.Runner( executor=executor, skipbadfiles=True, schema=processor.NanoAODSchema, maxchunks=maxchunks, ) hists = run(filelist, "Events", processor_instance=NanoEventsProcessor()) assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
f"cp nano_dy.parquet /mnt/cephfs/nanoevents/ZJets/nano_dy.{i}.parquet" ) os.system( f"cp nano_dimuon.parquet /mnt/cephfs/nanoevents/Data/nano_dimuon.{i}.parquet" ) from dask.distributed import Client, LocalCluster cluster = LocalCluster(processes=True, threads_per_worker=1) client = Client(cluster) executor = processor.DaskExecutor(client=client) run = processor.Runner( executor=executor, use_skyhook=True, format="parquet", schema=schemas.NanoAODSchema, ) hists = run( { "ZJets": "/mnt/cephfs/nanoevents/ZJets", "Data": "/mnt/cephfs/nanoevents/Data", }, "Events", processor_instance=NanoEventsProcessor(), ) assert hists["cutflow"]["ZJets_pt"] == 108 assert hists["cutflow"]["ZJets_mass"] == 36 assert hists["cutflow"]["Data_pt"] == 504
f"cp nano_dy.parquet /mnt/cephfs/nanoevents/ZJets/nano_dy.{i}.parquet" ) os.system( f"cp nano_dimuon.parquet /mnt/cephfs/nanoevents/Data/nano_dimuon.{i}.parquet" ) from dask.distributed import Client, LocalCluster cluster = LocalCluster(processes=True, threads_per_worker=1) client = Client(cluster) executor = processor.DaskExecutor(client=client) run = processor.Runner( executor=executor, ceph_config_path="/tmp/testradosparquetjob/ceph.conf", format="parquet", schema=schemas.NanoAODSchema, ) hists = run( { "ZJets": "/mnt/cephfs/nanoevents/ZJets", "Data": "/mnt/cephfs/nanoevents/Data", }, "Events", processor_instance=NanoEventsProcessor(), ) assert hists["cutflow"]["ZJets_pt"] == 108 assert hists["cutflow"]["ZJets_mass"] == 36 assert hists["cutflow"]["Data_pt"] == 504