Exemple #1
0
def do_dask_pandas_job(client, filelist):
    from coffea.processor.test_items import NanoTestProcessorPandas
    from coffea import nanoevents

    executor = processor.DaskExecutor(client=client, use_dataframes=True)
    run = processor.Runner(executor=executor, schema=nanoevents.NanoAODSchema)

    output = run(filelist,
                 "Events",
                 processor_instance=NanoTestProcessorPandas())

    # Can save to Parquet straight from distributed DataFrame without explicitly collecting the outputs:
    #
    # import dask.dataframe as dd
    # dd.to_parquet(df=output, path=/output/path/)
    #
    #
    # It's also possible to do some operations on distributed DataFrames without collecting them.
    # For example, split the dataframe by column value and save to different directories:
    #
    # dd.to_parquet(df=output[output.dataset=='ZJets'], path=/output/path/ZJets/)
    # dd.to_parquet(df=output[output.dataset=='Data'], path=/output/path/Data/)

    # Alternatively, can continue working with output.
    # Convert from Dask DataFrame back to Pandas:
    output = output.compute()

    assert output[output.dataset == "ZJets"].shape[0] == 6
    assert output[output.dataset == "Data"].shape[0] == 18
Exemple #2
0
def test_dataframe_analysis(executor, schema, chunksize, maxchunks, skipbadfiles):
    from coffea.processor.test_items import NanoTestProcessor

    filelist = {
        "ZJets": [osp.abspath("tests/samples/nano_dy.root")],
        "Data": [osp.abspath("tests/samples/nano_dimuon.root")],
    }

    executor = executor()
    run = processor.Runner(
        executor=executor,
        schema=schema,
        chunksize=chunksize,
        maxchunks=maxchunks,
        skipbadfiles=skipbadfiles,
    )

    hists = run(filelist, "Events", processor_instance=NanoTestProcessor())

    if maxchunks is None:
        assert hists["cutflow"]["ZJets_pt"] == 18
        assert hists["cutflow"]["ZJets_mass"] == 6
        assert hists["cutflow"]["Data_pt"] == 84
        assert hists["cutflow"]["Data_mass"] == 66
    else:
        assert maxchunks == 1
        assert hists["cutflow"]["ZJets_pt"] == 18 if chunksize == 100_000 else 2
        assert hists["cutflow"]["ZJets_mass"] == 6 if chunksize == 100_000 else 1
        assert hists["cutflow"]["Data_pt"] == 84 if chunksize == 100_000 else 13
        assert hists["cutflow"]["Data_mass"] == 66 if chunksize == 100_000 else 12
Exemple #3
0
def do_dask_cached(client, filelist, cachestrategy=None):
    from coffea.nanoevents import schemas
    from coffea.processor.test_items import NanoEventsProcessor
    from coffea.processor.dask import register_columncache

    register_columncache(client)

    worker_affinity = True if cachestrategy is not None else False
    executor = processor.DaskExecutor(client=client,
                                      worker_affinity=worker_affinity)
    run = processor.Runner(
        executor=executor,
        schema=schemas.NanoAODSchema,
        cachestrategy=cachestrategy,
        savemetrics=True,
    )

    hists, metrics = run(
        filelist,
        "Events",
        processor_instance=NanoEventsProcessor(canaries=[
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content",
        ]),
    )

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
    return hists["worker"]
Exemple #4
0
def do_parsl_job(filelist, flatten=False, compression=0, config=None):
    from coffea.processor.test_items import NanoTestProcessor

    executor = processor.ParslExecutor(compression=compression, config=config)
    run = processor.Runner(executor=executor)

    hists = run(filelist, "Events", processor_instance=NanoTestProcessor())

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
Exemple #5
0
def do_dask_job(client, filelist, compression=0):
    from coffea.processor.test_items import NanoTestProcessor

    executor = processor.DaskExecutor(client=client, compression=compression)
    run = processor.Runner(executor=executor)

    hists = run(filelist, "Events", processor_instance=NanoTestProcessor())

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
def run(query, chunksize, workers, file):
    if not file.startswith("/dev/shm"):
        # https://stackoverflow.com/questions/9551838/how-to-purge-disk-i-o-caches-on-linux
        try:
            subprocess.run("sync", check=True)
            subprocess.run(
                ["sudo", "bash", "-c", "echo 3 > /proc/sys/vm/drop_caches"],
                check=True)
        except PermissionError:
            pass

    tic = time.monotonic()
    cputic = proc.cpu_times()

    if workers > 1:
        executor = processor.FuturesExecutor(workers=workers, status=False)
    else:
        executor = processor.IterativeExecutor(status=False)
    runner = processor.Runner(
        executor=executor,
        schema=nanoevents.NanoAODSchema,
        savemetrics=True,
        chunksize=chunksize,
    )
    output, metrics = runner(
        fileset={"SingleMu": [file]},
        treename="Events",
        processor_instance=query(),
    )

    toc = time.monotonic()
    cputoc = proc.cpu_times()
    metrics["query"] = query.__name__
    metrics["tgt_chunksize"] = chunksize
    metrics["chunksize"] = metrics["entries"] / metrics["chunks"]
    metrics["workers"] = workers
    metrics["walltime"] = toc - tic
    metrics["path"] = os.path.dirname(file)
    metrics.update({
        n: f - i
        for n, f, i in zip(
            "user system children_user children_system iowait".split(),
            cputoc,
            cputic,
        )
    })

    return output, metrics
def test_dataframe_analysis(executor, schema, chunksize):
    from coffea.processor.test_items import NanoTestProcessor

    filelist = {
        "ZJets": [osp.abspath("tests/samples/nano_dy.root")],
        "Data": [osp.abspath("tests/samples/nano_dimuon.root")],
    }

    executor = executor()
    run = processor.Runner(executor=executor,
                           schema=schema,
                           chunksize=chunksize)

    hists = run(filelist, "Events", processor_instance=NanoTestProcessor())

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
def test_nanoevents_analysis(executor, compression, maxchunks):
    from coffea.processor.test_items import NanoEventsProcessor

    filelist = {
        "DummyBad": {
            "treename": "Events",
            "files": [osp.abspath("tests/samples/non_existent.root")],
        },
        "ZJets": {
            "treename": "Events",
            "files": [osp.abspath("tests/samples/nano_dy.root")],
            "metadata": {
                "checkusermeta": True,
                "someusermeta": "hello"
            },
        },
        "Data": {
            "treename": "Events",
            "files": [osp.abspath("tests/samples/nano_dimuon.root")],
            "metadata": {
                "checkusermeta": True,
                "someusermeta2": "world"
            },
        },
    }

    executor = executor(compression=compression)
    run = processor.Runner(
        executor=executor,
        skipbadfiles=True,
        schema=processor.NanoAODSchema,
        maxchunks=maxchunks,
    )

    hists = run(filelist, "Events", processor_instance=NanoEventsProcessor())

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
Exemple #9
0
            f"cp nano_dy.parquet /mnt/cephfs/nanoevents/ZJets/nano_dy.{i}.parquet"
        )
        os.system(
            f"cp nano_dimuon.parquet /mnt/cephfs/nanoevents/Data/nano_dimuon.{i}.parquet"
        )

    from dask.distributed import Client, LocalCluster

    cluster = LocalCluster(processes=True, threads_per_worker=1)
    client = Client(cluster)

    executor = processor.DaskExecutor(client=client)

    run = processor.Runner(
        executor=executor,
        use_skyhook=True,
        format="parquet",
        schema=schemas.NanoAODSchema,
    )

    hists = run(
        {
            "ZJets": "/mnt/cephfs/nanoevents/ZJets",
            "Data": "/mnt/cephfs/nanoevents/Data",
        },
        "Events",
        processor_instance=NanoEventsProcessor(),
    )

    assert hists["cutflow"]["ZJets_pt"] == 108
    assert hists["cutflow"]["ZJets_mass"] == 36
    assert hists["cutflow"]["Data_pt"] == 504
Exemple #10
0
            f"cp nano_dy.parquet /mnt/cephfs/nanoevents/ZJets/nano_dy.{i}.parquet"
        )
        os.system(
            f"cp nano_dimuon.parquet /mnt/cephfs/nanoevents/Data/nano_dimuon.{i}.parquet"
        )

    from dask.distributed import Client, LocalCluster

    cluster = LocalCluster(processes=True, threads_per_worker=1)
    client = Client(cluster)

    executor = processor.DaskExecutor(client=client)

    run = processor.Runner(
        executor=executor,
        ceph_config_path="/tmp/testradosparquetjob/ceph.conf",
        format="parquet",
        schema=schemas.NanoAODSchema,
    )

    hists = run(
        {
            "ZJets": "/mnt/cephfs/nanoevents/ZJets",
            "Data": "/mnt/cephfs/nanoevents/Data",
        },
        "Events",
        processor_instance=NanoEventsProcessor(),
    )

    assert hists["cutflow"]["ZJets_pt"] == 108
    assert hists["cutflow"]["ZJets_mass"] == 36
    assert hists["cutflow"]["Data_pt"] == 504