Exemple #1
0
def test_preloaded_nanoevents():
    columns = [
        'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge',
        'nJet', 'Jet_eta'
    ]
    p = NanoEventsProcessor(columns=columns)

    rootdir = uproot.open(os.path.abspath('tests/samples/nano_dy.root'))
    tree = rootdir['Events']
    arrays = tree.arrays(columns, how=dict)
    src = SimplePreloadedColumnSource(arrays,
                                      rootdir.file.uuid,
                                      tree.num_entries,
                                      object_path='/Events')
    print(arrays)

    events = NanoEventsFactory.from_preloaded(src,
                                              metadata={
                                                  'dataset': 'ZJets'
                                              }).events()
    hists = p.process(events)

    print(hists)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)

    with pytest.raises(AttributeError):
        print(events.Muon.matched_jet)
Exemple #2
0
def test_preloaded_nanoevents():
    columns = [
        "nMuon",
        "Muon_pt",
        "Muon_eta",
        "Muon_phi",
        "Muon_mass",
        "Muon_charge",
        "nJet",
        "Jet_eta",
    ]
    p = NanoEventsProcessor(columns=columns)

    rootdir = uproot.open(os.path.abspath("tests/samples/nano_dy.root"))
    tree = rootdir["Events"]
    arrays = tree.arrays(columns, how=dict)
    src = SimplePreloadedColumnSource(
        arrays, rootdir.file.uuid, tree.num_entries, object_path="/Events"
    )
    print(arrays)

    events = NanoEventsFactory.from_preloaded(
        src, metadata={"dataset": "ZJets"}
    ).events()
    hists = p.process(events)

    print(hists)
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6

    with pytest.raises(AttributeError):
        print(events.Muon.matched_jet)
Exemple #3
0
def do_dask_cached(client, filelist, cachestrategy=None):
    from coffea.nanoevents import NanoAODSchema
    from coffea.processor.test_items import NanoEventsProcessor
    from coffea.processor.dask import register_columncache

    register_columncache(client)

    exe_args = {
        "client": client,
        "schema": NanoAODSchema,
        "cachestrategy": cachestrategy,
        "savemetrics": True,
        "worker_affinity": True if cachestrategy is not None else False,
    }
    hists, metrics = processor.run_uproot_job(
        filelist,
        "Events",
        processor_instance=NanoEventsProcessor(canaries=[
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content",
        ]),
        executor=processor.dask_executor,
        executor_args=exe_args,
    )

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
    return hists["worker"]
Exemple #4
0
def do_dask_cached(client, filelist, cachestrategy=None):
    from coffea.nanoevents import NanoAODSchema
    from coffea.processor.test_items import NanoEventsProcessor
    from coffea.processor.dask import register_columncache
    register_columncache(client)

    exe_args = {
        'client': client,
        'schema': NanoAODSchema,
        'cachestrategy': cachestrategy,
        'savemetrics': True,
        'worker_affinity': True if cachestrategy is not None else False,
    }
    hists, metrics = processor.run_uproot_job(
        filelist,
        'Events',
        processor_instance=NanoEventsProcessor(canaries=[
            'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets',
            'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content',
            'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content',
            'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content',
            'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content',
            'a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content'
        ]),
        executor=processor.dask_executor,
        executor_args=exe_args)

    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)
    return hists['worker']
Exemple #5
0
def test_nanoevents_analysis(executor, compression, maxchunks):
    from coffea.processor.test_items import NanoEventsProcessor

    filelist = {
        "DummyBad": [osp.abspath("tests/samples/non_existent.root")],
        "ZJets": [osp.abspath("tests/samples/nano_dy.root")],
        "Data": [osp.abspath("tests/samples/nano_dimuon.root")],
    }
    treename = "Events"

    exe_args = {
        "workers": 1,
        "skipbadfiles": True,
        "schema": processor.NanoAODSchema,
        "compression": compression,
    }

    hists = processor.run_uproot_job(
        filelist,
        treename,
        NanoEventsProcessor(),
        executor,
        executor_args=exe_args,
        maxchunks=maxchunks,
    )

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
Exemple #6
0
def do_dask_cached(client, filelist, cachestrategy=None):
    from coffea.processor.test_items import NanoEventsProcessor
    from coffea.processor.dask import register_columncache
    register_columncache(client)

    exe_args = {
        'client': client,
        'nano': True,
        'cachestrategy': cachestrategy,
        'savemetrics': True,
        'worker_affinity': True if cachestrategy is not None else False,
    }
    hists, metrics = processor.run_uproot_job(
        filelist,
        'Events',
        processor_instance=NanoEventsProcessor(canaries=[
            '0001a210a3f8364811eaa29ff5b55c90beef;Events;0;40;Muon_pt'
        ]),
        executor=processor.dask_executor,
        executor_args=exe_args)

    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)
    return hists['worker']
Exemple #7
0
def do_dask_cached(client, filelist, cachestrategy=None):
    from coffea.nanoevents import schemas
    from coffea.processor.test_items import NanoEventsProcessor
    from coffea.processor.dask import register_columncache

    register_columncache(client)

    worker_affinity = True if cachestrategy is not None else False
    executor = processor.DaskExecutor(client=client,
                                      worker_affinity=worker_affinity)
    run = processor.Runner(
        executor=executor,
        schema=schemas.NanoAODSchema,
        cachestrategy=cachestrategy,
        savemetrics=True,
    )

    hists, metrics = run(
        filelist,
        "Events",
        processor_instance=NanoEventsProcessor(canaries=[
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/nMuon%2C%21load%2C%21counts2offsets%2C%21skip/offsets",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_phi%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_pt%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_eta%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_mass%2C%21load%2C%21content",
            "a9490124-3648-11ea-89e9-f5b55c90beef/%2FEvents%3B1/0-40/Muon_charge%2C%21load%2C%21content",
        ]),
    )

    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
    return hists["worker"]
Exemple #8
0
def test_loadsave():
    filename = 'testprocessor.coffea'
    try:
        aprocessor = NanoEventsProcessor()
        save(aprocessor, filename)
        newprocessor = load(filename)
        assert 'pt' in newprocessor.accumulator
        assert newprocessor.accumulator['pt'].compatible(aprocessor.accumulator['pt'])
    finally:
        if os.path.exists(filename):
            os.remove(filename)
Exemple #9
0
def test_preloaded_nanoevents():
    columns = [
        'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge',
        'nJet', 'Jet_eta'
    ]
    p = NanoEventsProcessor(columns=columns)

    tree = uproot.open(os.path.abspath('tests/samples/nano_dy.root'))['Events']
    arrays = tree.arrays(columns, flatten=True, namedecode='ascii')
    df = processor.PreloadedDataFrame(tree.numentries, arrays)
    print(arrays)

    events = NanoEvents.from_arrays(arrays, metadata={'dataset': 'ZJets'})
    hists = p.process(events)

    print(hists)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)

    with pytest.raises(RuntimeError):
        print(events.Muon.matched_jet)
Exemple #10
0
def test_nanoevents_analysis(executor, compression, maxchunks, skipbadfiles):
    from coffea.processor.test_items import NanoEventsProcessor

    filelist = {
        "DummyBad": {
            "treename": "Events",
            "files": [osp.abspath("tests/samples/non_existent.root")],
        },
        "ZJets": {
            "treename": "Events",
            "files": [osp.abspath("tests/samples/nano_dy.root")],
            "metadata": {"checkusermeta": True, "someusermeta": "hello"},
        },
        "Data": {
            "treename": "Events",
            "files": [osp.abspath("tests/samples/nano_dimuon.root")],
            "metadata": {"checkusermeta": True, "someusermeta2": "world"},
        },
    }

    executor = executor(compression=compression)
    run = processor.Runner(
        executor=executor,
        skipbadfiles=skipbadfiles,
        schema=processor.NanoAODSchema,
        maxchunks=maxchunks,
    )

    if skipbadfiles:
        hists = run(filelist, "Events", processor_instance=NanoEventsProcessor())
        assert hists["cutflow"]["ZJets_pt"] == 18
        assert hists["cutflow"]["ZJets_mass"] == 6
        assert hists["cutflow"]["Data_pt"] == 84
        assert hists["cutflow"]["Data_mass"] == 66
    else:
        with pytest.raises(FileNotFoundError):
            hists = run(filelist, "Events", processor_instance=NanoEventsProcessor())
Exemple #11
0
def do_dask_cached(client, filelist, cachestrategy=None):
    from coffea.processor.test_items import NanoEventsProcessor

    exe_args = {
        'client': client,
        'nano': True,
        'cachestrategy': cachestrategy,
        'savemetrics': True,
    }
    hists, metrics = processor.run_uproot_job(
        filelist,
        'Events',
        processor_instance=NanoEventsProcessor(),
        executor=processor.dask_executor,
        executor_args=exe_args)

    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)
Exemple #12
0
    executor = processor.DaskExecutor(client=client)

    run = processor.Runner(
        executor=executor,
        use_skyhook=True,
        format="parquet",
        schema=schemas.NanoAODSchema,
    )

    hists = run(
        {
            "ZJets": "/mnt/cephfs/nanoevents/ZJets",
            "Data": "/mnt/cephfs/nanoevents/Data",
        },
        "Events",
        processor_instance=NanoEventsProcessor(),
    )

    assert hists["cutflow"]["ZJets_pt"] == 108
    assert hists["cutflow"]["ZJets_mass"] == 36
    assert hists["cutflow"]["Data_pt"] == 504
    assert hists["cutflow"]["Data_mass"] == 396

    # now run again on parquet files in cephfs (without any pushdown)
    executor_args = {"client": client}

    run = processor.Runner(
        executor=executor,
        format="parquet",
        schema=schemas.NanoAODSchema,
    )
Exemple #13
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.util import guid

    from coffea.processor.spark.detail import (_spark_initialize,
                                               _spark_make_dfs, _spark_stop)
    from coffea.processor import run_spark_job
    from coffea.nanoevents import schemas

    import os
    import os.path as osp

    import pyspark.sql
    spark_config = pyspark.sql.SparkSession.builder \
        .appName('spark-executor-test-%s' % guid()) \
        .master('local[*]') \
        .config('spark.sql.execution.arrow.enabled','true') \
        .config('spark.executor.x509proxyname','x509_u12409') \
        .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

    spark = _spark_initialize(config=spark_config,
                              log_level='ERROR',
                              spark_progress=False)

    filelist = {
        'ZJets': {
            'files':
            ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')],
            'treename':
            'Events'
        },
        'Data': {
            'files': [
                'file:' +
                osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')
            ],
            'treename':
            'Events'
        }
    }

    from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = [
        'nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge'
    ]
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={'file_type': 'root'})

    assert (sum(spark_executor.counts.values()) == 80)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)

    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={'file_type': 'root'})

    assert (sum(spark_executor.counts.values()) == 80)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)

    proc = NanoEventsProcessor(columns=columns)
    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={
                              'file_type': 'root',
                              'schema': schemas.NanoAODSchema
                          })

    _spark_stop(spark)

    assert (sum(spark_executor.counts.values()) == 80)
    assert (hists['cutflow']['ZJets_pt'] == 18)
    assert (hists['cutflow']['ZJets_mass'] == 6)
    assert (hists['cutflow']['Data_pt'] == 84)
    assert (hists['cutflow']['Data_mass'] == 66)
Exemple #14
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.util import guid

    from coffea.processor.spark.detail import (
        _spark_initialize,
        _spark_stop,
    )
    from coffea.processor import run_spark_job
    from coffea.nanoevents import schemas

    import os
    import os.path as osp

    import pyspark.sql

    spark_config = (pyspark.sql.SparkSession.builder.appName(
        "spark-executor-test-%s" % guid()).master("local[*]").config(
            "spark.sql.execution.arrow.enabled",
            "true").config("spark.driver.host", "127.0.0.1").config(
                "spark.driver.bindAddress", "127.0.0.1").config(
                    "spark.executor.x509proxyname", "x509_u12409").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        200000))

    spark = _spark_initialize(config=spark_config,
                              log_level="ERROR",
                              spark_progress=False)

    filelist = {
        "ZJets": {
            "files":
            ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")],
            "treename":
            "Events",
        },
        "Data": {
            "files": [
                "file:" +
                osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")
            ],
            "treename":
            "Events",
        },
    }

    from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = [
        "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge"
    ]
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    proc = NanoEventsProcessor(columns=columns)
    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={
            "file_type": "root",
            "schema": schemas.NanoAODSchema
        },
    )

    _spark_stop(spark)

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66