コード例 #1
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.compat import guid

    from coffea.processor.spark.detail import (_spark_initialize,
                                               _spark_make_dfs, _spark_stop)
    from coffea.processor import run_spark_job

    import os
    import os.path as osp

    import pyspark.sql
    spark_config = pyspark.sql.SparkSession.builder \
        .appName('spark-executor-test-%s' % guid()) \
        .master('local[*]') \
        .config('spark.sql.execution.arrow.enabled','true') \
        .config('spark.executor.x509proxyname','x509_u12409') \
        .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

    spark = _spark_initialize(config=spark_config,
                              log_level='ERROR',
                              spark_progress=False)

    filelist = {
        'ZJets': {
            'files':
            ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')],
            'treename':
            'Events'
        },
        'Data': {
            'files': [
                'file:' +
                osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root')
            ],
            'treename':
            'Events'
        }
    }

    from coffea.processor.test_items import NanoTestProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = ['nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass']
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1,
                          executor_args={'file_type': 'root'})

    _spark_stop(spark)

    assert (sum(spark_executor.counts.values()) == 20)
    assert (hists['cutflow']['ZJets_pt'] == 4)
    assert (hists['cutflow']['ZJets_mass'] == 1)
    assert (hists['cutflow']['Data_pt'] == 15)
    assert (hists['cutflow']['Data_mass'] == 5)
コード例 #2
0
def test_spark_executor():
    pyspark = pytest.importorskip("pyspark", minversion="2.4.1")
    from pyarrow.util import guid

    from coffea.processor.spark.detail import (
        _spark_initialize,
        _spark_stop,
    )
    from coffea.processor import run_spark_job
    from coffea.nanoevents import schemas

    import os
    import os.path as osp

    import pyspark.sql

    spark_config = (pyspark.sql.SparkSession.builder.appName(
        "spark-executor-test-%s" % guid()).master("local[*]").config(
            "spark.sql.execution.arrow.enabled",
            "true").config("spark.driver.host", "127.0.0.1").config(
                "spark.driver.bindAddress", "127.0.0.1").config(
                    "spark.executor.x509proxyname", "x509_u12409").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        200000))

    spark = _spark_initialize(config=spark_config,
                              log_level="ERROR",
                              spark_progress=False)

    filelist = {
        "ZJets": {
            "files":
            ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")],
            "treename":
            "Events",
        },
        "Data": {
            "files": [
                "file:" +
                osp.join(os.getcwd(), "tests/samples/nano_dimuon.root")
            ],
            "treename":
            "Events",
        },
    }

    from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor
    from coffea.processor.spark.spark_executor import spark_executor

    columns = [
        "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge"
    ]
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={"file_type": "root"},
    )

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66

    proc = NanoEventsProcessor(columns=columns)
    hists = run_spark_job(
        filelist,
        processor_instance=proc,
        executor=spark_executor,
        spark=spark,
        thread_workers=1,
        executor_args={
            "file_type": "root",
            "schema": schemas.NanoAODSchema
        },
    )

    _spark_stop(spark)

    assert sum(spark_executor.counts.values()) == 80
    assert hists["cutflow"]["ZJets_pt"] == 18
    assert hists["cutflow"]["ZJets_mass"] == 6
    assert hists["cutflow"]["Data_pt"] == 84
    assert hists["cutflow"]["Data_mass"] == 66
コード例 #3
0
    samp_info.paths = dataset
    samp_info.year = "2018"
    samp_info.load("test", use_dask=False)
    samp_info.lumi_weights["test"] = 1.0

    executor = spark_executor
    executor_args = {"schema": processor.NanoAODSchema, "file_type": "root"}
    processor_args = {
        "samp_info": samp_info,
        "do_timer": False,
        "do_btag_syst": False
    }
    print(samp_info.fileset)

    output = run_spark_job(
        samp_info.fileset,
        DimuonProcessor(**processor_args),
        executor,
        spark=spark,
        thread_workers=32,
        partitionsize=100000,
        executor_args=executor_args,
    )
    _spark_stop(spark)

    df = output.compute()
    print(df)

    elapsed = round(time.time() - tick, 3)
    print(f"Finished everything in {elapsed} s.")