def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.compat import guid from coffea.processor.spark.detail import (_spark_initialize, _spark_make_dfs, _spark_stop) from coffea.processor import run_spark_job import os import os.path as osp import pyspark.sql spark_config = pyspark.sql.SparkSession.builder \ .appName('spark-executor-test-%s' % guid()) \ .master('local[*]') \ .config('spark.sql.execution.arrow.enabled','true') \ .config('spark.executor.x509proxyname','x509_u12409') \ .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000) spark = _spark_initialize(config=spark_config, log_level='ERROR', spark_progress=False) filelist = { 'ZJets': { 'files': ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')], 'treename': 'Events' }, 'Data': { 'files': [ 'file:' + osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root') ], 'treename': 'Events' } } from coffea.processor.test_items import NanoTestProcessor from coffea.processor.spark.spark_executor import spark_executor columns = ['nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass'] proc = NanoTestProcessor(columns=columns) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={'file_type': 'root'}) _spark_stop(spark) assert (sum(spark_executor.counts.values()) == 20) assert (hists['cutflow']['ZJets_pt'] == 4) assert (hists['cutflow']['ZJets_mass'] == 1) assert (hists['cutflow']['Data_pt'] == 15) assert (hists['cutflow']['Data_mass'] == 5)
def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.util import guid from coffea.processor.spark.detail import ( _spark_initialize, _spark_stop, ) from coffea.processor import run_spark_job from coffea.nanoevents import schemas import os import os.path as osp import pyspark.sql spark_config = (pyspark.sql.SparkSession.builder.appName( "spark-executor-test-%s" % guid()).master("local[*]").config( "spark.sql.execution.arrow.enabled", "true").config("spark.driver.host", "127.0.0.1").config( "spark.driver.bindAddress", "127.0.0.1").config( "spark.executor.x509proxyname", "x509_u12409").config( "spark.sql.execution.arrow.maxRecordsPerBatch", 200000)) spark = _spark_initialize(config=spark_config, log_level="ERROR", spark_progress=False) filelist = { "ZJets": { "files": ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")], "treename": "Events", }, "Data": { "files": [ "file:" + osp.join(os.getcwd(), "tests/samples/nano_dimuon.root") ], "treename": "Events", }, } from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor from coffea.processor.spark.spark_executor import spark_executor columns = [ "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge" ] proc = NanoTestProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 proc = NanoEventsProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={ "file_type": "root", "schema": schemas.NanoAODSchema }, ) _spark_stop(spark) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
samp_info.paths = dataset samp_info.year = "2018" samp_info.load("test", use_dask=False) samp_info.lumi_weights["test"] = 1.0 executor = spark_executor executor_args = {"schema": processor.NanoAODSchema, "file_type": "root"} processor_args = { "samp_info": samp_info, "do_timer": False, "do_btag_syst": False } print(samp_info.fileset) output = run_spark_job( samp_info.fileset, DimuonProcessor(**processor_args), executor, spark=spark, thread_workers=32, partitionsize=100000, executor_args=executor_args, ) _spark_stop(spark) df = output.compute() print(df) elapsed = round(time.time() - tick, 3) print(f"Finished everything in {elapsed} s.")