def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.compat import guid from coffea.processor.spark.detail import (_spark_initialize, _spark_make_dfs, _spark_stop) from coffea.processor import run_spark_job import os import os.path as osp import pyspark.sql spark_config = pyspark.sql.SparkSession.builder \ .appName('spark-executor-test-%s' % guid()) \ .master('local[*]') \ .config('spark.sql.execution.arrow.enabled','true') \ .config('spark.executor.x509proxyname','x509_u12409') \ .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000) spark = _spark_initialize(config=spark_config, log_level='ERROR', spark_progress=False) filelist = { 'ZJets': { 'files': ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.root')], 'treename': 'Events' }, 'Data': { 'files': [ 'file:' + osp.join(os.getcwd(), 'tests/samples/nano_dimuon.root') ], 'treename': 'Events' } } from coffea.processor.test_items import NanoTestProcessor from coffea.processor.spark.spark_executor import spark_executor columns = ['nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass'] proc = NanoTestProcessor(columns=columns) hists = run_spark_job(filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={'file_type': 'root'}) _spark_stop(spark) assert (sum(spark_executor.counts.values()) == 20) assert (hists['cutflow']['ZJets_pt'] == 4) assert (hists['cutflow']['ZJets_mass'] == 1) assert (hists['cutflow']['Data_pt'] == 15) assert (hists['cutflow']['Data_mass'] == 5)
def check_spark_functionality(): spark = _spark_initialize() env = Environment(loader=PackageLoader('coffea.processor', 'templates'), autoescape=select_autoescape(['py']) ) template_name = 'spark.py.tmpl' tmpl = env.get_template(template_name) global processor_instance, lz4_clevel, coffea_udf processor_instance = DummyProcessor() lz4_clevel = 1 cols = ['dataset'] output = tmpl.render(cols=cols) exec(output) dataset = [{'dataset': 'WJets'}, {'dataset': 'WJets'}, {'dataset': 'WJets'}] df = spark.createDataFrame(dataset, schema='dataset: string') pd_one = df.toPandas() df = df.withColumn('histos', coffea_udf(*cols)) pd_two = df.toPandas() _spark_stop(spark) return pd_one['dataset'].count(), pd_two['dataset'].count(), pd_two['histos']
def test_spark_imports(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from coffea.processor.spark.spark_executor import spark_executor from coffea.processor.spark.detail import (_spark_initialize, _spark_make_dfs, _spark_stop) spark = _spark_initialize() _spark_stop(spark)
def test_spark_imports(): pytest.importorskip("pyspark", minversion="2.4.1") from coffea.processor.spark.detail import ( _spark_initialize, _spark_stop, ) spark = _spark_initialize(bindAddress="127.0.0.1", host="127.0.0.1") _spark_stop(spark)
def test_spark_executor(): pyspark = pytest.importorskip("pyspark", minversion="2.4.1") from pyarrow.util import guid from coffea.processor.spark.detail import ( _spark_initialize, _spark_stop, ) from coffea.processor import run_spark_job from coffea.nanoevents import schemas import os import os.path as osp import pyspark.sql spark_config = (pyspark.sql.SparkSession.builder.appName( "spark-executor-test-%s" % guid()).master("local[*]").config( "spark.sql.execution.arrow.enabled", "true").config("spark.driver.host", "127.0.0.1").config( "spark.driver.bindAddress", "127.0.0.1").config( "spark.executor.x509proxyname", "x509_u12409").config( "spark.sql.execution.arrow.maxRecordsPerBatch", 200000)) spark = _spark_initialize(config=spark_config, log_level="ERROR", spark_progress=False) filelist = { "ZJets": { "files": ["file:" + osp.join(os.getcwd(), "tests/samples/nano_dy.root")], "treename": "Events", }, "Data": { "files": [ "file:" + osp.join(os.getcwd(), "tests/samples/nano_dimuon.root") ], "treename": "Events", }, } from coffea.processor.test_items import NanoTestProcessor, NanoEventsProcessor from coffea.processor.spark.spark_executor import spark_executor columns = [ "nMuon", "Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass", "Muon_charge" ] proc = NanoTestProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={"file_type": "root"}, ) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66 proc = NanoEventsProcessor(columns=columns) hists = run_spark_job( filelist, processor_instance=proc, executor=spark_executor, spark=spark, thread_workers=1, executor_args={ "file_type": "root", "schema": schemas.NanoAODSchema }, ) _spark_stop(spark) assert sum(spark_executor.counts.values()) == 80 assert hists["cutflow"]["ZJets_pt"] == 18 assert hists["cutflow"]["ZJets_mass"] == 6 assert hists["cutflow"]["Data_pt"] == 84 assert hists["cutflow"]["Data_mass"] == 66
if __name__ == "__main__": tick = time.time() spark_config = (pyspark.sql.SparkSession.builder.appName( "spark-executor-test-%s" % guid()).master("local[1]").config( "spark.sql.execution.arrow.enabled", "true").config("spark.executor.memory", "7g").config( "spark.executor.cores", "1").config("spark.driver.memory", "16g").config( "spark.driver.maxResultSize", "4g").config( "spark.sql.execution.arrow.maxRecordsPerBatch", 100000).config("spark.cores.max", "1")) spark = _spark_initialize( config=spark_config, log_level="ERROR", spark_progress=False, laurelin_version="1.0.0", ) print("Spark initialized") file_name = "vbf_powheg_dipole_NANOV10_2018.root" file_path = f"{os.getcwd()}/tests/samples/{file_name}" dataset = {"test": file_path} samp_info = SamplesInfo(xrootd=False) samp_info.paths = dataset samp_info.year = "2018" samp_info.load("test", use_dask=False) samp_info.lumi_weights["test"] = 1.0 executor = spark_executor