Ejemplo n.º 1
0
def check_spark_functionality():
    spark = _spark_initialize()

    env = Environment(loader=PackageLoader(
        'fnal_column_analysis_tools.processor', 'templates'),
                      autoescape=select_autoescape(['py']))

    template_name = 'spark_template.py'
    tmpl = env.get_template(template_name)

    global processor_instance, lz4_clevel, coffea_udf
    processor_instance = DummyProcessor()
    lz4_clevel = 1

    cols = ['dataset']
    output = tmpl.render(cols=cols)
    exec(output)

    dataset = [{
        'dataset': 'WJets'
    }, {
        'dataset': 'WJets'
    }, {
        'dataset': 'WJets'
    }]
    df = spark.createDataFrame(dataset, schema='dataset: string')
    pd_one = df.toPandas()

    df = df.withColumn('histos', coffea_udf(*cols))
    pd_two = df.toPandas()

    _spark_stop(spark)

    return pd_one['dataset'].count(), pd_two['dataset'].count(
    ), pd_two['histos']
def test_spark_executor():
    try:
        import pyspark
    except ImportError:
        warnings.warn('pyspark not installed, skipping tests')
        return
    except Exception as e:
        warnings.warn('other error when trying to import pyspark!')
        raise e

    from fnal_column_analysis_tools.processor.spark.detail import (
        _spark_initialize, _spark_make_dfs, _spark_stop)
    from fnal_column_analysis_tools.processor import run_spark_job

    import os
    import os.path as osp

    import pyspark.sql
    spark_config = pyspark.sql.SparkSession.builder \
        .appName('spark-executor-test') \
        .master('local[*]') \
        .config('spark.sql.execution.arrow.enabled','true') \
        .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

    spark = _spark_initialize(config=spark_config,
                              log_level='ERROR',
                              spark_progress=False)

    filelist = {
        'ZJets':
        ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dy.parquet')],
        'Data':
        ['file:' + osp.join(os.getcwd(), 'tests/samples/nano_dimuon.parquet')]
    }

    from fnal_column_analysis_tools.processor.test_items import NanoTestProcessor
    from fnal_column_analysis_tools.processor.spark.spark_executor import spark_executor

    columns = ['nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass']
    proc = NanoTestProcessor(columns=columns)

    hists = run_spark_job(filelist,
                          processor_instance=proc,
                          executor=spark_executor,
                          spark=spark,
                          thread_workers=1)

    _spark_stop(spark)

    assert (sum(spark_executor.counts.values()) == 20)
    assert (hists['cutflow']['ZJets_pt'] == 4)
    assert (hists['cutflow']['ZJets_mass'] == 1)
    assert (hists['cutflow']['Data_pt'] == 15)
    assert (hists['cutflow']['Data_mass'] == 5)
Ejemplo n.º 3
0
def test_spark_imports():
    try:
        import pyspark
    except ImportError:
        warnings.warn('pyspark not installed, skipping tests')
        return
    except Exception as e:
        warnings.warn('other error when trying to import pyspark!')
        raise e

    from fnal_column_analysis_tools.processor.spark.spark_executor import spark_executor
    from fnal_column_analysis_tools.processor.spark.detail import (
        _spark_initialize, _spark_make_dfs, _spark_stop)

    spark = _spark_initialize()
    _spark_stop(spark)