Example #1
0
    def test_initialization_method(self):
        """
        Check initialization method in Spark backend.
        Define a method in the ROOT interpreter called getValue which returns
        the value defined by the user on the python side.
        """
        def init(value):
            import ROOT
            cpp_code = '''int userValue = %s ;''' % value
            ROOT.gInterpreter.ProcessLine(cpp_code)

        PyRDF.initialize(init, 123)
        # Spark backend has a limited list of supported methods, so we use
        # Histo1D which is a supported action.
        # The code below creates an RDataFrame instance with one single entry
        # and defines a column 'u' whose value is taken from the variable
        # 'userValue'.
        # This variable is only declared inside the ROOT interpreter, however
        # the value of the variable is passed by the user from the python side.
        # If the init function defined by the user is properly propagated to the
        # Spark backend, each workers will run the init function as a first step
        # and hence the variable 'userValue' will be defined at runtime.
        # As a result the define operation should read the variable 'userValue'
        # and assign it to the entries of the column 'u' (only one entry).
        # Finally, Histo1D returns a histogram filled with one value. The mean
        # of this single value has to be the value itself, independently of
        # the number of spawned workers.
        df = PyRDF.Spark.RDataFrame(1).Define("u", "userValue").Histo1D("u")
        h = df.GetValue()
        self.assertEqual(h.GetMean(), 123)
Example #2
0
    def test_spark_histograms(self):
        """Check that Spark backend works the same way as local."""
        physics_variables = ['pt1_h', 'pt2_h', 'invMass_h', 'phis_h']

        # Spark execution
        PyRDF.use("spark", {'npartitions': 5})

        SparkResult = namedtuple('SparkResult', physics_variables)
        spark = SparkResult(*self.build_pyrdf_graph())

        spark.pt1_h.Draw("PL PLC PMC")  # Trigger Event-loop, Spark

        # Local execution
        PyRDF.use("local")

        LocalResult = namedtuple('LocalResult', physics_variables)
        local = LocalResult(*self.build_pyrdf_graph())

        local.pt1_h.Draw("PL PLC PMC")  # Trigger Event-loop, Local

        # Assert 'pt1_h' histogram
        self.assertEqual(spark.pt1_h.GetEntries(), local.pt1_h.GetEntries())
        # Assert 'pt2_h' histogram
        self.assertEqual(spark.pt2_h.GetEntries(), local.pt2_h.GetEntries())
        # Assert 'invMass_h' histogram
        self.assertEqual(spark.invMass_h.GetEntries(),
                         local.invMass_h.GetEntries())
        # Assert 'phis_h' histogram
        self.assertEqual(spark.phis_h.GetEntries(), local.phis_h.GetEntries())
Example #3
0
    def test_includes_function_with_filter_and_histo(self):
        """
        An integration test to check that the filter
        operation is able to use C++ functions that
        were included using header files.

        """
        PyRDF.include("tests/integration/local/test_headers/header1.hxx")
        PyRDF.use("spark")

        rdf = PyRDF.RDataFrame(10)

        # This filters out all numbers less than 5
        rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)")
        histo = rdf_filtered.Histo1D("tdfentry_")

        # The expected results after filtering
        required_numbers = range(
            5)  # The actual set of numbers required after filtering
        required_size = len(required_numbers)
        required_mean = sum(required_numbers) / float(required_size)
        required_stdDev = math.sqrt(
            sum((x - required_mean)**2
                for x in required_numbers) / required_size)

        # Compare the sizes of equivalent set of numbers
        self.assertEqual(histo.GetEntries(), float(required_size))

        # Compare the means of equivalent set of numbers
        self.assertEqual(histo.GetMean(), required_mean)

        # Compare the standard deviations of equivalent set of numbers
        self.assertEqual(histo.GetStdDev(), required_stdDev)
Example #4
0
def main():
    """
    Main function of the skimming step of the analysis
    The function loops over all required samples, reduces the content to the
    interesting events and writes them to new files.
    """
    PyRDF.use("spark")
    PyRDF.include_headers("skim.h")

    for sample in sampleNames:
        print(">>> Process sample {}:\n".format(sample))

        df = PyRDF.RDataFrame("Events", samplesBasePath + sample + ".root")

        df2 = MinimalSelection(df)
        df3 = FindGoodMuons(df2)
        df4 = FindGoodTaus(df3)
        df5 = FilterGoodEvents(df4)
        df6 = FindMuonTauPair(df5)
        df7 = DeclareVariables(df6)
        df8 = CheckGeneratorTaus(df7, sample)
        df9 = AddEventWeight(df8, sample)

        out_file = sample + "Skim.root"
        df9.Snapshot("Events", out_file, final_variables_vec)
Example #5
0
    def tearDownClass(cls):
        """
        Restore global current_backend to default Local backend after running
        all tests

        """
        PyRDF.use("local")
Example #6
0
    def test_extend_ROOT_include_path(self):
        """
        Check that the include path of ROOT is extended with the directories
        specified in `PyRDF.include_headers()` so references between headers
        are correctly solved.
        """
        import ROOT

        header_folder = "tests/integration/local/test_headers/headers_folder"

        PyRDF.use("spark")
        PyRDF.include_headers(header_folder)

        # Get list of include paths seen by ROOT
        ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ")

        # Create new include folder token
        new_folder_include = "-I\"{}\"".format(header_folder)

        # Check that new folder is in ROOT include paths
        self.assertTrue(new_folder_include in ROOT_include_path)

        # Create an RDataFrame with 100 integers from 0 to 99
        rdf = PyRDF.RDataFrame(100)

        # Filter numbers less than 10 and create an histogram
        rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)")
        histo1 = rdf_less_than_10.Histo1D("tdfentry_")

        # Check that histogram has 10 entries and mean 4.5
        self.assertEqual(histo1.GetEntries(), 10)
        self.assertAlmostEqual(histo1.GetMean(), 4.5)
Example #7
0
    def test_spark_histograms(self):
        """
        Integration test to check that Spark
        backend works the same way as local.

        """
        # Spark execution
        PyRDF.use("spark", {'npartitions': 5})
        pt1_h_spark, pt2_h_spark, invMass_h_spark, phis_h_spark = self.build_pyrdf_graph(
        )
        pt1_h_spark.Draw("PL PLC PMC")  # Trigger Event-loop, Spark

        # Local execution
        PyRDF.use("local")
        pt1_h_local, pt2_h_local, invMass_h_local, phis_h_local = self.build_pyrdf_graph(
        )
        pt1_h_local.Draw("PL PLC PMC")  # Trigger Event-loop, Local

        # Assert 'pt1_h' histogram
        self.assertEqual(pt1_h_spark.GetEntries(), pt1_h_local.GetEntries())
        # Assert 'pt2_h' histogram
        self.assertEqual(pt2_h_spark.GetEntries(), pt2_h_local.GetEntries())
        # Assert 'invMass_h' histogram
        self.assertEqual(invMass_h_spark.GetEntries(),
                         invMass_h_local.GetEntries())
        # Assert 'phis_h' histogram
        self.assertEqual(phis_h_spark.GetEntries(), phis_h_local.GetEntries())
Example #8
0
    def test_include_dir_and_headers(self):
        """
        Check that the filter operation is able to use C++ functions included
        from a list with a directory and a single header file.
        """
        PyRDF.include_headers([
            "tests/integration/local/test_headers/headers_folder",
            "tests/integration/local/test_headers/header1.hxx"
        ])
        # creates and RDataFrame with 10 integers [0...9]
        rdf = PyRDF.RDataFrame(10)

        # This filters out all numbers less than 5
        filter1 = rdf.Filter("check_number_less_than_5(tdfentry_)")
        # This filters out all numbers greater than 5
        filter2 = rdf.Filter("check_number_greater_than_5(tdfentry_)")
        # This filters out all numbers less than 10
        filter3 = rdf.Filter("check_number_less_than_10(tdfentry_)")

        count1 = filter1.Count()
        count2 = filter2.Count()
        count3 = filter3.Count()

        # The final answer should respectively 5 integers less than 5,
        # 4 integers greater than 5 and 10 integers less than 10.
        self.assertEqual(count1.GetValue(), 5)
        self.assertEqual(count2.GetValue(), 4)
        self.assertEqual(count3.GetValue(), 10)
Example #9
0
    def test_default_empty_list_include(self):
        """
        Test case to ensure that 'PyRDF.include' function
        raises a TypeError if no parameter is given.

        """
        with self.assertRaises(TypeError):
            PyRDF.include()
Example #10
0
 def test_header_declaration_on_current_session(self):
     """Header has to be declared on the current session"""
     # Before the header declaration the function f is not present on the
     # ROOT interpreter
     with self.assertRaises(AttributeError):
         self.assertRaises(ROOT.b(1))
     PyRDF.include_headers("tests/unit/backend/test_headers/header4.hxx")
     self.assertEqual(ROOT.b(1), True)
Example #11
0
    def test_default_empty_list_include(self):
        """
        'PyRDF.include' function raises a TypeError if no parameter is
        given.

        """
        with self.assertRaises(TypeError):
            PyRDF.include_headers()
Example #12
0
    def test_spark_select(self):
        """
        Test to check if 'spark'
        environment gets set correctly.

        """

        PyRDF.use("spark")
        self.assertIsInstance(PyRDF.current_backend, Spark)
Example #13
0
    def test_local_select(self):
        """
        Test to check if 'local'
        environment gets set correctly.

        """

        PyRDF.use("local")
        self.assertIsInstance(PyRDF.current_backend, Local)
Example #14
0
    def test_future_env_select(self):
        """
        Test to check if a future environment
        throws a NotImplementedError.

        """

        with self.assertRaises(NotImplementedError):
            PyRDF.use("dask")
Example #15
0
    def test_string_include(self):
        """
        Test case to check the working of 'PyRDF.include'
        function when a single string is passed to it.

        """
        PyRDF.include("header1")

        self.assertListEqual(PyRDF.includes, ["header1"])
Example #16
0
    def test_list_include(self):
        """
        Test case to check the working of 'PyRDF.include'
        function when a list of strings is passed to it.

        """
        PyRDF.include(["header1"])

        self.assertListEqual(PyRDF.includes, ["header1"])
Example #17
0
    def test_list_include(self):
        """'PyRDF.include' with a list of strings."""
        PyRDF.include_headers(["tests/unit/backend/test_headers/header1.hxx"])

        required_header = ["tests/unit/backend/test_headers/header1.hxx"]
        # Feature detection: first try Python 3 function, then Python 2
        try:
            self.assertCountEqual(PyRDF.includes_headers, required_header)
        except AttributeError:
            self.assertItemsEqual(PyRDF.includes_headers, required_header)
Example #18
0
def main():
    PyRDF.use("spark")

    # Create output file
    tfile = ROOT.TFile("histograms.root", "RECREATE")
    variables = ranges.keys()

    # Loop through skimmed datasets and produce histograms of variables
    for name, label in [
        ("GluGluToHToTauTau", "ggH"),
        ("VBF_HToTauTau", "qqH"),
        ("W1JetsToLNu", "W1J"),
        ("W2JetsToLNu", "W2J"),
        ("W3JetsToLNu", "W3J"),
        ("TTbar", "TT"),
        ("DYJetsToLL", "ZLL"),
        ("DYJetsToLL", "ZTT"),
        ("Run2012B_TauPlusX", "dataRunB"),
        ("Run2012C_TauPlusX", "dataRunC"),
    ]:
        print(">>> Process skim {}".format(name))

        filenames = [filename for filename in os.listdir() if name in filename]
        # Load skimmed dataset and apply baseline selection
        df = PyRDF.RDataFrame("Events", filenames).Filter(
            "mt_1<30",
            "Muon transverse mass cut for W+jets suppression")\
            .Filter("iso_1<0.1", "Require isolated muon for signal region")

        # Book histograms for the signal region
        df1 = df.Filter("q_1*q_2<0",
                        "Require opposited charge for signal region")
        df1 = filterGenMatch(df1, label)
        hists = {}
        for variable in variables:
            hists[variable] = bookHistogram(df1, variable, ranges[variable])

        # Book histograms for the control region used to estimate the QCD
        # contribution
        df2 = df.Filter("q_1*q_2>0", "Control region for QCD estimation")
        df2 = filterGenMatch(df2, label)
        hists_cr = {}
        for variable in variables:
            hists_cr[variable] = bookHistogram(df2, variable, ranges[variable])

        # Write histograms to output file
        for variable in variables:
            writeHistogram(hists[variable], "{}_{}".format(label, variable))
        for variable in variables:
            writeHistogram(hists_cr[variable],
                           "{}_{}_cr".format(label, variable))

    tfile.Close()
Example #19
0
    def test_initialization(self):
        """
        Check that the user initialization method is assigned to the current
        backend.

        """
        def returnNumber(n):
            return n

        PyRDF.initialize(returnNumber, 123)
        f = PyRDF.current_backend.initialization
        self.assertEqual(f(), 123)
Example #20
0
    def test_list_extend_include(self):
        """
        Test case to check the working of 'PyRDF.include'
        function when different lists of strings are passed
        to it multiple times.

        """
        PyRDF.include(["header1", "header2"])
        PyRDF.include(["header3", "header4", "header5"])

        self.assertListEqual(
            PyRDF.includes,
            ["header1", "header2", "header3", "header4", "header5"])
Example #21
0
    def test_initialization_runs_in_current_environment(self):
        """
        User initialization method should be executed on the current user
        session, so actions applied by the user initialization function are
        also visible in the current scenario.
        """
        def defineIntVariable(name, value):
            import ROOT
            ROOT.gInterpreter.ProcessLine("int %s = %s;" % (name, value))

        varvalue = 2
        PyRDF.initialize(defineIntVariable, "myInt", varvalue)
        self.assertEqual(ROOT.myInt, varvalue)
Example #22
0
    def test_includes_shared_lib_with_filter_op(self):
        """
        Check that the filter operation is able to use C++ functions that
        were include using header files.
        """
        # Paths to the cpp file that has to be compiled into a shared library
        # and the path with the output name of the library.
        # Both are relative to the current directory of this file
        cpp_path = "tests/integration/local/test_shared_libraries/a.cpp"

        library_path = "tests/integration/local/test_shared_libraries/liba.so"

        library_code = ("`root-config --cxx` "
                        "`root-config --cflags --libs` "
                        "-fPIC -shared {cpp}"
                        " -o {lib}").format(cpp=cpp_path, lib=library_path)
        # This creates the shared library
        subprocess.call(library_code, shell=True)

        # Path to the shared library relative to the main PyRDF directory.
        so_path = ("tests/integration/local/" "test_shared_libraries/liba.so")

        PyRDF.include_shared_libraries(so_path)

        # The user can include directly the header related to the library
        # or choose to declare functions or objects later
        header_path = ("tests/integration/local/" "test_shared_libraries/a.h")
        PyRDF.include_headers(header_path)

        # Creates an RDataFrame with 5 integers [0...4]
        rdf = PyRDF.RDataFrame(5)

        # This filters out all numbers less than 3
        filter1 = rdf.Filter("tdfentry_ < 3")

        # This defines a new variable x with all entries squared
        # then filters out all x values less than 3
        filter2 = rdf.Define("x", "f(tdfentry_)").Filter("x < 3")

        count1 = filter1.Count().GetValue()
        count2 = filter2.Count().GetValue()

        # The final answer should be the number of integers
        # less than 5, which is 3, and the number of squared integers less
        # than 5, which is 2.
        self.assertEqual(count1, 3)
        self.assertEqual(count2, 2)

        # Remove unnecessary files at the end
        os.remove(so_path)
Example #23
0
    def test_asnumpy_return_arrays(self):
        """Test support for `AsNumpy` pythonization in local backend"""
        import numpy

        # Let's create a simple dataframe with ten rows and two columns
        df = PyRDF.RDataFrame(10).Define("x", "(int)rdfentry_")\
                                 .Define("y", "1.f/(1.f+rdfentry_)")

        # Build a dictionary of numpy arrays.
        npy = df.AsNumpy()
        self.assertIsInstance(npy, dict)

        # Retrieve the two numpy arrays with the column names of the original
        # RDataFrame as dictionary keys.
        npy_x = npy["x"]
        npy_y = npy["y"]
        self.assertIsInstance(npy_x, numpy.ndarray)
        self.assertIsInstance(npy_y, numpy.ndarray)

        # Check the two arrays are of the same length as the original columns.
        self.assertEqual(len(npy_x), 10)
        self.assertEqual(len(npy_y), 10)

        # Check the types correspond to the ones of the original columns.
        int_32_dtype = numpy.dtype("int32")
        float_32_dtype = numpy.dtype("float32")
        self.assertEqual(npy_x.dtype, int_32_dtype)
        self.assertEqual(npy_y.dtype, float_32_dtype)
Example #24
0
    def test_distributed_snapshot(self):
        """Test support for `Snapshot` in distributed backend"""
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = PyRDF.RDataFrame(10).Define("x", "rdfentry_")

        # Count rows in the dataframe
        nrows = df.Count()

        # Snapshot to two files, build a ROOT.TChain with them and retrieve a
        # PyRDF.RDataFrame
        snapdf = df.Snapshot("snapTree", "snapFile.root")

        # Count the rows in the snapshotted dataframe
        snapcount = snapdf.Count()

        self.assertEqual(nrows.GetValue(), 10)
        self.assertEqual(snapcount.GetValue(), 10)

        # Retrieve list of file from the snapshotted PyRDF.RDataFrame
        input_files = snapdf.proxied_node.get_inputfiles()
        # Create list of supposed filenames for the intermediary files
        tmp_files = ["snapFile_0_4.root", "snapFile_5_9.root"]
        # Check that the two lists are the same
        self.assertListEqual(input_files, tmp_files)
        # Check that the intermediary .root files were created with the right
        # names, then remove them because they are not necessary
        for filename in tmp_files:
            self.assertTrue(os.path.exists(filename))
            os.remove(filename)
Example #25
0
    def test_distributed_sum(self):
        """Test support for `Sum` operation in distributed backend"""
        rdf_py = PyRDF.RDataFrame(10)
        rdf_def = rdf_py.Define("x", "rdfentry_")
        rdf_sum = rdf_def.Sum("x")

        self.assertAlmostEqual(rdf_sum.GetValue(), 45.0)
Example #26
0
    def test_snapshot_nrows(self):
        """Test support for `Snapshot` in local backend"""
        def fill_tree(treeName, fileName):
            rdf = PyRDF.RDataFrame(100)
            return rdf.Define("b1", "rdfentry_")\
                      .Snapshot(treeName, fileName)

        # We prepare an input tree to run on
        fileName = "snapFile.root"
        treeName = "snapTree"

        snapdf = fill_tree(treeName, fileName)

        # We read the tree from the file and create a RDataFrame.
        d = PyRDF.RDataFrame(treeName, fileName)

        # Check on dataframe retrieved from file
        d_cut = d.Filter("b1 % 2 == 0")

        d_count = d_cut.Count()

        self.assertEqual(d_count.GetValue(), 50)

        # Check on dataframe returned by Snapshot operation
        snapdf_cut = snapdf.Filter("b1 % 2 == 0")
        snapdf_count = snapdf_cut.Count()

        self.assertEqual(snapdf_count.GetValue(), 50)

        # Remove unnecessary .root file
        os.remove(fileName)
Example #27
0
    def build_pyrdf_graph(self):
        """Create a PyRDF graph with a fixed set of operations and return it."""
        treename = "data"
        files = [
            'https://root.cern/files/teaching/CMS_Open_Dataset.root',
        ]
        rdf = PyRDF.RDataFrame(treename, files)

        # Define the analysis cuts
        chargeCutStr = "C1 != C2"
        etaCutStr = "fabs(eta1) < 2.3 && fabs(eta2) < 2.3"
        ptCutStr = "pt1 > 2 && pt2 > 2"
        rdf_f = rdf.Filter(chargeCutStr, "Opposite Charge") \
                   .Filter(etaCutStr, "Central Muons") \
                   .Filter(ptCutStr, "Sane Pt")

        # Create the invariant mass column
        invMassFormulaStr = ("sqrt(pow(E1+E2, 2) - (pow(px1+px2, 2) +"
                             "pow(py1+py2, 2) + pow(pz1+pz2, 2)))")
        rdf_fd = rdf_f.Define("invMass", invMassFormulaStr)

        # Create the histograms
        pt1_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt1")
        pt2_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt2")
        model = ("invMass", "CMS Opendata;#mu#mu mass[GeV];Events", 512, 5,
                 110)
        invMass_h = rdf_fd.Histo1D(model, "invMass")
        import ROOT
        pi = ROOT.TMath.Pi()
        model = ("", "", 64, -pi, pi, 64, -pi, pi)
        phis_h = rdf_fd.Histo2D(model, "phi1", "phi2")

        return pt1_h, pt2_h, invMass_h, phis_h
Example #28
0
def main(basepath):
    """Skim the datasets and create histograms"""

    # Create output file
    tfile = ROOT.TFile("histograms.root", "RECREATE")

    for sample, label in samplesandlabels:

        filenames = [
            basepath + sample + "_{}.root".format(i) for i in range(1, 16)
        ]
        # Uncomment to use only one file per sample
        # sample_file = eosbasepath + sample + "_1.root"

        # Create RDataFrame
        # Uncomment to use only one file per sample
        # df = PyRDF.RDataFrame("Events", sample_file)
        df = PyRDF.RDataFrame("Events", filenames)

        # Skim events
        skimdf = skim(df, sample)

        # Create histograms
        histos(skimdf, label)

    tfile.Close()
Example #29
0
    def test_initialization_method(self):
        """
        Check initialization method in Local backend.

        Define a method in the ROOT interpreter called getValue which returns
        the value defined by the user on the python side.

        """
        def init(value):
            cpp_code = '''auto getUserValue = [](){return %s ;};''' % value
            ROOT.gInterpreter.Declare(cpp_code)

        PyRDF.initialize(init, 123)
        PyRDF.current_backend = Local()
        df = PyRDF.RDataFrame(1)
        s = df.Define("userValue", "getUserValue()").Sum("userValue")
        self.assertEqual(s.GetValue(), 123)
Example #30
0
    def test_histo_from_empty_root_file(self):
        """
        Check that when performing operations with the distributed backend on
        an RDataFrame without entries, PyRDF falls back to using the local
        backend and outputs the correct (empty) result.
        """
        PyRDF.use("spark")

        # Creates and RDataFrame with 10 integers [0...9]
        rdf = PyRDF.RDataFrame("NOMINAL", "tests/unit/backend/emptytree.root")
        histo = rdf.Histo1D("mybranch")

        # Get entries in the histogram, should be zero
        entries = histo.GetEntries()

        self.assertIsInstance(PyRDF.current_backend, Local)
        self.assertEqual(entries, 0)