def test_initialization_method(self): """ Check initialization method in Spark backend. Define a method in the ROOT interpreter called getValue which returns the value defined by the user on the python side. """ def init(value): import ROOT cpp_code = '''int userValue = %s ;''' % value ROOT.gInterpreter.ProcessLine(cpp_code) PyRDF.initialize(init, 123) # Spark backend has a limited list of supported methods, so we use # Histo1D which is a supported action. # The code below creates an RDataFrame instance with one single entry # and defines a column 'u' whose value is taken from the variable # 'userValue'. # This variable is only declared inside the ROOT interpreter, however # the value of the variable is passed by the user from the python side. # If the init function defined by the user is properly propagated to the # Spark backend, each workers will run the init function as a first step # and hence the variable 'userValue' will be defined at runtime. # As a result the define operation should read the variable 'userValue' # and assign it to the entries of the column 'u' (only one entry). # Finally, Histo1D returns a histogram filled with one value. The mean # of this single value has to be the value itself, independently of # the number of spawned workers. df = PyRDF.Spark.RDataFrame(1).Define("u", "userValue").Histo1D("u") h = df.GetValue() self.assertEqual(h.GetMean(), 123)
def test_spark_histograms(self): """Check that Spark backend works the same way as local.""" physics_variables = ['pt1_h', 'pt2_h', 'invMass_h', 'phis_h'] # Spark execution PyRDF.use("spark", {'npartitions': 5}) SparkResult = namedtuple('SparkResult', physics_variables) spark = SparkResult(*self.build_pyrdf_graph()) spark.pt1_h.Draw("PL PLC PMC") # Trigger Event-loop, Spark # Local execution PyRDF.use("local") LocalResult = namedtuple('LocalResult', physics_variables) local = LocalResult(*self.build_pyrdf_graph()) local.pt1_h.Draw("PL PLC PMC") # Trigger Event-loop, Local # Assert 'pt1_h' histogram self.assertEqual(spark.pt1_h.GetEntries(), local.pt1_h.GetEntries()) # Assert 'pt2_h' histogram self.assertEqual(spark.pt2_h.GetEntries(), local.pt2_h.GetEntries()) # Assert 'invMass_h' histogram self.assertEqual(spark.invMass_h.GetEntries(), local.invMass_h.GetEntries()) # Assert 'phis_h' histogram self.assertEqual(spark.phis_h.GetEntries(), local.phis_h.GetEntries())
def test_includes_function_with_filter_and_histo(self): """ An integration test to check that the filter operation is able to use C++ functions that were included using header files. """ PyRDF.include("tests/integration/local/test_headers/header1.hxx") PyRDF.use("spark") rdf = PyRDF.RDataFrame(10) # This filters out all numbers less than 5 rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)") histo = rdf_filtered.Histo1D("tdfentry_") # The expected results after filtering required_numbers = range( 5) # The actual set of numbers required after filtering required_size = len(required_numbers) required_mean = sum(required_numbers) / float(required_size) required_stdDev = math.sqrt( sum((x - required_mean)**2 for x in required_numbers) / required_size) # Compare the sizes of equivalent set of numbers self.assertEqual(histo.GetEntries(), float(required_size)) # Compare the means of equivalent set of numbers self.assertEqual(histo.GetMean(), required_mean) # Compare the standard deviations of equivalent set of numbers self.assertEqual(histo.GetStdDev(), required_stdDev)
def main(): """ Main function of the skimming step of the analysis The function loops over all required samples, reduces the content to the interesting events and writes them to new files. """ PyRDF.use("spark") PyRDF.include_headers("skim.h") for sample in sampleNames: print(">>> Process sample {}:\n".format(sample)) df = PyRDF.RDataFrame("Events", samplesBasePath + sample + ".root") df2 = MinimalSelection(df) df3 = FindGoodMuons(df2) df4 = FindGoodTaus(df3) df5 = FilterGoodEvents(df4) df6 = FindMuonTauPair(df5) df7 = DeclareVariables(df6) df8 = CheckGeneratorTaus(df7, sample) df9 = AddEventWeight(df8, sample) out_file = sample + "Skim.root" df9.Snapshot("Events", out_file, final_variables_vec)
def tearDownClass(cls): """ Restore global current_backend to default Local backend after running all tests """ PyRDF.use("local")
def test_extend_ROOT_include_path(self): """ Check that the include path of ROOT is extended with the directories specified in `PyRDF.include_headers()` so references between headers are correctly solved. """ import ROOT header_folder = "tests/integration/local/test_headers/headers_folder" PyRDF.use("spark") PyRDF.include_headers(header_folder) # Get list of include paths seen by ROOT ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ") # Create new include folder token new_folder_include = "-I\"{}\"".format(header_folder) # Check that new folder is in ROOT include paths self.assertTrue(new_folder_include in ROOT_include_path) # Create an RDataFrame with 100 integers from 0 to 99 rdf = PyRDF.RDataFrame(100) # Filter numbers less than 10 and create an histogram rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)") histo1 = rdf_less_than_10.Histo1D("tdfentry_") # Check that histogram has 10 entries and mean 4.5 self.assertEqual(histo1.GetEntries(), 10) self.assertAlmostEqual(histo1.GetMean(), 4.5)
def test_spark_histograms(self): """ Integration test to check that Spark backend works the same way as local. """ # Spark execution PyRDF.use("spark", {'npartitions': 5}) pt1_h_spark, pt2_h_spark, invMass_h_spark, phis_h_spark = self.build_pyrdf_graph( ) pt1_h_spark.Draw("PL PLC PMC") # Trigger Event-loop, Spark # Local execution PyRDF.use("local") pt1_h_local, pt2_h_local, invMass_h_local, phis_h_local = self.build_pyrdf_graph( ) pt1_h_local.Draw("PL PLC PMC") # Trigger Event-loop, Local # Assert 'pt1_h' histogram self.assertEqual(pt1_h_spark.GetEntries(), pt1_h_local.GetEntries()) # Assert 'pt2_h' histogram self.assertEqual(pt2_h_spark.GetEntries(), pt2_h_local.GetEntries()) # Assert 'invMass_h' histogram self.assertEqual(invMass_h_spark.GetEntries(), invMass_h_local.GetEntries()) # Assert 'phis_h' histogram self.assertEqual(phis_h_spark.GetEntries(), phis_h_local.GetEntries())
def test_include_dir_and_headers(self): """ Check that the filter operation is able to use C++ functions included from a list with a directory and a single header file. """ PyRDF.include_headers([ "tests/integration/local/test_headers/headers_folder", "tests/integration/local/test_headers/header1.hxx" ]) # creates and RDataFrame with 10 integers [0...9] rdf = PyRDF.RDataFrame(10) # This filters out all numbers less than 5 filter1 = rdf.Filter("check_number_less_than_5(tdfentry_)") # This filters out all numbers greater than 5 filter2 = rdf.Filter("check_number_greater_than_5(tdfentry_)") # This filters out all numbers less than 10 filter3 = rdf.Filter("check_number_less_than_10(tdfentry_)") count1 = filter1.Count() count2 = filter2.Count() count3 = filter3.Count() # The final answer should respectively 5 integers less than 5, # 4 integers greater than 5 and 10 integers less than 10. self.assertEqual(count1.GetValue(), 5) self.assertEqual(count2.GetValue(), 4) self.assertEqual(count3.GetValue(), 10)
def test_default_empty_list_include(self): """ Test case to ensure that 'PyRDF.include' function raises a TypeError if no parameter is given. """ with self.assertRaises(TypeError): PyRDF.include()
def test_header_declaration_on_current_session(self): """Header has to be declared on the current session""" # Before the header declaration the function f is not present on the # ROOT interpreter with self.assertRaises(AttributeError): self.assertRaises(ROOT.b(1)) PyRDF.include_headers("tests/unit/backend/test_headers/header4.hxx") self.assertEqual(ROOT.b(1), True)
def test_default_empty_list_include(self): """ 'PyRDF.include' function raises a TypeError if no parameter is given. """ with self.assertRaises(TypeError): PyRDF.include_headers()
def test_spark_select(self): """ Test to check if 'spark' environment gets set correctly. """ PyRDF.use("spark") self.assertIsInstance(PyRDF.current_backend, Spark)
def test_local_select(self): """ Test to check if 'local' environment gets set correctly. """ PyRDF.use("local") self.assertIsInstance(PyRDF.current_backend, Local)
def test_future_env_select(self): """ Test to check if a future environment throws a NotImplementedError. """ with self.assertRaises(NotImplementedError): PyRDF.use("dask")
def test_string_include(self): """ Test case to check the working of 'PyRDF.include' function when a single string is passed to it. """ PyRDF.include("header1") self.assertListEqual(PyRDF.includes, ["header1"])
def test_list_include(self): """ Test case to check the working of 'PyRDF.include' function when a list of strings is passed to it. """ PyRDF.include(["header1"]) self.assertListEqual(PyRDF.includes, ["header1"])
def test_list_include(self): """'PyRDF.include' with a list of strings.""" PyRDF.include_headers(["tests/unit/backend/test_headers/header1.hxx"]) required_header = ["tests/unit/backend/test_headers/header1.hxx"] # Feature detection: first try Python 3 function, then Python 2 try: self.assertCountEqual(PyRDF.includes_headers, required_header) except AttributeError: self.assertItemsEqual(PyRDF.includes_headers, required_header)
def main(): PyRDF.use("spark") # Create output file tfile = ROOT.TFile("histograms.root", "RECREATE") variables = ranges.keys() # Loop through skimmed datasets and produce histograms of variables for name, label in [ ("GluGluToHToTauTau", "ggH"), ("VBF_HToTauTau", "qqH"), ("W1JetsToLNu", "W1J"), ("W2JetsToLNu", "W2J"), ("W3JetsToLNu", "W3J"), ("TTbar", "TT"), ("DYJetsToLL", "ZLL"), ("DYJetsToLL", "ZTT"), ("Run2012B_TauPlusX", "dataRunB"), ("Run2012C_TauPlusX", "dataRunC"), ]: print(">>> Process skim {}".format(name)) filenames = [filename for filename in os.listdir() if name in filename] # Load skimmed dataset and apply baseline selection df = PyRDF.RDataFrame("Events", filenames).Filter( "mt_1<30", "Muon transverse mass cut for W+jets suppression")\ .Filter("iso_1<0.1", "Require isolated muon for signal region") # Book histograms for the signal region df1 = df.Filter("q_1*q_2<0", "Require opposited charge for signal region") df1 = filterGenMatch(df1, label) hists = {} for variable in variables: hists[variable] = bookHistogram(df1, variable, ranges[variable]) # Book histograms for the control region used to estimate the QCD # contribution df2 = df.Filter("q_1*q_2>0", "Control region for QCD estimation") df2 = filterGenMatch(df2, label) hists_cr = {} for variable in variables: hists_cr[variable] = bookHistogram(df2, variable, ranges[variable]) # Write histograms to output file for variable in variables: writeHistogram(hists[variable], "{}_{}".format(label, variable)) for variable in variables: writeHistogram(hists_cr[variable], "{}_{}_cr".format(label, variable)) tfile.Close()
def test_initialization(self): """ Check that the user initialization method is assigned to the current backend. """ def returnNumber(n): return n PyRDF.initialize(returnNumber, 123) f = PyRDF.current_backend.initialization self.assertEqual(f(), 123)
def test_list_extend_include(self): """ Test case to check the working of 'PyRDF.include' function when different lists of strings are passed to it multiple times. """ PyRDF.include(["header1", "header2"]) PyRDF.include(["header3", "header4", "header5"]) self.assertListEqual( PyRDF.includes, ["header1", "header2", "header3", "header4", "header5"])
def test_initialization_runs_in_current_environment(self): """ User initialization method should be executed on the current user session, so actions applied by the user initialization function are also visible in the current scenario. """ def defineIntVariable(name, value): import ROOT ROOT.gInterpreter.ProcessLine("int %s = %s;" % (name, value)) varvalue = 2 PyRDF.initialize(defineIntVariable, "myInt", varvalue) self.assertEqual(ROOT.myInt, varvalue)
def test_includes_shared_lib_with_filter_op(self): """ Check that the filter operation is able to use C++ functions that were include using header files. """ # Paths to the cpp file that has to be compiled into a shared library # and the path with the output name of the library. # Both are relative to the current directory of this file cpp_path = "tests/integration/local/test_shared_libraries/a.cpp" library_path = "tests/integration/local/test_shared_libraries/liba.so" library_code = ("`root-config --cxx` " "`root-config --cflags --libs` " "-fPIC -shared {cpp}" " -o {lib}").format(cpp=cpp_path, lib=library_path) # This creates the shared library subprocess.call(library_code, shell=True) # Path to the shared library relative to the main PyRDF directory. so_path = ("tests/integration/local/" "test_shared_libraries/liba.so") PyRDF.include_shared_libraries(so_path) # The user can include directly the header related to the library # or choose to declare functions or objects later header_path = ("tests/integration/local/" "test_shared_libraries/a.h") PyRDF.include_headers(header_path) # Creates an RDataFrame with 5 integers [0...4] rdf = PyRDF.RDataFrame(5) # This filters out all numbers less than 3 filter1 = rdf.Filter("tdfentry_ < 3") # This defines a new variable x with all entries squared # then filters out all x values less than 3 filter2 = rdf.Define("x", "f(tdfentry_)").Filter("x < 3") count1 = filter1.Count().GetValue() count2 = filter2.Count().GetValue() # The final answer should be the number of integers # less than 5, which is 3, and the number of squared integers less # than 5, which is 2. self.assertEqual(count1, 3) self.assertEqual(count2, 2) # Remove unnecessary files at the end os.remove(so_path)
def test_asnumpy_return_arrays(self): """Test support for `AsNumpy` pythonization in local backend""" import numpy # Let's create a simple dataframe with ten rows and two columns df = PyRDF.RDataFrame(10).Define("x", "(int)rdfentry_")\ .Define("y", "1.f/(1.f+rdfentry_)") # Build a dictionary of numpy arrays. npy = df.AsNumpy() self.assertIsInstance(npy, dict) # Retrieve the two numpy arrays with the column names of the original # RDataFrame as dictionary keys. npy_x = npy["x"] npy_y = npy["y"] self.assertIsInstance(npy_x, numpy.ndarray) self.assertIsInstance(npy_y, numpy.ndarray) # Check the two arrays are of the same length as the original columns. self.assertEqual(len(npy_x), 10) self.assertEqual(len(npy_y), 10) # Check the types correspond to the ones of the original columns. int_32_dtype = numpy.dtype("int32") float_32_dtype = numpy.dtype("float32") self.assertEqual(npy_x.dtype, int_32_dtype) self.assertEqual(npy_y.dtype, float_32_dtype)
def test_distributed_snapshot(self): """Test support for `Snapshot` in distributed backend""" # A simple dataframe with ten sequential numbers from 0 to 9 df = PyRDF.RDataFrame(10).Define("x", "rdfentry_") # Count rows in the dataframe nrows = df.Count() # Snapshot to two files, build a ROOT.TChain with them and retrieve a # PyRDF.RDataFrame snapdf = df.Snapshot("snapTree", "snapFile.root") # Count the rows in the snapshotted dataframe snapcount = snapdf.Count() self.assertEqual(nrows.GetValue(), 10) self.assertEqual(snapcount.GetValue(), 10) # Retrieve list of file from the snapshotted PyRDF.RDataFrame input_files = snapdf.proxied_node.get_inputfiles() # Create list of supposed filenames for the intermediary files tmp_files = ["snapFile_0_4.root", "snapFile_5_9.root"] # Check that the two lists are the same self.assertListEqual(input_files, tmp_files) # Check that the intermediary .root files were created with the right # names, then remove them because they are not necessary for filename in tmp_files: self.assertTrue(os.path.exists(filename)) os.remove(filename)
def test_distributed_sum(self): """Test support for `Sum` operation in distributed backend""" rdf_py = PyRDF.RDataFrame(10) rdf_def = rdf_py.Define("x", "rdfentry_") rdf_sum = rdf_def.Sum("x") self.assertAlmostEqual(rdf_sum.GetValue(), 45.0)
def test_snapshot_nrows(self): """Test support for `Snapshot` in local backend""" def fill_tree(treeName, fileName): rdf = PyRDF.RDataFrame(100) return rdf.Define("b1", "rdfentry_")\ .Snapshot(treeName, fileName) # We prepare an input tree to run on fileName = "snapFile.root" treeName = "snapTree" snapdf = fill_tree(treeName, fileName) # We read the tree from the file and create a RDataFrame. d = PyRDF.RDataFrame(treeName, fileName) # Check on dataframe retrieved from file d_cut = d.Filter("b1 % 2 == 0") d_count = d_cut.Count() self.assertEqual(d_count.GetValue(), 50) # Check on dataframe returned by Snapshot operation snapdf_cut = snapdf.Filter("b1 % 2 == 0") snapdf_count = snapdf_cut.Count() self.assertEqual(snapdf_count.GetValue(), 50) # Remove unnecessary .root file os.remove(fileName)
def build_pyrdf_graph(self): """Create a PyRDF graph with a fixed set of operations and return it.""" treename = "data" files = [ 'https://root.cern/files/teaching/CMS_Open_Dataset.root', ] rdf = PyRDF.RDataFrame(treename, files) # Define the analysis cuts chargeCutStr = "C1 != C2" etaCutStr = "fabs(eta1) < 2.3 && fabs(eta2) < 2.3" ptCutStr = "pt1 > 2 && pt2 > 2" rdf_f = rdf.Filter(chargeCutStr, "Opposite Charge") \ .Filter(etaCutStr, "Central Muons") \ .Filter(ptCutStr, "Sane Pt") # Create the invariant mass column invMassFormulaStr = ("sqrt(pow(E1+E2, 2) - (pow(px1+px2, 2) +" "pow(py1+py2, 2) + pow(pz1+pz2, 2)))") rdf_fd = rdf_f.Define("invMass", invMassFormulaStr) # Create the histograms pt1_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt1") pt2_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt2") model = ("invMass", "CMS Opendata;#mu#mu mass[GeV];Events", 512, 5, 110) invMass_h = rdf_fd.Histo1D(model, "invMass") import ROOT pi = ROOT.TMath.Pi() model = ("", "", 64, -pi, pi, 64, -pi, pi) phis_h = rdf_fd.Histo2D(model, "phi1", "phi2") return pt1_h, pt2_h, invMass_h, phis_h
def main(basepath): """Skim the datasets and create histograms""" # Create output file tfile = ROOT.TFile("histograms.root", "RECREATE") for sample, label in samplesandlabels: filenames = [ basepath + sample + "_{}.root".format(i) for i in range(1, 16) ] # Uncomment to use only one file per sample # sample_file = eosbasepath + sample + "_1.root" # Create RDataFrame # Uncomment to use only one file per sample # df = PyRDF.RDataFrame("Events", sample_file) df = PyRDF.RDataFrame("Events", filenames) # Skim events skimdf = skim(df, sample) # Create histograms histos(skimdf, label) tfile.Close()
def test_initialization_method(self): """ Check initialization method in Local backend. Define a method in the ROOT interpreter called getValue which returns the value defined by the user on the python side. """ def init(value): cpp_code = '''auto getUserValue = [](){return %s ;};''' % value ROOT.gInterpreter.Declare(cpp_code) PyRDF.initialize(init, 123) PyRDF.current_backend = Local() df = PyRDF.RDataFrame(1) s = df.Define("userValue", "getUserValue()").Sum("userValue") self.assertEqual(s.GetValue(), 123)
def test_histo_from_empty_root_file(self): """ Check that when performing operations with the distributed backend on an RDataFrame without entries, PyRDF falls back to using the local backend and outputs the correct (empty) result. """ PyRDF.use("spark") # Creates and RDataFrame with 10 integers [0...9] rdf = PyRDF.RDataFrame("NOMINAL", "tests/unit/backend/emptytree.root") histo = rdf.Histo1D("mybranch") # Get entries in the histogram, should be zero entries = histo.GetEntries() self.assertIsInstance(PyRDF.current_backend, Local) self.assertEqual(entries, 0)