def test_includes_function_with_filter_and_histo(self): """ An integration test to check that the filter operation is able to use C++ functions that were included using header files. """ PyRDF.include("tests/integration/local/test_headers/header1.hxx") PyRDF.use("spark") rdf = PyRDF.RDataFrame(10) # This filters out all numbers less than 5 rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)") histo = rdf_filtered.Histo1D("tdfentry_") # The expected results after filtering required_numbers = range( 5) # The actual set of numbers required after filtering required_size = len(required_numbers) required_mean = sum(required_numbers) / float(required_size) required_stdDev = math.sqrt( sum((x - required_mean)**2 for x in required_numbers) / required_size) # Compare the sizes of equivalent set of numbers self.assertEqual(histo.GetEntries(), float(required_size)) # Compare the means of equivalent set of numbers self.assertEqual(histo.GetMean(), required_mean) # Compare the standard deviations of equivalent set of numbers self.assertEqual(histo.GetStdDev(), required_stdDev)
def test_spark_histograms(self): """ Integration test to check that Spark backend works the same way as local. """ # Spark execution PyRDF.use("spark", {'npartitions': 5}) pt1_h_spark, pt2_h_spark, invMass_h_spark, phis_h_spark = self.build_pyrdf_graph( ) pt1_h_spark.Draw("PL PLC PMC") # Trigger Event-loop, Spark # Local execution PyRDF.use("local") pt1_h_local, pt2_h_local, invMass_h_local, phis_h_local = self.build_pyrdf_graph( ) pt1_h_local.Draw("PL PLC PMC") # Trigger Event-loop, Local # Assert 'pt1_h' histogram self.assertEqual(pt1_h_spark.GetEntries(), pt1_h_local.GetEntries()) # Assert 'pt2_h' histogram self.assertEqual(pt2_h_spark.GetEntries(), pt2_h_local.GetEntries()) # Assert 'invMass_h' histogram self.assertEqual(invMass_h_spark.GetEntries(), invMass_h_local.GetEntries()) # Assert 'phis_h' histogram self.assertEqual(phis_h_spark.GetEntries(), phis_h_local.GetEntries())
def test_extend_ROOT_include_path(self): """ Check that the include path of ROOT is extended with the directories specified in `PyRDF.include_headers()` so references between headers are correctly solved. """ import ROOT header_folder = "tests/integration/local/test_headers/headers_folder" PyRDF.use("spark") PyRDF.include_headers(header_folder) # Get list of include paths seen by ROOT ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ") # Create new include folder token new_folder_include = "-I\"{}\"".format(header_folder) # Check that new folder is in ROOT include paths self.assertTrue(new_folder_include in ROOT_include_path) # Create an RDataFrame with 100 integers from 0 to 99 rdf = PyRDF.RDataFrame(100) # Filter numbers less than 10 and create an histogram rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)") histo1 = rdf_less_than_10.Histo1D("tdfentry_") # Check that histogram has 10 entries and mean 4.5 self.assertEqual(histo1.GetEntries(), 10) self.assertAlmostEqual(histo1.GetMean(), 4.5)
def test_spark_histograms(self): """Check that Spark backend works the same way as local.""" physics_variables = ['pt1_h', 'pt2_h', 'invMass_h', 'phis_h'] # Spark execution PyRDF.use("spark", {'npartitions': 5}) SparkResult = namedtuple('SparkResult', physics_variables) spark = SparkResult(*self.build_pyrdf_graph()) spark.pt1_h.Draw("PL PLC PMC") # Trigger Event-loop, Spark # Local execution PyRDF.use("local") LocalResult = namedtuple('LocalResult', physics_variables) local = LocalResult(*self.build_pyrdf_graph()) local.pt1_h.Draw("PL PLC PMC") # Trigger Event-loop, Local # Assert 'pt1_h' histogram self.assertEqual(spark.pt1_h.GetEntries(), local.pt1_h.GetEntries()) # Assert 'pt2_h' histogram self.assertEqual(spark.pt2_h.GetEntries(), local.pt2_h.GetEntries()) # Assert 'invMass_h' histogram self.assertEqual(spark.invMass_h.GetEntries(), local.invMass_h.GetEntries()) # Assert 'phis_h' histogram self.assertEqual(spark.phis_h.GetEntries(), local.phis_h.GetEntries())
def tearDownClass(cls): """ Restore global current_backend to default Local backend after running all tests """ PyRDF.use("local")
def main(): """ Main function of the skimming step of the analysis The function loops over all required samples, reduces the content to the interesting events and writes them to new files. """ PyRDF.use("spark") PyRDF.include_headers("skim.h") for sample in sampleNames: print(">>> Process sample {}:\n".format(sample)) df = PyRDF.RDataFrame("Events", samplesBasePath + sample + ".root") df2 = MinimalSelection(df) df3 = FindGoodMuons(df2) df4 = FindGoodTaus(df3) df5 = FilterGoodEvents(df4) df6 = FindMuonTauPair(df5) df7 = DeclareVariables(df6) df8 = CheckGeneratorTaus(df7, sample) df9 = AddEventWeight(df8, sample) out_file = sample + "Skim.root" df9.Snapshot("Events", out_file, final_variables_vec)
def test_future_env_select(self): """ Test to check if a future environment throws a NotImplementedError. """ with self.assertRaises(NotImplementedError): PyRDF.use("dask")
def test_local_select(self): """ Test to check if 'local' environment gets set correctly. """ PyRDF.use("local") self.assertIsInstance(PyRDF.current_backend, Local)
def test_spark_select(self): """ Test to check if 'spark' environment gets set correctly. """ PyRDF.use("spark") self.assertIsInstance(PyRDF.current_backend, Spark)
def main(): PyRDF.use("spark") # Create output file tfile = ROOT.TFile("histograms.root", "RECREATE") variables = ranges.keys() # Loop through skimmed datasets and produce histograms of variables for name, label in [ ("GluGluToHToTauTau", "ggH"), ("VBF_HToTauTau", "qqH"), ("W1JetsToLNu", "W1J"), ("W2JetsToLNu", "W2J"), ("W3JetsToLNu", "W3J"), ("TTbar", "TT"), ("DYJetsToLL", "ZLL"), ("DYJetsToLL", "ZTT"), ("Run2012B_TauPlusX", "dataRunB"), ("Run2012C_TauPlusX", "dataRunC"), ]: print(">>> Process skim {}".format(name)) filenames = [filename for filename in os.listdir() if name in filename] # Load skimmed dataset and apply baseline selection df = PyRDF.RDataFrame("Events", filenames).Filter( "mt_1<30", "Muon transverse mass cut for W+jets suppression")\ .Filter("iso_1<0.1", "Require isolated muon for signal region") # Book histograms for the signal region df1 = df.Filter("q_1*q_2<0", "Require opposited charge for signal region") df1 = filterGenMatch(df1, label) hists = {} for variable in variables: hists[variable] = bookHistogram(df1, variable, ranges[variable]) # Book histograms for the control region used to estimate the QCD # contribution df2 = df.Filter("q_1*q_2>0", "Control region for QCD estimation") df2 = filterGenMatch(df2, label) hists_cr = {} for variable in variables: hists_cr[variable] = bookHistogram(df2, variable, ranges[variable]) # Write histograms to output file for variable in variables: writeHistogram(hists[variable], "{}_{}".format(label, variable)) for variable in variables: writeHistogram(hists_cr[variable], "{}_{}_cr".format(label, variable)) tfile.Close()
def test_histo_from_empty_root_file(self): """ Check that when performing operations with the distributed backend on an RDataFrame without entries, PyRDF falls back to using the local backend and outputs the correct (empty) result. """ PyRDF.use("spark") # Creates and RDataFrame with 10 integers [0...9] rdf = PyRDF.RDataFrame("NOMINAL", "tests/unit/backend/emptytree.root") histo = rdf.Histo1D("mybranch") # Get entries in the histogram, should be zero entries = histo.GetEntries() self.assertIsInstance(PyRDF.current_backend, Local) self.assertEqual(entries, 0)
def test_local_stops_spark(self): """Test that switching to local stops running SparkContext""" # Instantiate a spark backend PyRDF.use("spark") from PyRDF import current_backend # Save the SparkContext object into a variable sc = current_backend.sparkContext # Retrieve the java object (to check that it has been stopped) javacon = sc._jsc.sc() # Check the java Spark context is alive self.assertFalse(javacon.isStopped()) PyRDF.use("local") # Check that `use` function correctly stopped the Spark context self.assertTrue(javacon.isStopped())
def test_friend_tree_histo(self): """ Tests that the computational graph can be issued both on the parent tree and the friend tree. """ self.create_parent_tree() self.create_friend_tree() # Parent Tree baseTree = ROOT.TChain("T") baseTree.Add("treeparent.root") # Friend Tree friendTree = ROOT.TChain("TF") friendTree.Add("treefriend.root") # Add friendTree to the parent baseTree.AddFriend(friendTree) # Create a PyRDF RDataFrame with the parent and the friend trees PyRDF.use("spark") df = PyRDF.RDataFrame(baseTree) # Create histograms h_parent = df.Histo1D("x") h_friend = df.Histo1D("TF.x") # Both trees have the same number of entries, i.e. 10000 self.assertEqual(h_parent.GetEntries(), 10000) self.assertEqual(h_friend.GetEntries(), 10000) # Check the mean of the distribution for each tree self.assertAlmostEqual(h_parent.GetMean(), 10, delta=0.01) self.assertAlmostEqual(h_friend.GetMean(), 20, delta=0.01) # Check the standard deviation of the distribution for each tree self.assertAlmostEqual(h_parent.GetStdDev(), 1, delta=0.01) self.assertAlmostEqual(h_friend.GetStdDev(), 1, delta=0.01) # Remove unnecessary .root files os.remove("treeparent.root") os.remove("treefriend.root")
def test_change_attribute_when_npartitions_greater_than_clusters(self): """ Check that the `npartitions class attribute is changed when it is greater than the number of clusters in the ROOT file. """ PyRDF.use("spark", {"npartitions": 10}) from PyRDF import current_backend self.assertEqual(current_backend.npartitions, 10) treename = "TotemNtuple" filelist = ["tests/unit/backend/Slimmed_ntuple.root"] df = PyRDF.RDataFrame(treename, filelist) histo = df.Histo1D("track_rp_3.x") nentries = histo.GetEntries() self.assertEqual(nentries, 10) self.assertEqual(current_backend.npartitions, 1)
def test_write_histo(self): """ Tests that an histogram is correctly written to a .root file created before the execution of the event loop. """ self.create_tree_with_data() # Create a new file where the histogram will be written outfile = ROOT.TFile("out_file.root", "recreate") # Create a PyRDF RDataFrame with the parent and the friend trees PyRDF.use("spark") df = PyRDF.RDataFrame("Events", "tree_gaus.root") # Create histogram histo = df.Histo1D(("x", "x", 100, 0, 20), "x") # Write histogram to out_file.root and close the file histo.Write() outfile.Close() # Reopen file to check that histogram was correctly stored reopen_file = ROOT.TFile("out_file.root", "read") reopen_histo = reopen_file.Get("x") # Check histogram statistics self.assertEqual(reopen_histo.GetEntries(), self.nentries) self.assertAlmostEqual(reopen_histo.GetMean(), self.gaus_mean, delta=self.delta_equal) self.assertAlmostEqual(reopen_histo.GetStdDev(), self.gaus_stdev, delta=self.delta_equal) # Remove unnecessary .root files os.remove("tree_gaus.root") os.remove("out_file.root")
import PyRDF import ROOT from ROOT import TCanvas from pyspark import SparkContext context = SparkContext.getOrCreate() context.stop() PyRDF.use("spark", {'npartitions': 2}) PyRDF.include("common_definitions.h") PyRDF.include("parameters_global.h") PyRDF.include("common_algorithms.h") PyRDF.include("parameters.h") PyRDF.include("common.h") PyRDF.include("initialize.h") ##################################################### # Prepare input data ##################################################### # Branches clasified by diagonal diagonals = { # return a tuple: ([left] verticals in 45, [right] verticals in 56)) "d45b_56t": (["track_rp_5", "track_rp_21", "track_rp_25"], ["track_rp_104", "track_rp_120", "track_rp_124"]), "ad45b_56b": (["track_rp_5", "track_rp_21", "track_rp_25"], ["track_rp_105", "track_rp_121", "track_rp_125"]),
def setUpClass(cls): """Select Spark backend before running all the tests.""" PyRDF.use("spark", {'npartitions': 2, 'spark.executor.instances': 2})
def test_future_env_select(self): """Non implemented backends throw a NotImplementedError.""" with self.assertRaises(NotImplementedError): PyRDF.use("dask")
import PyRDF import ROOT PyRDF.use("spark") # A simple helper function to fill a test tree: this makes the example stand-alone. def fill_tree(treeName, fileName): rdf = PyRDF.RDataFrame(10000) return rdf.Define("b1", "(int) rdfentry_")\ .Define("b2", "(float) rdfentry_ * rdfentry_")\ .Snapshot(treeName, fileName) # We prepare an input tree to run on fileName = "df007_snapshot_py.root" outFileName = "df007_snapshot_output_py.root" outFileNameAllColumns = "df007_snapshot_output_allColumns_py.root" treeName = "myTree" # The tree is snapshotted and we retrieve a new PyRDF.RDataFrame from it d = fill_tree(treeName, fileName) # ## Select entries # We now select some entries in the dataset d_cut = d.Filter("b1 % 2 == 0") # ## Enrich the dataset # Build some temporary columns: we'll write them out PyRDF.include_headers("tutorials/headers/df007_snapshot.h") d2 = d_cut.Define("b1_square", "b1 * b1") \
import PyRDF PyRDF.use("local") PyRDF.include_headers("./headers.hh") import latinos_reader as lr trees = lr.build_dataframe("./lowenergy", "VBS", PyRDF, "pyrdf") tree = trees[0] # print( tree.lowen_ele_looseVBS.rdf_node.AsNumpy(columns=["mjj_vbs"]) ) # m = tree.lowen_ele_looseVBS.rdf_node.Mean("mjj_vbs") # print( m.GetValue() )
""" Simple sum operation on a column of the reference dataset. Connects to the OpenStack VMs Spark cluster. If there is another cluster change arguments of PyRDF.use() accordingly. """ import ROOT import PyRDF PyRDF.use("spark", conf={ 'npartitions': 3, 'spark.master': 'spark://137.138.55.13:7077', 'spark.driver.port': 40000, 'spark.blockManager.port': 30000, 'spark.app.name': 'PyRDF', 'spark.executor.instances': 3, }) rdf = PyRDF.RDataFrame( "reftree", "root://eosuser.cern.ch//eos/user/v/vpadulan/reftree/reftree_100000000entry.root") s = rdf.Sum("b3") t = ROOT.TStopwatch() s.GetValue() t.Stop() realtime = round(t.RealTime(), 2) with open("tfileprefetch_pyrdf_sum.csv", "a+") as f: f.write(str(realtime)) f.write("\n")
def execute(self, generator): """ Executes the current RDataFrame graph in the given distributed environment. Args: generator (PyRDF.CallableGenerator): An instance of :obj:`CallableGenerator` that is responsible for generating the callable function. """ callable_function = generator.get_callable() # Arguments needed to create PyROOT RDF object rdf_args = generator.head_node.args treename = generator.head_node.get_treename() selected_branches = generator.head_node.get_branches() # Avoid having references to the instance inside the mapper initialization = Backend.initialization def mapper(current_range): """ Triggers the event-loop and executes all nodes in the computational graph using the callable. Args: current_range (tuple): A pair that contains the starting and ending values of the current range. Returns: list: This respresents the list of values of all action nodes in the computational graph. """ import ROOT # We have to decide whether to do this in Dist or in subclasses # Utils.declare_headers(worker_includes) # Declare headers if any # Run initialization method to prepare the worker runtime # environment initialization() # Build rdf start = int(current_range.start) end = int(current_range.end) if treename: # Build TChain of files for this range: chain = ROOT.TChain(treename) for f in current_range.filelist: chain.Add(str(f)) # We assume 'end' is exclusive chain.SetCacheEntryRange(start, end) # Gather information about friend trees friend_info = current_range.friend_info if friend_info: # Zip together the treenames of the friend trees and the # respective file names. Each friend treename can have # multiple corresponding friend file names. tree_files_names = zip(friend_info.friend_names, friend_info.friend_file_names) for friend_treename, friend_filenames in tree_files_names: # Start a TChain with the current friend treename friend_chain = ROOT.TChain(friend_treename) # Add each corresponding file to the TChain for filename in friend_filenames: friend_chain.Add(filename) # Set cache on the same range as the parent TChain friend_chain.SetCacheEntryRange(start, end) # Finally add friend TChain to the parent chain.AddFriend(friend_chain) if selected_branches: rdf = ROOT.ROOT.RDataFrame(chain, selected_branches) else: rdf = ROOT.ROOT.RDataFrame(chain) else: rdf = ROOT.ROOT.RDataFrame(*rdf_args) # PyROOT RDF object # # TODO : If we want to run multi-threaded in a Spark node in # # the future, use `TEntryList` instead of `Range` # rdf_range = rdf.Range(current_range.start, current_range.end) # Output of the callable output = callable_function(rdf, rdf_range=current_range) for i in range(len(output)): # `AsNumpy` and `Snapshot` return respectively `dict` and `list` # that don't have the `GetValue` method. if isinstance(output[i], (dict, list)): continue # FIX ME : RResultPtrs aren't serializable, # because of which we have to manually find # out the types here and copy construct the # values. # The type of the value of the action node value_type = type(output[i].GetValue()) # The `value_type` is required here because, # after a call to `GetValue`, the values die # along with the RResultPtrs output[i] = value_type(output[i].GetValue()) return output def reducer(values_list1, values_list2): """ Merges two given lists of values that were returned by the mapper function for two different ranges. Args: values_list1 (list): A list of computed values for a given entry range in a dataset. values_list2 (list): A list of computed values for a given entry range in a dataset. Returns: list: This is a list of values obtained after merging two given lists. """ import ROOT for i in range(len(values_list1)): # A bunch of if-else conditions to merge two values # Create a global list with all the files of the partial # snapshots if isinstance(values_list1[i], list): values_list1[i].extend(values_list2[i]) elif isinstance(values_list1[i], dict): combined = { key: numpy.concatenate( [values_list1[i][key], values_list2[i][key]]) for key in values_list1[i] } values_list1[i] = combined elif (isinstance(values_list1[i], ROOT.TH1) or isinstance(values_list1[i], ROOT.TH2)): # Merging two objects of type ROOT.TH1D or ROOT.TH2D values_list1[i].Add(values_list2[i]) elif isinstance(values_list1[i], ROOT.TGraph): # Prepare a TList tlist = ROOT.TList() tlist.Add(values_list2[i]) # Merge the second graph onto the first num_points = values_list1[i].Merge(tlist) # Check if there was an error in merging if num_points == -1: msg = "Error reducing two result values of type TGraph!" raise Exception(msg) elif isinstance(values_list1[i], float): # Adding values resulting from a Sum() operation # Sum() always returns a float in python values_list1[i] += values_list2[i] elif (isinstance(values_list1[i], int)): # noqa: Python 2 # Adding values resulting from a Count() operation values_list1[i] += values_list2[i] else: msg = ("Type \"{}\" is not supported by the reducer yet!". format(type(values_list1[i]))) raise NotImplementedError(msg) return values_list1 # Get number of entries in the input dataset using # arguments passed to RDataFrame constructor self.nentries = generator.head_node.get_num_entries() # Retrieve the treename used to initialize the RDataFrame self.treename = generator.head_node.get_treename() # Retrieve the filenames used to initialize the RDataFrame self.files = generator.head_node.get_inputfiles() # Retrieve the ROOT.TTree instance used to initialize the RDataFrame self.tree = generator.head_node.get_tree() # Retrieve info about the friend trees if self.tree: self.friend_info = self._get_friend_info(self.tree) if not self.nentries: # Fall back to local execution # if 'nentries' is '0' msg = ("No entries in the Tree, falling back to local execution!") warnings.warn(msg, UserWarning, stacklevel=2) PyRDF.use("local") from .. import current_backend return current_backend.execute(generator) # Values produced after Map-Reduce values = self.ProcessAndMerge(mapper, reducer) # List of action nodes in the same order as values nodes = generator.get_action_nodes() # Set the value of every action node for node, value in zip(nodes, values): if node.operation.name == "Snapshot": # Retrieve treename from operation args and start TChain snapshot_treename = node.operation.args[0] snapshot_chain = ROOT.TChain(snapshot_treename) # Add partial snapshot files to the chain for filename in value: snapshot_chain.Add(filename) # Create a new rdf with the chain and return that to user snapshot_rdf = PyRDF.RDataFrame(snapshot_chain) node.value = snapshot_rdf else: node.value = value
## ## \macro_code ## ## \date February 2017 ## \author Danilo Piparo import ROOT, PyRDF # The second parameter in 'PyRDF.use' call, is the config # dictionary. 'npartitions' represents the number of parts # that the input dataset should be divided into for processing. # 'spark.executor.instances' is a Spark configuration parameter and # it represents the number of spark executors that should be used to # process the partitioned dataset. Learn more about Spark configuration # options from it's official documentation page. PyRDF.use("spark", {'npartitions':4, 'spark.executor.instances':4}) RDataFrame = PyRDF.RDataFrame # A simple helper function to fill a test tree: this makes the example # stand-alone. def fill_tree(treeName, fileName): d = ROOT.ROOT.RDataFrame(25000) d.Define("px", "gRandom->Gaus()")\ .Define("py", "gRandom->Gaus()")\ .Define("pz", "sqrt(px * px + py * py)")\ .Snapshot(treeName, fileName) # We prepare an input tree to run on
help='Name of hist output file, if plotting selected') parser.add_argument('--noPlots', dest='noPlots', action='store_true', help='Disable plotting') args = parser.parse_args() #Call RDataFrame backend, if chosen if args.backend == "RDF": print("\tRDataFrame Implicit Multi-threading {}".format("ENABLED" if args.noIMT == False else "DISABLED")) print("====================") if not args.noIMT: ROOT.ROOT.EnableImplicitMT(args.nThreads) RDF = ROOT.ROOT.RDataFrame if args.backend == "PyRDF": import PyRDF PyRDF.use("spark", {'npartitions': '64'}) #was 32 in example RDF = PyRDF.RDataFrame procstart = collections.OrderedDict() procfinish = collections.OrderedDict() nToLabel = {} labelToN = {} print(args.label) for ln, label in enumerate(args.label): nToLabel[ln] = label labelToN[label] = ln def main(): for fn, fs in enumerate(args.input): print("\tSample '{}'".format(nToLabel[fn]))