Ejemplos de PyRDF.use en Python

Ejemplo n.º 1

0

Mostrar archivo

    def test_includes_function_with_filter_and_histo(self):
        """
        An integration test to check that the filter
        operation is able to use C++ functions that
        were included using header files.

        """
        PyRDF.include("tests/integration/local/test_headers/header1.hxx")
        PyRDF.use("spark")

        rdf = PyRDF.RDataFrame(10)

        # This filters out all numbers less than 5
        rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)")
        histo = rdf_filtered.Histo1D("tdfentry_")

        # The expected results after filtering
        required_numbers = range(
            5)  # The actual set of numbers required after filtering
        required_size = len(required_numbers)
        required_mean = sum(required_numbers) / float(required_size)
        required_stdDev = math.sqrt(
            sum((x - required_mean)**2
                for x in required_numbers) / required_size)

        # Compare the sizes of equivalent set of numbers
        self.assertEqual(histo.GetEntries(), float(required_size))

        # Compare the means of equivalent set of numbers
        self.assertEqual(histo.GetMean(), required_mean)

        # Compare the standard deviations of equivalent set of numbers
        self.assertEqual(histo.GetStdDev(), required_stdDev)

Ejemplo n.º 2

0

Mostrar archivo

    def test_spark_histograms(self):
        """
        Integration test to check that Spark
        backend works the same way as local.

        """
        # Spark execution
        PyRDF.use("spark", {'npartitions': 5})
        pt1_h_spark, pt2_h_spark, invMass_h_spark, phis_h_spark = self.build_pyrdf_graph(
        )
        pt1_h_spark.Draw("PL PLC PMC")  # Trigger Event-loop, Spark

        # Local execution
        PyRDF.use("local")
        pt1_h_local, pt2_h_local, invMass_h_local, phis_h_local = self.build_pyrdf_graph(
        )
        pt1_h_local.Draw("PL PLC PMC")  # Trigger Event-loop, Local

        # Assert 'pt1_h' histogram
        self.assertEqual(pt1_h_spark.GetEntries(), pt1_h_local.GetEntries())
        # Assert 'pt2_h' histogram
        self.assertEqual(pt2_h_spark.GetEntries(), pt2_h_local.GetEntries())
        # Assert 'invMass_h' histogram
        self.assertEqual(invMass_h_spark.GetEntries(),
                         invMass_h_local.GetEntries())
        # Assert 'phis_h' histogram
        self.assertEqual(phis_h_spark.GetEntries(), phis_h_local.GetEntries())

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_include_headers.py Proyecto: yeckang/PyRDF

    def test_extend_ROOT_include_path(self):
        """
        Check that the include path of ROOT is extended with the directories
        specified in `PyRDF.include_headers()` so references between headers
        are correctly solved.
        """
        import ROOT

        header_folder = "tests/integration/local/test_headers/headers_folder"

        PyRDF.use("spark")
        PyRDF.include_headers(header_folder)

        # Get list of include paths seen by ROOT
        ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ")

        # Create new include folder token
        new_folder_include = "-I\"{}\"".format(header_folder)

        # Check that new folder is in ROOT include paths
        self.assertTrue(new_folder_include in ROOT_include_path)

        # Create an RDataFrame with 100 integers from 0 to 99
        rdf = PyRDF.RDataFrame(100)

        # Filter numbers less than 10 and create an histogram
        rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)")
        histo1 = rdf_less_than_10.Histo1D("tdfentry_")

        # Check that histogram has 10 entries and mean 4.5
        self.assertEqual(histo1.GetEntries(), 10)
        self.assertAlmostEqual(histo1.GetMean(), 4.5)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: test_inv_mass.py Proyecto: yeckang/PyRDF

    def test_spark_histograms(self):
        """Check that Spark backend works the same way as local."""
        physics_variables = ['pt1_h', 'pt2_h', 'invMass_h', 'phis_h']

        # Spark execution
        PyRDF.use("spark", {'npartitions': 5})

        SparkResult = namedtuple('SparkResult', physics_variables)
        spark = SparkResult(*self.build_pyrdf_graph())

        spark.pt1_h.Draw("PL PLC PMC")  # Trigger Event-loop, Spark

        # Local execution
        PyRDF.use("local")

        LocalResult = namedtuple('LocalResult', physics_variables)
        local = LocalResult(*self.build_pyrdf_graph())

        local.pt1_h.Draw("PL PLC PMC")  # Trigger Event-loop, Local

        # Assert 'pt1_h' histogram
        self.assertEqual(spark.pt1_h.GetEntries(), local.pt1_h.GetEntries())
        # Assert 'pt2_h' histogram
        self.assertEqual(spark.pt2_h.GetEntries(), local.pt2_h.GetEntries())
        # Assert 'invMass_h' histogram
        self.assertEqual(spark.invMass_h.GetEntries(),
                         local.invMass_h.GetEntries())
        # Assert 'phis_h' histogram
        self.assertEqual(spark.phis_h.GetEntries(), local.phis_h.GetEntries())

Ejemplo n.º 5

0

Mostrar archivo

    def tearDownClass(cls):
        """
        Restore global current_backend to default Local backend after running
        all tests

        """
        PyRDF.use("local")

Ejemplo n.º 6

0

Mostrar archivo

def main():
    """
    Main function of the skimming step of the analysis
    The function loops over all required samples, reduces the content to the
    interesting events and writes them to new files.
    """
    PyRDF.use("spark")
    PyRDF.include_headers("skim.h")

    for sample in sampleNames:
        print(">>> Process sample {}:\n".format(sample))

        df = PyRDF.RDataFrame("Events", samplesBasePath + sample + ".root")

        df2 = MinimalSelection(df)
        df3 = FindGoodMuons(df2)
        df4 = FindGoodTaus(df3)
        df5 = FilterGoodEvents(df4)
        df6 = FindMuonTauPair(df5)
        df7 = DeclareVariables(df6)
        df8 = CheckGeneratorTaus(df7, sample)
        df9 = AddEventWeight(df8, sample)

        out_file = sample + "Skim.root"
        df9.Snapshot("Events", out_file, final_variables_vec)

Ejemplo n.º 7

0

Mostrar archivo

Archivo: test_common.py Proyecto: sly2j/PyRDF

    def test_future_env_select(self):
        """
        Test to check if a future environment
        throws a NotImplementedError.

        """

        with self.assertRaises(NotImplementedError):
            PyRDF.use("dask")

Ejemplo n.º 8

0

Mostrar archivo

Archivo: test_local.py Proyecto: sly2j/PyRDF

    def test_local_select(self):
        """
        Test to check if 'local'
        environment gets set correctly.

        """

        PyRDF.use("local")
        self.assertIsInstance(PyRDF.current_backend, Local)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: test_spark.py Proyecto: sly2j/PyRDF

    def test_spark_select(self):
        """
        Test to check if 'spark'
        environment gets set correctly.

        """

        PyRDF.use("spark")
        self.assertIsInstance(PyRDF.current_backend, Spark)

Ejemplo n.º 10

0

Mostrar archivo

def main():
    PyRDF.use("spark")

    # Create output file
    tfile = ROOT.TFile("histograms.root", "RECREATE")
    variables = ranges.keys()

    # Loop through skimmed datasets and produce histograms of variables
    for name, label in [
        ("GluGluToHToTauTau", "ggH"),
        ("VBF_HToTauTau", "qqH"),
        ("W1JetsToLNu", "W1J"),
        ("W2JetsToLNu", "W2J"),
        ("W3JetsToLNu", "W3J"),
        ("TTbar", "TT"),
        ("DYJetsToLL", "ZLL"),
        ("DYJetsToLL", "ZTT"),
        ("Run2012B_TauPlusX", "dataRunB"),
        ("Run2012C_TauPlusX", "dataRunC"),
    ]:
        print(">>> Process skim {}".format(name))

        filenames = [filename for filename in os.listdir() if name in filename]
        # Load skimmed dataset and apply baseline selection
        df = PyRDF.RDataFrame("Events", filenames).Filter(
            "mt_1<30",
            "Muon transverse mass cut for W+jets suppression")\
            .Filter("iso_1<0.1", "Require isolated muon for signal region")

        # Book histograms for the signal region
        df1 = df.Filter("q_1*q_2<0",
                        "Require opposited charge for signal region")
        df1 = filterGenMatch(df1, label)
        hists = {}
        for variable in variables:
            hists[variable] = bookHistogram(df1, variable, ranges[variable])

        # Book histograms for the control region used to estimate the QCD
        # contribution
        df2 = df.Filter("q_1*q_2>0", "Control region for QCD estimation")
        df2 = filterGenMatch(df2, label)
        hists_cr = {}
        for variable in variables:
            hists_cr[variable] = bookHistogram(df2, variable, ranges[variable])

        # Write histograms to output file
        for variable in variables:
            writeHistogram(hists[variable], "{}_{}".format(label, variable))
        for variable in variables:
            writeHistogram(hists_cr[variable],
                           "{}_{}_cr".format(label, variable))

    tfile.Close()

Ejemplo n.º 11

0

Mostrar archivo

    def test_histo_from_empty_root_file(self):
        """
        Check that when performing operations with the distributed backend on
        an RDataFrame without entries, PyRDF falls back to using the local
        backend and outputs the correct (empty) result.
        """
        PyRDF.use("spark")

        # Creates and RDataFrame with 10 integers [0...9]
        rdf = PyRDF.RDataFrame("NOMINAL", "tests/unit/backend/emptytree.root")
        histo = rdf.Histo1D("mybranch")

        # Get entries in the histogram, should be zero
        entries = histo.GetEntries()

        self.assertIsInstance(PyRDF.current_backend, Local)
        self.assertEqual(entries, 0)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: test_common.py Proyecto: yeckang/PyRDF

    def test_local_stops_spark(self):
        """Test that switching to local stops running SparkContext"""
        # Instantiate a spark backend
        PyRDF.use("spark")

        from PyRDF import current_backend

        # Save the SparkContext object into a variable
        sc = current_backend.sparkContext
        # Retrieve the java object (to check that it has been stopped)
        javacon = sc._jsc.sc()

        # Check the java Spark context is alive
        self.assertFalse(javacon.isStopped())

        PyRDF.use("local")

        # Check that `use` function correctly stopped the Spark context
        self.assertTrue(javacon.isStopped())

Ejemplo n.º 13

0

Mostrar archivo

Archivo: test_friend_trees.py Proyecto: yeckang/PyRDF

    def test_friend_tree_histo(self):
        """
        Tests that the computational graph can be issued both on the
        parent tree and the friend tree.
        """
        self.create_parent_tree()
        self.create_friend_tree()

        # Parent Tree
        baseTree = ROOT.TChain("T")
        baseTree.Add("treeparent.root")

        # Friend Tree
        friendTree = ROOT.TChain("TF")
        friendTree.Add("treefriend.root")

        # Add friendTree to the parent
        baseTree.AddFriend(friendTree)

        # Create a PyRDF RDataFrame with the parent and the friend trees
        PyRDF.use("spark")
        df = PyRDF.RDataFrame(baseTree)

        # Create histograms
        h_parent = df.Histo1D("x")
        h_friend = df.Histo1D("TF.x")

        # Both trees have the same number of entries, i.e. 10000
        self.assertEqual(h_parent.GetEntries(), 10000)
        self.assertEqual(h_friend.GetEntries(), 10000)

        # Check the mean of the distribution for each tree
        self.assertAlmostEqual(h_parent.GetMean(), 10, delta=0.01)
        self.assertAlmostEqual(h_friend.GetMean(), 20, delta=0.01)

        # Check the standard deviation of the distribution for each tree
        self.assertAlmostEqual(h_parent.GetStdDev(), 1, delta=0.01)
        self.assertAlmostEqual(h_friend.GetStdDev(), 1, delta=0.01)

        # Remove unnecessary .root files
        os.remove("treeparent.root")
        os.remove("treefriend.root")

Ejemplo n.º 14

0

Mostrar archivo

    def test_change_attribute_when_npartitions_greater_than_clusters(self):
        """
        Check that the `npartitions class attribute is changed when it is
        greater than the number of clusters in the ROOT file.
        """
        PyRDF.use("spark", {"npartitions": 10})

        from PyRDF import current_backend

        self.assertEqual(current_backend.npartitions, 10)

        treename = "TotemNtuple"
        filelist = ["tests/unit/backend/Slimmed_ntuple.root"]
        df = PyRDF.RDataFrame(treename, filelist)

        histo = df.Histo1D("track_rp_3.x")
        nentries = histo.GetEntries()

        self.assertEqual(nentries, 10)
        self.assertEqual(current_backend.npartitions, 1)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: test_histo_write.py Proyecto: yeckang/PyRDF

    def test_write_histo(self):
        """
        Tests that an histogram is correctly written to a .root file created
        before the execution of the event loop.
        """
        self.create_tree_with_data()

        # Create a new file where the histogram will be written
        outfile = ROOT.TFile("out_file.root", "recreate")

        # Create a PyRDF RDataFrame with the parent and the friend trees
        PyRDF.use("spark")
        df = PyRDF.RDataFrame("Events", "tree_gaus.root")

        # Create histogram
        histo = df.Histo1D(("x", "x", 100, 0, 20), "x")

        # Write histogram to out_file.root and close the file
        histo.Write()
        outfile.Close()

        # Reopen file to check that histogram was correctly stored
        reopen_file = ROOT.TFile("out_file.root", "read")
        reopen_histo = reopen_file.Get("x")

        # Check histogram statistics
        self.assertEqual(reopen_histo.GetEntries(), self.nentries)
        self.assertAlmostEqual(reopen_histo.GetMean(),
                               self.gaus_mean,
                               delta=self.delta_equal)
        self.assertAlmostEqual(reopen_histo.GetStdDev(),
                               self.gaus_stdev,
                               delta=self.delta_equal)

        # Remove unnecessary .root files
        os.remove("tree_gaus.root")
        os.remove("out_file.root")

Ejemplo n.º 16

0

Mostrar archivo

import PyRDF
import ROOT
from ROOT import TCanvas

from pyspark import SparkContext

context = SparkContext.getOrCreate()
context.stop()

PyRDF.use("spark", {'npartitions': 2})

PyRDF.include("common_definitions.h")
PyRDF.include("parameters_global.h")
PyRDF.include("common_algorithms.h")
PyRDF.include("parameters.h")
PyRDF.include("common.h")
PyRDF.include("initialize.h")

#####################################################
# Prepare input data
#####################################################

# Branches clasified by diagonal
diagonals = {
    # return a tuple: ([left] verticals in 45, [right] verticals in 56))
    "d45b_56t":
    (["track_rp_5", "track_rp_21",
      "track_rp_25"], ["track_rp_104", "track_rp_120", "track_rp_124"]),
    "ad45b_56b":
    (["track_rp_5", "track_rp_21",
      "track_rp_25"], ["track_rp_105", "track_rp_121", "track_rp_125"]),

Ejemplo n.º 17

0

Mostrar archivo

 def setUpClass(cls):
     """Select Spark backend before running all the tests."""
     PyRDF.use("spark", {'npartitions': 2, 'spark.executor.instances': 2})

Ejemplo n.º 18

0

Mostrar archivo

 def test_future_env_select(self):
     """Non implemented backends throw a NotImplementedError."""
     with self.assertRaises(NotImplementedError):
         PyRDF.use("dask")

Ejemplo n.º 19

0

Mostrar archivo

import PyRDF
import ROOT

PyRDF.use("spark")


# A simple helper function to fill a test tree: this makes the example stand-alone.
def fill_tree(treeName, fileName):
    rdf = PyRDF.RDataFrame(10000)
    return rdf.Define("b1", "(int) rdfentry_")\
              .Define("b2", "(float) rdfentry_ * rdfentry_")\
              .Snapshot(treeName, fileName)


# We prepare an input tree to run on
fileName = "df007_snapshot_py.root"
outFileName = "df007_snapshot_output_py.root"
outFileNameAllColumns = "df007_snapshot_output_allColumns_py.root"
treeName = "myTree"

# The tree is snapshotted and we retrieve a new PyRDF.RDataFrame from it
d = fill_tree(treeName, fileName)

# ## Select entries
# We now select some entries in the dataset
d_cut = d.Filter("b1 % 2 == 0")

# ## Enrich the dataset
# Build some temporary columns: we'll write them out
PyRDF.include_headers("tutorials/headers/df007_snapshot.h")
d2 = d_cut.Define("b1_square", "b1 * b1") \

Ejemplo n.º 20

0

Mostrar archivo

import PyRDF

PyRDF.use("local")
PyRDF.include_headers("./headers.hh")

import latinos_reader as lr

trees = lr.build_dataframe("./lowenergy", "VBS", PyRDF, "pyrdf")
tree = trees[0]

# print( tree.lowen_ele_looseVBS.rdf_node.AsNumpy(columns=["mjj_vbs"]) )
# m = tree.lowen_ele_looseVBS.rdf_node.Mean("mjj_vbs")
# print( m.GetValue() )

Ejemplo n.º 21

0

Mostrar archivo

"""
Simple sum operation on a column of the reference dataset.
Connects to the OpenStack VMs Spark cluster. If there is another cluster change
arguments of PyRDF.use() accordingly.
"""
import ROOT
import PyRDF

PyRDF.use("spark", conf={
    'npartitions': 3,
    'spark.master': 'spark://137.138.55.13:7077',
    'spark.driver.port': 40000,
    'spark.blockManager.port': 30000,
    'spark.app.name': 'PyRDF',
    'spark.executor.instances': 3,
})

rdf = PyRDF.RDataFrame(
    "reftree",
    "root://eosuser.cern.ch//eos/user/v/vpadulan/reftree/reftree_100000000entry.root")
s = rdf.Sum("b3")

t = ROOT.TStopwatch()
s.GetValue()
t.Stop()
realtime = round(t.RealTime(), 2)

with open("tfileprefetch_pyrdf_sum.csv", "a+") as f:
    f.write(str(realtime))
    f.write("\n")

Ejemplo n.º 22

0

Mostrar archivo

    def execute(self, generator):
        """
        Executes the current RDataFrame graph
        in the given distributed environment.

        Args:
            generator (PyRDF.CallableGenerator): An instance of
                :obj:`CallableGenerator` that is responsible for generating
                the callable function.
        """
        callable_function = generator.get_callable()
        # Arguments needed to create PyROOT RDF object
        rdf_args = generator.head_node.args
        treename = generator.head_node.get_treename()
        selected_branches = generator.head_node.get_branches()

        # Avoid having references to the instance inside the mapper
        initialization = Backend.initialization

        def mapper(current_range):
            """
            Triggers the event-loop and executes all
            nodes in the computational graph using the
            callable.

            Args:
                current_range (tuple): A pair that contains the starting and
                    ending values of the current range.

            Returns:
                list: This respresents the list of values of all action nodes
                in the computational graph.
            """
            import ROOT

            # We have to decide whether to do this in Dist or in subclasses
            # Utils.declare_headers(worker_includes)  # Declare headers if any

            # Run initialization method to prepare the worker runtime
            # environment
            initialization()

            # Build rdf
            start = int(current_range.start)
            end = int(current_range.end)

            if treename:
                # Build TChain of files for this range:
                chain = ROOT.TChain(treename)
                for f in current_range.filelist:
                    chain.Add(str(f))

                # We assume 'end' is exclusive
                chain.SetCacheEntryRange(start, end)

                # Gather information about friend trees
                friend_info = current_range.friend_info
                if friend_info:
                    # Zip together the treenames of the friend trees and the
                    # respective file names. Each friend treename can have
                    # multiple corresponding friend file names.
                    tree_files_names = zip(friend_info.friend_names,
                                           friend_info.friend_file_names)
                    for friend_treename, friend_filenames in tree_files_names:
                        # Start a TChain with the current friend treename
                        friend_chain = ROOT.TChain(friend_treename)
                        # Add each corresponding file to the TChain
                        for filename in friend_filenames:
                            friend_chain.Add(filename)

                        # Set cache on the same range as the parent TChain
                        friend_chain.SetCacheEntryRange(start, end)
                        # Finally add friend TChain to the parent
                        chain.AddFriend(friend_chain)

                if selected_branches:
                    rdf = ROOT.ROOT.RDataFrame(chain, selected_branches)
                else:
                    rdf = ROOT.ROOT.RDataFrame(chain)
            else:
                rdf = ROOT.ROOT.RDataFrame(*rdf_args)  # PyROOT RDF object

            # # TODO : If we want to run multi-threaded in a Spark node in
            # # the future, use `TEntryList` instead of `Range`
            # rdf_range = rdf.Range(current_range.start, current_range.end)

            # Output of the callable
            output = callable_function(rdf, rdf_range=current_range)

            for i in range(len(output)):
                # `AsNumpy` and `Snapshot` return respectively `dict` and `list`
                # that don't have the `GetValue` method.
                if isinstance(output[i], (dict, list)):
                    continue
                # FIX ME : RResultPtrs aren't serializable,
                # because of which we have to manually find
                # out the types here and copy construct the
                # values.

                # The type of the value of the action node
                value_type = type(output[i].GetValue())
                # The `value_type` is required here because,
                # after a call to `GetValue`, the values die
                # along with the RResultPtrs
                output[i] = value_type(output[i].GetValue())
            return output

        def reducer(values_list1, values_list2):
            """
            Merges two given lists of values that were
            returned by the mapper function for two different
            ranges.

            Args:
                values_list1 (list): A list of computed values for a given
                    entry range in a dataset.

                values_list2 (list): A list of computed values for a given
                    entry range in a dataset.

            Returns:
                list: This is a list of values obtained after merging two
                given lists.
            """
            import ROOT

            for i in range(len(values_list1)):
                # A bunch of if-else conditions to merge two values

                # Create a global list with all the files of the partial
                # snapshots
                if isinstance(values_list1[i], list):
                    values_list1[i].extend(values_list2[i])

                elif isinstance(values_list1[i], dict):
                    combined = {
                        key: numpy.concatenate(
                            [values_list1[i][key], values_list2[i][key]])
                        for key in values_list1[i]
                    }
                    values_list1[i] = combined
                elif (isinstance(values_list1[i], ROOT.TH1)
                      or isinstance(values_list1[i], ROOT.TH2)):
                    # Merging two objects of type ROOT.TH1D or ROOT.TH2D
                    values_list1[i].Add(values_list2[i])

                elif isinstance(values_list1[i], ROOT.TGraph):
                    # Prepare a TList
                    tlist = ROOT.TList()
                    tlist.Add(values_list2[i])

                    # Merge the second graph onto the first
                    num_points = values_list1[i].Merge(tlist)

                    # Check if there was an error in merging
                    if num_points == -1:
                        msg = "Error reducing two result values of type TGraph!"
                        raise Exception(msg)

                elif isinstance(values_list1[i], float):
                    # Adding values resulting from a Sum() operation
                    # Sum() always returns a float in python
                    values_list1[i] += values_list2[i]

                elif (isinstance(values_list1[i], int)):  # noqa: Python 2
                    # Adding values resulting from a Count() operation
                    values_list1[i] += values_list2[i]

                else:
                    msg = ("Type \"{}\" is not supported by the reducer yet!".
                           format(type(values_list1[i])))
                    raise NotImplementedError(msg)

            return values_list1

        # Get number of entries in the input dataset using
        # arguments passed to RDataFrame constructor
        self.nentries = generator.head_node.get_num_entries()

        # Retrieve the treename used to initialize the RDataFrame
        self.treename = generator.head_node.get_treename()

        # Retrieve the filenames used to initialize the RDataFrame
        self.files = generator.head_node.get_inputfiles()

        # Retrieve the ROOT.TTree instance used to initialize the RDataFrame
        self.tree = generator.head_node.get_tree()

        # Retrieve info about the friend trees
        if self.tree:
            self.friend_info = self._get_friend_info(self.tree)

        if not self.nentries:
            # Fall back to local execution
            # if 'nentries' is '0'
            msg = ("No entries in the Tree, falling back to local execution!")
            warnings.warn(msg, UserWarning, stacklevel=2)
            PyRDF.use("local")
            from .. import current_backend
            return current_backend.execute(generator)

        # Values produced after Map-Reduce
        values = self.ProcessAndMerge(mapper, reducer)
        # List of action nodes in the same order as values
        nodes = generator.get_action_nodes()

        # Set the value of every action node
        for node, value in zip(nodes, values):
            if node.operation.name == "Snapshot":
                # Retrieve treename from operation args and start TChain
                snapshot_treename = node.operation.args[0]
                snapshot_chain = ROOT.TChain(snapshot_treename)
                # Add partial snapshot files to the chain
                for filename in value:
                    snapshot_chain.Add(filename)
                # Create a new rdf with the chain and return that to user
                snapshot_rdf = PyRDF.RDataFrame(snapshot_chain)
                node.value = snapshot_rdf
            else:
                node.value = value

Ejemplo n.º 23

0

Mostrar archivo

##
## \macro_code
##
## \date February 2017
## \author Danilo Piparo

import ROOT, PyRDF

# The second parameter in 'PyRDF.use' call, is the config
# dictionary. 'npartitions' represents the number of parts
# that the input dataset should be divided into for processing.
# 'spark.executor.instances' is a Spark configuration parameter and
# it represents the number of spark executors that should be used to
# process the partitioned dataset. Learn more about Spark configuration
# options from it's official documentation page.
PyRDF.use("spark", {'npartitions':4, 'spark.executor.instances':4})

RDataFrame = PyRDF.RDataFrame

# A simple helper function to fill a test tree: this makes the example
# stand-alone.
def fill_tree(treeName, fileName):
    d = ROOT.ROOT.RDataFrame(25000)
    d.Define("px", "gRandom->Gaus()")\
     .Define("py", "gRandom->Gaus()")\
     .Define("pz", "sqrt(px * px + py * py)")\
     .Snapshot(treeName, fileName)



# We prepare an input tree to run on

Ejemplo n.º 24

0

Mostrar archivo

Archivo: NanoAOD-Diff.py Proyecto: NJManganelli/FourTopNAOD

                    help='Name of hist output file, if plotting selected')
parser.add_argument('--noPlots', dest='noPlots', action='store_true', 
                    help='Disable plotting')
args = parser.parse_args()


#Call RDataFrame backend, if chosen
if args.backend == "RDF":
    print("\tRDataFrame Implicit Multi-threading {}".format("ENABLED" if args.noIMT == False else "DISABLED"))
    print("====================")
    if not args.noIMT:
        ROOT.ROOT.EnableImplicitMT(args.nThreads)
    RDF = ROOT.ROOT.RDataFrame
if args.backend == "PyRDF":
    import PyRDF
    PyRDF.use("spark", {'npartitions': '64'}) #was 32 in example
    RDF = PyRDF.RDataFrame
            
procstart = collections.OrderedDict()
procfinish = collections.OrderedDict()
nToLabel = {}
labelToN = {}
print(args.label)
for ln, label in enumerate(args.label):
    nToLabel[ln] = label
    labelToN[label] = ln

def main():
  
    for fn, fs in enumerate(args.input):
        print("\tSample '{}'".format(nToLabel[fn]))