def test_tchain_with_friend_tchain_histo(self, connection):
        Tests that the computational graph can be issued both on the
        parent chain and the friend chain.

        main_filename = "main_chain.root"
        friend_filename = "friend_chain.root"


        # Main TChain
        mainchain = ROOT.TChain("T")

        # Friend TChain
        friendchain = ROOT.TChain("TF")

        # Add friend chain to the main one

        # Create a DistRDF RDataFrame with the main and the friend chains
        df = Dask.RDataFrame(mainchain, daskclient=connection)

        # Create histograms
        h_parent = df.Histo1D("x")
        h_friend = df.Histo1D("TF.x")

        check_histograms(h_parent, h_friend)

        # Remove unnecessary .root files
Exemple #2
    def test_write_histo(self, connection):
        Tests that an histogram is correctly written to a .root file created
        before the execution of the event loop.

        # Create a new file where the histogram will be written
        outfile = ROOT.TFile("out_file.root", "recreate")

        # Create a DistRDF RDataFrame with the parent and the friend trees
        df = Dask.RDataFrame("Events", "tree_gaus.root", daskclient=connection)

        # Create histogram
        histo = df.Histo1D(("x", "x", 100, 0, 20), "x")

        # Write histogram to out_file.root and close the file

        # Reopen file to check that histogram was correctly stored
        reopen_file = ROOT.TFile("out_file.root", "read")
        reopen_histo = reopen_file.Get("x")

        # Check histogram statistics
        assert reopen_histo.GetEntries() == self.nentries
        assert reopen_histo.GetMean() == pytest.approx(self.gaus_mean,
        assert reopen_histo.GetStdDev() == pytest.approx(
            self.gaus_stdev, self.delta_equal)

        # Remove unnecessary .root files
Exemple #3
    def build_distrdf_graph(self, connection):
        Create a DistRDF graph with a fixed set of operations and return it.
        treename = "data"
        files = ["", ]
        rdf = Dask.RDataFrame(treename, files, npartitions=5, daskclient=connection)

        # Define the analysis cuts
        chargeCutStr = "C1 != C2"
        etaCutStr = "fabs(eta1) < 2.3 && fabs(eta2) < 2.3"
        ptCutStr = "pt1 > 2 && pt2 > 2"
        rdf_f = rdf.Filter(chargeCutStr, "Opposite Charge") \
                   .Filter(etaCutStr, "Central Muons") \
                   .Filter(ptCutStr, "Sane Pt")

        # Create the invariant mass column
        invMassFormulaStr = ("sqrt(pow(E1+E2, 2) - (pow(px1+px2, 2) +"
                             "pow(py1+py2, 2) + pow(pz1+pz2, 2)))")
        rdf_fd = rdf_f.Define("invMass", invMassFormulaStr)

        # Create the histograms
        pt1_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt1")
        pt2_h = rdf.Histo1D(("", "", 128, 1, 1200), "pt2")
        model = (
            "invMass", "CMS Opendata;#mu#mu mass[GeV];Events", 512, 5, 110)
        invMass_h = rdf_fd.Histo1D(model, "invMass")
        pi = ROOT.TMath.Pi()
        model = ("", "", 64, -pi, pi, 64, -pi, pi)
        phis_h = rdf_fd.Histo2D(model, "phi1", "phi2")

        return pt1_h, pt2_h, invMass_h, phis_h
Exemple #4
    def _extend_ROOT_include_path(self, connection):
        Check that the include path of ROOT is extended with the directories
        specified in `DistRDF.include_headers()` so references between headers
        are correctly solved.

        # Create an RDataFrame with 100 integers from 0 to 99
        rdf = Dask.RDataFrame(100, daskclient=connection)

        # Distribute headers to the workers
        header_folder = "../test_headers/headers_folder"

        # Get list of include paths seen by ROOT
        ROOT_include_path = ROOT.gInterpreter.GetIncludePath().split(" ")

        # Create new include folder token
        new_folder_include = "-I\"{}\"".format(header_folder)

        # Check that new folder is in ROOT include paths
        assert new_folder_include in ROOT_include_path

        # Filter numbers less than 10 and create an histogram
        rdf_less_than_10 = rdf.Filter("check_number_less_than_10(tdfentry_)")
        histo1 = rdf_less_than_10.Histo1D("tdfentry_")

        # Check that histogram has 10 entries and mean 4.5
        assert histo1.GetEntries() == 10
        assert histo1.GetMean() == pytest.approx(4.5)
Exemple #5
    def test_definepersample_simple(self, connection):
        Test DefinePerSample operation on three samples using a predefined
        string of operations.

        df = Dask.RDataFrame(self.maintreename,

        # Associate a number to each sample
        definepersample_code = """
        if(rdfsampleinfo_.Contains(\"{}\")) return 1;
        else if (rdfsampleinfo_.Contains(\"{}\")) return 2;
        else if (rdfsampleinfo_.Contains(\"{}\")) return 3;
        else return 0;

        df1 = df.DefinePerSample("sampleid", definepersample_code)

        # Filter by the sample number. Each filtered dataframe should contain
        # 10 entries, equal to the number of entries per sample
        samplescounts = [
            df1.Filter("sampleid == {}".format(id)).Count()
            for id in [1, 2, 3]

        for count in samplescounts:
            assert count.GetValue() == 10
Exemple #6
    def _includes_function_with_filter_and_histo(self, connection):
        Check that the filter operation is able to use C++ functions that
        were included using header files.

        rdf = Dask.RDataFrame(10, daskclient=connection)


        # This filters out all numbers less than 5
        rdf_filtered = rdf.Filter("check_number_less_than_5(tdfentry_)")
        histo = rdf_filtered.Histo1D("tdfentry_")

        # The expected results after filtering
        # The actual set of numbers required after filtering
        required_numbers = range(5)
        required_size = len(required_numbers)
        required_mean = sum(required_numbers) / float(required_size)
        required_stdDev = math.sqrt(
            sum((x - required_mean)**2 for x in required_numbers) /

        # Compare the sizes of equivalent set of numbers
        assert histo.GetEntries() == required_size
        # Compare the means of equivalent set of numbers
        assert histo.GetMean() == required_mean
        # Compare the standard deviations of equivalent set of numbers
        assert histo.GetStdDev() == required_stdDev
Exemple #7
    def test_distributed_snapshot_columnlist(self, connection):
        Test that distributed Snapshot correctly passes also the third input
        argument "columnList".
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Dask.RDataFrame(10, daskclient=connection)\
            .Define("a", "rdfentry_")\
            .Define("b", "rdfentry_")\
            .Define("c", "rdfentry_")\
            .Define("d", "rdfentry_")

        expectedcolumns = ["a", "b"]
        df.Snapshot("snapTree_columnlist", "distrdf_dask_snapfile_columnlist.root", expectedcolumns)

        # Create a traditional RDF from the snapshotted files to retrieve the
        # list of columns
        tmp_files = ["distrdf_dask_snapfile_columnlist_0.root", "distrdf_dask_snapfile_columnlist_1.root"]
        rdf = ROOT.RDataFrame("snapTree_columnlist", tmp_files)
        snapcolumns = [str(column) for column in rdf.GetColumnNames()]

        assert snapcolumns == expectedcolumns

        for filename in tmp_files:
Exemple #8
    def test_definepersample_withinitialization(self, connection):
        Test DefinePerSample operation on three samples using C++ functions
        declared to the ROOT interpreter.

        # Write initialization code that will be run in the workers to make the
        # needed functions available
        def declare_definepersample_code():
            #ifndef distrdf_test_definepersample_withinitialization
            #define distrdf_test_definepersample_withinitialization
            float sample1_weight(){
                return 1.0f;

            float sample2_weight(){
                return 2.0f;

            float sample3_weight(){
                return 3.0f;

            float samples_weights(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
                if (id.Contains("sample1")){
                    return sample1_weight();
                } else if (id.Contains("sample2")){
                    return sample2_weight();
                } else if (id.Contains("sample3")){
                    return sample3_weight();
                return -999.0f;

            std::string samples_names(unsigned int slot, const ROOT::RDF::RSampleInfo &id){
                return id.AsString();
            #endif // distrdf_test_definepersample_withinitialization

        df = Dask.RDataFrame(self.maintreename,
        df1 = df.DefinePerSample("sample_weight", "samples_weights(rdfslot_, rdfsampleinfo_)")\
                .DefinePerSample("sample_name", "samples_names(rdfslot_, rdfsampleinfo_)")

        # Filter by the two defined columns per sample: a weight and the sample string representation
        # Each filtered dataset should have 10 entries, equal to the number of entries per sample
        weightsandnames = [("1.0f", "sample1.root/Events"),
                           ("2.0f", "sample2.root/Events"),
                           ("3.0f", "sample3.root/Events")]
        samplescounts = [
            df1.Filter("sample_weight == {} && sample_name == \"{}\"".format(
                weight, name)).Count() for (weight, name) in weightsandnames

        for count in samplescounts:
            assert count.GetValue() == 10
Exemple #9
    def test_initialization_method(self, connection):
        Check `DistRDF.initialize` with Dask backend. Defines an integer value
        to the ROOT interpreter. Check that this value is available in the
        worker processes.
        def init(value):
            import ROOT
            cpp_code = f"int userValue = {value};"

        DistRDF.initialize(init, 123)
        # Dask backend has a limited list of supported methods, so we use
        # Histo1D which is a supported action.
        # The code below creates an RDataFrame instance with one single entry
        # and defines a column 'u' whose value is taken from the variable
        # 'userValue'.
        # This variable is only declared inside the ROOT interpreter, however
        # the value of the variable is passed by the user from the python side.
        # If the init function defined by the user is properly propagated to the
        # Dask backend, each workers will run the init function as a first step
        # and hence the variable 'userValue' will be defined at runtime.
        # As a result the define operation should read the variable 'userValue'
        # and assign it to the entries of the column 'u' (only one entry).
        # Finally, Histo1D returns a histogram filled with one value. The mean
        # of this single value has to be the value itself, independently of
        # the number of spawned workers.
        df = Dask.RDataFrame(1, daskclient=connection).Define(
            "u", "userValue").Histo1D("u")
        h = df.GetValue()
        assert h.GetMean() == 123
Exemple #10
    def test_distributed_sum(self, connection):
        """Test support for `Sum` operation in distributed backend"""
        rdf_py = Dask.RDataFrame(10, daskclient=connection)
        rdf_def = rdf_py.Define("x", "rdfentry_")
        rdf_sum = rdf_def.Sum("x")

        assert rdf_sum.GetValue() == 45.0
Exemple #11
    def test_distributed_snapshot(self, connection):
        """Test support for `Snapshot` in distributed backend"""
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Dask.RDataFrame(10, daskclient=connection).Define("x", "rdfentry_")

        # Snapshot to two files, build a ROOT.TChain with them and retrieve a
        # Dask.RDataFrame
        snapdf = df.Snapshot("snapTree", "snapFile.root")
        self.check_snapshot_df(snapdf, "snapFile")
Exemple #12
    def test_distributed_asnumpy(self, connection):
        """Test support for `AsNumpy` pythonization in distributed backend"""

        # Let's create a simple dataframe with ten rows and two columns
        df = Dask.RDataFrame(10, daskclient=connection).Define("x", "(int)rdfentry_")\
            .Define("y", "1.f/(1.f+rdfentry_)")

        # Build a dictionary of numpy arrays.
        npy = df.AsNumpy()
Exemple #13
    def test_user_supplied_npartitions_have_precedence(self, connection):
        The class Client object is connected to a LocalCluster with 2 processes.
        The `DaskBackend.optimize_npartitions` method would thus return 2.
        Check that if the user specifies a number of partitions, that is not
        overwritten by the backend.
        df = Dask.RDataFrame(100, daskclient=connection, npartitions=4)

        # The number of partitions was supplied by the user.
        assert df._headnode.npartitions == 4
Exemple #14
    def test_histo1d_merge(self, connection):
        """Check the working of Histo1D merge operation in the reducer."""
        # Operations with DistRDF
        rdf_py = Dask.RDataFrame(10, daskclient=connection)
        histo_py = rdf_py.Histo1D("rdfentry_")

        # Operations with PyROOT
        rdf_cpp = ROOT.ROOT.RDataFrame(10)
        histo_cpp = rdf_cpp.Histo1D("rdfentry_")

        # Compare the 2 histograms
        self.assertHistoOrProfile(histo_py, histo_cpp)
Exemple #15
    def test_redefine_one_column(self, connection):
        """Test that values of one column can be properly redefined."""
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Dask.RDataFrame(10, daskclient=connection)
        df_before = df.Define("x", "1")
        df_after = df_before.Redefine("x", "2")

        # Initial sum should be equal to 10
        sum_before = df_before.Sum("x")
        # Sum after the redefinition should be equal to 20
        sum_after = df_after.Sum("x")

        assert sum_before.GetValue() == 10.0
        assert sum_after.GetValue() == 20.0
Exemple #16
    def test_distributed_snapshot_lazy(self, connection):
        """Test that `Snapshot` can be still called lazily in distributed mode"""
        # A simple dataframe with ten sequential numbers from 0 to 9
        df = Dask.RDataFrame(10, daskclient=connection).Define("x", "rdfentry_")

        opts = ROOT.RDF.RSnapshotOptions()
        opts.fLazy = True
        snap_lazy = df.Snapshot("snapTree_lazy", "snapFile_lazy.root", ["x"], opts)
        # The event loop hasn't been triggered yet
        assert isinstance(snap_lazy, ActionProxy)
        assert snap_lazy.proxied_node.value is None

        snapdf = snap_lazy.GetValue()
        self.check_snapshot_df(snapdf, "snapFile_lazy")
Exemple #17
    def test_count_with_same_tree_repeated(self, connection):
        Count entries of a dataset with three times the same tree.
        df = ROOT.RDataFrame(100).Define("x", "1")
        treename = "tree"
        filename = "distrdf_roottest_dask_check_backend_same_tree.root"
        filenames = [filename] * 3
        df.Snapshot(treename, filename, ["x"])

        rdf = Dask.RDataFrame(treename, filenames, daskclient=connection)
        assert rdf.Count().GetValue() == 300

Exemple #18
    def test_profile1d_merge(self, connection):
        """Check the working of Profile1D merge operation in the reducer."""
        # Operations with DistRDF
        rdf_py = Dask.RDataFrame(10, daskclient=connection)
        columns_py = self.define_two_columns(rdf_py)
        profile_py = columns_py.Profile1D(("", "", 64, -4, 4), "x", "y")

        # Operations with PyROOT
        rdf_cpp = ROOT.ROOT.RDataFrame(10)
        columns_cpp = self.define_two_columns(rdf_cpp)
        profile_cpp = columns_cpp.Profile1D(("", "", 64, -4, 4), "x", "y")

        # Compare the 2 profiles
        self.assertHistoOrProfile(profile_py, profile_cpp)
Exemple #19
    def test_varyfiltersum(self, connection):
        df = Dask.RDataFrame(10, daskclient=connection,
                             npartitions=2).Define("x", "1")
        df_sum = df.Vary("x", "ROOT::RVecI{-1*x, 2*x}", ("down", "up"),
                         "myvariation").Filter("x > 0").Sum("x")

        assert df_sum.GetValue() == 10

        sums = DistRDF.VariationsFor(df_sum)

        expectednames = ["nominal", "myvariation:down", "myvariation:up"]
        expectedsums = [10, 0, 20]
        for varname, val in zip(expectednames, expectedsums):
            assert sums[varname] == val
Exemple #20
    def test_graph(self, connection):
        df = Dask.RDataFrame(10, daskclient=connection,
                             npartitions=2).Define("x", "1")
        g = df.Vary("x", "ROOT::RVecI{-1, 2}", nVariations=2).Graph("x", "x")
        gs = DistRDF.VariationsFor(g)

        assert g.GetMean() == 1

        expectednames = ["nominal", "x:0", "x:1"]
        expectedmeans = [1, -1, 2]
        for varname, mean in zip(expectednames, expectedmeans):
            graph = gs[varname]
            assert isinstance(graph, ROOT.TGraph)
            assert graph.GetMean() == mean
Exemple #21
    def test_histo(self, connection):
        df = Dask.RDataFrame(10, daskclient=connection,
                             npartitions=2).Define("x", "1")
        df1 = df.Vary("x", "ROOT::RVecI{-2,2}", ["down", "up"])
        h = df1.Histo1D("x")
        histos = DistRDF.VariationsFor(h)

        expectednames = ["nominal", "x:up", "x:down"]
        expectedmeans = [1, 2, -2]
        for varname, mean in zip(expectednames, expectedmeans):
            histo = histos[varname]
            assert isinstance(histo, ROOT.TH1D)
            assert histo.GetEntries() == 10
            assert histo.GetMean() == mean
Exemple #22
    def test_rungraphs_sparkanddask_3histos(self, connection):
        Submit three different RDF graphs concurrently to Spark and Dask
        daskconn, sparkconn = connection
        # Create a test file for processing
        treename = "myTree"
        filename = "2clusters.root"
        nentries = 10000
        opts = ROOT.RDF.RSnapshotOptions()
        opts.fAutoFlush = 5000
        ROOT.RDataFrame(nentries).Define("b1", "42")\
                                 .Define("b2", "42")\
                                 .Define("b3", "42")\
                                 .Snapshot(treename, filename, ["b1", "b2", "b3"], opts)

        histoproxies_spark = [
                             npartitions=2).Histo1D((col, col, 1, 40, 45), col)
            for col in ["b1", "b2", "b3"]

        histoproxies_dask = [
                            npartitions=2).Histo1D((col, col, 1, 40, 45), col)
            for col in ["b1", "b2", "b3"]

        histoproxies = histoproxies_spark + histoproxies_dask

        # Before triggering the computation graphs values are None
        for proxy in histoproxies:
            assert proxy.proxied_node.value is None


        # After RunGraphs all histograms are correctly assigned to the
        # node objects
        for proxy in histoproxies:
            histo = proxy.proxied_node.value
            assert isinstance(histo, ROOT.TH1D)
            assert histo.GetEntries() == nentries
            assert histo.GetMean() == 42

Exemple #23
    def test_mixed(self, connection):
        df = Dask.RDataFrame(10, daskclient=connection,
                             npartitions=2).Define("x", "1").Define("y", "42")
        h = df.Vary("x", "ROOT::RVecI{-1, 2}",
                    variationTags=["down", "up"]).Histo1D("x", "y")
        histos = DistRDF.VariationsFor(h)

        expectednames = ["nominal", "x:down", "x:up"]
        expectedmeans = [1, -1, 2]
        expectedmax = 420
        for varname, mean in zip(expectednames, expectedmeans):
            histo = histos[varname]
            assert isinstance(histo, ROOT.TH1D)
            assert histo.GetMaximum() == expectedmax
            assert histo.GetMean() == mean
Exemple #24
    def test_histo_from_empty_root_file(self, connection):
        Check that when performing operations with the distributed backend on
        an RDataFrame without entries, DistRDF raises an error.

        # Create an RDataFrame from a file with an empty tree
        rdf = Dask.RDataFrame("NOMINAL",
        histo = rdf.Histo1D(("empty", "empty", 10, 0, 10), "mybranch")

        # Get entries in the histogram, raises error
        with pytest.raises(RuntimeError):
Exemple #25
    def test_distributed_asnumpy_lazy(self, connection):
        """Test that `AsNumpy` can be still called lazily in distributed mode"""

        # Let's create a simple dataframe with ten rows and two columns
        df = Dask.RDataFrame(10, daskclient=connection).Define("x", "(int)rdfentry_")\
            .Define("y", "1.f/(1.f+rdfentry_)")

        npy_lazy = df.AsNumpy(lazy=True)
        # The event loop hasn't been triggered yet
        assert isinstance(npy_lazy, ActionProxy)
        assert npy_lazy.proxied_node.value is None

        # Trigger the computations and check final results
        npy = npy_lazy.GetValue()
Exemple #26
    def test_initialization(self, connection):
        Check that the user initialization method is assigned to the current
        def returnNumber(n):
            return n

        DistRDF.initialize(returnNumber, 123)

        # Dummy df just to retrieve the initialization function
        df = Dask.RDataFrame(10, daskclient=connection)
        f = df._headnode.backend.initialization

        assert f() == 123
Exemple #27
    def test_histo3d_merge(self, connection):
        """Check the working of Histo3D merge operation in the reducer."""
        modelTH3D = ("", "", 64, -4, 4, 64, -4, 4, 64, -4, 4)
        # Operations with DistRDF
        rdf_py = Dask.RDataFrame(10, daskclient=connection)
        columns_py = self.define_three_columns(rdf_py)
        histo_py = columns_py.Histo3D(modelTH3D, "x", "y", "z")

        # Operations with PyROOT
        rdf_cpp = ROOT.ROOT.RDataFrame(10)
        columns_cpp = self.define_three_columns(rdf_cpp)
        histo_cpp = columns_cpp.Histo3D(modelTH3D, "x", "y", "z")

        # Compare the 2 histograms
        self.assertHistoOrProfile(histo_py, histo_cpp)
Exemple #28
    def test_simultaneous(self, connection):
        df = Dask.RDataFrame(10, daskclient=connection,
                             npartitions=2).Define("x", "1").Define("y", "42")
        h = df.Vary(["x", "y"],
                    "ROOT::RVec<ROOT::RVecI>{{-1, 2, 3}, {41, 43, 44}}",
                    ["down", "up", "other"], "xy").Histo1D("x", "y")
        histos = DistRDF.VariationsFor(h)

        expectednames = ["nominal", "xy:down", "xy:up", "xy:other"]
        expectedmeans = [1, -1, 2, 3]
        expectedmax = [420, 410, 430, 440]
        for varname, mean, maxval in zip(expectednames, expectedmeans,
            graph = histos[varname]
            assert isinstance(graph, ROOT.TH1D)
            assert graph.GetMaximum() == maxval
            assert graph.GetMean() == mean
Exemple #29
    def test_histond_merge(self, connection):
        """Check the working of HistoND merge operation in the reducer."""
        nbins = (10, 10, 10, 10)
        xmin = (0., 0., 0., 0.)
        xmax = (100., 100., 100., 100.)
        modelTHND = ("name", "title", 4, nbins, xmin, xmax)
        colnames = ("x0", "x1", "x2", "x3")

        distrdf = Dask.RDataFrame(100, daskclient=connection)
        rdf = ROOT.RDataFrame(100)

        distrdf_withcols = self.define_four_columns(distrdf, colnames)
        rdf_withcols = self.define_four_columns(rdf, colnames)

        histond_distrdf = distrdf_withcols.HistoND(modelTHND, colnames)
        histond_rdf = rdf_withcols.HistoND(modelTHND, colnames)

        assert histond_distrdf.GetEntries() == histond_rdf.GetEntries()
        assert histond_distrdf.GetNbins() == histond_rdf.GetNbins()
    def test_friends_tchain_noname_add_fullpath_addfriend_alias(
            self, connection):
        """Test against the reproducer of issue"""

        rn1 = "rn1.root"
        rn2 = "rn2.root"
        friendsfilename = "friendtrees_dask.root"

        df_1 = ROOT.RDataFrame(10000)
        df_2 = ROOT.RDataFrame(10000)

        df_1 = df_1.Define("rnd", "gRandom->Gaus(10)")
        df_2 = df_2.Define("rnd", "gRandom->Gaus(20)")

        df_1.Snapshot("randomNumbers", rn1)
        df_2.Snapshot("randomNumbersBis", rn2)

        # Put the two trees together in a common file"hadd -f {} {} {}".format(friendsfilename, rn1, rn2),

        # Test the specific case of a parent chain and friend chain with no
        # names, that receive one tree each in the form "filename/treename". The
        # friend is then added to the parent with an alias.
        chain = ROOT.TChain()
        chainFriend = ROOT.TChain()


        chain.AddFriend(chainFriend, "myfriend")

        df = Dask.RDataFrame(chain, daskclient=connection)

        h_parent = df.Histo1D("rnd")
        h_friend = df.Histo1D("myfriend.rnd")

        check_histograms(h_parent, h_friend)
