Esempio n. 1
0
    def test_three_files_two_partitions_per_file(self):
        """
        Create two partitions per file
        """
        nfiles = 3
        treenames = [f"tree_{i}" for i in range(nfiles)]
        filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)]
        npartitions = nfiles * 2
        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)
        ranges_reqd = [
            # File 0
            (0, 50, [0], [50], [filenames[0]]),
            (50, 100, [50], [100], [filenames[0]]),
            # File 1
            (0, 50, [0], [50], [filenames[1]]),
            (50, 100, [50], [100], [filenames[1]]),
            # File 2
            (0, 50, [0], [50], [filenames[2]]),
            (50, 100, [50], [100], [filenames[2]]),
        ]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 2
0
    def test_clustered_ranges_with_many_clusters_many_partitions(self):
        """
        Create as many partitions as number of clusters in the file.
        """

        treenames = ["myTree"]
        filenames = ["backend/1000clusters.root"]
        npartitions = 1000

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)

        start = 0
        end = 1000
        step = 1

        ranges_reqd = [(a, b, [a], [b], filenames) for a, b in zip(
            range(start, end, step), range(step, end + 1, step))]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 3
0
    def test_clustered_ranges_with_four_clusters_four_partitions(self):
        """
        When the cluster boundaries allow it, create ranges as equal as possible
        in terms of how many entries they span.
        """

        treenames = ["myTree"]
        filenames = ["backend/4clusters.root"]
        npartitions = 4

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)

        ranges_reqd = [(0, 250, [0], [250], ["backend/4clusters.root"]),
                       (250, 500, [250], [500], ["backend/4clusters.root"]),
                       (500, 750, [500], [750], ["backend/4clusters.root"]),
                       (750, 1000, [750], [1000], ["backend/4clusters.root"])]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 4
0
    def test_clustered_ranges_with_many_clusters_four_partitions(self):
        """
        Create ranges that spany many clusters.
        """

        treenames = ["myTree"]
        filenames = ["backend/1000clusters.root"]
        npartitions = 4

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)

        ranges_reqd = [(0, 250, [0], [250], ["backend/1000clusters.root"]),
                       (250, 500, [250], [500], ["backend/1000clusters.root"]),
                       (500, 750, [500], [750], ["backend/1000clusters.root"]),
                       (750, 1000, [750], [1000],
                        ["backend/1000clusters.root"])]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 5
0
    def test_rdataframe_with_treename_and_filename_with_globbing(self):
        """
        Check globbing returns the proper file name to create ranges.
        """
        treename = "myTree"
        filename = "backend/2cluste*.root"
        npartitions = 2
        rdf = get_headnode(None, npartitions, treename, filename)

        expected_inputfiles = ["backend/2clusters.root"]
        extracted_inputfiles = rdf.inputfiles

        percranges = Ranges.get_percentage_ranges([treename],
                                                  extracted_inputfiles,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]
        ranges = treeranges_to_tuples(clusteredranges)

        ranges_reqd = [(0, 777, [0], [777], expected_inputfiles),
                       (777, 1000, [777], [1000], expected_inputfiles)]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 6
0
    def test_npartitions_greater_than_clusters(self):
        """
        Asking for 2 partitions with an input file that contains only 1 cluster
        returns a list with two tasks. One spans the whole file, the other is
        None.
        """

        # This tree has 10 entries and 1 cluster
        treenames = ["TotemNtuple"]
        filenames = ["backend/Slimmed_ntuple.root"]
        npartitions = 2

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        # We return one task per partition
        self.assertEqual(len(clusteredranges), npartitions)
        # But only one is non-empty
        actualtasks = [task for task in clusteredranges if task is not None]
        self.assertEqual(len(actualtasks), 1)

        ranges = treeranges_to_tuples(actualtasks)
        ranges_reqd = [(0, 10, [0], [10], ["backend/Slimmed_ntuple.root"])]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 7
0
    def test_clustered_ranges_with_one_cluster(self):
        """
        Exactly one range is created when user asks for one partition. The range
        spans the whole input file.
        """

        # This tree has 10 entries and 1 cluster
        treenames = ["TotemNtuple"]
        filenames = ["backend/Slimmed_ntuple.root"]
        npartitions = 1

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)

        self.assertEqual(len(ranges), npartitions)

        ranges_reqd = [(0, 10, [0], [10], ["backend/Slimmed_ntuple.root"])]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 8
0
    def test_three_files_partitions_greater_than_clusters(self):
        """
        Create more partitions than clusters in the dataset.
        """
        nfiles = 3
        treenames = [f"tree_{i}" for i in range(nfiles)]
        filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)]
        npartitions = 42

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        # We return one task per partition
        self.assertEqual(len(clusteredranges), npartitions)
        # But at most as many as the number of clusters in the dataset are non-empty
        actualtasks = [task for task in clusteredranges if task is not None]
        self.assertEqual(len(actualtasks), 30)

        # Same as previous test
        ranges = treeranges_to_tuples(actualtasks)
        ranges_reqd = [(0, 10, [0], [10], [filenames[0]]),
                       (10, 20, [10], [20], [filenames[0]]),
                       (20, 30, [20], [30], [filenames[0]]),
                       (30, 40, [30], [40], [filenames[0]]),
                       (40, 50, [40], [50], [filenames[0]]),
                       (50, 60, [50], [60], [filenames[0]]),
                       (60, 70, [60], [70], [filenames[0]]),
                       (70, 80, [70], [80], [filenames[0]]),
                       (80, 90, [80], [90], [filenames[0]]),
                       (90, 100, [90], [100], [filenames[0]]),
                       (0, 10, [0], [10], [filenames[1]]),
                       (10, 20, [10], [20], [filenames[1]]),
                       (20, 30, [20], [30], [filenames[1]]),
                       (30, 40, [30], [40], [filenames[1]]),
                       (40, 50, [40], [50], [filenames[1]]),
                       (50, 60, [50], [60], [filenames[1]]),
                       (60, 70, [60], [70], [filenames[1]]),
                       (70, 80, [70], [80], [filenames[1]]),
                       (80, 90, [80], [90], [filenames[1]]),
                       (90, 100, [90], [100], [filenames[1]]),
                       (0, 10, [0], [10], [filenames[2]]),
                       (10, 20, [10], [20], [filenames[2]]),
                       (20, 30, [20], [30], [filenames[2]]),
                       (30, 40, [30], [40], [filenames[2]]),
                       (40, 50, [40], [50], [filenames[2]]),
                       (50, 60, [50], [60], [filenames[2]]),
                       (60, 70, [60], [70], [filenames[2]]),
                       (70, 80, [70], [80], [filenames[2]]),
                       (80, 90, [80], [90], [filenames[2]]),
                       (90, 100, [90], [100], [filenames[2]])]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 9
0
        def build_chain_from_range(
            current_range: Ranges.TreeRangePerc
        ) -> Tuple[Optional[ROOT.TChain], Ranges.TaskTreeEntries]:
            """
            Builds a TChain from the information in 'current_range'.

            Processing on the chain is restricted to the entries selected for
            this task via TEntryList. If the user provided info about friend
            trees, also that is used to attach the friends to the main chain.
            """

            # Build TEntryList for this range:
            elists = ROOT.TEntryList()

            # Build TChain of files for this range:
            chain = ROOT.TChain(maintreename)

            clustered_range, entries_in_trees = Ranges.get_clustered_range_from_percs(
                current_range)
            if clustered_range is None:
                # The task could not be correctly built, don't create the TChain
                return None, entries_in_trees

            for subtreename, filename, treenentries, start, end in zip(
                    clustered_range.treenames, clustered_range.filenames,
                    clustered_range.treesnentries, clustered_range.localstarts,
                    clustered_range.localends):

                # Use default constructor of TEntryList rather than the
                # constructor accepting treename and filename, otherwise
                # the TEntryList would remove any url or protocol from the
                # file name.
                elist = ROOT.TEntryList()
                elist.SetTreeName(subtreename)
                elist.SetFileName(filename)
                elist.EnterRange(start, end)
                elists.AddSubList(elist)
                chain.Add(filename + "?#" + subtreename, treenentries)

            # We assume 'end' is exclusive
            chain.SetCacheEntryRange(clustered_range.globalstart,
                                     clustered_range.globalend)

            # Connect the entry list to the chain
            chain.SetEntryList(elists, "sync")

            # Needs the same globalstart and globalend of the chain created in
            # this task
            attach_friend_info_if_present(clustered_range, chain)

            return chain, entries_in_trees
Esempio n. 10
0
    def test_three_files_partitions_equal_clusters(self):
        """
        Create as many partitions as clusters in the dataset.
        """
        nfiles = 3
        treenames = [f"tree_{i}" for i in range(nfiles)]
        filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)]
        npartitions = nfiles * 10  # trees have 10 clusters

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)
        ranges_reqd = [(0, 10, [0], [10], [filenames[0]]),
                       (10, 20, [10], [20], [filenames[0]]),
                       (20, 30, [20], [30], [filenames[0]]),
                       (30, 40, [30], [40], [filenames[0]]),
                       (40, 50, [40], [50], [filenames[0]]),
                       (50, 60, [50], [60], [filenames[0]]),
                       (60, 70, [60], [70], [filenames[0]]),
                       (70, 80, [70], [80], [filenames[0]]),
                       (80, 90, [80], [90], [filenames[0]]),
                       (90, 100, [90], [100], [filenames[0]]),
                       (0, 10, [0], [10], [filenames[1]]),
                       (10, 20, [10], [20], [filenames[1]]),
                       (20, 30, [20], [30], [filenames[1]]),
                       (30, 40, [30], [40], [filenames[1]]),
                       (40, 50, [40], [50], [filenames[1]]),
                       (50, 60, [50], [60], [filenames[1]]),
                       (60, 70, [60], [70], [filenames[1]]),
                       (70, 80, [70], [80], [filenames[1]]),
                       (80, 90, [80], [90], [filenames[1]]),
                       (90, 100, [90], [100], [filenames[1]]),
                       (0, 10, [0], [10], [filenames[2]]),
                       (10, 20, [10], [20], [filenames[2]]),
                       (20, 30, [20], [30], [filenames[2]]),
                       (30, 40, [30], [40], [filenames[2]]),
                       (40, 50, [40], [50], [filenames[2]]),
                       (50, 60, [50], [60], [filenames[2]]),
                       (60, 70, [60], [70], [filenames[2]]),
                       (70, 80, [70], [80], [filenames[2]]),
                       (80, 90, [80], [90], [filenames[2]]),
                       (90, 100, [90], [100], [filenames[2]])]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 11
0
    def test_three_files_one_partition(self):
        """
        Create one range that spans three files.
        """
        nfiles = 3
        treenames = [f"tree_{i}" for i in range(nfiles)]
        filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)]
        npartitions = 1

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)
        ranges_reqd = [(0, 300, [0, 0, 0], [100, 100, 100], filenames)]
        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 12
0
    def test_clustered_ranges_with_two_files(self):
        """
        Create two ranges from two files with a different number of clusters.
        """
        treenames = ["myTree"] * 2
        filenames = ["backend/2clusters.root", "backend/4clusters.root"]
        npartitions = 2

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)
        ranges_reqd = [(0, 1000, [filename]) for filename in filenames]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 13
0
        def build_rdf_from_range(
                current_range: Ranges.TreeRangePerc) -> TaskObjects:
            """
            Builds an RDataFrame instance for a distributed mapper.

            The function creates a TChain from the information contained in the
            input range object. If the chain cannot be built, returns None.
            """

            clustered_range, entries_in_trees = Ranges.get_clustered_range_from_percs(
                current_range)

            if clustered_range is None:
                return TaskObjects(None, entries_in_trees)

            ds = ROOT.RDF.Experimental.RDatasetSpec(
                zip(clustered_range.treenames, clustered_range.filenames),
                (clustered_range.globalstart, clustered_range.globalend))

            attach_friend_info_if_present(clustered_range, ds)

            return TaskObjects(ROOT.RDataFrame(ds), entries_in_trees)
Esempio n. 14
0
    def test_three_files_one_partition_per_file(self):
        """
        Create as many ranges as files
        """
        nfiles = 3
        treenames = [f"tree_{i}" for i in range(nfiles)]
        filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)]
        npartitions = nfiles

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]

        ranges = treeranges_to_tuples(clusteredranges)
        ranges_reqd = [(0, 100, [filename]) for filename in filenames]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 15
0
    def test_rdataframe_with_notreename_and_chain_with_subtrees(self):
        """
        Check proper handling of a TChain with different subnames.
        """
        # Create two dummy files
        treename1, filename1 = "entries_1", "entries_1.root"
        treename2, filename2 = "entries_2", "entries_2.root"
        npartitions = 2
        ROOT.RDataFrame(10).Define("x",
                                   "rdfentry_").Snapshot(treename1, filename1)
        ROOT.RDataFrame(10).Define("x",
                                   "rdfentry_").Snapshot(treename2, filename2)

        chain = ROOT.TChain()
        chain.Add(str(filename1 + "?#" + treename1))
        chain.Add(str(filename2 + "?#" + treename2))

        rdf = get_headnode(None, npartitions, chain)
        extracted_subtreenames = rdf.subtreenames
        extracted_filenames = rdf.inputfiles

        percranges = Ranges.get_percentage_ranges(extracted_subtreenames,
                                                  extracted_filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]
        ranges = treeranges_to_tuples(clusteredranges)

        ranges_reqd = [(0, 10, [0], [10], [filename1]),
                       (0, 10, [0], [10], [filename2])]

        os.remove(filename1)
        os.remove(filename2)
        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 16
0
    def test_clustered_ranges_with_two_clusters_two_partitions(self):
        """
        Create clustered ranges respecting the cluster boundaries, even if that
        implies to have ranges with different numbers of entries.
        """

        treenames = ["myTree"]
        filenames = ["backend/2clusters.root"]
        npartitions = 2

        percranges = Ranges.get_percentage_ranges(treenames,
                                                  filenames,
                                                  npartitions,
                                                  friendinfo=None)
        clusteredranges = [
            Ranges.get_clustered_range_from_percs(percrange)[0]
            for percrange in percranges
        ]
        ranges = treeranges_to_tuples(clusteredranges)

        ranges_reqd = [(0, 777, [0], [777], ["backend/2clusters.root"]),
                       (777, 1000, [777], [1000], ["backend/2clusters.root"])]

        self.assertListEqual(ranges, ranges_reqd)