def test_three_files_two_partitions_per_file(self): """ Create two partitions per file """ nfiles = 3 treenames = [f"tree_{i}" for i in range(nfiles)] filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)] npartitions = nfiles * 2 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [ # File 0 (0, 50, [0], [50], [filenames[0]]), (50, 100, [50], [100], [filenames[0]]), # File 1 (0, 50, [0], [50], [filenames[1]]), (50, 100, [50], [100], [filenames[1]]), # File 2 (0, 50, [0], [50], [filenames[2]]), (50, 100, [50], [100], [filenames[2]]), ] self.assertListEqual(ranges, ranges_reqd)
def test_clustered_ranges_with_many_clusters_many_partitions(self): """ Create as many partitions as number of clusters in the file. """ treenames = ["myTree"] filenames = ["backend/1000clusters.root"] npartitions = 1000 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) start = 0 end = 1000 step = 1 ranges_reqd = [(a, b, [a], [b], filenames) for a, b in zip( range(start, end, step), range(step, end + 1, step))] self.assertListEqual(ranges, ranges_reqd)
def test_clustered_ranges_with_four_clusters_four_partitions(self): """ When the cluster boundaries allow it, create ranges as equal as possible in terms of how many entries they span. """ treenames = ["myTree"] filenames = ["backend/4clusters.root"] npartitions = 4 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 250, [0], [250], ["backend/4clusters.root"]), (250, 500, [250], [500], ["backend/4clusters.root"]), (500, 750, [500], [750], ["backend/4clusters.root"]), (750, 1000, [750], [1000], ["backend/4clusters.root"])] self.assertListEqual(ranges, ranges_reqd)
def test_clustered_ranges_with_many_clusters_four_partitions(self): """ Create ranges that spany many clusters. """ treenames = ["myTree"] filenames = ["backend/1000clusters.root"] npartitions = 4 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 250, [0], [250], ["backend/1000clusters.root"]), (250, 500, [250], [500], ["backend/1000clusters.root"]), (500, 750, [500], [750], ["backend/1000clusters.root"]), (750, 1000, [750], [1000], ["backend/1000clusters.root"])] self.assertListEqual(ranges, ranges_reqd)
def test_rdataframe_with_treename_and_filename_with_globbing(self): """ Check globbing returns the proper file name to create ranges. """ treename = "myTree" filename = "backend/2cluste*.root" npartitions = 2 rdf = get_headnode(None, npartitions, treename, filename) expected_inputfiles = ["backend/2clusters.root"] extracted_inputfiles = rdf.inputfiles percranges = Ranges.get_percentage_ranges([treename], extracted_inputfiles, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 777, [0], [777], expected_inputfiles), (777, 1000, [777], [1000], expected_inputfiles)] self.assertListEqual(ranges, ranges_reqd)
def test_npartitions_greater_than_clusters(self): """ Asking for 2 partitions with an input file that contains only 1 cluster returns a list with two tasks. One spans the whole file, the other is None. """ # This tree has 10 entries and 1 cluster treenames = ["TotemNtuple"] filenames = ["backend/Slimmed_ntuple.root"] npartitions = 2 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] # We return one task per partition self.assertEqual(len(clusteredranges), npartitions) # But only one is non-empty actualtasks = [task for task in clusteredranges if task is not None] self.assertEqual(len(actualtasks), 1) ranges = treeranges_to_tuples(actualtasks) ranges_reqd = [(0, 10, [0], [10], ["backend/Slimmed_ntuple.root"])] self.assertListEqual(ranges, ranges_reqd)
def test_clustered_ranges_with_one_cluster(self): """ Exactly one range is created when user asks for one partition. The range spans the whole input file. """ # This tree has 10 entries and 1 cluster treenames = ["TotemNtuple"] filenames = ["backend/Slimmed_ntuple.root"] npartitions = 1 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) self.assertEqual(len(ranges), npartitions) ranges_reqd = [(0, 10, [0], [10], ["backend/Slimmed_ntuple.root"])] self.assertListEqual(ranges, ranges_reqd)
def test_three_files_partitions_greater_than_clusters(self): """ Create more partitions than clusters in the dataset. """ nfiles = 3 treenames = [f"tree_{i}" for i in range(nfiles)] filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)] npartitions = 42 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] # We return one task per partition self.assertEqual(len(clusteredranges), npartitions) # But at most as many as the number of clusters in the dataset are non-empty actualtasks = [task for task in clusteredranges if task is not None] self.assertEqual(len(actualtasks), 30) # Same as previous test ranges = treeranges_to_tuples(actualtasks) ranges_reqd = [(0, 10, [0], [10], [filenames[0]]), (10, 20, [10], [20], [filenames[0]]), (20, 30, [20], [30], [filenames[0]]), (30, 40, [30], [40], [filenames[0]]), (40, 50, [40], [50], [filenames[0]]), (50, 60, [50], [60], [filenames[0]]), (60, 70, [60], [70], [filenames[0]]), (70, 80, [70], [80], [filenames[0]]), (80, 90, [80], [90], [filenames[0]]), (90, 100, [90], [100], [filenames[0]]), (0, 10, [0], [10], [filenames[1]]), (10, 20, [10], [20], [filenames[1]]), (20, 30, [20], [30], [filenames[1]]), (30, 40, [30], [40], [filenames[1]]), (40, 50, [40], [50], [filenames[1]]), (50, 60, [50], [60], [filenames[1]]), (60, 70, [60], [70], [filenames[1]]), (70, 80, [70], [80], [filenames[1]]), (80, 90, [80], [90], [filenames[1]]), (90, 100, [90], [100], [filenames[1]]), (0, 10, [0], [10], [filenames[2]]), (10, 20, [10], [20], [filenames[2]]), (20, 30, [20], [30], [filenames[2]]), (30, 40, [30], [40], [filenames[2]]), (40, 50, [40], [50], [filenames[2]]), (50, 60, [50], [60], [filenames[2]]), (60, 70, [60], [70], [filenames[2]]), (70, 80, [70], [80], [filenames[2]]), (80, 90, [80], [90], [filenames[2]]), (90, 100, [90], [100], [filenames[2]])] self.assertListEqual(ranges, ranges_reqd)
def build_chain_from_range( current_range: Ranges.TreeRangePerc ) -> Tuple[Optional[ROOT.TChain], Ranges.TaskTreeEntries]: """ Builds a TChain from the information in 'current_range'. Processing on the chain is restricted to the entries selected for this task via TEntryList. If the user provided info about friend trees, also that is used to attach the friends to the main chain. """ # Build TEntryList for this range: elists = ROOT.TEntryList() # Build TChain of files for this range: chain = ROOT.TChain(maintreename) clustered_range, entries_in_trees = Ranges.get_clustered_range_from_percs( current_range) if clustered_range is None: # The task could not be correctly built, don't create the TChain return None, entries_in_trees for subtreename, filename, treenentries, start, end in zip( clustered_range.treenames, clustered_range.filenames, clustered_range.treesnentries, clustered_range.localstarts, clustered_range.localends): # Use default constructor of TEntryList rather than the # constructor accepting treename and filename, otherwise # the TEntryList would remove any url or protocol from the # file name. elist = ROOT.TEntryList() elist.SetTreeName(subtreename) elist.SetFileName(filename) elist.EnterRange(start, end) elists.AddSubList(elist) chain.Add(filename + "?#" + subtreename, treenentries) # We assume 'end' is exclusive chain.SetCacheEntryRange(clustered_range.globalstart, clustered_range.globalend) # Connect the entry list to the chain chain.SetEntryList(elists, "sync") # Needs the same globalstart and globalend of the chain created in # this task attach_friend_info_if_present(clustered_range, chain) return chain, entries_in_trees
def test_three_files_partitions_equal_clusters(self): """ Create as many partitions as clusters in the dataset. """ nfiles = 3 treenames = [f"tree_{i}" for i in range(nfiles)] filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)] npartitions = nfiles * 10 # trees have 10 clusters percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 10, [0], [10], [filenames[0]]), (10, 20, [10], [20], [filenames[0]]), (20, 30, [20], [30], [filenames[0]]), (30, 40, [30], [40], [filenames[0]]), (40, 50, [40], [50], [filenames[0]]), (50, 60, [50], [60], [filenames[0]]), (60, 70, [60], [70], [filenames[0]]), (70, 80, [70], [80], [filenames[0]]), (80, 90, [80], [90], [filenames[0]]), (90, 100, [90], [100], [filenames[0]]), (0, 10, [0], [10], [filenames[1]]), (10, 20, [10], [20], [filenames[1]]), (20, 30, [20], [30], [filenames[1]]), (30, 40, [30], [40], [filenames[1]]), (40, 50, [40], [50], [filenames[1]]), (50, 60, [50], [60], [filenames[1]]), (60, 70, [60], [70], [filenames[1]]), (70, 80, [70], [80], [filenames[1]]), (80, 90, [80], [90], [filenames[1]]), (90, 100, [90], [100], [filenames[1]]), (0, 10, [0], [10], [filenames[2]]), (10, 20, [10], [20], [filenames[2]]), (20, 30, [20], [30], [filenames[2]]), (30, 40, [30], [40], [filenames[2]]), (40, 50, [40], [50], [filenames[2]]), (50, 60, [50], [60], [filenames[2]]), (60, 70, [60], [70], [filenames[2]]), (70, 80, [70], [80], [filenames[2]]), (80, 90, [80], [90], [filenames[2]]), (90, 100, [90], [100], [filenames[2]])] self.assertListEqual(ranges, ranges_reqd)
def test_three_files_one_partition(self): """ Create one range that spans three files. """ nfiles = 3 treenames = [f"tree_{i}" for i in range(nfiles)] filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)] npartitions = 1 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 300, [0, 0, 0], [100, 100, 100], filenames)] self.assertListEqual(ranges, ranges_reqd)
def test_clustered_ranges_with_two_files(self): """ Create two ranges from two files with a different number of clusters. """ treenames = ["myTree"] * 2 filenames = ["backend/2clusters.root", "backend/4clusters.root"] npartitions = 2 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 1000, [filename]) for filename in filenames] self.assertListEqual(ranges, ranges_reqd)
def build_rdf_from_range( current_range: Ranges.TreeRangePerc) -> TaskObjects: """ Builds an RDataFrame instance for a distributed mapper. The function creates a TChain from the information contained in the input range object. If the chain cannot be built, returns None. """ clustered_range, entries_in_trees = Ranges.get_clustered_range_from_percs( current_range) if clustered_range is None: return TaskObjects(None, entries_in_trees) ds = ROOT.RDF.Experimental.RDatasetSpec( zip(clustered_range.treenames, clustered_range.filenames), (clustered_range.globalstart, clustered_range.globalend)) attach_friend_info_if_present(clustered_range, ds) return TaskObjects(ROOT.RDataFrame(ds), entries_in_trees)
def test_three_files_one_partition_per_file(self): """ Create as many ranges as files """ nfiles = 3 treenames = [f"tree_{i}" for i in range(nfiles)] filenames = [f"distrdf_unittests_file_{i}.root" for i in range(nfiles)] npartitions = nfiles percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 100, [filename]) for filename in filenames] self.assertListEqual(ranges, ranges_reqd)
def test_rdataframe_with_notreename_and_chain_with_subtrees(self): """ Check proper handling of a TChain with different subnames. """ # Create two dummy files treename1, filename1 = "entries_1", "entries_1.root" treename2, filename2 = "entries_2", "entries_2.root" npartitions = 2 ROOT.RDataFrame(10).Define("x", "rdfentry_").Snapshot(treename1, filename1) ROOT.RDataFrame(10).Define("x", "rdfentry_").Snapshot(treename2, filename2) chain = ROOT.TChain() chain.Add(str(filename1 + "?#" + treename1)) chain.Add(str(filename2 + "?#" + treename2)) rdf = get_headnode(None, npartitions, chain) extracted_subtreenames = rdf.subtreenames extracted_filenames = rdf.inputfiles percranges = Ranges.get_percentage_ranges(extracted_subtreenames, extracted_filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 10, [0], [10], [filename1]), (0, 10, [0], [10], [filename2])] os.remove(filename1) os.remove(filename2) self.assertListEqual(ranges, ranges_reqd)
def test_clustered_ranges_with_two_clusters_two_partitions(self): """ Create clustered ranges respecting the cluster boundaries, even if that implies to have ranges with different numbers of entries. """ treenames = ["myTree"] filenames = ["backend/2clusters.root"] npartitions = 2 percranges = Ranges.get_percentage_ranges(treenames, filenames, npartitions, friendinfo=None) clusteredranges = [ Ranges.get_clustered_range_from_percs(percrange)[0] for percrange in percranges ] ranges = treeranges_to_tuples(clusteredranges) ranges_reqd = [(0, 777, [0], [777], ["backend/2clusters.root"]), (777, 1000, [777], [1000], ["backend/2clusters.root"])] self.assertListEqual(ranges, ranges_reqd)