def build_ranges(self): """Build the ranges for this dataset.""" # Empty datasets cannot be processed distributedly if self.tree.GetEntries() == 0: raise RuntimeError( ("Cannot build a distributed RDataFrame with zero entries. " "Distributed computation will fail. ")) logger.debug( "Building ranges from dataset info:\n" "main treename: %s\n" "names of subtrees: %s\n" "input files: %s\n", self.maintreename, self.subtreenames, self.inputfiles) # Retrieve a tuple of clusters for all files of the tree clustersinfiles = Ranges.get_clusters(self.subtreenames, self.inputfiles) numclusters = len(clustersinfiles) # TODO: This shouldn't be triggered if len(clustersinfiles) == 1. The # current minimum amount of partitions is 2. We need a robust reducer # that smartly becomes no-op if npartitions == 1 to avoid this. # Restrict `npartitions` if it's greater than clusters of the dataset if self.npartitions > numclusters: msg = ("Number of partitions is greater than number of clusters " "in the dataset. Using {} partition(s)".format(numclusters)) warnings.warn(msg, UserWarning, stacklevel=2) self.npartitions = numclusters logger.debug("%s clusters will be split along %s partitions.", numclusters, self.npartitions) return Ranges.get_clustered_ranges(clustersinfiles, self.npartitions, self.friendinfo)
def test_clustered_ranges_with_one_cluster(self): """ Check that _get_clustered_ranges returns one range when the dataset contains a single cluster and the number of partitions is 1 """ treename = "TotemNtuple" filelist = ["backend/Slimmed_ntuple.root"] npartitions = 1 clustersinfiles = Ranges.get_clusters(treename, filelist) friendinfo = None crs = Ranges.get_clustered_ranges(clustersinfiles, npartitions, treename, friendinfo) ranges = treeranges_to_tuples(crs) ranges_reqd = [(0, 10, ["backend/Slimmed_ntuple.root"])] self.assertListEqual(ranges, ranges_reqd)