Esempio n. 1
0
    def build_ranges(self):
        """Build the ranges for this dataset."""
        # Empty datasets cannot be processed distributedly
        if self.tree.GetEntries() == 0:
            raise RuntimeError(
                ("Cannot build a distributed RDataFrame with zero entries. "
                 "Distributed computation will fail. "))

        logger.debug(
            "Building ranges from dataset info:\n"
            "main treename: %s\n"
            "names of subtrees: %s\n"
            "input files: %s\n", self.maintreename, self.subtreenames,
            self.inputfiles)

        # Retrieve a tuple of clusters for all files of the tree
        clustersinfiles = Ranges.get_clusters(self.subtreenames,
                                              self.inputfiles)
        numclusters = len(clustersinfiles)

        # TODO: This shouldn't be triggered if len(clustersinfiles) == 1. The
        # current minimum amount of partitions is 2. We need a robust reducer
        # that smartly becomes no-op if npartitions == 1 to avoid this.
        # Restrict `npartitions` if it's greater than clusters of the dataset
        if self.npartitions > numclusters:
            msg = ("Number of partitions is greater than number of clusters "
                   "in the dataset. Using {} partition(s)".format(numclusters))
            warnings.warn(msg, UserWarning, stacklevel=2)
            self.npartitions = numclusters

        logger.debug("%s clusters will be split along %s partitions.",
                     numclusters, self.npartitions)
        return Ranges.get_clustered_ranges(clustersinfiles, self.npartitions,
                                           self.friendinfo)
Esempio n. 2
0
    def test_clustered_ranges_with_one_cluster(self):
        """
        Check that _get_clustered_ranges returns one range when the dataset
        contains a single cluster and the number of partitions is 1

        """

        treename = "TotemNtuple"
        filelist = ["backend/Slimmed_ntuple.root"]
        npartitions = 1
        clustersinfiles = Ranges.get_clusters(treename, filelist)
        friendinfo = None

        crs = Ranges.get_clustered_ranges(clustersinfiles, npartitions,
                                          treename, friendinfo)
        ranges = treeranges_to_tuples(crs)

        ranges_reqd = [(0, 10, ["backend/Slimmed_ntuple.root"])]

        self.assertListEqual(ranges, ranges_reqd)