def test_nentries_not_multipleOf_npartitions(self): """ Building balanced ranges when the number of entries is not a multiple of the number of partitions. """ nentries_1 = 10 nentries_2 = 9 npartitions = 4 # Example in which fractional part of # (nentries/npartitions) >= 0.5 rng = Ranges.get_balanced_ranges(nentries_1, npartitions) ranges_1 = emptysourceranges_to_tuples(rng) # Example in which fractional part of # (nentries/npartitions) < 0.5 rng = Ranges.get_balanced_ranges(nentries_2, npartitions) ranges_2 = emptysourceranges_to_tuples(rng) # Required output pairs ranges_1_reqd = [(0, 3), (3, 6), (6, 8), (8, 10)] ranges_2_reqd = [(0, 3), (3, 5), (5, 7), (7, 9)] self.assertListEqual(ranges_1, ranges_1_reqd) self.assertListEqual(ranges_2, ranges_2_reqd)
def test_nentries_multipleOf_npartitions(self): """ Building balanced ranges when the number of entries is a multiple of the number of partitions. """ nentries_small = 10 npartitions_small = 5 nentries_large = 100 npartitions_large = 10 # First case rng = Ranges.get_balanced_ranges(nentries_small, npartitions_small) ranges_small = emptysourceranges_to_tuples(rng) # Second case rng = Ranges.get_balanced_ranges(nentries_large, npartitions_large) ranges_large = emptysourceranges_to_tuples(rng) ranges_small_reqd = [(0, 2), (2, 4), (4, 6), (6, 8), (8, 10)] ranges_large_reqd = [(0, 10), (10, 20), (20, 30), (30, 40), (40, 50), (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)] self.assertListEqual(ranges_small, ranges_small_reqd) self.assertListEqual(ranges_large, ranges_large_reqd)
def test_nentries_greater_than_npartitions(self): """ Building balanced ranges when the number of entries is smaller than the number of partitions. """ nentries = 5 npartitions = 7 rng = Ranges.get_balanced_ranges(nentries, npartitions) ranges = emptysourceranges_to_tuples(rng) ranges_reqd = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)] self.assertListEqual(ranges, ranges_reqd)
def build_ranges(self): """Build the ranges for this dataset.""" # Empty datasets cannot be processed distributedly if not self.nentries: raise RuntimeError( ("Cannot build a distributed RDataFrame with zero entries. " "Distributed computation will fail. ")) # TODO: This shouldn't be triggered if entries == 1. The current minimum # amount of partitions is 2. We need a robust reducer that smartly # becomes no-op if npartitions == 1 to avoid this. if self.npartitions > self.nentries: # Restrict 'npartitions' if it's greater than 'nentries' msg = ("Number of partitions {0} is greater than number of entries {1} " "in the dataframe. Using {1} partition(s)".format(self.npartitions, self.nentries)) warnings.warn(msg, UserWarning, stacklevel=2) self.npartitions = self.nentries return Ranges.get_balanced_ranges(self.nentries, self.npartitions)