Esempio n. 1
0
    def test_nentries_not_multipleOf_npartitions(self):
        """
        Building balanced ranges when the number of entries is not a multiple of
        the number of partitions.
        """

        nentries_1 = 10
        nentries_2 = 9
        npartitions = 4

        # Example in which fractional part of
        # (nentries/npartitions) >= 0.5
        rng = Ranges.get_balanced_ranges(nentries_1, npartitions)
        ranges_1 = emptysourceranges_to_tuples(rng)

        # Example in which fractional part of
        # (nentries/npartitions) < 0.5
        rng = Ranges.get_balanced_ranges(nentries_2, npartitions)
        ranges_2 = emptysourceranges_to_tuples(rng)

        # Required output pairs
        ranges_1_reqd = [(0, 3), (3, 6), (6, 8), (8, 10)]
        ranges_2_reqd = [(0, 3), (3, 5), (5, 7), (7, 9)]

        self.assertListEqual(ranges_1, ranges_1_reqd)
        self.assertListEqual(ranges_2, ranges_2_reqd)
Esempio n. 2
0
    def test_nentries_multipleOf_npartitions(self):
        """
        Building balanced ranges when the number of entries is a multiple of the
        number of partitions.
        """

        nentries_small = 10
        npartitions_small = 5
        nentries_large = 100
        npartitions_large = 10

        # First case
        rng = Ranges.get_balanced_ranges(nentries_small, npartitions_small)
        ranges_small = emptysourceranges_to_tuples(rng)

        # Second case
        rng = Ranges.get_balanced_ranges(nentries_large, npartitions_large)
        ranges_large = emptysourceranges_to_tuples(rng)

        ranges_small_reqd = [(0, 2), (2, 4), (4, 6), (6, 8), (8, 10)]
        ranges_large_reqd = [(0, 10), (10, 20), (20, 30), (30, 40), (40, 50),
                             (50, 60), (60, 70), (70, 80), (80, 90), (90, 100)]

        self.assertListEqual(ranges_small, ranges_small_reqd)
        self.assertListEqual(ranges_large, ranges_large_reqd)
Esempio n. 3
0
    def test_nentries_greater_than_npartitions(self):
        """
        Building balanced ranges when the number of entries is smaller than the
        number of partitions.
        """

        nentries = 5
        npartitions = 7

        rng = Ranges.get_balanced_ranges(nentries, npartitions)
        ranges = emptysourceranges_to_tuples(rng)

        ranges_reqd = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]

        self.assertListEqual(ranges, ranges_reqd)
Esempio n. 4
0
 def build_ranges(self):
     """Build the ranges for this dataset."""
     # Empty datasets cannot be processed distributedly
     if not self.nentries:
         raise RuntimeError(
             ("Cannot build a distributed RDataFrame with zero entries. "
              "Distributed computation will fail. "))
     # TODO: This shouldn't be triggered if entries == 1. The current minimum
     # amount of partitions is 2. We need a robust reducer that smartly
     # becomes no-op if npartitions == 1 to avoid this.
     if self.npartitions > self.nentries:
         # Restrict 'npartitions' if it's greater than 'nentries'
         msg = ("Number of partitions {0} is greater than number of entries {1} "
                "in the dataframe. Using {1} partition(s)".format(self.npartitions, self.nentries))
         warnings.warn(msg, UserWarning, stacklevel=2)
         self.npartitions = self.nentries
     return Ranges.get_balanced_ranges(self.nentries, self.npartitions)