Exemple #1
0
 def test_build_tier_tree(self):
     ret = build_tier_tree(self.test_devs)
     self.assertEqual(len(ret), 8)
     self.assertEqual(ret[()], set([(1,)]))
     self.assertEqual(ret[(1,)], set([(1, 1), (1, 2)]))
     self.assertEqual(ret[(1, 1)],
                      set([(1, 1, '192.168.1.2'),
                           (1, 1, '192.168.1.1')]))
     self.assertEqual(ret[(1, 2)],
                      set([(1, 2, '192.168.2.2'),
                           (1, 2, '192.168.2.1')]))
     self.assertEqual(ret[(1, 1, '192.168.1.1')],
                      set([(1, 1, '192.168.1.1', 0),
                           (1, 1, '192.168.1.1', 1),
                           (1, 1, '192.168.1.1', 2)]))
     self.assertEqual(ret[(1, 1, '192.168.1.2')],
                      set([(1, 1, '192.168.1.2', 3),
                           (1, 1, '192.168.1.2', 4),
                           (1, 1, '192.168.1.2', 5)]))
     self.assertEqual(ret[(1, 2, '192.168.2.1')],
                      set([(1, 2, '192.168.2.1', 6),
                           (1, 2, '192.168.2.1', 7),
                           (1, 2, '192.168.2.1', 8)]))
     self.assertEqual(ret[(1, 2, '192.168.2.2')],
                      set([(1, 2, '192.168.2.2', 9),
                           (1, 2, '192.168.2.2', 10),
                           (1, 2, '192.168.2.2', 11)]))
Exemple #2
0
 def test_build_tier_tree(self):
     ret = build_tier_tree(self.test_devs)
     self.assertEqual(len(ret), 8)
     self.assertEqual(ret[()], set([(1, )]))
     self.assertEqual(ret[(1, )], set([(1, 1), (1, 2)]))
     self.assertEqual(ret[(1, 1)],
                      set([(1, 1, '192.168.1.2'), (1, 1, '192.168.1.1')]))
     self.assertEqual(ret[(1, 2)],
                      set([(1, 2, '192.168.2.2'), (1, 2, '192.168.2.1')]))
     self.assertEqual(
         ret[(1, 1, '192.168.1.1')],
         set([(1, 1, '192.168.1.1', 0), (1, 1, '192.168.1.1', 1),
              (1, 1, '192.168.1.1', 2)]))
     self.assertEqual(
         ret[(1, 1, '192.168.1.2')],
         set([(1, 1, '192.168.1.2', 3), (1, 1, '192.168.1.2', 4),
              (1, 1, '192.168.1.2', 5)]))
     self.assertEqual(
         ret[(1, 2, '192.168.2.1')],
         set([(1, 2, '192.168.2.1', 6), (1, 2, '192.168.2.1', 7),
              (1, 2, '192.168.2.1', 8)]))
     self.assertEqual(
         ret[(1, 2, '192.168.2.2')],
         set([(1, 2, '192.168.2.2', 9), (1, 2, '192.168.2.2', 10),
              (1, 2, '192.168.2.2', 11)]))
Exemple #3
0
    def _build_max_replicas_by_tier(self):
        """
	返回一个字典(tier:replica_count),给ring上的所有tiers层
        Returns a dict of (tier: replica_count) for all tiers in the ring.

        There will always be a () entry as the root of the structure, whose
        replica_count will equal the ring's replica_count.

        Then there will be (dev_id,) entries for each device, indicating the
        maximum number of replicas the device might have for any given
        partition. Anything greater than 1 indicates a partition at serious
        risk, as the data on that partition will not be stored distinctly at
        the ring's replica_count.????????????

        Next there will be (dev_id, ip_port) entries for each device,
        indicating the maximum number of replicas the device shares with other
        devices on the same ip_port for any given partition. Anything greater
        than 1 indicates a partition at elevated risk, as if that ip_port were
        to fail multiple replicas of that partition would be unreachable.

        Last there will be (dev_id, ip_port, zone) entries for each device,
        indicating the maximum number of replicas the device shares with other
        devices within the same zone for any given partition. Anything greater
        than 1 indicates a partition at slightly elevated risk, as if that zone
        were to fail multiple replicas of that partition would be unreachable.

        Example return dict for the common SAIO setup::

            {(): 3,
             (1,): 1.0,
             (1, '127.0.0.1:6010'): 1.0,
             (1, '127.0.0.1:6010', 0): 1.0,
             (2,): 1.0,
             (2, '127.0.0.1:6020'): 1.0,
             (2, '127.0.0.1:6020', 1): 1.0,
             (3,): 1.0,
             (3, '127.0.0.1:6030'): 1.0,
             (3, '127.0.0.1:6030', 2): 1.0,
             (4,): 1.0,
             (4, '127.0.0.1:6040'): 1.0,
             (4, '127.0.0.1:6040', 3): 1.0}
        """
        # Used by walk_tree to know what entries to create for each recursive
        # call.
        tier2children = build_tier_tree(self._iter_devs())

        def walk_tree(tier, replica_count):
            mr = {tier: replica_count}
            if tier in tier2children:
	    #下一层
                subtiers = tier2children[tier]
                for subtier in subtiers:
                    submax = math.ceil(float(replica_count) / len(subtiers))
	#update 就是将词典内容继续追加
                    mr.update(walk_tree(subtier, submax))
            return mr
        return walk_tree((), self.replicas)
Exemple #4
0
    def _build_max_replicas_by_tier(self):
        """
        Returns a dict of (tier: replica_count) for all tiers in the ring.

        There will always be a () entry as the root of the structure, whose
        replica_count will equal the ring's replica_count.

        Then there will be (dev_id,) entries for each device, indicating the
        maximum number of replicas the device might have for any given
        partition. Anything greater than 1 indicates a partition at serious
        risk, as the data on that partition will not be stored distinctly at
        the ring's replica_count.

        Next there will be (dev_id, ip_port) entries for each device,
        indicating the maximum number of replicas the device shares with other
        devices on the same ip_port for any given partition. Anything greater
        than 1 indicates a partition at elevated risk, as if that ip_port were
        to fail multiple replicas of that partition would be unreachable.

        Last there will be (dev_id, ip_port, zone) entries for each device,
        indicating the maximum number of replicas the device shares with other
        devices within the same zone for any given partition. Anything greater
        than 1 indicates a partition at slightly elevated risk, as if that zone
        were to fail multiple replicas of that partition would be unreachable.

        Example return dict for the common SAIO setup::

            {(): 3,
             (1,): 1.0,
             (1, '127.0.0.1:6010'): 1.0,
             (1, '127.0.0.1:6010', 0): 1.0,
             (2,): 1.0,
             (2, '127.0.0.1:6020'): 1.0,
             (2, '127.0.0.1:6020', 1): 1.0,
             (3,): 1.0,
             (3, '127.0.0.1:6030'): 1.0,
             (3, '127.0.0.1:6030', 2): 1.0,
             (4,): 1.0,
             (4, '127.0.0.1:6040'): 1.0,
             (4, '127.0.0.1:6040', 3): 1.0}
        """
        # Used by walk_tree to know what entries to create for each recursive
        # call.
        tier2children = build_tier_tree(self._iter_devs())

        def walk_tree(tier, replica_count):
            mr = {tier: replica_count}
            if tier in tier2children:
                subtiers = tier2children[tier]
                for subtier in subtiers:
                    submax = math.ceil(float(replica_count) / len(subtiers))
                    mr.update(walk_tree(subtier, submax))
            return mr

        return walk_tree((), self.replicas)
Exemple #5
0
 def test_build_tier_tree(self):
     ret = build_tier_tree(self.test_devs)
     self.assertEqual(len(ret), 8)
     self.assertEqual(ret[()], set([(1,)]))
     self.assertEqual(ret[(1,)], set([(1, 1), (1, 2)]))
     self.assertEqual(ret[(1, 1)], set([(1, 1, "192.168.1.2:6000"), (1, 1, "192.168.1.1:6000")]))
     self.assertEqual(ret[(1, 2)], set([(1, 2, "192.168.2.2:6000"), (1, 2, "192.168.2.1:6000")]))
     self.assertEqual(
         ret[(1, 1, "192.168.1.1:6000")],
         set([(1, 1, "192.168.1.1:6000", 0), (1, 1, "192.168.1.1:6000", 1), (1, 1, "192.168.1.1:6000", 2)]),
     )
     self.assertEqual(
         ret[(1, 1, "192.168.1.2:6000")],
         set([(1, 1, "192.168.1.2:6000", 3), (1, 1, "192.168.1.2:6000", 4), (1, 1, "192.168.1.2:6000", 5)]),
     )
     self.assertEqual(
         ret[(1, 2, "192.168.2.1:6000")],
         set([(1, 2, "192.168.2.1:6000", 6), (1, 2, "192.168.2.1:6000", 7), (1, 2, "192.168.2.1:6000", 8)]),
     )
     self.assertEqual(
         ret[(1, 2, "192.168.2.2:6000")],
         set([(1, 2, "192.168.2.2:6000", 9), (1, 2, "192.168.2.2:6000", 10), (1, 2, "192.168.2.2:6000", 11)]),
     )
Exemple #6
0
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different zones are considered the
        farthest-apart things, followed by different ip/port pairs within a
        zone; the least-far-apart things are different devices with the same
        ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)
        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2children = build_tier_tree(available_devs)

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
        tiers_by_depth = defaultdict(set)
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
                tiers_by_depth[len(tier)].add(tier)

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (zones, ip_ports, and devices) the
            # replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(lambda: 0)
            for replica in xrange(self.replicas):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1

            def find_home_for_replica(tier=(), depth=1):
                # Order the tiers by how many replicas of this
                # partition they already have. Then, of the ones
                # with the smallest number of replicas, pick the
                # tier with the hungriest drive and then continue
                # searching in that subtree.
                #
                # There are other strategies we could use here,
                # such as hungriest-tier (i.e. biggest
                # sum-of-parts-wanted) or picking one at random.
                # However, hungriest-drive is what was used here
                # before, and it worked pretty well in practice.
                #
                # Note that this allocator will balance things as
                # evenly as possible at each level of the device
                # layout. If your layout is extremely unbalanced,
                # this may produce poor results.
                candidate_tiers = tier2children[tier]
                min_count = min(other_replicas[t] for t in candidate_tiers)
                candidate_tiers = [
                    t for t in candidate_tiers
                    if other_replicas[t] == min_count
                ]
                candidate_tiers.sort(key=lambda t: tier2sort_key[t][-1])

                if depth == max(tiers_by_depth.keys()):
                    return tier2devs[candidate_tiers[-1]][-1]

                return find_home_for_replica(tier=candidate_tiers[-1],
                                             depth=depth + 1)

            for replica in replace_replicas:
                dev = find_home_for_replica()
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
Exemple #7
0
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)

        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=lambda t: tier2sort_key[t][-1])
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(
                    lambda t: tier2sort_key[t][-1], child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            unique_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1
                        unique_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas, pick the
                    # tier with the hungriest drive and then continue
                    # searching in that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator will balance things as
                    # evenly as possible at each level of the device
                    # layout. If your layout is extremely unbalanced,
                    # this may produce poor results.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.
                    candidate_tiers = tier2children[tier]
                    candidates_with_replicas = \
                        unique_tiers_by_tier_len[len(tier) + 1]
                    if len(candidate_tiers) > len(candidates_with_replicas):
                        # There exists at least one tier with 0 other replicas,
                        # so work backward among the candidates, accepting the
                        # first which isn't in other_replicas.
                        #
                        # This optimization is to avoid calling the min()
                        # below, which is expensive if you've got thousands of
                        # drives.
                        for t in reversed(candidate_tiers):
                            if other_replicas[t] == 0:
                                tier = t
                                break
                    else:
                        min_count = min(other_replicas[t]
                                        for t in candidate_tiers)
                        tier = (t for t in reversed(candidate_tiers)
                                if other_replicas[t] == min_count).next()
                    depth += 1
                dev = tier2devs[tier][-1]
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1
                    unique_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

                    # Now jiggle tier2children values to keep them sorted
                    new_last_sort_key = tier2sort_key[tier][-1]
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(
                        new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
Exemple #8
0
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        parts_available_in_tier = defaultdict(int)
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)
            tiers = tiers_for_dev(dev)
            dev['tiers'] = tiers
            for tier in tiers:
                # Note: this represents how many partitions may be assigned to
                # a given tier (region/zone/server/disk). It does not take
                # into account how many partitions a given tier wants to shed.
                #
                # If we did not do this, we could have a zone where, at some
                # point during assignment, number-of-parts-to-gain equals
                # number-of-parts-to-shed. At that point, no further placement
                # into that zone would occur since its parts_available_in_tier
                # would be 0. This would happen any time a zone had any device
                # with partitions to shed, which is any time a device is being
                # removed, which is a pretty frequent operation.
                parts_available_in_tier[tier] += max(dev['parts_wanted'], 0)

        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(tuple)
        tier2dev_sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            for tier in dev['tiers']:
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2dev_sort_key[tier].append(dev['sort_key'])
                tier2sort_key[tier] = dev['sort_key']
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=tier2sort_key.__getitem__)
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(
                    tier2sort_key.__getitem__, child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            occupied_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in dev['tiers']:
                        other_replicas[tier] += 1
                        occupied_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                # Find a new home for this replica
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas and that have
                    # room to accept more partitions, pick the tier with
                    # the hungriest drive and then continue searching in
                    # that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator prioritizes even device
                    # filling over dispersion, so if your layout is
                    # extremely unbalanced, you may not get the replica
                    # dispersion that you expect, and your durability
                    # may be lessened.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.

                    # We sort the tiers here so that, when we look for a tier
                    # with the lowest number of replicas, the first one we
                    # find is the one with the hungriest drive (i.e. drive
                    # with the largest sort_key value). This lets us
                    # short-circuit the search while still ensuring we get the
                    # right tier.
                    candidates_with_replicas = \
                        occupied_tiers_by_tier_len[len(tier) + 1]

                    # Among the tiers with room for more partitions,
                    # find one with the smallest possible number of
                    # replicas already in it, breaking ties by which one
                    # has the hungriest drive.
                    candidates_with_room = [
                        t for t in tier2children[tier]
                        if parts_available_in_tier[t] > 0]

                    if len(candidates_with_room) > \
                            len(candidates_with_replicas):
                        # There exists at least one tier with room for
                        # another partition and 0 other replicas already
                        # in it, so we can use a faster search. The else
                        # branch's search would work here, but it's
                        # significantly slower.
                        tier = max((t for t in candidates_with_room
                                    if other_replicas[t] == 0),
                                   key=tier2sort_key.__getitem__)
                    else:
                        tier = max(candidates_with_room,
                                   key=lambda t: (-other_replicas[t],
                                                  tier2sort_key[t]))
                    depth += 1
                dev = tier2devs[tier][-1]
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in dev['tiers']:
                    parts_available_in_tier[tier] -= 1
                    other_replicas[tier] += 1
                    occupied_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2dev_sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2dev_sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2dev_sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2dev_sort_key[tier].insert(new_index, new_sort_key)

                    new_last_sort_key = tier2dev_sort_key[tier][-1]
                    tier2sort_key[tier] = new_last_sort_key

                    # Now jiggle tier2children values to keep them sorted
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(
                        tier2children_sort_key[parent_tier],
                        new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(
                        new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']
            del dev['tiers']
Exemple #9
0
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev["sort_key"] = self._sort_key_for(dev)

        available_devs = sorted((d for d in self._iter_devs() if d["weight"]), key=lambda x: x["sort_key"])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(tuple)
        tier2dev_sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            dev["tiers"] = tiers_for_dev(dev)
            for tier in dev["tiers"]:
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2dev_sort_key[tier].append(dev["sort_key"])
                tier2sort_key[tier] = dev["sort_key"]
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=tier2sort_key.__getitem__)
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(tier2sort_key.__getitem__, child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            unique_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in dev["tiers"]:
                        other_replicas[tier] += 1
                        unique_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas, pick the
                    # tier with the hungriest drive and then continue
                    # searching in that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator will balance things as
                    # evenly as possible at each level of the device
                    # layout. If your layout is extremely unbalanced,
                    # this may produce poor results.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.

                    # We sort the tiers here so that, when we look for a tier
                    # with the lowest number of replicas, the first one we
                    # find is the one with the hungriest drive (i.e. drive
                    # with the largest sort_key value). This lets us
                    # short-circuit the search while still ensuring we get the
                    # right tier.
                    candidates_with_replicas = unique_tiers_by_tier_len[len(tier) + 1]
                    # Find a tier with the minimal replica count and the
                    # hungriest drive among all the tiers with the minimal
                    # replica count.
                    if len(tier2children[tier]) > len(candidates_with_replicas):
                        # There exists at least one tier with 0 other replicas
                        tier = max(
                            (t for t in tier2children[tier] if other_replicas[t] == 0), key=tier2sort_key.__getitem__
                        )
                    else:
                        tier = max(tier2children[tier], key=lambda t: (-other_replicas[t], tier2sort_key[t]))
                    depth += 1
                dev = tier2devs[tier][-1]
                dev["parts_wanted"] -= 1
                dev["parts"] += 1
                old_sort_key = dev["sort_key"]
                new_sort_key = dev["sort_key"] = self._sort_key_for(dev)
                for tier in dev["tiers"]:
                    other_replicas[tier] += 1
                    unique_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2dev_sort_key[tier], old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2dev_sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2dev_sort_key[tier], new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2dev_sort_key[tier].insert(new_index, new_sort_key)

                    new_last_sort_key = tier2dev_sort_key[tier][-1]
                    tier2sort_key[tier] = new_last_sort_key

                    # Now jiggle tier2children values to keep them sorted
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(tier2children_sort_key[parent_tier], old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(tier2children_sort_key[parent_tier], new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev["id"]

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev["sort_key"]
            dev.pop("tiers", None)  # May be absent for devices w/o weight
Exemple #10
0
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different regions are considered the
        farthest-apart things, followed by zones, then different ip/port pairs
        within a zone; the least-far-apart things are different devices with
        the same ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        parts_available_in_tier = defaultdict(int)
        for dev in self._iter_devs():
            dev["sort_key"] = self._sort_key_for(dev)
            tiers = tiers_for_dev(dev)
            dev["tiers"] = tiers
            for tier in tiers:
                # Note: this represents how many partitions may be assigned to
                # a given tier (region/zone/server/disk). It does not take
                # into account how many partitions a given tier wants to shed.
                #
                # If we did not do this, we could have a zone where, at some
                # point during assignment, number-of-parts-to-gain equals
                # number-of-parts-to-shed. At that point, no further placement
                # into that zone would occur since its parts_available_in_tier
                # would be 0. This would happen any time a zone had any device
                # with partitions to shed, which is any time a device is being
                # removed, which is a pretty frequent operation.
                parts_available_in_tier[tier] += max(dev["parts_wanted"], 0)

        available_devs = sorted((d for d in self._iter_devs() if d["weight"]), key=lambda x: x["sort_key"])

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(tuple)
        tier2dev_sort_key = defaultdict(list)
        max_tier_depth = 0
        for dev in available_devs:
            for tier in dev["tiers"]:
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2dev_sort_key[tier].append(dev["sort_key"])
                tier2sort_key[tier] = dev["sort_key"]
                if len(tier) > max_tier_depth:
                    max_tier_depth = len(tier)

        tier2children_sets = build_tier_tree(available_devs)
        tier2children = defaultdict(list)
        tier2children_sort_key = {}
        tiers_list = [()]
        depth = 1
        while depth <= max_tier_depth:
            new_tiers_list = []
            for tier in tiers_list:
                child_tiers = list(tier2children_sets[tier])
                child_tiers.sort(key=tier2sort_key.__getitem__)
                tier2children[tier] = child_tiers
                tier2children_sort_key[tier] = map(tier2sort_key.__getitem__, child_tiers)
                new_tiers_list.extend(child_tiers)
            tiers_list = new_tiers_list
            depth += 1

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (regions, zones, ip/ports, and
            # devices) the replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(int)
            occupied_tiers_by_tier_len = defaultdict(set)
            for replica in self._replicas_for_part(part):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in dev["tiers"]:
                        other_replicas[tier] += 1
                        occupied_tiers_by_tier_len[len(tier)].add(tier)

            for replica in replace_replicas:
                # Find a new home for this replica
                tier = ()
                depth = 1
                while depth <= max_tier_depth:
                    # Order the tiers by how many replicas of this
                    # partition they already have. Then, of the ones
                    # with the smallest number of replicas and that have
                    # room to accept more partitions, pick the tier with
                    # the hungriest drive and then continue searching in
                    # that subtree.
                    #
                    # There are other strategies we could use here,
                    # such as hungriest-tier (i.e. biggest
                    # sum-of-parts-wanted) or picking one at random.
                    # However, hungriest-drive is what was used here
                    # before, and it worked pretty well in practice.
                    #
                    # Note that this allocator prioritizes even device
                    # filling over dispersion, so if your layout is
                    # extremely unbalanced, you may not get the replica
                    # dispersion that you expect, and your durability
                    # may be lessened.
                    #
                    # This used to be a cute, recursive function, but it's been
                    # unrolled for performance.

                    # We sort the tiers here so that, when we look for a tier
                    # with the lowest number of replicas, the first one we
                    # find is the one with the hungriest drive (i.e. drive
                    # with the largest sort_key value). This lets us
                    # short-circuit the search while still ensuring we get the
                    # right tier.
                    candidates_with_replicas = occupied_tiers_by_tier_len[len(tier) + 1]

                    # Among the tiers with room for more partitions,
                    # find one with the smallest possible number of
                    # replicas already in it, breaking ties by which one
                    # has the hungriest drive.
                    candidates_with_room = [t for t in tier2children[tier] if parts_available_in_tier[t] > 0]

                    if len(candidates_with_room) > len(candidates_with_replicas):
                        # There exists at least one tier with room for
                        # another partition and 0 other replicas already
                        # in it, so we can use a faster search. The else
                        # branch's search would work here, but it's
                        # significantly slower.
                        tier = max(
                            (t for t in candidates_with_room if other_replicas[t] == 0), key=tier2sort_key.__getitem__
                        )
                    else:
                        tier = max(candidates_with_room, key=lambda t: (-other_replicas[t], tier2sort_key[t]))
                    depth += 1
                dev = tier2devs[tier][-1]
                dev["parts_wanted"] -= 1
                dev["parts"] += 1
                old_sort_key = dev["sort_key"]
                new_sort_key = dev["sort_key"] = self._sort_key_for(dev)
                for tier in dev["tiers"]:
                    parts_available_in_tier[tier] -= 1
                    other_replicas[tier] += 1
                    occupied_tiers_by_tier_len[len(tier)].add(tier)

                    index = bisect.bisect_left(tier2dev_sort_key[tier], old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2dev_sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2dev_sort_key[tier], new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2dev_sort_key[tier].insert(new_index, new_sort_key)

                    new_last_sort_key = tier2dev_sort_key[tier][-1]
                    tier2sort_key[tier] = new_last_sort_key

                    # Now jiggle tier2children values to keep them sorted
                    parent_tier = tier[0:-1]
                    index = bisect.bisect_left(tier2children_sort_key[parent_tier], old_sort_key)
                    popped = tier2children[parent_tier].pop(index)
                    tier2children_sort_key[parent_tier].pop(index)

                    new_index = bisect.bisect_left(tier2children_sort_key[parent_tier], new_last_sort_key)
                    tier2children[parent_tier].insert(new_index, popped)
                    tier2children_sort_key[parent_tier].insert(new_index, new_last_sort_key)

                self._replica2part2dev[replica][part] = dev["id"]

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev["sort_key"]
            del dev["tiers"]
    def _reassign_parts(self, reassign_parts):
        """
        For an existing ring data set, partitions are reassigned similarly to
        the initial assignment. The devices are ordered by how many partitions
        they still want and kept in that order throughout the process. The
        gathered partitions are iterated through, assigning them to devices
        according to the "most wanted" while keeping the replicas as "far
        apart" as possible. Two different zones are considered the
        farthest-apart things, followed by different ip/port pairs within a
        zone; the least-far-apart things are different devices with the same
        ip/port pair in the same zone.

        If you want more replicas than devices, you won't get all your
        replicas.

        :param reassign_parts: An iterable of (part, replicas_to_replace)
                               pairs. replicas_to_replace is an iterable of the
                               replica (an int) to replace for that partition.
                               replicas_to_replace may be shared for multiple
                               partitions, so be sure you do not modify it.
        """
        for dev in self._iter_devs():
            dev['sort_key'] = self._sort_key_for(dev)
        available_devs = \
            sorted((d for d in self._iter_devs() if d['weight']),
                   key=lambda x: x['sort_key'])

        tier2children = build_tier_tree(available_devs)

        tier2devs = defaultdict(list)
        tier2sort_key = defaultdict(list)
        tiers_by_depth = defaultdict(set)
        for dev in available_devs:
            for tier in tiers_for_dev(dev):
                tier2devs[tier].append(dev)  # <-- starts out sorted!
                tier2sort_key[tier].append(dev['sort_key'])
                tiers_by_depth[len(tier)].add(tier)

        for part, replace_replicas in reassign_parts:
            # Gather up what other tiers (zones, ip_ports, and devices) the
            # replicas not-to-be-moved are in for this part.
            other_replicas = defaultdict(lambda: 0)
            for replica in xrange(self.replicas):
                if replica not in replace_replicas:
                    dev = self.devs[self._replica2part2dev[replica][part]]
                    for tier in tiers_for_dev(dev):
                        other_replicas[tier] += 1

            def find_home_for_replica(tier=(), depth=1):
                # Order the tiers by how many replicas of this
                # partition they already have. Then, of the ones
                # with the smallest number of replicas, pick the
                # tier with the hungriest drive and then continue
                # searching in that subtree.
                #
                # There are other strategies we could use here,
                # such as hungriest-tier (i.e. biggest
                # sum-of-parts-wanted) or picking one at random.
                # However, hungriest-drive is what was used here
                # before, and it worked pretty well in practice.
                #
                # Note that this allocator will balance things as
                # evenly as possible at each level of the device
                # layout. If your layout is extremely unbalanced,
                # this may produce poor results.
                candidate_tiers = tier2children[tier]
                min_count = min(other_replicas[t] for t in candidate_tiers)
                candidate_tiers = [t for t in candidate_tiers
                                   if other_replicas[t] == min_count]
                candidate_tiers.sort(
                    key=lambda t: tier2sort_key[t][-1])

                if depth == max(tiers_by_depth.keys()):
                    return tier2devs[candidate_tiers[-1]][-1]

                return find_home_for_replica(tier=candidate_tiers[-1],
                                             depth=depth + 1)

            for replica in replace_replicas:
                dev = find_home_for_replica()
                dev['parts_wanted'] -= 1
                dev['parts'] += 1
                old_sort_key = dev['sort_key']
                new_sort_key = dev['sort_key'] = self._sort_key_for(dev)
                for tier in tiers_for_dev(dev):
                    other_replicas[tier] += 1

                    index = bisect.bisect_left(tier2sort_key[tier],
                                               old_sort_key)
                    tier2devs[tier].pop(index)
                    tier2sort_key[tier].pop(index)

                    new_index = bisect.bisect_left(tier2sort_key[tier],
                                                   new_sort_key)
                    tier2devs[tier].insert(new_index, dev)
                    tier2sort_key[tier].insert(new_index, new_sort_key)

                self._replica2part2dev[replica][part] = dev['id']

        # Just to save memory and keep from accidental reuse.
        for dev in self._iter_devs():
            del dev['sort_key']