Beispiel #1
0
    def schedule(self, deadline, chunk=1000):
        # TODO: This doesn't lead to a fair balancing of workers, ideally each
        # scheduling task would be executed by a different process for each
        # host. There is also no failure isolation here, so a single shard
        # failure will cause the remainder of the shards to not be able to be
        # scheduled.
        for host in self.cluster.hosts:
            connection = self.cluster.get_local_client(host)

            with Lock('{0}:s:{1}'.format(self.namespace, host), nowait=True, timeout=30):
                # Prevent a runaway loop by setting a maximum number of
                # iterations. Note that this limits the total number of
                # expected items in any specific scheduling interval to chunk *
                # maximum_iterations.
                maximum_iterations = 1000
                for i in xrange(maximum_iterations):
                    items = connection.zrangebyscore(
                        make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING),
                        min=0,
                        max=deadline,
                        withscores=True,
                        start=0,
                        num=chunk,
                    )

                    # XXX: Redis will error if we try and execute an empty
                    # transaction. If there are no items to move between states, we
                    # need to exit the loop now. (This can happen on the first
                    # iteration of the loop if there is nothing to do, or on a
                    # subsequent iteration if there was exactly the same number of
                    # items to change states as the chunk size.)
                    if not items:
                        break

                    with connection.pipeline() as pipeline:
                        pipeline.multi()

                        pipeline.zrem(
                            make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING),
                            *[key for key, timestamp in items]
                        )

                        pipeline.zadd(
                            make_schedule_key(self.namespace, SCHEDULE_STATE_READY),
                            *itertools.chain.from_iterable([(timestamp, key) for (key, timestamp) in items])
                        )

                        for key, timestamp in items:
                            yield ScheduleEntry(key, timestamp)

                        pipeline.execute()

                    # If we retrieved less than the chunk size of items, we don't
                    # need try to retrieve more items.
                    if len(items) < chunk:
                        break
                else:
                    raise RuntimeError('loop exceeded maximum iterations (%s)' % (maximum_iterations,))
Beispiel #2
0
    def __schedule_partition(self, host, deadline, chunk):
        connection = self.cluster.get_local_client(host)

        lock = self.locks.get(
            '{0}:s:{1}'.format(self.namespace, host),
            duration=30,
            routing_key=host,
        )

        with lock.acquire():
            # Prevent a runaway loop by setting a maximum number of
            # iterations. Note that this limits the total number of
            # expected items in any specific scheduling interval to chunk *
            # maximum_iterations.
            maximum_iterations = 1000
            for i in range(maximum_iterations):
                items = connection.zrangebyscore(
                    make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING),
                    min=0,
                    max=deadline,
                    withscores=True,
                    start=0,
                    num=chunk,
                )

                # XXX: Redis will error if we try and execute an empty
                # transaction. If there are no items to move between states, we
                # need to exit the loop now. (This can happen on the first
                # iteration of the loop if there is nothing to do, or on a
                # subsequent iteration if there was exactly the same number of
                # items to change states as the chunk size.)
                if not items:
                    break

                with connection.pipeline() as pipeline:
                    pipeline.multi()

                    pipeline.zrem(
                        make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING),
                        *[key for key, timestamp in items]
                    )

                    pipeline.zadd(
                        make_schedule_key(self.namespace, SCHEDULE_STATE_READY),
                        *itertools.chain.from_iterable([(timestamp, key) for (key, timestamp) in items])
                    )

                    for key, timestamp in items:
                        yield ScheduleEntry(key, timestamp)

                    pipeline.execute()

                # If we retrieved less than the chunk size of items, we don't
                # need try to retrieve more items.
                if len(items) < chunk:
                    break
            else:
                raise RuntimeError('loop exceeded maximum iterations (%s)' % (maximum_iterations,))
Beispiel #3
0
    def schedule(self, deadline, timestamp=None):
        if timestamp is None:
            timestamp = time.time()

        for host in self.cluster.hosts:
            try:
                for key, timestamp in self.__schedule_partition(
                        host, deadline, timestamp):
                    yield ScheduleEntry(key, float(timestamp))
            except Exception as error:
                logger.error(
                    'Failed to perform scheduling for partition %r due to error: %r',
                    host,
                    error,
                    exc_info=True)
Beispiel #4
0
    def schedule(
        self,
        deadline: float,
        timestamp: Optional[float] = None
    ) -> Optional[Iterable[ScheduleEntry]]:
        if timestamp is None:
            timestamp = time.time()

        for host in self.cluster.hosts:
            try:
                for key, timestamp in self.__schedule_partition(
                        host, deadline, timestamp):
                    yield ScheduleEntry(key.decode("utf-8"), float(timestamp))
            except Exception as error:
                logger.error(
                    f"Failed to perform scheduling for partition {host} due to error: {error}",
                    exc_info=True,
                )
Beispiel #5
0
    def __maintenance_partition(self, host, deadline, chunk):
        connection = self.cluster.get_local_client(host)

        extra = 0
        start = 0
        maximum_iterations = 1000
        for i in range(maximum_iterations):
            fetch_size = chunk + extra
            entries = [
                ScheduleEntry(*x) for x in (connection.zrangebyscore(
                    make_schedule_key(self.namespace, SCHEDULE_STATE_READY),
                    min=start,
                    max=deadline,
                    withscores=True,
                    start=0,
                    num=fetch_size,
                ))
            ]

            def try_lock(entry):
                """
                Attempt to immedately acquire a lock on the timeline at
                key, returning the lock if it can be acquired, otherwise
                returning ``None``.
                """
                timeline_key = make_timeline_key(self.namespace, entry.key),
                lock = self.locks.get(
                    timeline_key,
                    duration=5,
                    routing_key=timeline_key,
                )
                try:
                    lock.acquire()
                except Exception:
                    lock = None
                return lock, entry

            # Try to take out a lock on each entry. If we can't acquire the
            # lock, that means this is currently being digested and cannot be
            # rescheduled.
            can_reschedule = ([], [])  # indexed by True and False
            for result in map(try_lock, entries):
                can_reschedule[result[0] is not None].append(result)

            logger.debug('Fetched %s items, able to reschedule %s.',
                         len(entries), len(can_reschedule[True]))

            # Set the start position for the next query. (If there are no
            # items, we don't need to worry about this, since there won't
            # be a next query.) If all items share the same score and are
            # locked, the iterator will never advance (we will keep trying
            # to schedule the same locked items over and over) and either
            # eventually progress slowly as items are unlocked, or hit the
            # maximum iterations boundary. A possible solution to this
            # would be to count the number of items that have the maximum
            # score in this page that we assume we can't acquire (since we
            # couldn't acquire the lock this iteration) and add that count
            # to the next query limit. (This unfortunately could also
            # lead to unbounded growth too, so we have to limit it as well.)
            if entries:
                start = entries[-1].key
                extra = min(
                    ilen(
                        itertools.takewhile(
                            # (lock, entry)
                            lambda x: x[1].timestamp == start,
                            can_reschedule[False][::-1],
                        ), ),
                    chunk,
                )

            # XXX: We need to perform this check before the transaction to
            # ensure that we don't execute an empty transaction. (We'll
            # need to perform a similar check after the completion of the
            # transaction as well.)
            if not can_reschedule[True]:
                if len(entries) == fetch_size:
                    # There is nothing to reschedule in this chunk, but we
                    # need check if there are others after this chunk.
                    continue
                else:
                    # There is nothing to unlock, and we've exhausted all items.
                    break

            try:
                with connection.pipeline() as pipeline:
                    pipeline.multi()

                    pipeline.zrem(
                        make_schedule_key(self.namespace,
                                          SCHEDULE_STATE_READY),
                        *[entry.key for (lock, entry) in can_reschedule[True]])

                    should_reschedule = ([], [])  # indexed by True and False
                    timeout = time.time() - self.ttl
                    for lock, entry in can_reschedule[True]:
                        should_reschedule[entry.timestamp > timeout].append(
                            entry)

                    logger.debug(
                        'Identified %s items that should be rescheduled, and %s that will be removed.',
                        len(should_reschedule[True]),
                        len(should_reschedule[False]),
                    )

                    # Move items that should be rescheduled to the waiting state.
                    if should_reschedule[True]:
                        pipeline.zadd(
                            make_schedule_key(self.namespace,
                                              SCHEDULE_STATE_WAITING),
                            *itertools.chain.from_iterable([
                                (entry.timestamp, entry.key)
                                for entry in should_reschedule[True]
                            ]))

                    # Clear out timelines that should not be rescheduled.
                    # Ideally this path is never actually hit, but this can
                    # happen if the queue is **extremely** backlogged, or if a
                    # cluster size change caused partition ownership to change
                    # and timelines are now stuck within partitions that they
                    # no longer should be. (For more details, see GH-2479.)
                    for entry in should_reschedule[False]:
                        logger.warning(
                            'Clearing expired timeline %r from host %s, schedule timestamp was %s.',
                            entry.key,
                            host,
                            entry.timestamp,
                        )
                        clear_timeline_contents(
                            pipeline,
                            make_timeline_key(self.namespace, entry.key),
                        )

                    pipeline.execute()
            finally:
                # Regardless of the outcome of the transaction, we should
                # try to unlock the items for processing.
                for lock, entry in can_reschedule[True]:
                    try:
                        lock.release()
                    except Exception as error:
                        # XXX: This shouldn't be hit (the ``Lock`` code
                        # should swallow the exception) but this is here
                        # for safety anyway.
                        logger.warning('Could not unlock %r: %s', entry, error)

            # If we retrieved less than the chunk size of items, we don't
            # need try to retrieve more items.
            if len(entries) < fetch_size:
                break
        else:
            raise RuntimeError('loop exceeded maximum iterations (%s)' %
                               (maximum_iterations, ))