Exemple #1
0
    def with_retries(self, method, *args, **kwargs):
        timeout = 20 * 60
        start = time.time()

        i = 0
        while True:
            try:
                return method(*args, **kwargs)
            except FatalError as e:
                logger.error('[%s] %s', self.label, e)
                self.error_buffer.record(e)
                raise
            except Exception as e:
                delta = time.time() - start
                if delta > timeout:
                    raise error.TimeoutError(
                        'Have been unable to connect to the allocator at {} for {}s. Giving up. Last error: {}'
                        .format(self.base_url, delta, e))
                i += 1

                sleep = min(2**i, 60)
                time.sleep(sleep)
                logger.error(
                    '[%s] Error making request to allocator: %s. Will retry in %ss (and timeout in %.0fs)',
                    self.label, e, sleep, start + timeout - time.time())
Exemple #2
0
    def _poll(self):
        self._sleep = min(20, self._sleep + 2)

        if len(self.pending) == 0:
            return

        for name, spec in self.pending.items():
            delta = time.time() - spec['received_at']
            if delta > self.start_timeout:
                raise error.TimeoutError('Waited {}s for {} to get an IP, which exceeds start_timeout of {}'.format(delta, name, self.start_timeout))

        names = list(self.pending.keys())
        # This really should be an allocation_get, but it's possible
        # the pods list will be long. So it's either GET with a body,
        # or POST what should really be a GET. We do the latter.
        allocation = self.with_retries(self._requestor.allocation_refresh, self.client_id, names=names)
        assert len(allocation['env_n']) <= len(names), "Received more envs than requested: allocation={} names={}".format(allocation, names)

        # Handle any envs which have gone missing
        result = set(env['name'] for env in allocation['env_n'])
        dropped = [p for p in self.pending.keys() if p not in result]
        if len(dropped) > 0:
            logger.info('Pending remote envs %s were not returned by the allocator (only %s were returned). Assuming the missing ones have gone down and requesting replacements.', dropped, list(result))
            for d in dropped:
                spec = self.pending.pop(d)
                self._allocate(dropped, False, spec['params'])

        # Handle successful allocations
        self._handle_allocation(allocation, pop=True)