def with_retries(self, method, *args, **kwargs): timeout = 20 * 60 start = time.time() i = 0 while True: try: return method(*args, **kwargs) except FatalError as e: logger.error('[%s] %s', self.label, e) self.error_buffer.record(e) raise except Exception as e: delta = time.time() - start if delta > timeout: raise error.TimeoutError( 'Have been unable to connect to the allocator at {} for {}s. Giving up. Last error: {}' .format(self.base_url, delta, e)) i += 1 sleep = min(2**i, 60) time.sleep(sleep) logger.error( '[%s] Error making request to allocator: %s. Will retry in %ss (and timeout in %.0fs)', self.label, e, sleep, start + timeout - time.time())
def _poll(self): self._sleep = min(20, self._sleep + 2) if len(self.pending) == 0: return for name, spec in self.pending.items(): delta = time.time() - spec['received_at'] if delta > self.start_timeout: raise error.TimeoutError('Waited {}s for {} to get an IP, which exceeds start_timeout of {}'.format(delta, name, self.start_timeout)) names = list(self.pending.keys()) # This really should be an allocation_get, but it's possible # the pods list will be long. So it's either GET with a body, # or POST what should really be a GET. We do the latter. allocation = self.with_retries(self._requestor.allocation_refresh, self.client_id, names=names) assert len(allocation['env_n']) <= len(names), "Received more envs than requested: allocation={} names={}".format(allocation, names) # Handle any envs which have gone missing result = set(env['name'] for env in allocation['env_n']) dropped = [p for p in self.pending.keys() if p not in result] if len(dropped) > 0: logger.info('Pending remote envs %s were not returned by the allocator (only %s were returned). Assuming the missing ones have gone down and requesting replacements.', dropped, list(result)) for d in dropped: spec = self.pending.pop(d) self._allocate(dropped, False, spec['params']) # Handle successful allocations self._handle_allocation(allocation, pop=True)