def retriable_error(e, error_message): if isinstance(e, failure.Failure): e = e.value if self._already_closed(factory.i): logger.error('[%s] Got error, but giving up on reconnecting, since %d already disconnected', factory.label, factory.i) return # Also need to handle DNS errors, so let's just handle everything for now. # # reason.trap(twisted.internet.error.ConnectError, error.ConnectionError) if elapsed_sleep_time < start_timeout: sleep = min((2 * attempt+1), 10) logger.error('[%s] Waiting on rewarder: %s. Retry in %ds (slept %ds/%ds): %s', factory.label, error_message, sleep, elapsed_sleep_time, start_timeout, e) reactor.callLater( sleep, self._connect, name=name, address=address, env_id=env_id, seed=seed, fps=fps, i=i, network=network, env_status=env_status, reward_buffer=reward_buffer, label=label, attempt=attempt+1, elapsed_sleep_time=elapsed_sleep_time+sleep, start_timeout=start_timeout, password=password, observer=observer, skip_network_calibration=skip_network_calibration, ) else: logger.error('[%s] %s. Retries exceeded (slept %ds/%ds): %s', factory.label, error_message, elapsed_sleep_time, start_timeout, e) record_error(e)
def websocket_failed(e): if isinstance(e, failure.Failure): e = e.value if self._already_closed(factory.i): logger.error('[%s] Giving up on reconnecting, since %d already disconnected', factory.label, factory.i) return # Also need to handle DNS errors, so let's just handle everything for now. # # reason.trap(twisted.internet.error.ConnectError, error.ConnectionError) if elapsed_sleep_time < start_timeout: sleep = min((2 * attempt+1), 10) logger.error('[%s] Waiting on rewarder: %s. Retry in %ds (slept %ds/%ds): %s', factory.label, websocket_failed.error_message, sleep, elapsed_sleep_time, start_timeout, e) reactor.callLater( sleep, self._connect, name=name, address=address, env_id=env_id, seed=seed, fps=fps, i=i, network=network, env_status=env_status, reward_buffer=reward_buffer, label=label, attempt=attempt+1, elapsed_sleep_time=elapsed_sleep_time+sleep, start_timeout=start_timeout, password=password, observer=observer, skip_network_calibration=skip_network_calibration, ) else: logger.error('[%s] %s. Retries exceeded (slept %ds/%ds): %s', factory.label, websocket_failed.error_message, elapsed_sleep_time, start_timeout, e) record_error(e)
def _connect_errback(reason): if tries < max_attempts: # Somewhat arbitrary exponential backoff: should be # pretty rare, and indicate that we're just starting # up. delay = 1.5 ** tries logger.info('[RewardProxyServer] [%d] Connection to %s failed: %s. Try %d/%d; going to retry in %fs', self.id, remote, reason, tries, max_attempts, delay) reactor.callLater( delay, self.connect_upstream, tries=tries+1, max_attempts=max_attempts) else: logger.error('[RewardProxyServer] [%d] Connection to %s failed: %s. Completed %d/%d atttempts; disconnecting.', self.id, remote, reason, tries, max_attempts) self.transport.loseConnection()
def measure_clock_skew(label, host): cmd = ['ntpdate', '-q', '-p', '8', host] extra_logger.info('[%s] Starting network calibration with %s', label, ' '.join(cmd)) skew = Clockskew(label, cmd) # TODO: search PATH for this? process = reactor.spawnProcess(skew, '/usr/sbin/ntpdate', cmd, {}) # process = reactor.spawnProcess(skew, '/bin/sleep', ['sleep', '2'], {}) t = float(os.environ.get('UNIVERSE_NTPDATE_TIMEOUT', 20)) def timeout(): if process.pid: logger.error('[%s] %s call timed out after %ss; killing the subprocess. This is ok, but you could have more accurate timings by enabling UDP port 123 traffic to your env. (Alternatively, you can try increasing the timeout by setting environment variable UNIVERSE_NTPDATE_TIMEOUT=10.)', label, ' '.join(cmd), t) process.signalProcess(signal.SIGKILL) process.reapProcess() # TODO: make this part of the connection string reactor.callLater(t, timeout) return skew.deferred
def _connect_errback(reason): if tries < max_attempts: # Somewhat arbitrary exponential backoff: should be # pretty rare, and indicate that we're just starting # up. delay = 1.5**tries logger.info( '[RewardProxyServer] [%d] Connection to %s failed: %s. Try %d/%d; going to retry in %fs', self.id, remote, reason, tries, max_attempts, delay) reactor.callLater(delay, self.connect_upstream, tries=tries + 1, max_attempts=max_attempts) else: logger.error( '[RewardProxyServer] [%d] Connection to %s failed: %s. Completed %d/%d atttempts; disconnecting.', self.id, remote, reason, tries, max_attempts) self.transport.loseConnection()
def _start(self): def calibrate(): d = defer.Deferred() def fail(reason): logger.error('[%s] Could not recalibrate network: %s', self.client.factory.label, reason) d.addErrback(fail) self._start_measure_connection_time(d) self._start() self.recalibrate = reactor.callLater(5 * 60, calibrate)
def measure_clock_skew(label, host): cmd = ['ntpdate', '-q', '-p', '8', host] extra_logger.info('[%s] Starting network calibration with %s', label, ' '.join(cmd)) skew = Clockskew(label, cmd) # TODO: search PATH for this? process = reactor.spawnProcess(skew, '/usr/sbin/ntpdate', cmd, {}) # process = reactor.spawnProcess(skew, '/bin/sleep', ['sleep', '2'], {}) t = float(os.environ.get('UNIVERSE_NTPDATE_TIMEOUT', 20)) def timeout(): if process.pid: logger.error( '[%s] %s call timed out after %ss; killing the subprocess. This is ok, but you could have more accurate timings by enabling UDP port 123 traffic to your env. (Alternatively, you can try increasing the timeout by setting environment variable UNIVERSE_NTPDATE_TIMEOUT=10.)', label, ' '.join(cmd), t) process.signalProcess(signal.SIGKILL) process.reapProcess() # TODO: make this part of the connection string reactor.callLater(t, timeout) return skew.deferred