Ejemplo n.º 1
0
    def _connect(
        self,
        name,
        address,
        env_id,
        seed,
        fps,
        i,
        network,
        env_status,
        reward_buffer,
        label,
        password,
        start_timeout,
        observer,
        skip_network_calibration,
        attempt=0,
        elapsed_sleep_time=0,
    ):
        endpoint = endpoints.clientFromString(reactor, 'tcp:' + address)
        factory = websocket.WebSocketClientFactory('ws://' + address)
        factory.protocol = rewarder_client.RewarderClient

        assert password, "Missing password: {} for rewarder session".format(
            password)
        factory.headers = {
            'authorization': utils.basic_auth_encode(password),
            'openai-observer': 'true' if observer else 'false'
        }
        factory.i = i

        # Various important objects
        factory.endpoint = endpoint
        factory.env_status = env_status
        factory.reward_buffer = reward_buffer

        # Helpful strings
        factory.label = label
        factory.address = address

        # Arguments to always send to the remote reset call
        factory.arg_env_id = env_id
        factory.arg_fps = fps

        def record_error(e):
            if isinstance(e, failure.Failure):
                e = e.value

            # logger.error('[%s] Recording rewarder error: %s', factory.label, e)
            with self.lock:
                # drop error on the floor if we're already closed
                if self._already_closed(factory.i):
                    extra_logger.info(
                        '[%s] Ignoring error for already closed connection: %s',
                        label, e)
                elif factory.i not in self.clients:
                    extra_logger.info(
                        '[%s] Received error for connection which has not been fully initialized: %s',
                        label, e)
                    # We could handle this better, but right now we
                    # just mark this as a fatal error for the
                    # backend. Often it actually is.
                    self.errors[factory.i] = e
                else:
                    extra_logger.info(
                        '[%s] Recording fatal error for connection: %s', label,
                        e)
                    self.errors[factory.i] = e

        def retriable_error(e, error_message):
            if isinstance(e, failure.Failure):
                e = e.value

            if self._already_closed(factory.i):
                logger.error(
                    '[%s] Got error, but giving up on reconnecting, since %d already disconnected',
                    factory.label, factory.i)
                return

            # Also need to handle DNS errors, so let's just handle everything for now.
            #
            # reason.trap(twisted.internet.error.ConnectError, error.ConnectionError)
            if elapsed_sleep_time < start_timeout:
                sleep = min((2 * attempt + 1), 10)
                logger.error(
                    '[%s] Waiting on rewarder: %s. Retry in %ds (slept %ds/%ds): %s',
                    factory.label, error_message, sleep, elapsed_sleep_time,
                    start_timeout, e)
                reactor.callLater(
                    sleep,
                    self._connect,
                    name=name,
                    address=address,
                    env_id=env_id,
                    seed=seed,
                    fps=fps,
                    i=i,
                    network=network,
                    env_status=env_status,
                    reward_buffer=reward_buffer,
                    label=label,
                    attempt=attempt + 1,
                    elapsed_sleep_time=elapsed_sleep_time + sleep,
                    start_timeout=start_timeout,
                    password=password,
                    observer=observer,
                    skip_network_calibration=skip_network_calibration,
                )
            else:
                logger.error('[%s] %s. Retries exceeded (slept %ds/%ds): %s',
                             factory.label, error_message, elapsed_sleep_time,
                             start_timeout, e)
                record_error(e)

        factory.record_error = record_error

        try:
            retry_msg = 'establish rewarder TCP connection'
            client = yield endpoint.connect(factory)
            extra_logger.info('[%s] Rewarder TCP connection established',
                              factory.label)

            retry_msg = 'complete WebSocket handshake'
            yield client.waitForWebsocketConnection()
            extra_logger.info('[%s] Websocket client successfully connected',
                              factory.label)

            if not skip_network_calibration:
                retry_msg = 'run network calibration'
                yield network.calibrate(client)
                extra_logger.info('[%s] Network calibration complete',
                                  factory.label)

            retry_msg = ''

            if factory.arg_env_id is not None:
                # We aren't picky about episode ID: we may have
                # already receieved an env.describe message
                # telling us about a resetting environment, which
                # we don't need to bump post.
                #
                # tl;dr hardcoding 0.0 here avoids a double reset.
                reply = yield self._send_env_reset(client,
                                                   seed=seed,
                                                   episode_id='0')
            else:
                # No env_id requested, so we just proceed without a reset
                reply = None
            # We're connected and have measured the
            # network. Mark everything as ready to go.
            with self.lock:
                if factory.i not in self.names_by_id:
                    # ID has been popped!
                    logger.info(
                        '[%s] Rewarder %d started, but has already been closed',
                        factory.label, factory.i)
                    client.close()
                elif reply is None:
                    logger.info(
                        '[%s] Attached to running environment without reset',
                        factory.label)
                else:
                    context, req, rep = reply
                    logger.info('[%s] Initial reset complete: episode_id=%s',
                                factory.label, rep['headers']['episode_id'])
                self.clients[factory.i] = client
        except Exception as e:
            if retry_msg:
                retriable_error(e, 'failed to ' + retry_msg)
            else:
                record_error(e)
Ejemplo n.º 2
0
    def _register_rewarder(self, address, start_time=None):
        if start_time is None:
            start_time = time.time()

        host, port = host_port(address, default_port=15900)

        while True:
            # In WebSockets, the server sends bytes once we've upgraded the protocol
            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
            try:
                sock.connect((host, port))
            except (socket.error, socket.gaierror) as e:
                # ECONNREFUSED: VNC env hasn't come up yet
                # ETIMEDOUT: the packets can't be delivered yet, such as can happen on kubernetes
                # gaierror: can't resolve the address yet, which can also happen on kubernetes
                expected = socket.errno.ECONNREFUSED == e.errno or socket.errno.ETIMEDOUT == e.errno or isinstance(e, socket.gaierror)
                if self.start_timeout is None or not expected:
                    reraise(suffix='while connecting to Rewarder server {}'.format(address))
                logger.info('Rewarder server %s did not come up yet (error: %s). Sleeping for 1s.', address, e)
                time.sleep(1)
            else:
                break

            if time.time() - start_time > self.start_timeout:
                raise error.Error('Rewarder server {} did not come up within {}s'.format(address, self.start_timeout))

        # Send a websocket handshake.
        # https://developer.mozilla.org/en-US/docs/Web/API/WebSockets_API/Writing_WebSocket_servers
        #
        # The port 10003 is an arbitrary port that we don't actually connect to, but needs to be a valid part
        # e.g Host: 127.0.0.1:GARBAGE results in the following error: (invalid port 'GARBAGE' in HTTP Host header '127.0.0.1:GARBAGE')
        sock.send(b'GET / HTTP/1.1\r\nHost: 127.0.0.1:10003\r\nUpgrade: WebSocket\r\nConnection:Upgrade\r\nSec-WebSocket-Key: dGhlIHNhbXBsZSBub25jZQ==\r\nSec-WebSocket-Version: 13\r\nauthorization: ' + utils.basic_auth_encode('openai').encode('utf-8') + b'\r\nopenai-observer: true\r\n\r\n')
        self.sockets[sock] = ('rewarder', address)
Ejemplo n.º 3
0
    def _connect(
        self,
        name,
        address,
        env_id,
        seed,
        fps,
        i,
        network,
        env_status,
        reward_buffer,
        label,
        password,
        start_timeout,
        observer,
        skip_network_calibration,
        attempt=0,
        elapsed_sleep_time=0,
    ):
        endpoint = endpoints.clientFromString(reactor, 'tcp:' + address)
        factory = websocket.WebSocketClientFactory('ws://' + address)
        factory.protocol = rewarder_client.RewarderClient

        assert password, "Missing password: {} for rewarder session".format(
            password)
        factory.headers = {
            'authorization': utils.basic_auth_encode(password),
            'openai-observer': 'true' if observer else 'false'
        }
        factory.i = i

        # Various important objects
        factory.endpoint = endpoint
        factory.env_status = env_status
        factory.reward_buffer = reward_buffer

        # Helpful strings
        factory.label = label
        factory.address = address

        # Arguments to always send to the remote reset call
        factory.arg_env_id = env_id
        factory.arg_fps = fps

        def record_error(e):
            if isinstance(e, failure.Failure):
                e = e.value

            # logger.error('[%s] Recording rewarder error: %s', factory.label, e)
            with self.lock:
                # drop error on the floor if we're already closed
                if self._already_closed(factory.i):
                    extra_logger.info(
                        '[%s] Ignoring error for already closed connection: %s',
                        label, e)
                else:
                    extra_logger.info(
                        '[%s] Recording fatal error for connection: %s', label,
                        e)
                    self.errors[factory.i] = e

        def websocket_failed(e):
            if isinstance(e, failure.Failure):
                e = e.value

            if self._already_closed(factory.i):
                logger.error(
                    '[%s] Giving up on reconnecting, since %d already disconnected',
                    factory.label, factory.i)
                return

            # Also need to handle DNS errors, so let's just handle everything for now.
            #
            # reason.trap(twisted.internet.error.ConnectError, error.ConnectionError)
            if elapsed_sleep_time < start_timeout:
                sleep = min((2 * attempt + 1), 10)
                logger.error(
                    '[%s] Waiting on rewarder: %s. Retry in %ds (slept %ds/%ds): %s',
                    factory.label, websocket_failed.error_message, sleep,
                    elapsed_sleep_time, start_timeout, e)
                reactor.callLater(
                    sleep,
                    self._connect,
                    name=name,
                    address=address,
                    env_id=env_id,
                    seed=seed,
                    fps=fps,
                    i=i,
                    network=network,
                    env_status=env_status,
                    reward_buffer=reward_buffer,
                    label=label,
                    attempt=attempt + 1,
                    elapsed_sleep_time=elapsed_sleep_time + sleep,
                    start_timeout=start_timeout,
                    password=password,
                    observer=observer,
                    skip_network_calibration=skip_network_calibration,
                )
            else:
                logger.error('[%s] %s. Retries exceeded (slept %ds/%ds): %s',
                             factory.label, websocket_failed.error_message,
                             elapsed_sleep_time, start_timeout, e)
                record_error(e)

        def retriable_record_error(e):
            """Record an error, unless our connection is still establishing"""
            if isinstance(e, failure.Failure):
                e = e.value

            # logger.error('[%s] Recording rewarder error: %s', factory.label, e)
            with self.lock:
                # drop error on the floor if we're already closed
                if factory.i not in self.names_by_id:
                    record_error(e)
                elif factory.i not in self.clients:
                    extra_logger.info(
                        '[%s] Received error for connection which has not been fully initialized: %s',
                        label, e)
                    # We could handle this better, but right now we
                    # just mark this as a fatal error for the
                    # backend. Often it actually is.
                    #
                    # If we break again, don't recurse; just skip to
                    # the direct error recording.
                    record_error(e)
                else:
                    record_error(e)

        factory.record_error = retriable_record_error

        def fail(reason):
            factory.record_error(reason)

        def connected(client):
            extra_logger.info('[%s] Websocket client successfully connected',
                              factory.label)

            # Websocket client has come up fully. Time to start on the
            # next level of our callback chain. (There must be a
            # better way to write this.)
            def calibrate_success(network):
                extra_logger.info('[%s] Network calibration complete',
                                  factory.label)

                def reset_success(reply):
                    # We're connected and have measured the
                    # network. Mark everything as ready to go.
                    with self.lock:
                        if factory.i not in self.names_by_id:
                            # ID has been popped!
                            logger.info(
                                '[%s] Rewarder %d started, but has already been closed',
                                factory.label, factory.i)
                            client.close()
                        elif reply is None:
                            logger.info(
                                '[%s] Attached to running environment without reset',
                                factory.label)
                        else:
                            context, req, rep = reply
                            logger.info(
                                '[%s] Initial reset complete: episode_id=%s',
                                factory.label, rep['headers']['episode_id'])
                        self.clients[factory.i] = client

                if factory.arg_env_id is not None:
                    # We aren't picky about episode ID: we may have
                    # already receieved an env.describe message
                    # telling us about a resetting environment, which
                    # we don't need to bump post.
                    #
                    # tl;dr hardcoding 0.0 here avoids a double reset.
                    d = self._send_env_reset(client, seed=seed, episode_id='0')
                    d.addCallback(reset_success)
                    d.addErrback(fail)
                else:
                    # No env_id requested, so we just proceed without a reset
                    reset_success(None)

            if skip_network_calibration:
                calibrate_success(network)
            else:
                d = network.calibrate(client)
                d.addCallback(calibrate_success)
                websocket_failed.error_message = 'WebSocket handshake established but calibration failed'
                d.addErrback(websocket_failed)
                d.addErrback(fail)

        d = defer.Deferred()
        d.addCallbacks(connected)
        websocket_failed.error_message = 'TCP connection established but WebSocket handshake failed'
        d.addErrback(websocket_failed)
        d.addErrback(fail)
        factory.deferred = d

        def connection_succeeded(conn):
            extra_logger.info('[%s] Rewarder TCP connection established',
                              factory.label)

        def connection_failed(reason):
            reason = error.Error('[{}] Connection failed: {}'.format(
                factory.label, reason.value))

            try:
                d.errback(utils.format_error(reason))
            except defer.AlreadyCalledError:
                raise

        res = endpoint.connect(factory)
        res.addCallback(connection_succeeded)
        websocket_failed.error_message = 'Could not establish rewarder TCP connection'
        res.addErrback(websocket_failed)
        res.addErrback(connection_failed)
Ejemplo n.º 4
0
    def _connect(self, name, address, env_id, seed, fps, i, network, env_status, reward_buffer,
                 label, password, start_timeout,
                 observer, skip_network_calibration,
                 attempt=0, elapsed_sleep_time=0,
    ):
        endpoint = endpoints.clientFromString(reactor, 'tcp:'+address)
        factory = websocket.WebSocketClientFactory('ws://'+address)
        factory.protocol = rewarder_client.RewarderClient

        assert password, "Missing password: {} for rewarder session".format(password)
        factory.headers = {'authorization': utils.basic_auth_encode(password), 'openai-observer': 'true' if observer else 'false'}
        factory.i = i

        # Various important objects
        factory.endpoint = endpoint
        factory.env_status = env_status
        factory.reward_buffer = reward_buffer

        # Helpful strings
        factory.label = label
        factory.address = address

        # Arguments to always send to the remote reset call
        factory.arg_env_id = env_id
        factory.arg_fps = fps

        def record_error(e):
            if isinstance(e, failure.Failure):
                e = e.value

            # logger.error('[%s] Recording rewarder error: %s', factory.label, e)
            with self.lock:
                # drop error on the floor if we're already closed
                if self._already_closed(factory.i):
                    extra_logger.info('[%s] Ignoring error for already closed connection: %s', label, e)
                elif factory.i not in self.clients:
                    extra_logger.info('[%s] Received error for connection which has not been fully initialized: %s', label, e)
                    # We could handle this better, but right now we
                    # just mark this as a fatal error for the
                    # backend. Often it actually is.
                    self.errors[factory.i] = e
                else:
                    extra_logger.info('[%s] Recording fatal error for connection: %s', label, e)
                    self.errors[factory.i] = e

        def retriable_error(e, error_message):
            if isinstance(e, failure.Failure):
                e = e.value

            if self._already_closed(factory.i):
                logger.error('[%s] Got error, but giving up on reconnecting, since %d already disconnected', factory.label, factory.i)
                return

            # Also need to handle DNS errors, so let's just handle everything for now.
            #
            # reason.trap(twisted.internet.error.ConnectError, error.ConnectionError)
            if elapsed_sleep_time < start_timeout:
                sleep = min((2 * attempt+1), 10)
                logger.error('[%s] Waiting on rewarder: %s. Retry in %ds (slept %ds/%ds): %s', factory.label, error_message, sleep, elapsed_sleep_time, start_timeout, e)
                reactor.callLater(
                    sleep, self._connect, name=name, address=address,
                    env_id=env_id, seed=seed, fps=fps, i=i, network=network,
                    env_status=env_status, reward_buffer=reward_buffer, label=label,
                    attempt=attempt+1, elapsed_sleep_time=elapsed_sleep_time+sleep,
                    start_timeout=start_timeout, password=password,
                    observer=observer, skip_network_calibration=skip_network_calibration,
                )
            else:
                logger.error('[%s] %s. Retries exceeded (slept %ds/%ds): %s', factory.label, error_message, elapsed_sleep_time, start_timeout, e)
                record_error(e)

        factory.record_error = record_error

        try:
            retry_msg = 'establish rewarder TCP connection'
            client = yield endpoint.connect(factory)
            extra_logger.info('[%s] Rewarder TCP connection established', factory.label)

            retry_msg = 'complete WebSocket handshake'
            yield client.waitForWebsocketConnection()
            extra_logger.info('[%s] Websocket client successfully connected', factory.label)

            if not skip_network_calibration:
                retry_msg = 'run network calibration'
                yield network.calibrate(client)
                extra_logger.info('[%s] Network calibration complete', factory.label)

            retry_msg = ''

            if factory.arg_env_id is not None:
                # We aren't picky about episode ID: we may have
                # already receieved an env.describe message
                # telling us about a resetting environment, which
                # we don't need to bump post.
                #
                # tl;dr hardcoding 0.0 here avoids a double reset.
                reply = yield self._send_env_reset(client, seed=seed, episode_id='0')
            else:
                # No env_id requested, so we just proceed without a reset
                reply = None
            # We're connected and have measured the
            # network. Mark everything as ready to go.
            with self.lock:
                if factory.i not in self.names_by_id:
                    # ID has been popped!
                    logger.info('[%s] Rewarder %d started, but has already been closed', factory.label, factory.i)
                    client.close(reason='RewarderSession: double-closing, client was closed while RewarderSession was starting')
                elif reply is None:
                    logger.info('[%s] Attached to running environment without reset', factory.label)
                else:
                    context, req, rep = reply
                    logger.info('[%s] Initial reset complete: episode_id=%s', factory.label, rep['headers']['episode_id'])
                self.clients[factory.i] = client
        except Exception as e:
            if retry_msg:
                retriable_error(e, 'failed to ' + retry_msg)
            else:
                record_error(e)