def _channel_connectivity_changed(connectivity): """Callback for channel connectivity changes.""" try: with _host_state.channel_condition: if connectivity == grpc.ChannelConnectivity.READY: if _check_state(): logs.log('Connected to worker.') _host_state.channel_state = ChannelState.READY else: _host_state.channel_state = ChannelState.INCONSISTENT _host_state.channel_condition.notify_all() return _host_state.channel_state = ChannelState.NOT_READY if connectivity == grpc.ChannelConnectivity.SHUTDOWN: if _host_state.expect_shutdown: # We requested a shutdown to update the source. logs.log('Worker shutting down.') return raise untrusted.HostException('Unrecoverable error.') except AttributeError: # Python sets all globals to None on shutdown. Ignore. logs.log('Shutting down.') return if connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE: logs.log_warn('Transient failure detected on worker channel.') if connectivity == grpc.ChannelConnectivity.CONNECTING: logs.log('Reconnecting to worker.')
def _connect(): """Initial connect to the worker.""" worker_assignment = _get_host_worker_assignment() if worker_assignment is None: raise AssertionError if worker_assignment.worker_name is None: raise AssertionError if worker_assignment.project_name is None: raise AssertionError root_cert = _get_root_cert(worker_assignment.project_name) if not root_cert: logs.log_warn("TLS certs not yet generated.") time.sleep(WAIT_TLS_CERT_SECONDS) sys.exit(0) environment.set_value( "QUEUE_OVERRIDE", untrusted.platform_name(worker_assignment.project_name, "linux"), ) server_name = worker_assignment.worker_name if not environment.get_value("LOCAL_DEVELOPMENT"): server_name += untrusted.internal_network_domain() _host_state.worker_bot_name = worker_assignment.worker_name credentials = grpc.ssl_channel_credentials(root_cert) _host_state.channel = grpc.secure_channel( "%s:%d" % (server_name, config.PORT), credentials=credentials, options=config.GRPC_OPTIONS, ) _host_state.stub = UntrustedRunnerStub(_host_state.channel) logs.log("Connecting to worker %s..." % server_name) _host_state.channel.subscribe(_channel_connectivity_changed, try_to_connect=True) channel_state = _check_channel_state( config.INITIAL_CONNECT_TIMEOUT_SECONDS) if channel_state == ChannelState.INCONSISTENT: logs.log_warn("Worker inconsistent on initial connect.") monitoring_metrics.HOST_INCONSISTENT_COUNT.increment() host_exit_no_return(return_code=0) if channel_state != ChannelState.READY: raise untrusted.HostException("Failed to connect to worker.") environment.set_value("WORKER_BOT_NAME", worker_assignment.worker_name) _host_state.heartbeat_thread = threading.Thread(target=_do_heartbeat) _host_state.heartbeat_thread.daemon = True _host_state.heartbeat_thread.start()
def host_exit_no_return(return_code=1): """Called when there is a host error.""" if return_code: monitoring_metrics.HOST_ERROR_COUNT.increment({'return_code': return_code}) # Always try to get the worker to exit too. update_worker() # Prevent exceptions during shutdown. _host_state.channel.unsubscribe(_channel_connectivity_changed) # This should bypass most exception handlers and avoid callers from catching # this incorrectly. logs.log('Shutting down host.', return_code=return_code) raise untrusted.HostException(return_code)