def get_bot(): """Returns a valid Bot instance. Should only be called once in the process lifetime. """ # This variable is used to bootstrap the initial bot.Bot object, which then is # used to get the dimensions and state. attributes = { 'dimensions': { u'id': ['none'] }, 'state': {}, 'version': generate_version(), } config = get_config() assert not config['server'].endswith('/'), config base_dir = os.path.dirname(THIS_FILE) # Use temporary Bot object to call get_attributes. Attributes are needed to # construct the "real" bot.Bot. attributes = get_attributes( bot.Bot(remote_client.createRemoteClient(config['server'], None), attributes, config['server'], config['server_version'], base_dir, on_shutdown_hook)) # Make remote client callback use the returned bot object. We assume here # RemoteClient doesn't call its callback in the constructor (since 'botobj' is # undefined during the construction). botobj = bot.Bot( remote_client.createRemoteClient( config['server'], lambda: get_authentication_headers(botobj)), attributes, config['server'], config['server_version'], base_dir, on_shutdown_hook) return botobj
def make_bot(self, auth_headers_cb=None): return bot.Bot( remote_client.createRemoteClient('https://localhost:1', auth_headers_cb, 'localhost', self.root_dir, False), copy.deepcopy(self.attributes), 'https://localhost:1', 'version1', self.root_dir, self.fail)
def _run_command(self, task_details, headers_cb=None): start = time.time() self.mock(time, 'time', lambda: start + 10) remote = remote_client.createRemoteClient('https://localhost:1', headers_cb) return task_runner.run_command( remote, task_details, self.work_dir, 3600., start, 1, '/path/to/file')
def run_command(server_url, work_dir, task_details, headers_cb): """Runs a command with an initialized client.""" remote = remote_client.createRemoteClient( server_url, headers_cb, 'localhost', work_dir, False) with luci_context.stage(local_auth=None) as ctx_file: return task_runner.run_command( remote, task_details, work_dir, 3600., time.time(), ['--min-free-space', '1'], '/path/to/file', ctx_file)
def _run_command(self, task_details): # Dot not mock time since this test class is testing timeouts. remote = remote_client.createRemoteClient('https://localhost:1', None, 'localhost', self.work_dir, False) with luci_context.stage(local_auth=None) as ctx_file: return task_runner.run_command( remote, task_details, self.work_dir, 3600., time.time(), ['--min-free-space', '1'], '/path/to/file', ctx_file)
def _run_command(self, task_details, headers_cb=None): start = time.time() self.mock(time, 'time', lambda: start + 10) remote = remote_client.createRemoteClient('https://localhost:1', headers_cb, 'localhost', self.work_dir, False) with luci_context.stage(local_auth=None) as ctx_file: return task_runner.run_command( remote, task_details, self.work_dir, 3600., start, ['--min-free-space', '1'], '/path/to/file', ctx_file)
def load_and_run( in_file, swarming_server, is_grpc, cost_usd_hour, start, out_file, run_isolated_flags, bot_file, auth_params_file): """Loads the task's metadata, prepares auth environment and executes the task. This may throw all sorts of exceptions in case of failure. It's up to the caller to trap them. These shall be considered 'internal_failure' instead of 'failure' from a TaskRunResult standpoint. """ auth_system = None local_auth_context = None task_result = None work_dir = os.path.dirname(out_file) def handler(sig, _): logging.info('Got signal %s', sig) raise ExitSignal(sig) try: with subprocess42.set_signal_handler([SIG_BREAK_OR_TERM], handler): # The work directory is guaranteed to exist since it was created by # bot_main.py and contains the manifest. Temporary files will be # downloaded there. It's bot_main.py that will delete the directory # afterward. Tests are not run from there. if not os.path.isdir(work_dir): raise InternalError('%s expected to exist' % work_dir) # Raises InternalError on errors. task_details = TaskDetails.load(in_file) # This will start a thread that occasionally reads bot authentication # headers from 'auth_params_file'. It will also optionally launch local # HTTP server that serves OAuth tokens to the task processes. We put # location of this service into a file referenced by LUCI_CONTEXT env var # below. if auth_params_file: try: auth_system = bot_auth.AuthSystem(auth_params_file) local_auth_context = auth_system.start() except bot_auth.AuthSystemError as e: raise InternalError('Failed to init auth: %s' % e) # Override LUCI_CONTEXT['local_auth']. If the task is not using auth, # do NOT inherit existing local_auth (if its there). Kick it out by # passing None. context_edits = { 'local_auth': local_auth_context } # Extend existing LUCI_CONTEXT['swarming'], if any. if task_details.secret_bytes is not None: swarming = luci_context.read('swarming') or {} swarming['secret_bytes'] = task_details.secret_bytes context_edits['swarming'] = swarming # Returns bot authentication headers dict or raises InternalError. def headers_cb(): try: if auth_system: return auth_system.get_bot_headers() return (None, None) # A timeout of "None" means "don't use auth" except bot_auth.AuthSystemError as e: raise InternalError('Failed to grab bot auth headers: %s' % e) # Make a client that can send request to Swarming using bot auth headers. grpc_proxy = '' if is_grpc: grpc_proxy = swarming_server swarming_server = '' # The hostname and work dir provided here don't really matter, since the # task runner is always called with a specific versioned URL. remote = remote_client.createRemoteClient( swarming_server, headers_cb, os_utilities.get_hostname_short(), work_dir, grpc_proxy) remote.initialize() # Let AuthSystem know it can now send RPCs to Swarming (to grab OAuth # tokens). There's a circular dependency here! AuthSystem will be # indirectly relying on its own 'get_bot_headers' method to authenticate # RPCs it sends through the provided client. if auth_system: auth_system.set_remote_client(remote) # Auth environment is up, start the command. task_result is dumped to # disk in 'finally' block. with luci_context.stage(_tmpdir=work_dir, **context_edits) as ctx_file: task_result = run_command( remote, task_details, work_dir, cost_usd_hour, start, run_isolated_flags, bot_file, ctx_file) except (ExitSignal, InternalError, remote_client.InternalError) as e: # This normally means run_command() didn't get the chance to run, as it # itself traps exceptions and will report accordingly. In this case, we want # the parent process to send the message instead. if not task_result: task_result = { u'exit_code': -1, u'hard_timeout': False, u'io_timeout': False, u'must_signal_internal_failure': str(e.message or 'unknown error'), u'version': OUT_VERSION, } finally: # We've found tests to delete the working directory work_dir when quitting, # causing an exception here. Try to recreate the directory if necessary. if not os.path.isdir(work_dir): os.mkdir(work_dir) if auth_system: auth_system.stop() with open(out_file, 'wb') as f: json.dump(task_result, f)
def load_and_run(in_file, swarming_server, cost_usd_hour, start, out_file, min_free_space, bot_file, auth_params_file): """Loads the task's metadata, prepares auth environment and executes the task. This may throw all sorts of exceptions in case of failure. It's up to the caller to trap them. These shall be considered 'internal_failure' instead of 'failure' from a TaskRunResult standpoint. """ auth_system = None task_result = None work_dir = os.path.dirname(out_file) def handler(sig, _): logging.info('Got signal %s', sig) raise ExitSignal(sig) try: with subprocess42.set_signal_handler([SIG_BREAK_OR_TERM], handler): # The work directory is guaranteed to exist since it was created by # bot_main.py and contains the manifest. Temporary files will be # downloaded there. It's bot_main.py that will delete the directory # afterward. Tests are not run from there. if not os.path.isdir(work_dir): raise InternalError('%s expected to exist' % work_dir) # Raises InternalError on errors. task_details = TaskDetails.load(in_file) # This will start a thread that occasionally reads bot authentication # headers from 'auth_params_file'. It will also optionally launch local # HTTP server that serves OAuth tokens to the task processes. We put # location of this service into a file referenced by LUCI_CONTEXT env var # below. if auth_params_file: try: auth_system = bot_auth.AuthSystem(auth_params_file) auth_system.start() except bot_auth.AuthSystemError as e: raise InternalError('Failed to init auth: %s' % e) context_edits = {} # If the task is using service accounts, add local_auth details to # LUCI_CONTEXT. if auth_system and auth_system.local_auth_context: context_edits['local_auth'] = auth_system.local_auth_context # Returns bot authentication headers dict or raises InternalError. def headers_cb(): try: if auth_system: # The second parameter is the time until which the remote client # should cache the headers. Since auth_system is doing the # caching, we're just sending "0", which is to say the Epoch # (Jan 1 1970), which effectively means "never cache." return (auth_system.bot_headers, 0) return (None, None ) # A timeout of "None" means "don't use auth" except bot_auth.AuthSystemError as e: raise InternalError('Failed to grab bot auth headers: %s' % e) # Auth environment is up, start the command. task_result is dumped to # disk in 'finally' block. remote = remote_client.createRemoteClient(swarming_server, headers_cb) with luci_context.write(_tmpdir=work_dir, **context_edits): task_result = run_command(remote, task_details, work_dir, cost_usd_hour, start, min_free_space, bot_file) except (ExitSignal, InternalError) as e: # This normally means run_command() didn't get the chance to run, as it # itself traps exceptions and will report accordingly. In this case, we want # the parent process to send the message instead. if not task_result: task_result = { u'exit_code': -1, u'hard_timeout': False, u'io_timeout': False, u'must_signal_internal_failure': str(e.message or 'unknown error'), u'version': OUT_VERSION, } finally: # We've found tests to delete the working directory work_dir when quitting, # causing an exception here. Try to recreate the directory if necessary. if not os.path.isdir(work_dir): os.mkdir(work_dir) if auth_system: auth_system.stop() with open(out_file, 'wb') as f: json.dump(task_result, f)
def _run_command(self, task_details): # Dot not mock time since this test class is testing timeouts. remote = remote_client.createRemoteClient('https://localhost:1', None) return task_runner.run_command( remote, task_details, self.work_dir, 3600., time.time(), 1, '/path/to/file')
def run_bot(arg_error): """Runs the bot until it reboots or self-update or a signal is received. When a signal is received, simply exit. """ quit_bit = threading.Event() def handler(sig, _): logging.info('Got signal %s', sig) quit_bit.set() # TODO(maruel): Set quit_bit when stdin is closed on Windows. with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler): config = get_config() try: # First thing is to get an arbitrary url. This also ensures the network is # up and running, which is necessary before trying to get the FQDN below. # There's no need to do error handling here - the "ping" is just to "wake # up" the network; if there's something seriously wrong, the handshake # will fail and we'll handle it there. remote = remote_client.createRemoteClient(config['server'], None) remote.ping() except Exception as e: # url_read() already traps pretty much every exceptions. This except # clause is kept there "just in case". logging.exception('server_ping threw') # If we are on GCE, we want to make sure GCE metadata server responds, since # we use the metadata to derive bot ID, dimensions and state. if platforms.is_gce(): logging.info('Running on GCE, waiting for the metadata server') platforms.gce.wait_for_metadata(quit_bit) if quit_bit.is_set(): logging.info('Early quit 1') return 0 # Next we make sure the bot can make authenticated calls by grabbing # the auth headers, retrying on errors a bunch of times. We don't give up # if it fails though (maybe the bot will "fix itself" later). botobj = get_bot() try: botobj.remote.initialize(quit_bit) except remote_client.InitializationError as exc: botobj.post_error('failed to grab auth headers: %s' % exc.last_error) logging.error('Can\'t grab auth headers, continuing anyway...') if arg_error: botobj.post_error('Bootstrapping error: %s' % arg_error) if quit_bit.is_set(): logging.info('Early quit 2') return 0 call_hook(botobj, 'on_bot_startup') # Initial attributes passed to bot.Bot in get_bot above were constructed for # 'fake' bot ID ('none'). Refresh them to match the real bot ID, now that we # have fully initialize bot.Bot object. Note that 'get_dimensions' and # 'get_state' may depend on actions done by 'on_bot_startup' hook, that's # why we do it here and not in 'get_bot'. botobj._update_dimensions(get_dimensions(botobj)) botobj._update_state(get_state(botobj, 0)) if quit_bit.is_set(): logging.info('Early quit 3') return 0 # This is the first authenticated request to the server. If the bot is # misconfigured, the request may fail with HTTP 401 or HTTP 403. Instead of # dying right away, spin in a loop, hoping the bot will "fix itself" # eventually. Authentication errors in /handshake are logged on the server # and generate error reports, so bots stuck in this state are discoverable. sleep_time = 5 while not quit_bit.is_set(): resp = botobj.remote.do_handshake(botobj._attributes) if resp: logging.info('Connected to %s', resp.get('server_version')) if resp.get('bot_version') != botobj._attributes['version']: logging.warning( 'Found out we\'ll need to update: server said %s; we\'re %s', resp.get('bot_version'), botobj._attributes['version']) # Remember the server-provided per-bot configuration. '/handshake' is # the only place where the server returns it. The bot will be sending # the 'bot_group_cfg_version' back in each /poll (as part of 'state'), # so that the server can instruct the bot to restart itself when # config changes. cfg_version = resp.get('bot_group_cfg_version') if cfg_version: botobj._update_bot_group_cfg(cfg_version, resp.get('bot_group_cfg')) break logging.error( 'Failed to contact for handshake, retrying in %d sec...', sleep_time) quit_bit.wait(sleep_time) sleep_time = min(300, sleep_time * 2) if quit_bit.is_set(): logging.info('Early quit 4') return 0 # Let the bot to finish the initialization, now that it knows its server # defined dimensions. call_hook(botobj, 'on_handshake') cleanup_bot_directory(botobj) clean_cache(botobj) if quit_bit.is_set(): logging.info('Early quit 5') return 0 # This environment variable is accessible to the tasks executed by this bot. os.environ['SWARMING_BOT_ID'] = botobj.id.encode('utf-8') consecutive_sleeps = 0 last_action = time.time() while not quit_bit.is_set(): try: botobj._update_dimensions(get_dimensions(botobj)) botobj._update_state(get_state(botobj, consecutive_sleeps)) did_something = poll_server(botobj, quit_bit, last_action) if did_something: last_action = time.time() consecutive_sleeps = 0 else: consecutive_sleeps += 1 except Exception as e: logging.exception('poll_server failed') msg = '%s\n%s' % (e, traceback.format_exc()[-2048:]) botobj.post_error(msg) consecutive_sleeps = 0 logging.info('Quitting') # Tell the server we are going away. botobj.post_event('bot_shutdown', 'Signal was received') return 0