def main(args): # Add SWARMING_HEADLESS into environ so subcommands know that they are running # in a headless (non-interactive) mode. os.environ['SWARMING_HEADLESS'] = '1' # The only reason this is kept is to enable the unit test to use --help to # quit the process. parser = optparse.OptionParser(description=sys.modules[__name__].__doc__) _, args = parser.parse_args(args) # Enforces that only one process with a bot in this directory can be run on # this host at once. # # This is generally a problem with launchd which is a bit too much # 'restart-happy', causing 2 bots running concurrently on the host but it was # observed on linux too. if not SINGLETON.acquire(): print >> sys.stderr, 'Found a previous bot, %d exiting.' % os.getpid() return 1 for t in ('out', 'err'): log_path = os.path.join( os.path.dirname(THIS_FILE), 'logs', 'bot_std%s.log' % t) os_utilities.roll_log(log_path) os_utilities.trim_rolled_log(log_path) error = None if len(args) != 0: error = 'Unexpected arguments: %s' % args try: return run_bot(error) finally: call_hook(bot.Bot(None, None, None, None, os.path.dirname(THIS_FILE), None), 'on_bot_shutdown')
def main(args): subprocess42.inhibit_os_error_reporting() # Add SWARMING_HEADLESS into environ so subcommands know that they are running # in a headless (non-interactive) mode. os.environ['SWARMING_HEADLESS'] = '1' # The only reason this is kept is to enable the unit test to use --help to # quit the process. parser = optparse.OptionParser(description=sys.modules[__name__].__doc__) _, args = parser.parse_args(args) # Enforces that only one process with a bot in this directory can be run on # this host at once. if not SINGLETON.acquire(): if sys.platform == 'darwin': msg = ('Found a previous bot, %d rebooting as a workaround for ' 'https://crbug.com/569610.') % os.getpid() print >> sys.stderr, msg os_utilities.restart(msg) else: print >> sys.stderr, 'Found a previous bot, %d exiting.' % os.getpid( ) return 1 base_dir = os.path.dirname(THIS_FILE) for t in ('out', 'err'): log_path = os.path.join(base_dir, 'logs', 'bot_std%s.log' % t) os_utilities.roll_log(log_path) os_utilities.trim_rolled_log(log_path) error = None if len(args) != 0: error = 'Unexpected arguments: %s' % args try: return run_bot(error) finally: call_hook(bot.Bot(None, None, None, None, base_dir, None), 'on_bot_shutdown') logging.info('main() returning')
def main(args): # Add SWARMING_HEADLESS into environ so subcommands know that they are running # in a headless (non-interactive) mode. os.environ['SWARMING_HEADLESS'] = '1' # The only reason this is kept is to enable the unit test to use --help to # quit the process. parser = optparse.OptionParser(description=sys.modules[__name__].__doc__) _, args = parser.parse_args(args) # Enforces that only one process with a bot in this directory can be run on # this host at once. if not SINGLETON.acquire(): if sys.platform == 'darwin': msg = ( 'Found a previous bot, %d rebooting as a workaround for ' 'https://crbug.com/569610.') % os.getpid() print >> sys.stderr, msg os_utilities.restart(msg) else: print >> sys.stderr, 'Found a previous bot, %d exiting.' % os.getpid() return 1 for t in ('out', 'err'): log_path = os.path.join( os.path.dirname(THIS_FILE), 'logs', 'bot_std%s.log' % t) os_utilities.roll_log(log_path) os_utilities.trim_rolled_log(log_path) error = None if len(args) != 0: error = 'Unexpected arguments: %s' % args try: return run_bot(error) finally: call_hook(bot.Bot(None, None, None, os.path.dirname(THIS_FILE), None), 'on_bot_shutdown') logging.info('main() returning')
def run_manifest(botobj, manifest, start): """Defers to task_runner.py. Return True if the task succeeded. """ # Ensure the manifest is valid. This can throw a json decoding error. Also # raise if it is empty. if not manifest: raise ValueError('Empty manifest') # Necessary to signal an internal_failure. This occurs when task_runner fails # to execute the command. It is important to note that this data is extracted # before any I/O is done, like writting the manifest to disk. task_id = manifest['task_id'] hard_timeout = manifest['hard_timeout'] or None # Default the grace period to 30s here, this doesn't affect the grace period # for the actual task. grace_period = manifest['grace_period'] or 30 if manifest['hard_timeout']: # One for the child process, one for run_isolated, one for task_runner. hard_timeout += 3 * manifest['grace_period'] # For isolated task, download time is not counted for hard timeout so add # more time. if not manifest['command']: hard_timeout += manifest['io_timeout'] or 600 url = manifest.get('host', botobj.server) task_dimensions = manifest['dimensions'] task_result = {} failure = False internal_failure = False msg = None work_dir = os.path.join(botobj.base_dir, 'work') try: try: if os.path.isdir(work_dir): file_path.rmtree(work_dir) except OSError: # If a previous task created an undeleteable file/directory inside 'work', # make sure that following tasks are not affected. This is done by working # around the undeleteable directory by creating a temporary directory # instead. This is not normal behavior. The bot will report a failure on # start. work_dir = tempfile.mkdtemp(dir=botobj.base_dir, prefix='work') else: os.makedirs(work_dir) env = os.environ.copy() # Windows in particular does not tolerate unicode strings in environment # variables. env['SWARMING_TASK_ID'] = task_id.encode('ascii') task_in_file = os.path.join(work_dir, 'task_runner_in.json') with open(task_in_file, 'wb') as f: f.write(json.dumps(manifest)) call_hook(botobj, 'on_before_task') task_result_file = os.path.join(work_dir, 'task_runner_out.json') if os.path.exists(task_result_file): os.remove(task_result_file) command = [ sys.executable, THIS_FILE, 'task_runner', '--swarming-server', url, '--in-file', task_in_file, '--out-file', task_result_file, '--cost-usd-hour', str(botobj.state.get('cost_usd_hour') or 0.), # Include the time taken to poll the task in the cost. '--start', str(start), '--min-free-space', str(get_min_free_space()), ] logging.debug('Running command: %s', command) # Put the output file into the current working directory, which should be # the one containing swarming_bot.zip. log_path = os.path.join(botobj.base_dir, 'logs', 'task_runner_stdout.log') os_utilities.roll_log(log_path) os_utilities.trim_rolled_log(log_path) with open(log_path, 'a+b') as f: proc = subprocess42.Popen(command, detached=True, cwd=botobj.base_dir, env=env, stdin=subprocess42.PIPE, stdout=f, stderr=subprocess42.STDOUT, close_fds=sys.platform != 'win32') try: proc.wait(hard_timeout) except subprocess42.TimeoutExpired: # That's the last ditch effort; as task_runner should have completed a # while ago and had enforced the timeout itself (or run_isolated for # hard_timeout for isolated task). logging.error('Sending SIGTERM to task_runner') proc.terminate() internal_failure = True msg = 'task_runner hung' try: proc.wait(grace_period) except subprocess42.TimeoutExpired: logging.error('Sending SIGKILL to task_runner') proc.kill() proc.wait() return False logging.info('task_runner exit: %d', proc.returncode) if os.path.exists(task_result_file): with open(task_result_file, 'rb') as fd: task_result = json.load(fd) if proc.returncode: msg = 'Execution failed: internal error (%d).' % proc.returncode internal_failure = True elif not task_result: logging.warning('task_runner failed to write metadata') msg = 'Execution failed: internal error (no metadata).' internal_failure = True elif task_result[u'must_signal_internal_failure']: msg = ('Execution failed: %s' % task_result[u'must_signal_internal_failure']) internal_failure = True failure = bool(task_result.get('exit_code')) if task_result else False return not internal_failure and not failure except Exception as e: # Failures include IOError when writing if the disk is full, OSError if # swarming_bot.zip doesn't exist anymore, etc. logging.exception('run_manifest failed') msg = 'Internal exception occured: %s\n%s' % ( e, traceback.format_exc()[-2048:]) internal_failure = True finally: if internal_failure: post_error_task(botobj, msg, task_id) call_hook(botobj, 'on_after_task', failure, internal_failure, task_dimensions, task_result) if os.path.isdir(work_dir): try: file_path.rmtree(work_dir) except Exception as e: botobj.post_error('Failed to delete work directory %s: %s' % (work_dir, e))
def run_manifest(botobj, manifest, start): """Defers to task_runner.py. Return True if the task succeeded. """ # Ensure the manifest is valid. This can throw a json decoding error. Also # raise if it is empty. if not manifest: raise ValueError('Empty manifest') # Necessary to signal an internal_failure. This occurs when task_runner fails # to execute the command. It is important to note that this data is extracted # before any I/O is done, like writting the manifest to disk. task_id = manifest['task_id'] hard_timeout = manifest['hard_timeout'] or None # Default the grace period to 30s here, this doesn't affect the grace period # for the actual task. grace_period = manifest['grace_period'] or 30 if manifest['hard_timeout']: # One for the child process, one for run_isolated, one for task_runner. hard_timeout += 3 * manifest['grace_period'] # For isolated task, download time is not counted for hard timeout so add # more time. if not manifest['command']: hard_timeout += manifest['io_timeout'] or 600 url = manifest.get('host', botobj.remote.url) task_dimensions = manifest['dimensions'] task_result = {} failure = False internal_failure = False msg = None work_dir = os.path.join(botobj.base_dir, 'work') try: try: if os.path.isdir(work_dir): file_path.rmtree(work_dir) except OSError: # If a previous task created an undeleteable file/directory inside 'work', # make sure that following tasks are not affected. This is done by working # around the undeleteable directory by creating a temporary directory # instead. This is not normal behavior. The bot will report a failure on # start. work_dir = tempfile.mkdtemp(dir=botobj.base_dir, prefix='work') else: os.makedirs(work_dir) env = os.environ.copy() # Windows in particular does not tolerate unicode strings in environment # variables. env['SWARMING_TASK_ID'] = task_id.encode('ascii') task_in_file = os.path.join(work_dir, 'task_runner_in.json') with open(task_in_file, 'wb') as f: f.write(json.dumps(manifest)) call_hook(botobj, 'on_before_task') task_result_file = os.path.join(work_dir, 'task_runner_out.json') if os.path.exists(task_result_file): os.remove(task_result_file) command = [ sys.executable, THIS_FILE, 'task_runner', '--swarming-server', url, '--in-file', task_in_file, '--out-file', task_result_file, '--cost-usd-hour', str(botobj.state.get('cost_usd_hour') or 0.), # Include the time taken to poll the task in the cost. '--start', str(start), ] logging.debug('Running command: %s', command) # Put the output file into the current working directory, which should be # the one containing swarming_bot.zip. log_path = os.path.join(botobj.base_dir, 'logs', 'task_runner_stdout.log') os_utilities.roll_log(log_path) os_utilities.trim_rolled_log(log_path) with open(log_path, 'a+b') as f: proc = subprocess42.Popen( command, detached=True, cwd=botobj.base_dir, env=env, stdout=f, stderr=subprocess42.STDOUT) try: proc.wait(hard_timeout) except subprocess42.TimeoutExpired: # That's the last ditch effort; as task_runner should have completed a # while ago and had enforced the timeout itself (or run_isolated for # hard_timeout for isolated task). logging.error('Sending SIGTERM to task_runner') proc.terminate() internal_failure = True msg = 'task_runner hung' try: proc.wait(grace_period) except subprocess42.TimeoutExpired: logging.error('Sending SIGKILL to task_runner') proc.kill() proc.wait() return False logging.info('task_runner exit: %d', proc.returncode) if os.path.exists(task_result_file): with open(task_result_file, 'rb') as fd: task_result = json.load(fd) if proc.returncode: msg = 'Execution failed: internal error (%d).' % proc.returncode internal_failure = True elif not task_result: logging.warning('task_runner failed to write metadata') msg = 'Execution failed: internal error (no metadata).' internal_failure = True elif task_result[u'must_signal_internal_failure']: msg = ( 'Execution failed: %s' % task_result[u'must_signal_internal_failure']) internal_failure = True failure = bool(task_result.get('exit_code')) if task_result else False return not internal_failure and not failure except Exception as e: # Failures include IOError when writing if the disk is full, OSError if # swarming_bot.zip doesn't exist anymore, etc. logging.exception('run_manifest failed') msg = 'Internal exception occured: %s\n%s' % ( e, traceback.format_exc()[-2048:]) internal_failure = True finally: if internal_failure: post_error_task(botobj, msg, task_id) call_hook( botobj, 'on_after_task', failure, internal_failure, task_dimensions, task_result) if os.path.isdir(work_dir): try: file_path.rmtree(work_dir) except Exception as e: botobj.post_error('Failed to delete work directory: %s' % e)