def start(self): """Starts the local Swarming bot.""" assert not self._proc bot_zip = os.path.join(self._tmpdir, 'swarming_bot.zip') urllib.urlretrieve(self._swarming_server_url + '/bot_code', bot_zip) cmd = [sys.executable, bot_zip, 'start_slave'] if self._redirect: logs = os.path.join(self._tmpdir, 'logs') if not os.path.isdir(logs): os.mkdir(logs) with open(os.path.join(logs, 'bot_stdout.log'), 'wb') as f: self._proc = subprocess42.Popen( cmd, cwd=self._tmpdir, stdout=f, stderr=f, detached=True) else: self._proc = subprocess42.Popen(cmd, cwd=self._tmpdir, detached=True)
def test_run_bot_signal(self): # Test SIGTERM signal handling. Run it as an external process to not mess # things up. proc = subprocess42.Popen( [sys.executable, self._zip_file, 'start_slave'], stdout=subprocess42.PIPE, stderr=subprocess42.PIPE, detached=True) # Wait for the grand-child process to poll the server. self._server.has_polled.wait(60) self.assertEqual(True, self._server.has_polled.is_set()) proc.terminate() out, _ = proc.communicate() self.assertEqual(0, proc.returncode) self.assertEqual('', out) events = self._server.get_events() for event in events: event.pop('dimensions') event.pop('state') event.pop('version') expected = [ { u'event': u'bot_shutdown', u'message': u'Signal was received', }, ] if sys.platform == 'win32': # Sadly, the signal handler generate an error. # TODO(maruel): Fix one day. self.assertEqual(u'bot_error', events.pop(0)['event']) self.assertEqual(expected, events)
def exec_python(args): """Executes a python process, replacing the current process if possible. On Windows, it returns the child process code. The caller must exit at the earliest opportunity. """ cmd = [sys.executable] + args if sys.platform not in ('cygwin', 'win32'): os.execv(cmd[0], cmd) return 1 try: # On Windows, we cannot sanely exec() so shell out the child process # instead. But we need to forward any signal received that the bot may care # about. This means processes accumulate, sadly. # TODO(maruel): If stdin closes, it tells the child process that the parent # process died. proc = subprocess42.Popen(cmd, detached=True, stdin=subprocess42.PIPE) def handler(sig, _): logging.info('Got signal %s', sig) # Always send SIGTERM, which is properly translated. proc.send_signal(signal.SIGTERM) sig = signal.SIGBREAK if sys.platform == 'win32' else signal.SIGTERM with subprocess42.set_signal_handler([sig], handler): proc.wait() return proc.returncode except Exception as e: logging.exception('failed to start: %s', e) # Swallow the exception. return 1
def _run(self, command, args): """Runs swarming.py and capture the stdout to a log file. The log file will be printed by the test framework in case of failure or verbose mode. Returns: The process exit code. """ name = os.path.join(self._tmpdir, 'client_%d.log' % self._index) self._index += 1 cmd = [ sys.executable, 'swarming.py', command, '-S', self._swarming_server, '--verbose', ] + args with open(name, 'wb') as f: f.write('\nRunning: %s\n' % ' '.join(cmd)) f.flush() p = subprocess42.Popen(cmd, stdout=f, stderr=subprocess42.STDOUT, cwd=CLIENT_DIR) p.communicate() return p.returncode
def test_containment_auto_limit_process(self): # Process creates a children process. It should fail, throwing not enough # quota. cmd = self._cmd_print_good() containment = subprocess42.Containment( containment_type=subprocess42.Containment.JOB_OBJECT, limit_processes=1) start = lambda: subprocess42.Popen(cmd, stdout=subprocess42.PIPE, stderr=subprocess42.PIPE, containment=containment) if sys.platform == 'win32': p = start() out, err = p.communicate() self.assertEqual(1, p.returncode) self.assertEqual('', out) self.assertIn('WindowsError', err) # Value for ERROR_NOT_ENOUGH_QUOTA. See # https://docs.microsoft.com/windows/desktop/debug/system-error-codes--1700-3999- self.assertIn('1816', err) else: # JOB_OBJECT is not usable on non-Windows. with self.assertRaises(NotImplementedError): start()
def test_kill_and_wait(self): # Test the case where the script swallows the SIGTERM/SIGBREAK signal and # hangs. script = os.path.join(self.root_dir, 'ignore_sigterm.py') with open(script, 'wb') as f: # The warning signal is received as SIGTERM on posix and SIGBREAK on # Windows. sig = 'SIGBREAK' if sys.platform == 'win32' else 'SIGTERM' f.write(( 'import signal, sys, time\n' 'def handler(_signum, _frame):\n' ' sys.stdout.write("got it\\n")\n' 'signal.signal(signal.%s, handler)\n' 'sys.stdout.write("ok\\n")\n' 'while True:\n' ' try:\n' ' time.sleep(1)\n' ' except IOError:\n' ' pass\n') % sig) cmd = [sys.executable, '-u', script] # detached=True is required on Windows for SIGBREAK to propagate properly. p = subprocess42.Popen(cmd, detached=True, stdout=subprocess42.PIPE) # Wait for it to write 'ok', so we know it's handling signals. It's # important because otherwise SIGTERM/SIGBREAK could be sent before the # signal handler is installed, and this is not what we're testing here. self.assertEqual('ok\n', p.stdout.readline()) # Send a SIGTERM/SIGBREAK, the process ignores it, send a SIGKILL. exit_code = task_runner.kill_and_wait(p, 0.1, 'testing purposes') expected = 1 if sys.platform == 'win32' else -signal.SIGKILL self.assertEqual(expected, exit_code) self.assertEqual('got it\n', p.stdout.readline())
def test_attached(self): is_win = (sys.platform == 'win32') cmd = [sys.executable, '-c', SCRIPT] # TODO(maruel): Make universal_newlines=True work and not hang. proc = subprocess42.Popen(cmd, detached=False, stdout=subprocess42.PIPE) try: if is_win: # There's something extremely weird happening on the Swarming bots in # the sense that they return 'hi\n' on Windows, while running the script # locally on Windows results in 'hi\r\n'. Author raises fist. self.assertIn(proc.recv_out(), ('hi\r\n', 'hi\n')) else: self.assertEqual('hi\n', proc.recv_out()) proc.terminate() if is_win: # If attached, it's hard killed. self.assertEqual(1, proc.wait()) self.assertEqual((None, None), proc.recv_any()) else: self.assertEqual(0, proc.wait()) self.assertEqual(('stdout', 'got signal 15\nbye\n'), proc.recv_any()) finally: # In case the test fails. proc.kill() proc.wait()
def get_output_sleep_proc(flush, unbuffered, sleep_duration): """Returns process with universal_newlines=True that prints to stdout before after a sleep. It also optionally sys.stdout.flush() before the sleep and optionally enable unbuffered output in python. """ command = [ 'import sys,time', 'print(\'A\')', ] if flush: # Sadly, this doesn't work otherwise in some combination. command.append('sys.stdout.flush()') command.extend(( 'time.sleep(%s)' % sleep_duration, 'print(\'B\')', )) cmd = [sys.executable, '-c', ';'.join(command)] if unbuffered: cmd.append('-u') return subprocess42.Popen(cmd, env=ENV, stdout=subprocess42.PIPE, universal_newlines=True)
def _Start(self): logging.info('Starting process %s', self) self.proc = subprocess42.Popen(self.cmd, stdout=subprocess42.PIPE, stderr=subprocess42.PIPE, detached=self.detached, cwd=self.cwd) threading.Thread(target=self._reader).start()
def test_communicate_input(self): cmd = [ sys.executable, '-u', '-c', 'import sys; sys.stdout.write(sys.stdin.read(5))', ] proc = subprocess42.Popen( cmd, stdin=subprocess42.PIPE, stdout=subprocess42.PIPE) out, err = proc.communicate(input='12345') self.assertEqual('12345', out) self.assertEqual(None, err)
def test_communicate_input_timeout(self): cmd = [sys.executable, '-u', '-c', 'import time; time.sleep(60)'] proc = subprocess42.Popen(cmd, stdin=subprocess42.PIPE) try: proc.communicate(input='12345', timeout=0.5) self.fail() except subprocess42.TimeoutExpired as e: self.assertEqual(None, e.output) self.assertEqual(None, e.stderr) self.assertTrue(proc.kill()) proc.wait() self.assertLess(0.5, proc.duration())
def _proc(self, err, **kwargs): # Do not use the -u flag here, we want to test when it is buffered by # default. See reference above about PYTHONUNBUFFERED. # That's why the two scripts uses .flush(). Sadly, the flush() call is # needed on Windows even for sys.stderr (!) cmd = [sys.executable, '-c', SCRIPT_ERR if err else SCRIPT_OUT] # TODO(maruel): Make universal_newlines=True work and not hang. if err: kwargs['stderr'] = subprocess42.PIPE else: kwargs['stdout'] = subprocess42.PIPE return subprocess42.Popen(cmd, **kwargs)
def test_communicate_timeout_no_pipe(self): # In this case, it's effectively a wait() call. cmd = [sys.executable, '-u', '-c', 'import time; time.sleep(60)'] proc = subprocess42.Popen(cmd) try: proc.communicate(timeout=0.5) self.fail() except subprocess42.TimeoutExpired as e: self.assertEqual(None, e.output) self.assertEqual(None, e.stderr) self.assertTrue(proc.kill()) proc.wait() self.assertLess(0.5, proc.duration())
def get_output_sleep_proc_err(sleep_duration): """Returns process with universal_newlines=True that prints to stderr before and after a sleep. """ command = [ 'import sys,time', 'sys.stderr.write(\'A\\n\')', ] command.extend(( 'time.sleep(%s)' % sleep_duration, 'sys.stderr.write(\'B\\n\')', )) cmd = [sys.executable, '-c', ';'.join(command)] return subprocess42.Popen( cmd, env=ENV, stderr=subprocess42.PIPE, universal_newlines=True)
def test_yield_any_returncode(self): proc = subprocess42.Popen( [sys.executable, '-c', 'import sys;sys.stdout.write("yo");sys.exit(1)'], stdout=subprocess42.PIPE) for p, d in proc.yield_any(): self.assertEqual('stdout', p) self.assertEqual('yo', d) # There was a bug where the second call to wait() would overwrite # proc.returncode with 0 when timeout is not None. self.assertEqual(1, proc.wait()) self.assertEqual(1, proc.wait(timeout=0)) self.assertEqual(1, proc.poll()) self.assertEqual(1, proc.returncode) # On Windows, the clock resolution is 15ms so Popen.duration() will likely # be 0. self.assertLessEqual(0, proc.duration())
def _test_lower_priority(self, lower_priority): if sys.platform == 'win32': cmd = [ sys.executable, '-u', '-c', 'import ctypes,sys; v=ctypes.windll.kernel32.GetPriorityClass(-1);' 'sys.stdout.write(hex(v))' ] else: cmd = [ sys.executable, '-u', '-c', 'import os,sys;sys.stdout.write(str(os.nice(0)))', ] proc = subprocess42.Popen( cmd, stdout=subprocess42.PIPE, lower_priority=lower_priority) out, err = proc.communicate() self.assertEqual(None, err) return out
def clean_cache(botobj): """Asks run_isolated to clean its cache. This may take a while but it ensures that in the case of a run_isolated run failed and it temporarily used more space than min_free_disk, it can cleans up the mess properly. It will remove unexpected files, remove corrupted files, trim the cache size based on the policies and update state.json. """ cmd = [ sys.executable, THIS_FILE, 'run_isolated', '--clean', '--log-file', os.path.join(botobj.base_dir, 'logs', 'run_isolated.log'), '--cache', os.path.join(botobj.base_dir, 'isolated_cache'), '--named-cache-root', os.path.join(botobj.base_dir, 'c'), '--min-free-space', str(get_min_free_space(botobj)), ] logging.info('Running: %s', cmd) try: # Intentionally do not use a timeout, it can take a while to hash 50gb but # better be safe than sorry. proc = subprocess42.Popen(cmd, stdin=subprocess42.PIPE, stdout=subprocess42.PIPE, stderr=subprocess42.STDOUT, cwd=botobj.base_dir, detached=True, close_fds=sys.platform != 'win32') output, _ = proc.communicate(None) logging.info('Result:\n%s', output) if proc.returncode: botobj.post_error( 'swarming_bot.zip failure during run_isolated --clean:\n%s' % output) except OSError: botobj.post_error( 'swarming_bot.zip internal failure during run_isolated --clean')
def test_kill_background(self): # Test process group killing. # Leaking a pipe through to the grandchild process is the only way to be # sure that we have a way to detect that this grandchild process is still # running or not. When all handles to the `w` end of the pipe have been # dropped, the `r` end will unblock. r, w = os.pipe() # Use fcntl instead of os.set_blocking because of python2 import fcntl fcntl.fcntl(r, fcntl.F_SETFL, fcntl.fcntl(r, fcntl.F_GETFL) | os.O_NONBLOCK) p = subprocess42.Popen( [sys.executable, self.output_script, 'out_leak'], stdout=w, detached=True) os.close(w) # close so we don't think that a child is hanging onto it. self.assertEqual(p.wait(), 0) # our immediate child has exited! with self.assertRaises(OSError): # oops, something still has a handle to this pipe! it's the grandchild! os.read(r, 1) # kill the group! That'll show 'em (unless the child actually daemonized, in # which case we're hosed). p.kill() # sleepy-loop until the pipe is closed, should take O(ms) but we generously # wait up to 5s. The sub-child will wait 30s and should outlive this loop if # somehow it survived the kill. now = time.time() while True: try: self.assertEqual(os.read(r, 1), b'') # i.e. EOF return except OSError as ex: if ex.errno != errno.EWOULDBLOCK: raise time.sleep(0.1) if time.time() - now > 5: raise Exception('pipe not unblocked after 5s, bailing')
def _start_task_runner(args, work_dir, ctx_file): """Starts task_runner process and returns its handle. Raises: _FailureOnStart if a problem occurred. """ cmd = get_run_isolated() args_path = os.path.join(work_dir, 'run_isolated_args.json') cmd.extend(['-a', args_path]) env = os.environ.copy() if ctx_file: env['LUCI_CONTEXT'] = ctx_file logging.info('cmd=%s', cmd) logging.info('cwd=%s', work_dir) logging.info('args=%s', args) # We write args to a file since there may be more of them than the OS # can handle. Since it is inside work_dir, it'll be automatically be deleted # upon task termination. try: with open(args_path, 'wb') as f: json.dump(args, f) except (IOError, OSError) as e: raise _FailureOnStart( e.errno or -1, 'Could not write args to %s: %s' % (args_path, e)) try: # TODO(maruel): Support separate streams for stdout and stderr. proc = subprocess42.Popen( cmd, env=env, cwd=work_dir, detached=True, stdout=subprocess42.PIPE, stderr=subprocess42.STDOUT, stdin=subprocess42.PIPE) except OSError as e: raise _FailureOnStart( e.errno or -1, 'Command "%s" failed to start.\nError: %s' % (' '.join(cmd), e)) return proc
def _get_output_sleep_proc(flush, env, duration): command = [ 'import sys,time', 'print(\'A\')', ] if flush: # Sadly, this doesn't work otherwise in some combination. command.append('sys.stdout.flush()') command.extend(( 'time.sleep(%s)' % duration, 'print(\'B\')', )) return subprocess42.Popen([ sys.executable, '-c', ';'.join(command), ], stdout=subprocess42.PIPE, universal_newlines=True, env=env)
def test_containment_auto_kill(self): # Test process killing. cmd = [ sys.executable, '-u', '-c', 'import sys,time; print("hi");time.sleep(60)', ] containment = subprocess42.Containment( containment_type=subprocess42.Containment.AUTO, limit_processes=1, limit_total_committed_memory=1024*1024*1024) p = subprocess42.Popen( cmd, stdout=subprocess42.PIPE, containment=containment) itr = p.yield_any_line() self.assertEqual(('stdout', 'hi'), next(itr)) p.kill() p.wait() if sys.platform != 'win32': # signal.SIGKILL is not defined on Windows. Validate our assumption here. self.assertEqual(9, signal.SIGKILL) self.assertEqual(-9, p.returncode)
def test_containment_auto_limit_memory(self): # Process allocates a lot of memory. It should fail due to quota. cmd = self._cmd_large_memory() containment = subprocess42.Containment( containment_type=subprocess42.Containment.JOB_OBJECT, # 20 MiB. limit_total_committed_memory=20*1024*1024) start = lambda: subprocess42.Popen( cmd, stdout=subprocess42.PIPE, stderr=subprocess42.PIPE, containment=containment) if sys.platform == 'win32': p = start() out, err = p.communicate() self.assertEqual(1, p.returncode) self.assertEqual('', out) self.assertIn('MemoryError', err) else: # JOB_OBJECT is not usable on non-Windows. with self.assertRaises(NotImplementedError): start()
def test_communicate_input_stdout_timeout(self): cmd = [ sys.executable, '-u', '-c', """ import sys, time sys.stdout.write(sys.stdin.read(5)) sys.stdout.flush() time.sleep(60) """, ] proc = subprocess42.Popen(cmd, stdin=subprocess42.PIPE, stdout=subprocess42.PIPE) try: proc.communicate(input=b'12345', timeout=2) self.fail() except subprocess42.TimeoutExpired as e: self.assertEqual(b'12345', e.output) self.assertEqual(None, e.stderr) self.assertTrue(proc.kill()) proc.wait() self.assertLessEqual(0.5, proc.duration())
def test_signal(self): # Tests when task_runner gets a SIGTERM. os.mkdir(os.path.join(self.root_dir, 'work')) signal_file = os.path.join(self.root_dir, 'work', 'signal') open(signal_file, 'wb').close() manifest = get_manifest( script='import os,time;os.remove(%r);time.sleep(60)' % signal_file, hard_timeout=60., io_timeout=60.) task_in_file = os.path.join(self.root_dir, 'task_runner_in.json') task_result_file = os.path.join(self.root_dir, 'task_runner_out.json') with open(task_in_file, 'wb') as f: json.dump(manifest, f) bot = os.path.join(self.root_dir, 'swarming_bot.1.zip') code, _ = fake_swarming.gen_zip(self._server.url) with open(bot, 'wb') as f: f.write(code) cmd = [ sys.executable, bot, 'task_runner', '--swarming-server', self._server.url, '--in-file', task_in_file, '--out-file', task_result_file, '--cost-usd-hour', '1', # Include the time taken to poll the task in the cost. '--start', str(time.time()), ] logging.info('%s', cmd) proc = subprocess42.Popen(cmd, cwd=self.root_dir, detached=True) # Wait for the child process to be alive. while os.path.isfile(signal_file): time.sleep(0.01) # Send SIGTERM to task_runner itself. Ensure the right thing happen. # Note that on Windows, this is actually sending a SIGBREAK since there's no # such thing as SIGTERM. proc.send_signal(signal.SIGTERM) proc.wait() task_runner_log = os.path.join(self.root_dir, 'logs', 'task_runner.log') with open(task_runner_log, 'rb') as f: logging.info('task_runner.log:\n---\n%s---', f.read()) expected = { u'exit_code': 0, u'hard_timeout': False, u'io_timeout': False, u'must_signal_internal_failure': None, u'version': task_runner.OUT_VERSION, } self.assertEqual([], self._server.get_events()) tasks = self._server.get_tasks() for task in tasks.itervalues(): for event in task: event.pop('cost_usd') event.pop('duration', None) expected = { '23': [ { u'id': u'localhost', u'task_id': 23, }, { u'exit_code': 1 if sys.platform == 'win32' else -signal.SIGTERM, u'hard_timeout': False, u'id': u'localhost', u'io_timeout': False, u'task_id': 23, }, ], } self.assertEqual(expected, tasks) expected = { 'swarming_bot.1.zip', '092b5bd4562f579711823f61e311de37247c853a-cacert.pem', 'work', 'logs', # TODO(maruel): Move inside work. 'task_runner_in.json', 'task_runner_out.json', } self.assertEqual(expected, set(os.listdir(self.root_dir))) expected = { u'exit_code': 1 if sys.platform == 'win32' else -signal.SIGTERM, u'hard_timeout': False, u'io_timeout': False, u'must_signal_internal_failure': u'task_runner received signal %d' % task_runner.SIG_BREAK_OR_TERM, u'version': 3, } with open(task_result_file, 'rb') as f: self.assertEqual(expected, json.load(f)) self.assertEqual(0, proc.returncode)
def ensure(self, site_root, packages, cache_dir=None, tmp_dir=None, timeout=None): """Ensures that packages installed in |site_root| equals |packages| set. Blocking call. Args: site_root (str): where to install packages. packages: dict of subdir -> list of (package_template, version) tuples. cache_dir (str): if set, cache dir for cipd binary own cache. Typically contains packages and tags. tmp_dir (str): if not None, dir for temp files. timeout (int): if not None, timeout in seconds for this function to run. Returns: Pinned packages in the form of {subdir: [(package_name, package_id)]}, which correspond 1:1 with the input packages argument. Raises: Error if could not install packages or timed out. """ timeoutfn = tools.sliding_timeout(timeout) logging.info('Installing packages %r into %s', packages, site_root) ensure_file_handle, ensure_file_path = tempfile.mkstemp( dir=tmp_dir, prefix=u'cipd-ensure-file-', suffix='.txt') json_out_file_handle, json_file_path = tempfile.mkstemp( dir=tmp_dir, prefix=u'cipd-ensure-result-', suffix='.json') os.close(json_out_file_handle) try: try: for subdir, pkgs in sorted(packages.iteritems()): if '\n' in subdir: raise Error( 'Could not install packages; subdir %r contains newline' % subdir) os.write(ensure_file_handle, '@Subdir %s\n' % (subdir, )) for pkg, version in pkgs: os.write(ensure_file_handle, '%s %s\n' % (pkg, version)) finally: os.close(ensure_file_handle) cmd = [ self.binary_path, 'ensure', '-root', site_root, '-ensure-file', ensure_file_path, '-verbose', # this is safe because cipd-ensure does not print a lot '-json-output', json_file_path, ] if cache_dir: cmd += ['-cache-dir', cache_dir] if self.service_url: cmd += ['-service-url', self.service_url] logging.debug('Running %r', cmd) process = subprocess42.Popen(cmd, stdout=subprocess42.PIPE, stderr=subprocess42.PIPE) output = [] for pipe_name, line in process.yield_any_line(timeout=0.1): to = timeoutfn() if to is not None and to <= 0: raise Error( 'Could not install packages; took more than %d seconds' % timeout) if not pipe_name: # stdout or stderr was closed, but yield_any_line still may have # something to yield. continue output.append(line) if pipe_name == 'stderr': logging.debug('cipd client: %s', line) else: logging.info('cipd client: %s', line) exit_code = process.wait(timeout=timeoutfn()) if exit_code != 0: raise Error( 'Could not install packages; exit code %d\noutput:%s' % (exit_code, '\n'.join(output))) with open(json_file_path) as jfile: result_json = json.load(jfile) return { subdir: [(x['package'], x['instance_id']) for x in pins] for subdir, pins in result_json['result'].iteritems() } finally: fs.remove(ensure_file_path) fs.remove(json_file_path)
def run_command(swarming_server, task_details, root_dir, cost_usd_hour, task_start, json_file): """Runs a command and sends packets to the server to stream results back. Implements both I/O and hard timeouts. Sends the packets numbered, so the server can ensure they are processed in order. Returns: Child process exit code. """ # Signal the command is about to be started. last_packet = start = now = monotonic_time() params = { 'cost_usd': cost_usd_hour * (now - task_start) / 60. / 60., 'id': task_details.bot_id, 'task_id': task_details.task_id, } post_update(swarming_server, params, None, '', 0) logging.info('Executing: %s', task_details.command) # TODO(maruel): Support both channels independently and display stderr in red. env = None if task_details.env: env = os.environ.copy() env.update(task_details.env) try: proc = subprocess42.Popen(task_details.command, env=env, cwd=root_dir, detached=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE) except OSError as e: stdout = 'Command "%s" failed to start.\nError: %s' % (' '.join( task_details.command), e) now = monotonic_time() params['cost_usd'] = cost_usd_hour * (now - task_start) / 60. / 60. params['duration'] = now - start params['io_timeout'] = False params['hard_timeout'] = False post_update(swarming_server, params, 1, stdout, 0) return 1 output_chunk_start = 0 stdout = '' exit_code = None had_hard_timeout = False had_io_timeout = False timed_out = None try: calc = lambda: calc_yield_wait(task_details, start, last_io, timed_out, stdout) maxsize = lambda: MAX_CHUNK_SIZE - len(stdout) last_io = monotonic_time() for _, new_data in proc.yield_any(maxsize=maxsize, soft_timeout=calc): now = monotonic_time() if new_data: stdout += new_data last_io = now # Post update if necessary. if should_post_update(stdout, now, last_packet): last_packet = monotonic_time() params['cost_usd'] = (cost_usd_hour * (last_packet - task_start) / 60. / 60.) post_update(swarming_server, params, None, stdout, output_chunk_start) output_chunk_start += len(stdout) stdout = '' # Send signal on timeout if necessary. Both are failures, not # internal_failures. # Eventually kill but return 0 so bot_main.py doesn't cancel the task. if not timed_out: if now - last_io > task_details.io_timeout: had_io_timeout = True logging.warning('I/O timeout') proc.terminate() timed_out = monotonic_time() elif now - start > task_details.hard_timeout: had_hard_timeout = True logging.warning('Hard timeout') proc.terminate() timed_out = monotonic_time() else: # During grace period. if now >= timed_out + task_details.grace_period: # Now kill for real. The user can distinguish between the following # states: # - signal but process exited within grace period, # (hard_|io_)_timed_out will be set but the process exit code will # be script provided. # - processed exited late, exit code will be -9 on posix. try: logging.warning('proc.kill() after grace') proc.kill() except OSError: pass logging.info('Waiting for proces exit') exit_code = proc.wait() logging.info('Waiting for proces exit - done') finally: # Something wrong happened, try to kill the child process. if exit_code is None: had_hard_timeout = True try: logging.warning('proc.kill() in finally') proc.kill() except OSError: # The process has already exited. pass # TODO(maruel): We'd wait only for X seconds. logging.info('Waiting for proces exit in finally') exit_code = proc.wait() logging.info('Waiting for proces exit in finally - done') # This is the very last packet for this command. now = monotonic_time() params['cost_usd'] = cost_usd_hour * (now - task_start) / 60. / 60. params['duration'] = now - start params['io_timeout'] = had_io_timeout params['hard_timeout'] = had_hard_timeout # At worst, it'll re-throw, which will be caught by bot_main.py. post_update(swarming_server, params, exit_code, stdout, output_chunk_start) output_chunk_start += len(stdout) stdout = '' with open(json_file, 'w') as fd: json.dump({'exit_code': exit_code, 'version': 1}, fd) logging.info('run_command() = %s', exit_code) assert not stdout return exit_code
def run_command(command, cwd, env, hard_timeout, grace_period): """Runs the command. Returns: tuple(process exit code, bool if had a hard timeout) """ logging.info('run_command(%s, %s)' % (command, cwd)) exit_code = None had_hard_timeout = False with tools.Profiler('RunTest'): proc = None had_signal = [] try: # TODO(maruel): This code is imperfect. It doesn't handle well signals # during the download phase and there's short windows were things can go # wrong. def handler(signum, _frame): if proc and not had_signal: logging.info('Received signal %d', signum) had_signal.append(True) raise subprocess42.TimeoutExpired(command, None) proc = subprocess42.Popen(command, cwd=cwd, env=env, detached=True) with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler): try: exit_code = proc.wait(hard_timeout or None) except subprocess42.TimeoutExpired: if not had_signal: logging.warning('Hard timeout') had_hard_timeout = True logging.warning('Sending SIGTERM') proc.terminate() # Ignore signals in grace period. Forcibly give the grace period to the # child process. if exit_code is None: ignore = lambda *_: None with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, ignore): try: exit_code = proc.wait(grace_period or None) except subprocess42.TimeoutExpired: # Now kill for real. The user can distinguish between the # following states: # - signal but process exited within grace period, # hard_timed_out will be set but the process exit code will be # script provided. # - processed exited late, exit code will be -9 on posix. logging.warning('Grace exhausted; sending SIGKILL') proc.kill() logging.info('Waiting for proces exit') exit_code = proc.wait() except OSError: # This is not considered to be an internal error. The executable simply # does not exit. sys.stderr.write( '<The executable does not exist or a dependent library is missing>\n' '<Check for missing .so/.dll in the .isolate or GN file>\n' '<Command: %s>\n' % command) if os.environ.get('SWARMING_TASK_ID'): # Give an additional hint when running as a swarming task. sys.stderr.write( '<See the task\'s page for commands to help diagnose this issue ' 'by reproducing the task locally>\n') exit_code = 1 logging.info('Command finished with exit code %d (%s)', exit_code, hex(0xffffffff & exit_code)) return exit_code, had_hard_timeout
def test_recv_any(self): # Test all pipe direction and output scenarios. combinations = [ { 'cmd': ['out_print', 'err_print'], 'stdout': None, 'stderr': None, 'expected': {}, }, { 'cmd': ['out_print', 'err_print'], 'stdout': None, 'stderr': subprocess42.STDOUT, 'expected': {}, }, { 'cmd': ['out_print'], 'stdout': subprocess42.PIPE, 'stderr': subprocess42.PIPE, 'expected': {'stdout': 'printing'}, }, { 'cmd': ['out_print'], 'stdout': subprocess42.PIPE, 'stderr': None, 'expected': {'stdout': 'printing'}, }, { 'cmd': ['out_print'], 'stdout': subprocess42.PIPE, 'stderr': subprocess42.STDOUT, 'expected': {'stdout': 'printing'}, }, { 'cmd': ['err_print'], 'stdout': subprocess42.PIPE, 'stderr': subprocess42.PIPE, 'expected': {'stderr': 'printing'}, }, { 'cmd': ['err_print'], 'stdout': None, 'stderr': subprocess42.PIPE, 'expected': {'stderr': 'printing'}, }, { 'cmd': ['err_print'], 'stdout': subprocess42.PIPE, 'stderr': subprocess42.STDOUT, 'expected': {'stdout': 'printing'}, }, { 'cmd': ['out_print', 'err_print'], 'stdout': subprocess42.PIPE, 'stderr': subprocess42.PIPE, 'expected': {'stderr': 'printing', 'stdout': 'printing'}, }, { 'cmd': ['out_print', 'err_print'], 'stdout': subprocess42.PIPE, 'stderr': subprocess42.STDOUT, 'expected': {'stdout': 'printingprinting'}, }, ] for i, testcase in enumerate(combinations): cmd = [sys.executable, self.output_script] + testcase['cmd'] p = subprocess42.Popen( cmd, env=ENV, stdout=testcase['stdout'], stderr=testcase['stderr']) actual = {} while p.poll() is None: pipe, data = p.recv_any() if data: actual.setdefault(pipe, '') actual[pipe] += data # The process exited, read any remaining data in the pipes. while True: pipe, data = p.recv_any() if pipe is None: break actual.setdefault(pipe, '') actual[pipe] += data self.assertEqual( testcase['expected'], actual, (i, testcase['cmd'], testcase['expected'], actual)) self.assertEqual((None, None), p.recv_any()) self.assertEqual(0, p.returncode)
def test_communicate_timeout(self): timedout = 1 if sys.platform == 'win32' else -9 # Format is: # ( (cmd, stderr_pipe, timeout), (stdout, stderr, returncode) ), ... # See OUTPUT script for the meaning of the commands. test_data = [ # 0 means no timeout, like None. ( (['out_sleeping', '0.001', 'out_slept', 'err_print'], None, 0), ('Sleeping.\nSlept.\n', None, 0), ), ( (['err_print'], subprocess42.STDOUT, 0), ('printing', None, 0), ), ( (['err_print'], subprocess42.PIPE, 0), ('', 'printing', 0), ), # On a loaded system, this can be tight. ( (['out_sleeping', 'out_flush', '60', 'out_slept'], None, 0.5), ('Sleeping.\n', None, timedout), ), ( ( # Note that err_flush is necessary on Windows but not on the other # OSes. This means the likelihood of missing stderr output from a # killed child process on Windows is much higher than on other OSes. [ 'out_sleeping', 'out_flush', 'err_print', 'err_flush', '60', 'out_slept', ], subprocess42.PIPE, 0.5), ('Sleeping.\n', 'printing', timedout), ), ( (['out_sleeping', '0.001', 'out_slept'], None, 60), ('Sleeping.\nSlept.\n', None, 0), ), ] for i, ((args, errpipe, timeout), expected) in enumerate(test_data): proc = subprocess42.Popen( [sys.executable, self.output_script] + args, env=ENV, stdout=subprocess42.PIPE, stderr=errpipe) try: stdout, stderr = proc.communicate(timeout=timeout) code = proc.returncode except subprocess42.TimeoutExpired as e: stdout = e.output stderr = e.stderr self.assertTrue(proc.kill()) code = proc.wait() finally: duration = proc.duration() expected_duration = 0.0001 if not timeout or timeout == 60 else timeout self.assertTrue(duration > expected_duration, (i, expected_duration)) self.assertEqual( (i, stdout, stderr, code), (i, to_native_eol(expected[0]), to_native_eol(expected[1]), expected[2])) # Try again with universal_newlines=True. proc = subprocess42.Popen( [sys.executable, self.output_script] + args, env=ENV, stdout=subprocess42.PIPE, stderr=errpipe, universal_newlines=True) try: stdout, stderr = proc.communicate(timeout=timeout) code = proc.returncode except subprocess42.TimeoutExpired as e: stdout = e.output stderr = e.stderr self.assertTrue(proc.kill()) code = proc.wait() finally: duration = proc.duration() self.assertTrue(duration > expected_duration, (i, expected_duration)) self.assertEqual( (i, stdout, stderr, code), (i,) + expected)
def run_command(command, cwd, tmp_dir, hard_timeout, grace_period): """Runs the command. Returns: tuple(process exit code, bool if had a hard timeout) """ logging.info('run_command(%s, %s)' % (command, cwd)) sys.stdout.flush() env = os.environ.copy() if sys.platform == 'darwin': env['TMPDIR'] = tmp_dir.encode('ascii') elif sys.platform == 'win32': # Temporarily disable this behavior on Windows while investigating # https://crbug.com/533552. # env['TEMP'] = tmp_dir.encode('ascii') pass else: env['TMP'] = tmp_dir.encode('ascii') exit_code = None had_hard_timeout = False with tools.Profiler('RunTest'): proc = None had_signal = [] try: # TODO(maruel): This code is imperfect. It doesn't handle well signals # during the download phase and there's short windows were things can go # wrong. def handler(signum, _frame): if proc and not had_signal: logging.info('Received signal %d', signum) had_signal.append(True) raise subprocess42.TimeoutExpired(command, None) proc = subprocess42.Popen(command, cwd=cwd, env=env, detached=True) with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, handler): try: exit_code = proc.wait(hard_timeout or None) except subprocess42.TimeoutExpired: if not had_signal: logging.warning('Hard timeout') had_hard_timeout = True logging.warning('Sending SIGTERM') proc.terminate() # Ignore signals in grace period. Forcibly give the grace period to the # child process. if exit_code is None: ignore = lambda *_: None with subprocess42.set_signal_handler(subprocess42.STOP_SIGNALS, ignore): try: exit_code = proc.wait(grace_period or None) except subprocess42.TimeoutExpired: # Now kill for real. The user can distinguish between the # following states: # - signal but process exited within grace period, # hard_timed_out will be set but the process exit code will be # script provided. # - processed exited late, exit code will be -9 on posix. logging.warning('Grace exhausted; sending SIGKILL') proc.kill() logging.info('Waiting for proces exit') exit_code = proc.wait() except OSError: # This is not considered to be an internal error. The executable simply # does not exit. exit_code = 1 logging.info( 'Command finished with exit code %d (%s)', exit_code, hex(0xffffffff & exit_code)) return exit_code, had_hard_timeout