def _connect_transport(self, pkey): for _ in xrange(self.CONNECT_TIMEOUT_RETRIES): transport = paramiko.Transport(self._connect_socket()) completed = threading.Event() transport.start_client(completed) completed.wait(self.CONNECT_TIMEOUT_SECONDS) if completed.isSet(): self._check_transport_error(transport) completed.clear() transport.auth_publickey(self.user, pkey, completed) completed.wait(self.CONNECT_TIMEOUT_SECONDS) if completed.isSet(): self._check_transport_error(transport) if not transport.is_authenticated(): transport.close() raise paramiko.AuthenticationException() return transport logging.warn("SSH negotiation (%s:%d) timed out, retrying", self.hostname, self.port) # HACK: we can't count on transport.join not hanging now, either transport.join = lambda: None transport.close() logging.error( "SSH negotation (%s:%d) has timed out %s times, " "giving up", self.hostname, self.port, self.CONNECT_TIMEOUT_RETRIES) raise error.AutoservSSHTimeout("SSH negotiation timed out")
def ssh_ping(self, timeout=60, base_cmd='true'): """ Pings remote host via ssh. @param timeout: Time in seconds before giving up. Defaults to 60 seconds. @param base_cmd: The base command to run with the ssh ping. Defaults to true. @raise AutoservSSHTimeout: If the ssh ping times out. @raise AutoservSshPermissionDeniedError: If ssh ping fails due to permissions. @raise AutoservSshPingHostError: For other AutoservRunErrors. """ try: self.run(base_cmd, timeout=timeout, connect_timeout=timeout) except error.AutoservSSHTimeout: msg = "Host (ssh) verify timed out (timeout = %d)" % timeout raise error.AutoservSSHTimeout(msg) except error.AutoservSshPermissionDeniedError: #let AutoservSshPermissionDeniedError be visible to the callers raise except error.AutoservRunError, e: # convert the generic AutoservRunError into something more # specific for this context raise error.AutoservSshPingHostError(e.description + '\n' + repr(e.result_obj))
def testTriggerUpdate(self): """Tests that we correctly handle updater errors.""" update_url = 'http://server/test/url' host = self.mox.CreateMockAnything() self.mox.StubOutWithMock(host, 'run') host.hostname = 'test_host' expected_cmd = ('/usr/bin/update_engine_client --check_for_update ' '--omaha_url=http://server/test/url') updater = autoupdater.ChromiumOSUpdater(update_url, host=host) # Test with success. host.run(expected_cmd) # SSH Timeout host.run(expected_cmd).AndRaise( error.AutoservSSHTimeout("ssh timed out", 255)) # SSH Permission Error host.run(expected_cmd).AndRaise( error.AutoservSshPermissionDeniedError("ssh timed out", 255)) # Command Failed Error cmd_result_1 = self.mox.CreateMockAnything() cmd_result_1.exit_status = 1 host.run(expected_cmd).AndRaise( error.AutoservRunError("ssh timed out", cmd_result_1)) # Generic SSH Error (maybe) cmd_result_255 = self.mox.CreateMockAnything() cmd_result_255.exit_status = 255 host.run(expected_cmd).AndRaise( error.AutoservRunError("Sometimes SSH specific result.", cmd_result_255)) self.mox.ReplayAll() # Verify Success. updater.trigger_update() # Verify each type of error listed above. self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.mox.VerifyAll()
def _execute_daemon(self, timeout, stderr_redirector, client_disconnect_timeout): monitor_dir = self.host.get_tmp_dir() daemon_cmd = self.get_daemon_cmd(monitor_dir) # grab the location for the server-side client log file client_log_prefix = self.get_client_log() client_log_path = os.path.join(self.results_dir, 'debug', client_log_prefix + '.log') client_log = open(client_log_path, 'w', 0) self.copy_client_config_file(client_log_prefix) stdout_read = stderr_read = 0 self.host.job.push_execution_context(self.results_dir) try: self.host.run(daemon_cmd, ignore_status=True, timeout=timeout) disconnect_warnings = [] while True: monitor_cmd = self.get_monitor_cmd(monitor_dir, stdout_read, stderr_read) try: result = self.host.run(monitor_cmd, ignore_status=True, timeout=timeout, stdout_tee=client_log, stderr_tee=stderr_redirector) except error.AutoservRunError, e: result = e.result_obj result.exit_status = None disconnect_warnings.append(e.description) stderr_redirector.log_warning( "Autotest client was disconnected: %s" % e.description, "NETWORK") except error.AutoservSSHTimeout: result = utils.CmdResult(monitor_cmd, "", "", None, 0) stderr_redirector.log_warning( "Attempt to connect to Autotest client timed out", "NETWORK") stdout_read += len(result.stdout) stderr_read += len(self._strip_stderr_prologue(result.stderr)) if result.exit_status is not None: # TODO (crosbug.com/38224)- sbasi: Remove extra logging. logging.debug('Result exit status is %d.', result.exit_status) return result elif not self.host.wait_up(client_disconnect_timeout): raise error.AutoservSSHTimeout( "client was disconnected, reconnect timed out")
def ssh_ping(self, timeout=60): try: self.run("true", timeout=timeout, connect_timeout=timeout) except error.AutoservSSHTimeout: msg = "Host (ssh) verify timed out (timeout = %d)" % timeout raise error.AutoservSSHTimeout(msg) except error.AutoservSshPermissionDeniedError: #let AutoservSshPermissionDeniedError be visible to the callers raise except error.AutoservRunError, e: # convert the generic AutoservRunError into something more # specific for this context raise error.AutoservSshPingHostError(e.description + '\n' + repr(e.result_obj))
def run(self, command, timeout=3600, ignore_status=False, stdout_tee=utils.TEE_TO_LOGS, stderr_tee=utils.TEE_TO_LOGS, connect_timeout=30, stdin=None, verbose=True, args=(), ignore_timeout=False): """ Run a command on the remote host. @see common_lib.hosts.host.run() @param connect_timeout: connection timeout (in seconds) @param options: string with additional ssh command options @param verbose: log the commands @param ignore_timeout: bool True command timeouts should be ignored. Will return None on command timeout. @raises AutoservRunError: if the command failed @raises AutoservSSHTimeout: ssh connection has timed out """ stdout = utils.get_stream_tee_file(stdout_tee, utils.DEFAULT_STDOUT_LEVEL, prefix=utils.STDOUT_PREFIX) stderr = utils.get_stream_tee_file( stderr_tee, utils.get_stderr_level(ignore_status), prefix=utils.STDERR_PREFIX) for arg in args: command += ' "%s"' % utils.sh_escape(arg) if verbose: logging.debug("Running (ssh-paramiko) '%s'", command) # start up the command start_time = time.time() try: channel = self._open_channel(timeout) channel.exec_command(command) except (socket.error, paramiko.SSHException, EOFError), e: # This has to match the string from paramiko *exactly*. if str(e) != 'Channel closed.': raise error.AutoservSSHTimeout("ssh failed: %s" % e)
def _open_channel(self, timeout): start_time = time.time() if os.getpid() != self.pid: if self.pid is not None: # HACK: paramiko tries to join() on its worker thread # and this just hangs on linux after a fork() self.transport.join = lambda: None self.transport.atfork() join_hook = lambda cmd: self._close_transport() subcommand.subcommand.register_join_hook(join_hook) logging.debug("Reopening SSH connection after a process fork") self._init_transport() channel = None try: channel = self.transport.open_session() except (socket.error, paramiko.SSHException, EOFError), e: logging.warn("Exception occured while opening session: %s", e) if time.time() - start_time >= timeout: raise error.AutoservSSHTimeout("ssh failed: %s" % e)
def _run(self, command, timeout, ignore_status, stdout, stderr, connect_timeout, env, options, stdin, args): """Helper function for run().""" ssh_cmd = self.ssh_command(connect_timeout, options) if not env.strip(): env = "" else: env = "export %s;" % env for arg in args: command += ' "%s"' % utils.sh_escape(arg) full_cmd = '%s "%s %s"' % (ssh_cmd, env, utils.sh_escape(command)) result = utils.run(full_cmd, timeout, True, stdout, stderr, verbose=False, stdin=stdin, stderr_is_expected=ignore_status) # The error messages will show up in band (indistinguishable # from stuff sent through the SSH connection), so we have the # remote computer echo the message "Connected." before running # any command. Since the following 2 errors have to do with # connecting, it's safe to do these checks. if result.exit_status == 255: if re.search( r'^ssh: connect to host .* port .*: ' r'Connection timed out\r$', result.stderr): raise error.AutoservSSHTimeout("ssh timed out", result) if "Permission denied." in result.stderr: msg = "ssh permission denied" raise error.AutoservSshPermissionDeniedError(msg, result) if not ignore_status and result.exit_status > 0: raise error.AutoservRunError("command execution error", result) return result
def _run(self, command, timeout, ignore_status, stdout, stderr, connect_timeout, env, options, stdin, args, ignore_timeout): """Helper function for run().""" ssh_cmd = self.ssh_command(connect_timeout, options) if not env.strip(): env = "" else: env = "export %s;" % env for arg in args: command += ' "%s"' % utils.sh_escape(arg) full_cmd = '%s "%s %s"' % (ssh_cmd, env, utils.sh_escape(command)) # TODO(jrbarnette): crbug.com/484726 - When we're in an SSP # container, sometimes shortly after reboot we will see DNS # resolution errors on ssh commands; the problem never # occurs more than once in a row. This especially affects # the autoupdate_Rollback test, but other cases have been # affected, too. # # We work around it by detecting the first DNS resolution error # and retrying exactly one time. dns_retry_count = 2 while True: result = utils.run(full_cmd, timeout, True, stdout, stderr, verbose=False, stdin=stdin, stderr_is_expected=ignore_status, ignore_timeout=ignore_timeout) dns_retry_count -= 1 if (result and result.exit_status == 255 and re.search( r'^ssh: .*: Name or service not known', result.stderr)): if dns_retry_count: logging.debug('Retrying because of DNS failure') continue logging.debug('Retry failed.') autotest_stats.Counter('dns_retry_hack.fail').increment() elif not dns_retry_count: logging.debug('Retry succeeded.') autotest_stats.Counter('dns_retry_hack.pass').increment() break if ignore_timeout and not result: return None # The error messages will show up in band (indistinguishable # from stuff sent through the SSH connection), so we have the # remote computer echo the message "Connected." before running # any command. Since the following 2 errors have to do with # connecting, it's safe to do these checks. if result.exit_status == 255: if re.search( r'^ssh: connect to host .* port .*: ' r'Connection timed out\r$', result.stderr): raise error.AutoservSSHTimeout("ssh timed out", result) if "Permission denied." in result.stderr: msg = "ssh permission denied" raise error.AutoservSshPermissionDeniedError(msg, result) if not ignore_status and result.exit_status > 0: raise error.AutoservRunError("command execution error", result) return result
def testTriggerUpdate(self): """Tests that we correctly handle updater errors.""" update_url = 'http://server/test/url' self.host = self.mox.CreateMockAnything() self.mox.StubOutWithMock(self.host, 'run') self.mox.StubOutWithMock(autoupdater.ChromiumOSUpdater, 'get_last_update_error') self.host.hostname = 'test_host' updater_control_bin = '/usr/bin/update_engine_client' test_url = 'http://server/test/url' expected_wait_cmd = ('%s -status | grep CURRENT_OP' % updater_control_bin) expected_cmd = ('%s --check_for_update --omaha_url=%s' % (updater_control_bin, test_url)) self.mox.StubOutWithMock(time, "sleep") UPDATE_ENGINE_RETRY_WAIT_TIME = 5 # Generic SSH Error. cmd_result_255 = self.mox.CreateMockAnything() cmd_result_255.exit_status = 255 # Command Failed Error cmd_result_1 = self.mox.CreateMockAnything() cmd_result_1.exit_status = 1 # Error 37 cmd_result_37 = self.mox.CreateMockAnything() cmd_result_37.exit_status = 37 updater = autoupdater.ChromiumOSUpdater(update_url, host=self.host) # (SUCCESS) Expect one wait command and one status command. self._host_run_for_update(expected_wait_cmd) self._host_run_for_update(expected_cmd) # (SUCCESS) Test with one retry to wait for update-engine. self._host_run_for_update(expected_wait_cmd, exception=error.AutoservRunError( 'non-zero status', cmd_result_1)) time.sleep(UPDATE_ENGINE_RETRY_WAIT_TIME) self._host_run_for_update(expected_wait_cmd) self._host_run_for_update(expected_cmd) # (SUCCESS) One-time SSH timeout, then success on retry. self._host_run_for_update(expected_wait_cmd) self._host_run_for_update(expected_cmd, exception=error.AutoservSSHTimeout( 'ssh timed out', cmd_result_255)) self._host_run_for_update(expected_cmd) # (SUCCESS) One-time ERROR 37, then success. self._host_run_for_update(expected_wait_cmd) self._host_run_for_update(expected_cmd, exception=error.AutoservRunError( 'ERROR_CODE=37', cmd_result_37)) self._host_run_for_update(expected_cmd) # (FAILURE) Bad status of update engine. self._host_run_for_update(expected_wait_cmd) self._host_run_for_update( expected_cmd, bad_update_status=True, exception=error.InstallError('host is not in installable state')) # (FAILURE) Two-time SSH timeout. self._host_run_for_update(expected_wait_cmd) self._host_run_for_update(expected_cmd, exception=error.AutoservSSHTimeout( 'ssh timed out', cmd_result_255)) self._host_run_for_update(expected_cmd, exception=error.AutoservSSHTimeout( 'ssh timed out', cmd_result_255)) # (FAILURE) SSH Permission Error self._host_run_for_update(expected_wait_cmd) self._host_run_for_update( expected_cmd, exception=error.AutoservSshPermissionDeniedError( 'no permission', cmd_result_255)) # (FAILURE) Other ssh failure self._host_run_for_update(expected_wait_cmd) self._host_run_for_update( expected_cmd, exception=error.AutoservSshPermissionDeniedError( 'no permission', cmd_result_255)) # (FAILURE) Other error self._host_run_for_update(expected_wait_cmd) self._host_run_for_update(expected_cmd, exception=error.AutoservRunError( "unknown error", cmd_result_1)) self.mox.ReplayAll() # Expect success updater.trigger_update() updater.trigger_update() updater.trigger_update() updater.trigger_update() # Expect errors as listed above self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.assertRaises(autoupdater.RootFSUpdateError, updater.trigger_update) self.mox.VerifyAll()
def _run(self, command, timeout, ignore_status, stdout, stderr, connect_timeout, env, options, stdin, args, ignore_timeout, ssh_failure_retry_ok): """Helper function for run().""" ssh_cmd = self.ssh_command(connect_timeout, options) if not env.strip(): env = "" else: env = "export %s;" % env for arg in args: command += ' "%s"' % utils.sh_escape(arg) full_cmd = '%s "%s %s"' % (ssh_cmd, env, utils.sh_escape(command)) # TODO(jrbarnette): crbug.com/484726 - When we're in an SSP # container, sometimes shortly after reboot we will see DNS # resolution errors on ssh commands; the problem never # occurs more than once in a row. This especially affects # the autoupdate_Rollback test, but other cases have been # affected, too. # # We work around it by detecting the first DNS resolution error # and retrying exactly one time. dns_error_retry_count = 1 def counters_inc(counter_name, failure_name): """Helper function to increment metrics counters. @param counter_name: string indicating which counter to use @param failure_name: string indentifying an error, or 'success' """ if counter_name == 'call': # ssh_counter records the outcome of each ssh invocation # inside _run(), including exceptions. ssh_counter = metrics.Counter('chromeos/autotest/ssh/calls') fields = { 'error': failure_name or 'success', 'attempt': ssh_call_count } ssh_counter.increment(fields=fields) if counter_name == 'run': # run_counter records each call to _run() with its result # and how many tries were made. Calls are recorded when # _run() exits (including exiting with an exception) run_counter = metrics.Counter('chromeos/autotest/ssh/runs') fields = { 'error': failure_name or 'success', 'attempt': ssh_call_count } run_counter.increment(fields=fields) # If ssh_failure_retry_ok is True, retry twice on timeouts and generic # error 255: if a simple retry doesn't work, kill the ssh master # connection and try again. (Note that either error could come from # the command running in the DUT, in which case the retry may be # useless but, in theory, also harmless.) if ssh_failure_retry_ok: # Ignore ssh command timeout, even though it could be a timeout due # to the command executing in the remote host. Note that passing # ignore_timeout = True makes utils.run() return None on timeouts # (and only on timeouts). original_ignore_timeout = ignore_timeout ignore_timeout = True ssh_failure_retry_count = 2 else: ssh_failure_retry_count = 0 ssh_call_count = 0 while True: try: # Increment call count first, in case utils.run() throws an # exception. ssh_call_count += 1 result = utils.run(full_cmd, timeout, True, stdout, stderr, verbose=False, stdin=stdin, stderr_is_expected=ignore_status, ignore_timeout=ignore_timeout) except Exception as e: # No retries on exception. counters_inc('call', 'exception') counters_inc('run', 'exception') raise e failure_name = None if result: if result.exit_status == 255: if re.search(r'^ssh: .*: Name or service not known', result.stderr): failure_name = 'dns_failure' else: failure_name = 'error_255' elif result.exit_status > 0: failure_name = 'nonzero_status' else: # result == None failure_name = 'timeout' # Record the outcome of the ssh invocation. counters_inc('call', failure_name) if failure_name: # There was a failure: decide whether to retry. if failure_name == 'dns_failure': if dns_error_retry_count > 0: logging.debug('retrying ssh because of DNS failure') dns_error_retry_count -= 1 continue else: if ssh_failure_retry_count == 2: logging.debug('retrying ssh command after %s', failure_name) ssh_failure_retry_count -= 1 continue elif ssh_failure_retry_count == 1: # After two failures, restart the master connection # before the final try. logging.debug('retry 2: restarting master connection') self.restart_master_ssh() # Last retry: reinstate timeout behavior. ignore_timeout = original_ignore_timeout ssh_failure_retry_count -= 1 continue # No retry conditions occurred. Exit the loop. break # The outcomes of ssh invocations have been recorded. Now record # the outcome of this function. if ignore_timeout and not result: counters_inc('run', 'ignored_timeout') return None # The error messages will show up in band (indistinguishable # from stuff sent through the SSH connection), so we have the # remote computer echo the message "Connected." before running # any command. Since the following 2 errors have to do with # connecting, it's safe to do these checks. if result.exit_status == 255: if re.search( r'^ssh: connect to host .* port .*: ' r'Connection timed out\r$', result.stderr): counters_inc('run', 'final_timeout') raise error.AutoservSSHTimeout("ssh timed out", result) if "Permission denied." in result.stderr: msg = "ssh permission denied" counters_inc('run', 'final_eperm') raise error.AutoservSshPermissionDeniedError(msg, result) if not ignore_status and result.exit_status > 0: counters_inc('run', 'final_run_error') raise error.AutoservRunError("command execution error", result) counters_inc('run', failure_name) return result