def kill_process(self, name): """ Kill process that matches name. NOTE: a killed process will continue to show up as active until the process monitor thread has caught that it has died. @param name: Process name @type name: str @return: True if a process named name was removed from process monitor. A process is considered killed if its stop() method was called. @rtype: bool """ if not isinstance(name, basestring): raise RLException( "kill_process takes in a process name but was given: %s" % name) logger.debug("ProcessMonitor.kill_process[%s]" % name) printlog("[%s] kill requested" % name) with self.plock: p = self.get_process(name) if p: try: # no need to accumulate errors, so pass in [] p.stop([]) except: logger.error(traceback.format_exc()) return True else: return False
def kill_process(self, name): """ Kill process that matches name. NOTE: a killed process will continue to show up as active until the process monitor thread has caught that it has died. @param name: Process name @type name: str @return: True if a process named name was removed from process monitor. A process is considered killed if its stop() method was called. @rtype: bool """ if not isinstance(name, basestring): raise RLException("kill_process takes in a process name but was given: %s"%name) logger.debug("ProcessMonitor.kill_process[%s]"%name) printlog("[%s] kill requested"%name) with self.plock: p = self.get_process(name) if p: try: # no need to accumulate errors, so pass in [] p.stop([]) except: logger.error(traceback.format_exc()) return True else: return False
def _start_child(self, server_node_uri, machine, counter): # generate a name for the machine. don't use config key as # it's too long to easily display name = "%s-%s"%(machine.address, counter) self.logger.info("remote[%s] starting roslaunch", name) printlog("remote[%s] starting roslaunch"%name) p = SSHChildROSLaunchProcess(self.run_id, name, server_node_uri, machine) success = p.start() self.pm.register(p) if not success: #treat as fatal raise RLException("unable to start remote roslaunch child: %s"%name) self.server.add_child(name, p) return p
def test_printlog(self): from roslaunch.core import add_printlog_handler, add_printerrlog_handler, printlog, printlog_bold, printerrlog add_printlog_handler(printlog_cb) add_printlog_handler(printlog_cb_exception) add_printerrlog_handler(printlog_cb) add_printerrlog_handler(printlog_cb_exception) #can't really test functionality, just make sure it doesn't crash global _lastmsg _lastmsg = None printlog('foo') self.assertEquals('foo', _lastmsg) printlog_bold('bar') self.assertEquals('bar', _lastmsg) printerrlog('baz') self.assertEquals('baz', _lastmsg)
def stop(self, errors=None): """ Terminate this process, including the SSH connection. """ if errors is None: errors = [] try: self.lock.acquire() if not self.ssh: return # call the shutdown API first as closing the SSH connection # won't actually kill the process unless it triggers SIGPIPE try: api = self.getapi() if api is not None: #TODO: probably need a timeout on this api.shutdown() except socket.error: # normal if process is already dead address, port = self.machine.address, self.machine.ssh_port if not self.is_dead: printerrlog( "remote[%s]: unable to contact [%s] to shutdown remote processes!" % (self.name, address)) else: printlog( "remote[%s]: unable to contact [%s] to shutdown cleanly. The remote roslaunch may have exited already." % (self.name, address)) except: # temporary: don't really want to log here as this # may occur during shutdown traceback.print_exc() _logger.info("remote[%s]: closing ssh connection", self.name) self.sshin.close() self.sshout.close() self.ssherr.close() self.ssh.close() self.sshin = None self.sshout = None self.ssherr = None self.ssh = None _logger.info("remote[%s]: ssh connection closed", self.name) finally: self.lock.release()
def _kill_process(p, errors): """ Routine for kill Process p with appropriate logging to screen and logfile @param p: process to kill @type p: Process @param errors: list of error messages from killed process @type errors: [str] """ try: logger.info("ProcessMonitor exit: killing %s", p.name) printlog("[%s] killing on exit" % p.name) # we accumulate errors from each process so that we can print these at the end p.stop(errors) except: traceback.print_exc() logger.error(traceback.format_exc())
def _start_child(self, server_node_uri, machine, counter): # generate a name for the machine. don't use config key as # it's too long to easily display name = "%s-%s" % (machine.address, counter) self.logger.info("remote[%s] starting roslaunch", name) printlog("remote[%s] starting roslaunch" % name) p = SSHChildROSLaunchProcess(self.run_id, name, server_node_uri, machine, self.rosconfig.master.uri) success = p.start() self.pm.register(p) if not success: #treat as fatal raise RLException("unable to start remote roslaunch child: %s" % name) self.server.add_child(name, p) return p
def stop(self, errors=None): """ Terminate this process, including the SSH connection. """ if errors is None: errors = [] try: self.lock.acquire() if not self.ssh: return # call the shutdown API first as closing the SSH connection # won't actually kill the process unless it triggers SIGPIPE try: api = self.getapi() if api is not None: #TODO: probably need a timeout on this api.shutdown() except socket.error: # normal if process is already dead address, port = self.machine.address, self.machine.ssh_port if not self.is_dead: printerrlog("remote[%s]: unable to contact [%s] to shutdown remote processes!"%(self.name, address)) else: printlog("remote[%s]: unable to contact [%s] to shutdown cleanly. The remote roslaunch may have exited already."%(self.name, address)) except: # temporary: don't really want to log here as this # may occur during shutdown traceback.print_exc() _logger.info("remote[%s]: closing ssh connection", self.name) self.sshin.close() self.sshout.close() self.ssherr.close() self.ssh.close() self.sshin = None self.sshout = None self.ssherr = None self.ssh = None _logger.info("remote[%s]: ssh connection closed", self.name) finally: self.lock.release()
def start(self): """ Start the remote process. This will create an SSH connection to the remote host. """ self.started = False #won't set to True until we are finished self.ssh = self.sshin = self.sshout = self.ssherr = None try: self.lock.acquire() name = self.name m = self.machine if m.user is not None: printlog("remote[%s]: creating ssh connection to %s:%s, user[%s]"%(name, m.address, m.ssh_port, m.user)) else: printlog("remote[%s]: creating ssh connection to %s:%s"%(name, m.address, m.ssh_port)) _logger.info("remote[%s]: invoking with ssh exec args [%s], env: %s"%(name, ' '.join(self.args), self.env)) sshvals, msg = self._ssh_exec(' '.join(self.args), self.env, m.address, m.ssh_port, m.user, m.password) if sshvals is None: printerrlog("remote[%s]: failed to launch on %s:\n\n%s\n\n"%(name, m.name, msg)) return False self.ssh, self.sshin, self.sshout, self.ssherr = sshvals printlog("remote[%s]: ssh connection created"%name) self.started = True return True finally: self.lock.release()
def start(self): """ Start the remote process. This will create an SSH connection to the remote host. """ self.started = False #won't set to True until we are finished self.ssh = self.sshin = self.sshout = self.ssherr = None with self.lock: name = self.name m = self.machine if m.user is not None: printlog( "remote[%s]: creating ssh connection to %s:%s, user[%s]" % (name, m.address, m.ssh_port, m.user)) else: printlog("remote[%s]: creating ssh connection to %s:%s" % (name, m.address, m.ssh_port)) _logger.info("remote[%s]: invoking with ssh exec args [%s]" % (name, ' '.join(self.args))) sshvals, msg = self._ssh_exec(' '.join(self.args), m.address, m.ssh_port, m.user, m.password) if sshvals is None: printerrlog("remote[%s]: failed to launch on %s:\n\n%s\n\n" % (name, m.name, msg)) return False self.ssh, self.sshin, self.sshout, self.ssherr = sshvals printlog("remote[%s]: ssh connection created" % name) self.started = True return True
def log(self, client, level, message): """ Report a log message to the server @param client: name of client @type client: str @param level: log level (uses roslib.msg.Log levels) @type level: int @param message: message to log @type message: str """ try: if level >= Log.ERROR: printerrlog("[%s]: %s"%(client, message)) else: #hack due to the fact that we only have one INFO level if 'started with pid' in message: printlog_bold("[%s]: %s"%(client, message)) else: printlog("[%s]: %s"%(client, message)) except: # can't trust the logging system at this point, so just dump to screen traceback.print_exc() return 1, '', 1
def log(self, client, level, message): """ Report a log message to the server @param client: name of client @type client: str @param level: log level (uses rosgraph_msgs.msg.Log levels) @type level: int @param message: message to log @type message: str """ try: if level >= Log.ERROR: printerrlog("[%s]: %s" % (client, message)) else: #hack due to the fact that we only have one INFO level if 'started with pid' in message: printlog_bold("[%s]: %s" % (client, message)) else: printlog("[%s]: %s" % (client, message)) except: # can't trust the logging system at this point, so just dump to screen traceback.print_exc() return 1, '', 1
def _ssh_exec(self, command, address, port, username=None, password=None): """ :returns: (ssh pipes, message). If error occurs, returns (None, error message). """ if self.master_uri: env_command = 'env %s=%s' % (rosgraph.ROS_MASTER_URI, self.master_uri) command = '%s %s' % (env_command, command) try: import Crypto except ImportError as e: _logger.error("cannot use SSH: pycrypto is not installed") return None, "pycrypto is not installed" try: import paramiko except ImportError as e: _logger.error("cannot use SSH: paramiko is not installed") return None, "paramiko is not installed" #load user's ssh configuration config_block = {'hostname': None, 'user': None, 'identityfile': None} ssh_config = paramiko.SSHConfig() try: with open(os.path.join(os.path.expanduser('~'), '.ssh','config')) as f: ssh_config.parse(f) config_block.update(ssh_config.lookup(address)) except: pass address = config_block['hostname'] or address username = username or config_block['user'] identity_file = None if config_block.get('identityfile', None): if isinstance(config_block['identityfile'], list): identity_file = [os.path.expanduser(f) for f in config_block['identityfile']] else: identity_file = os.path.expanduser(config_block['identityfile']) #load ssh client and connect ssh = paramiko.SSHClient() err_msg = ssh_check_known_hosts(ssh, address, port, username=username, logger=_logger) if not err_msg: username_str = '%s@'%username if username else '' try: if not password: #use SSH agent ssh.connect(address, port, username, timeout=TIMEOUT_SSH_CONNECT, key_filename=identity_file) else: #use SSH with login/pass ssh.connect(address, port, username, password, timeout=TIMEOUT_SSH_CONNECT) except paramiko.BadHostKeyException: _logger.error(traceback.format_exc()) err_msg = "Unable to verify host key for remote computer[%s:%s]"%(address, port) except paramiko.AuthenticationException: _logger.error(traceback.format_exc()) err_msg = "Authentication to remote computer[%s%s:%s] failed.\nA common cause of this error is a missing key in your authorized_keys file."%(username_str, address, port) except paramiko.SSHException as e: _logger.error(traceback.format_exc()) if str(e).startswith("Unknown server"): pass err_msg = "Unable to establish ssh connection to [%s%s:%s]: %s"%(username_str, address, port, e) except socket.error as e: # #1824 if e[0] == 111: err_msg = "network connection refused by [%s:%s]"%(address, port) else: err_msg = "network error connecting to [%s:%s]: %s"%(address, port, str(e)) if err_msg: return None, err_msg else: printlog("launching remote roslaunch child with command: [%s]"%(str(command))) sshin, sshout, ssherr = ssh.exec_command(command) return (ssh, sshin, sshout, ssherr), "executed remotely"
def _run(self): """ Internal run loop of ProcessMonitor """ plock = self.plock dead = [] respawn = [] while not self._registrations_complete: logger.info("mirko hack") time.sleep(0.1) #yield thread while not self.is_shutdown: with plock: #copy self.procs procs = self.procs[:] if self.is_shutdown: break # check current signal handlers to see if children have stolen them away # TODO: this code may not be necessary anymore (have to test) for s in _signal_list: if signal.getsignal(s) != rl_signal: self.reacquire_signals.add(s) for p in procs: try: if not p.is_alive(): logger.debug( "Process[%s] has died, respawn=%s, required=%s, exit_code=%s", p.name, "True(%f)" % p.respawn_delay if p.respawn else p.respawn, p.required, p.exit_code) exit_code_str = p.get_exit_description() if p.required: printerrlog( '=' * 80 + "REQUIRED process [%s] has died!\n%s\nInitiating shutdown!\n" % (p.name, exit_code_str) + '=' * 80) self.is_shutdown = True elif not p in respawn: if p.exit_code: printerrlog("[%s] %s" % (p.name, exit_code_str)) else: printlog_bold("[%s] %s" % (p.name, exit_code_str)) dead.append(p) ## no need for lock as we require listeners be ## added before process monitor is launched for l in self.listeners: l.process_died(p.name, p.exit_code) except Exception as e: traceback.print_exc() #don't respawn as this is an internal error dead.append(p) if self.is_shutdown: break #stop polling for d in dead: try: # when should_respawn() returns 0.0, bool(0.0) evaluates to False # work around this by checking if the return value is False if d.should_respawn() is not False: respawn.append(d) else: self.unregister(d) # stop process, don't accumulate errors d.stop([]) # save process data to dead list with plock: self.dead_list.append(DeadProcess(d)) except: logger.error(traceback.format_exc()) # dead check is to make sure that ProcessMonitor at least # waits until its had at least one process before exiting if self._registrations_complete and dead and not self.procs and not respawn: printlog( "all processes on machine have died, roslaunch will exit") self.is_shutdown = True del dead[:] _respawn = [] for r in respawn: try: if self.is_shutdown: break if r.should_respawn() <= 0.0: printlog("[%s] restarting process" % r.name) # stop process, don't accumulate errors r.stop([]) r.start() else: # not ready yet, keep it around _respawn.append(r) except: traceback.print_exc() logger.error("Restart failed %s", traceback.format_exc()) respawn = _respawn time.sleep(0.1) #yield thread
def _run(self): """ Internal run loop of ProcessMonitor """ plock = self.plock dead = [] respawn = [] while not self.is_shutdown: with plock: # copy self.procs procs = self.procs[:] if self.is_shutdown: break # check current signal handlers to see if children have stolen them away # TODO: this code may not be necessary anymore (have to test) for s in _signal_list: if signal.getsignal(s) != rl_signal: self.reacquire_signals.add(s) for p in procs: try: if not p.is_alive(): logger.debug( "Process[%s] has died, respawn=%s, required=%s, exit_code=%s", p.name, "True(%f)" % p.respawn_delay if p.respawn else p.respawn, p.required, p.exit_code, ) exit_code_str = p.get_exit_description() if p.required: printerrlog( "=" * 80 + "REQUIRED process [%s] has died!\n%s\nInitiating shutdown!\n" % (p.name, exit_code_str) + "=" * 80 ) self.is_shutdown = True elif not p in respawn: if p.exit_code: printerrlog("[%s] %s" % (p.name, exit_code_str)) else: printlog_bold("[%s] %s" % (p.name, exit_code_str)) dead.append(p) ## no need for lock as we require listeners be ## added before process monitor is launched for l in self.listeners: l.process_died(p.name, p.exit_code) except Exception as e: traceback.print_exc() # don't respawn as this is an internal error dead.append(p) if self.is_shutdown: break # stop polling for d in dead: try: if d.should_respawn(): respawn.append(d) else: self.unregister(d) # stop process, don't accumulate errors d.stop([]) # save process data to dead list with plock: self.dead_list.append(DeadProcess(d)) except: logger.error(traceback.format_exc()) # dead check is to make sure that ProcessMonitor at least # waits until its had at least one process before exiting if self._registrations_complete and dead and not self.procs and not respawn: printlog("all processes on machine have died, roslaunch will exit") self.is_shutdown = True del dead[:] _respawn = [] for r in respawn: try: if self.is_shutdown: break if r.should_respawn() <= 0.0: printlog("[%s] restarting process" % r.name) # stop process, don't accumulate errors r.stop([]) r.start() else: # not ready yet, keep it around _respawn.append(r) except: traceback.print_exc() logger.error("Restart failed %s", traceback.format_exc()) respawn = _respawn time.sleep(0.1) # yield thread
def _ssh_exec(self, command, address, port, username=None, password=None): """ :returns: (ssh pipes, message). If error occurs, returns (None, error message). """ if self.master_uri: env_command = 'env %s=%s' % (rosgraph.ROS_MASTER_URI, self.master_uri) command = '%s %s' % (env_command, command) try: import Crypto except ImportError as e: _logger.error("cannot use SSH: pycrypto is not installed") return None, "pycrypto is not installed" try: import paramiko except ImportError as e: _logger.error("cannot use SSH: paramiko is not installed") return None, "paramiko is not installed" #load user's ssh configuration config_block = {'hostname': None, 'user': None, 'identityfile': None} ssh_config = paramiko.SSHConfig() try: with open(os.path.join(os.path.expanduser('~'), '.ssh', 'config')) as f: ssh_config.parse(f) config_block.update(ssh_config.lookup(address)) except: pass address = config_block['hostname'] or address username = username or config_block['user'] identity_file = None if config_block.get('identityfile', None): if isinstance(config_block['identityfile'], list): identity_file = [ os.path.expanduser(f) for f in config_block['identityfile'] ] else: identity_file = os.path.expanduser( config_block['identityfile']) #load ssh client and connect ssh = paramiko.SSHClient() err_msg = ssh_check_known_hosts(ssh, address, port, username=username, logger=_logger) if not err_msg: username_str = '%s@' % username if username else '' try: if password is None: #use SSH agent ssh.connect(address, port, username, timeout=TIMEOUT_SSH_CONNECT, key_filename=identity_file) else: #use SSH with login/pass ssh.connect(address, port, username, password, timeout=TIMEOUT_SSH_CONNECT) except paramiko.BadHostKeyException: _logger.error(traceback.format_exc()) err_msg = "Unable to verify host key for remote computer[%s:%s]" % ( address, port) except paramiko.AuthenticationException: _logger.error(traceback.format_exc()) err_msg = "Authentication to remote computer[%s%s:%s] failed.\nA common cause of this error is a missing key in your authorized_keys file." % ( username_str, address, port) except paramiko.SSHException as e: _logger.error(traceback.format_exc()) if str(e).startswith("Unknown server"): pass err_msg = "Unable to establish ssh connection to [%s%s:%s]: %s" % ( username_str, address, port, e) except socket.error as e: # #1824 if e[0] == 111: err_msg = "network connection refused by [%s:%s]" % ( address, port) else: err_msg = "network error connecting to [%s:%s]: %s" % ( address, port, str(e)) if err_msg: return None, err_msg else: printlog("launching remote roslaunch child with command: [%s]" % (str(command))) sshin, sshout, ssherr = ssh.exec_command(command) return (ssh, sshin, sshout, ssherr), "executed remotely"
def launch_main(argv=sys.argv, real_args=None, fn_to_call=None): options = None logger = None try: from roslaunch import rlutil parser = roslaunch._get_optparse() (options, args) = parser.parse_args(argv[1:]) args = rlutil.resolve_launch_arguments(args) roslaunch._validate_args(parser, options, args) # node args doesn't require any roslaunch infrastructure, so process it first if any([ options.node_args, options.node_list, options.find_node, options.dump_params, options.file_list, options.ros_args ]): if options.node_args and not args: parser.error("please specify a launch file") from roslaunch import node_args if options.node_args: node_args.print_node_args(options.node_args, args) elif options.find_node: node_args.print_node_filename(options.find_node, args) # Dump parameters, #2685 elif options.dump_params: roslaunch_param_dump.dump_params(args) elif options.file_list: rlutil.print_file_list(args) elif options.ros_args: import arg_dump as roslaunch_arg_dump roslaunch_arg_dump.dump_args(args) else: node_args.print_node_list(args) return # we have to wait for the master here because we don't have the run_id yet if options.wait_for_master: if options.core: parser.error("--wait cannot be used with roscore") rlutil._wait_for_master() # write the pid to a file roslaunch.write_pid_file(options.pid_fn, options.core, options.port) # spin up the logging infrastructure. have to wait until we can read options.run_id uuid = rlutil.get_or_generate_uuid(options.run_id, options.wait_for_master) roslaunch.configure_logging(uuid) # #3088: don't check disk usage on remote machines if not options.child_name and not options.skip_log_check: # #2761 rlutil.check_log_disk_usage() logger = logging.getLogger('roslaunch') logger.info("roslaunch starting with args %s" % str(argv)) logger.info("roslaunch env is %s" % os.environ) if options.child_name: logger.info('starting in child mode') # This is a roslaunch child, spin up client server. # client spins up an XML-RPC server that waits for # commands and configuration from the server. from roslaunch import child as roslaunch_child c = roslaunch_child.ROSLaunchChild(uuid, options.child_name, options.server_uri) c.run() else: logger.info('starting in server mode') # #1491 change terminal name if not options.disable_title: rlutil.change_terminal_name(args, options.core) # Read roslaunch string from stdin when - is passed as launch filename. roslaunch_strs = [] if '-' in args: roslaunch_core.printlog( "Passed '-' as file argument, attempting to read roslaunch XML from stdin." ) roslaunch_strs.append(sys.stdin.read()) roslaunch_core.printlog("... %d bytes read successfully.\n" % len(roslaunch_strs[-1])) args.remove('-') # This is a roslaunch parent, spin up parent server and launch processes. # args are the roslaunch files to load from roslaunch import parent as roslaunch_parent try: # force a port binding spec if we are running a core if options.core: options.port = options.port or DEFAULT_MASTER_PORT p = roslaunch_parent.ROSLaunchParent( uuid, args, roslaunch_strs=roslaunch_strs, is_core=options.core, port=options.port, local_only=options.local_only, verbose=options.verbose, force_screen=options.force_screen, num_workers=options.num_workers, timeout=options.timeout) p.start() if fn_to_call is None: p.spin() else: fn_to_call(real_args) finally: # remove the pid file if options.pid_fn: try: os.unlink(options.pid_fn) except os.error: pass except RLException as e: roslaunch_core.printerrlog(str(e)) roslaunch_core.printerrlog( 'The traceback for the exception was written to the log file') if logger: logger.error(traceback.format_exc()) sys.exit(1) except ValueError as e: # TODO: need to trap better than this high-level trap roslaunch_core.printerrlog(str(e)) roslaunch_core.printerrlog( 'The traceback for the exception was written to the log file') if logger: logger.error(traceback.format_exc()) sys.exit(1) except Exception as e: traceback.print_exc() sys.exit(1)