Beispiel #1
0
    def _init_runner(self):
        """
        Initialize the roslaunch runner
        """
        if self.config is None:
            raise RLException("config is not initialized")
        if self.pm is None:
            raise RLException("pm is not initialized")
        if self.server is None:
            raise RLException("server is not initialized")
        self.runner = roslaunch.launch.ROSLaunchRunner(
            self.run_id,
            self.config,
            server_uri=self.server.uri,
            pmon=self.pm,
            is_core=self.is_core,
            remote_runner=self.remote_runner,
            is_rostest=self.is_rostest)

        # print runner info to user, put errors last to make the more visible
        if self.is_core:
            print("ros_comm version %s" %
                  (self.config.params['/rosversion'].value))

        print(self.config.summary(local=self.remote_runner is None))
        if self.config:
            for err in self.config.config_errors:
                printerrlog("WARNING: %s" % err)
Beispiel #2
0
 def start(self):
     """
     Start the remote process. This will create an SSH connection
     to the remote host.
     """
     self.started = False  #won't set to True until we are finished
     self.ssh = self.sshin = self.sshout = self.ssherr = None
     with self.lock:
         name = self.name
         m = self.machine
         if m.user is not None:
             printlog(
                 "remote[%s]: creating ssh connection to %s:%s, user[%s]" %
                 (name, m.address, m.ssh_port, m.user))
         else:
             printlog("remote[%s]: creating ssh connection to %s:%s" %
                      (name, m.address, m.ssh_port))
         _logger.info("remote[%s]: invoking with ssh exec args [%s]" %
                      (name, ' '.join(self.args)))
         sshvals, msg = self._ssh_exec(' '.join(self.args), m.address,
                                       m.ssh_port, m.user, m.password)
         if sshvals is None:
             printerrlog("remote[%s]: failed to launch on %s:\n\n%s\n\n" %
                         (name, m.name, msg))
             return False
         self.ssh, self.sshin, self.sshout, self.ssherr = sshvals
         printlog("remote[%s]: ssh connection created" % name)
         self.started = True
         return True
Beispiel #3
0
 def start(self):
     """
     Start the remote process. This will create an SSH connection
     to the remote host.
     """
     self.started = False #won't set to True until we are finished
     self.ssh = self.sshin = self.sshout = self.ssherr = None        
     try:
         self.lock.acquire()
         name = self.name
         m = self.machine
         if m.user is not None:
             printlog("remote[%s]: creating ssh connection to %s:%s, user[%s]"%(name, m.address, m.ssh_port, m.user))
         else:
             printlog("remote[%s]: creating ssh connection to %s:%s"%(name, m.address, m.ssh_port))
         _logger.info("remote[%s]: invoking with ssh exec args [%s], env: %s"%(name, ' '.join(self.args), self.env))
         sshvals, msg = self._ssh_exec(' '.join(self.args), self.env, m.address, m.ssh_port, m.user, m.password)
         if sshvals is None:
             printerrlog("remote[%s]: failed to launch on %s:\n\n%s\n\n"%(name, m.name, msg))
             return False
         self.ssh, self.sshin, self.sshout, self.ssherr = sshvals
         printlog("remote[%s]: ssh connection created"%name)
         self.started = True            
         return True
     finally:
         self.lock.release()
Beispiel #4
0
    def _init_runner(self):
        """
        Initialize the roslaunch runner
        """
        if self.config is None:
            raise RLException("config is not initialized")
        if self.pm is None:
            raise RLException("pm is not initialized")
        if self.server is None:
            raise RLException("server is not initialized")
        self.runner = roslaunch.launch.ROSLaunchRunner(
            self.run_id,
            self.config,
            server_uri=self.server.uri,
            pmon=self.pm,
            is_core=self.is_core,
            remote_runner=self.remote_runner,
            is_rostest=self.is_rostest,
        )

        # print runner info to user, put errors last to make the more visible
        if self.is_core:
            print "ros_comm version %s" % (self.config.params["/rosversion"].value)

        print self.config.summary(local=self.remote_runner is None)
        if self.config:
            for err in self.config.config_errors:
                printerrlog("WARNING: %s" % err)
Beispiel #5
0
    def _post_run(self):
        logger.info("ProcessMonitor._post_run %s" % self)
        # this is already true entering, but go ahead and make sure
        self.is_shutdown = True
        # killall processes on run exit

        q = Queue()
        q.join()

        with self.plock:
            # make copy of core_procs for threadsafe usage
            core_procs = self.core_procs[:]
            logger.info("ProcessMonitor._post_run %s: remaining procs are %s" %
                        (self, self.procs))

            # enqueue all non-core procs in reverse order for parallel kill
            # #526/885: ignore core procs
            [q.put(p) for p in reversed(self.procs) if not p in core_procs]

        # use 10 workers
        killers = []
        for i in range(10):
            t = _ProcessKiller(q, i)
            killers.append(t)
            t.start()

        # wait for workers to finish
        q.join()
        shutdown_errors = []

        # accumulate all the shutdown errors
        for t in killers:
            shutdown_errors.extend(t.errors)
        del killers[:]

        # #526/885: kill core procs last
        # we don't want to parallelize this as the master has to be last
        for p in reversed(core_procs):
            _kill_process(p, shutdown_errors)

        # delete everything except dead_list
        logger.info(
            "ProcessMonitor exit: cleaning up data structures and signals")
        with self.plock:
            del core_procs[:]
            del self.procs[:]
            del self.core_procs[:]

        reacquire_signals = self.reacquire_signals
        if reacquire_signals:
            reacquire_signals.clear()
        logger.info("ProcessMonitor exit: pmon has shutdown")
        self.done = True

        if shutdown_errors:
            printerrlog("Shutdown errors:\n" +
                        '\n'.join([" * %s" % e for e in shutdown_errors]))
Beispiel #6
0
    def _assume_failed(self, nodes, failed):
        """
        Utility routine for logging/recording nodes that failed

        :param nodes: list of nodes that are assumed to have failed, ``Node``
        :param failed: list of names of nodes that have failed to extend, ``[str]``
        """
        str_nodes = ["%s/%s"%(n.package, n.type) for n in nodes]
        failed.extend(str_nodes)
        printerrlog("Launch of the following nodes most likely failed: %s"%', '.join(str_nodes))
Beispiel #7
0
    def _assume_failed(self, nodes, failed):
        """
        Utility routine for logging/recording nodes that failed

        :param nodes: list of nodes that are assumed to have failed, ``Node``
        :param failed: list of names of nodes that have failed to extend, ``[str]``
        """
        str_nodes = ["%s/%s" % (n.package, n.type) for n in nodes]
        failed.extend(str_nodes)
        printerrlog("Launch of the following nodes most likely failed: %s" %
                    ', '.join(str_nodes))
Beispiel #8
0
    def _post_run(self):
        logger.info("ProcessMonitor._post_run %s" % self)
        # this is already true entering, but go ahead and make sure
        self.is_shutdown = True
        # killall processes on run exit

        q = Queue()
        q.join()

        with self.plock:
            # make copy of core_procs for threadsafe usage
            core_procs = self.core_procs[:]
            logger.info("ProcessMonitor._post_run %s: remaining procs are %s" % (self, self.procs))

            # enqueue all non-core procs in reverse order for parallel kill
            # #526/885: ignore core procs
            [q.put(p) for p in reversed(self.procs) if not p in core_procs]

        # use 10 workers
        killers = []
        for i in range(10):
            t = _ProcessKiller(q, i)
            killers.append(t)
            t.start()

        # wait for workers to finish
        q.join()
        shutdown_errors = []

        # accumulate all the shutdown errors
        for t in killers:
            shutdown_errors.extend(t.errors)
        del killers[:]

        # #526/885: kill core procs last
        # we don't want to parallelize this as the master has to be last
        for p in reversed(core_procs):
            _kill_process(p, shutdown_errors)

        # delete everything except dead_list
        logger.info("ProcessMonitor exit: cleaning up data structures and signals")
        with self.plock:
            del core_procs[:]
            del self.procs[:]
            del self.core_procs[:]

        reacquire_signals = self.reacquire_signals
        if reacquire_signals:
            reacquire_signals.clear()
        logger.info("ProcessMonitor exit: pmon has shutdown")
        self.done = True

        if shutdown_errors:
            printerrlog("Shutdown errors:\n" + "\n".join([" * %s" % e for e in shutdown_errors]))
Beispiel #9
0
    def relaunch_nodes(self):
        succeeded = []
        failed = []
        api = self.getapi()
        if self.nodes_xml is not None:
            try:
                _logger.debug("sending [%s] XML [\n%s\n]" %
                              (self.uri, self.nodes_xml))
                code, msg, val = api.launch(self.nodes_xml)
                if code == 1:
                    c_succ, c_fail = val
                    succeeded.extend(c_succ)
                    failed.extend(c_fail)
                else:
                    printerrlog('error launching on [%s, uri %s]: %s' %
                                (self.name, self.uri, msg))
            except socket.error as e:
                errno, msg = e
                printerrlog('error launching on [%s, uri %s]: %s' %
                            (self.name, self.uri, str(msg)))
            except socket.gaierror as e:
                errno, msg = e
                # usually errno == -2. See #815.
                child_host, _ = network.parse_http_host_and_port(self.uri)
                printerrlog(
                    "Unable to contact remote roslaunch at [%s]. This is most likely due to a network misconfiguration with host lookups. Please make sure that you can contact '%s' from this machine"
                    % (child.uri, child_host))

            except Exception as e:
                printerrlog('error launching on [%s, uri %s]: %s' %
                            (self.name, self.uri, str(e)))
        else:
            _logger.error("xml provided is not defined")
 def test_printlog(self):
     from roslaunch.core import add_printlog_handler, add_printerrlog_handler, printlog, printlog_bold, printerrlog
     add_printlog_handler(printlog_cb)
     add_printlog_handler(printlog_cb_exception)        
     add_printerrlog_handler(printlog_cb)
     add_printerrlog_handler(printlog_cb_exception)        
         
     #can't really test functionality, just make sure it doesn't crash
     global _lastmsg
     _lastmsg = None
     printlog('foo')
     self.assertEquals('foo', _lastmsg)
     printlog_bold('bar')
     self.assertEquals('bar', _lastmsg)        
     printerrlog('baz')
     self.assertEquals('baz', _lastmsg)        
Beispiel #11
0
 def test_printlog(self):
     from roslaunch.core import add_printlog_handler, add_printerrlog_handler, printlog, printlog_bold, printerrlog
     add_printlog_handler(printlog_cb)
     add_printlog_handler(printlog_cb_exception)        
     add_printerrlog_handler(printlog_cb)
     add_printerrlog_handler(printlog_cb_exception)        
         
     #can't really test functionality, just make sure it doesn't crash
     global _lastmsg
     _lastmsg = None
     printlog('foo')
     self.assertEquals('foo', _lastmsg)
     printlog_bold('bar')
     self.assertEquals('bar', _lastmsg)        
     printerrlog('baz')
     self.assertEquals('baz', _lastmsg)        
Beispiel #12
0
    def stop(self, errors=None):
        """
        Terminate this process, including the SSH connection.
        """
        if errors is None:
            errors = []
        try:
            self.lock.acquire()
            if not self.ssh:
                return

            # call the shutdown API first as closing the SSH connection
            # won't actually kill the process unless it triggers SIGPIPE
            try:
                api = self.getapi()
                if api is not None:
                    #TODO: probably need a timeout on this
                    api.shutdown()
            except socket.error:
                # normal if process is already dead
                address, port = self.machine.address, self.machine.ssh_port
                if not self.is_dead:
                    printerrlog(
                        "remote[%s]: unable to contact [%s] to shutdown remote processes!"
                        % (self.name, address))
                else:
                    printlog(
                        "remote[%s]: unable to contact [%s] to shutdown cleanly. The remote roslaunch may have exited already."
                        % (self.name, address))
            except:
                # temporary: don't really want to log here as this
                # may occur during shutdown
                traceback.print_exc()

            _logger.info("remote[%s]: closing ssh connection", self.name)
            self.sshin.close()
            self.sshout.close()
            self.ssherr.close()
            self.ssh.close()

            self.sshin = None
            self.sshout = None
            self.ssherr = None
            self.ssh = None
            _logger.info("remote[%s]: ssh connection closed", self.name)
        finally:
            self.lock.release()
Beispiel #13
0
    def is_alive(self):
        """
        @return: True if the process is alive. is_alive needs to be
        called periodically as it drains the SSH buffer
        @rtype: bool
        """
        if self.started and not self.ssh:
            return False
        elif not self.started:
            return True #not started is equivalent to alive in our logic
        s = self.ssherr
        s.channel.settimeout(0)
        try:
            #drain the pipes
            data = s.read(2048)
            if not len(data):
                self.is_dead = True
                return False
            # #2012 il8n: ssh *should* be UTF-8, but often isn't
            # (e.g. Japan)
            data = data.decode('utf-8')
            printerrlog("remote[%s]: %s"%(self.name, data))
        except socket.timeout:
            pass
        except IOError:
            return False
        except UnicodeDecodeError:
            # #2012: soft fail, printing is not essential. This occurs
            # with older machines that don't send utf-8 over ssh
            pass

        s = self.sshout
        s.channel.settimeout(0)
        try:
            #drain the pipes
            #TODO: write to log file
            data = s.read(2048)
            if not len(data):
                self.is_dead = True
                return False
            # data = data.decode('utf-8')
            #print "DATA", data
        except socket.timeout:
            pass
        except IOError:
            return False
        return True
Beispiel #14
0
    def is_alive(self):
        """
        @return: True if the process is alive. is_alive needs to be
        called periodically as it drains the SSH buffer
        @rtype: bool
        """
        if self.started and not self.ssh:
            return False
        elif not self.started:
            return True #not started is equivalent to alive in our logic
        s = self.ssherr
        s.channel.settimeout(0)
        try:
            #drain the pipes
            data = s.read(2048)
            if not len(data):
                self.is_dead = True
                return False
            # #2012 il8n: ssh *should* be UTF-8, but often isn't
            # (e.g. Japan)
            data = data.decode('utf-8')
            printerrlog("remote[%s]: %s"%(self.name, data))
        except socket.timeout:
            pass
        except IOError:
            return False
        except UnicodeDecodeError:
            # #2012: soft fail, printing is not essential. This occurs
            # with older machines that don't send utf-8 over ssh
            pass

        s = self.sshout
        s.channel.settimeout(0)
        try:
            #drain the pipes
            #TODO: write to log file
            data = s.read(2048)
            if not len(data):
                self.is_dead = True
                return False
            # data = data.decode('utf-8')
            #print "DATA", data
        except socket.timeout:
            pass
        except IOError:
            return False
        return True
Beispiel #15
0
    def launch_remote_nodes(self):
        """
        Contact each child to launch remote nodes
        """
        succeeded = []
        failed = []
        
        # initialize remote_nodes. we use the machine config key as
        # the key for the dictionary so that we can bin the nodes.
        self.remote_nodes = {}
        for m in self.machine_list:
            self.remote_nodes[m.config_key()] = []
            
        # build list of nodes that will be launched by machine
        nodes = [x for x in self.rosconfig.nodes if not is_machine_local(x.machine)]
        for n in nodes:
            self.remote_nodes[n.machine.config_key()].append(n)
            
        for child in self.remote_processes:
            nodes = self.remote_nodes[child.machine.config_key()]
            body = '\n'.join([n.to_remote_xml() for n in nodes])
            # #3799: force utf-8 encoding 
            xml = '<?xml version="1.0" encoding="utf-8"?>\n<launch>\n%s</launch>'%body 
                
            api = child.getapi()
            # TODO: timeouts
            try:
                self.logger.debug("sending [%s] XML [\n%s\n]"%(child.uri, xml))
                code, msg, val = api.launch(xml)
                if code == 1:
                    c_succ, c_fail = val
                    succeeded.extend(c_succ)
                    failed.extend(c_fail)
                else:
                    printerrlog('error launching on [%s, uri %s]: %s'%(child.name, child.uri, msg))
                    self._assume_failed(nodes, failed)
            except socket.error as e:
                errno, msg = e
                printerrlog('error launching on [%s, uri %s]: %s'%(child.name, child.uri, str(msg)))
                self._assume_failed(nodes, failed)

            except socket.gaierror as e:
                errno, msg = e
                # usually errno == -2. See #815. 
                child_host, _ = network.parse_http_host_and_port(child.uri)
                printerrlog("Unable to contact remote roslaunch at [%s]. This is most likely due to a network misconfiguration with host lookups. Please make sure that you can contact '%s' from this machine"%(child.uri, child_host))
                self._assume_failed(nodes, failed)

            except Exception as e:
                printerrlog('error launching on [%s, uri %s]: %s'%(child.name, child.uri, str(e)))
                self._assume_failed(nodes, failed)

        return succeeded, failed
Beispiel #16
0
    def stop(self, errors=None):
        """
        Terminate this process, including the SSH connection.
        """
        if errors is None:
            errors = []
        try:
            self.lock.acquire()            
            if not self.ssh:
                return

            # call the shutdown API first as closing the SSH connection
            # won't actually kill the process unless it triggers SIGPIPE
            try:
                api = self.getapi()
                if api is not None:
                    #TODO: probably need a timeout on this
                    api.shutdown()
            except socket.error:
                # normal if process is already dead
                address, port = self.machine.address, self.machine.ssh_port
                if not self.is_dead:
                    printerrlog("remote[%s]: unable to contact [%s] to shutdown remote processes!"%(self.name, address))
                else:
                    printlog("remote[%s]: unable to contact [%s] to shutdown cleanly. The remote roslaunch may have exited already."%(self.name, address))
            except:
                # temporary: don't really want to log here as this 
                # may occur during shutdown
                traceback.print_exc()

            _logger.info("remote[%s]: closing ssh connection", self.name)
            self.sshin.close()
            self.sshout.close()
            self.ssherr.close()                        
            self.ssh.close()

            self.sshin  = None
            self.sshout = None
            self.ssherr = None            
            self.ssh = None
            _logger.info("remote[%s]: ssh connection closed", self.name)            
        finally:
            self.lock.release()
Beispiel #17
0
 def log(self, client, level, message):
     """
     Report a log message to the server
     @param client: name of client
     @type  client: str
     @param level: log level (uses rosgraph_msgs.msg.Log levels)
     @type  level: int
     @param message: message to log
     @type  message: str
     """
     try:
         if level >= Log.ERROR:
             printerrlog("[%s]: %s" % (client, message))
         else:
             #hack due to the fact that we only have one INFO level
             if 'started with pid' in message:
                 printlog_bold("[%s]: %s" % (client, message))
             else:
                 printlog("[%s]: %s" % (client, message))
     except:
         # can't trust the logging system at this point, so just dump to screen
         traceback.print_exc()
     return 1, '', 1
Beispiel #18
0
 def log(self, client, level, message):
     """
     Report a log message to the server
     @param client: name of client
     @type  client: str
     @param level: log level (uses roslib.msg.Log levels)
     @type  level: int
     @param message: message to log
     @type  message: str
     """        
     try:
         if level >= Log.ERROR:
             printerrlog("[%s]: %s"%(client, message))
         else:
             #hack due to the fact that we only have one INFO level
             if 'started with pid' in message:
                 printlog_bold("[%s]: %s"%(client, message))
             else:
                 printlog("[%s]: %s"%(client, message))
     except:
         # can't trust the logging system at this point, so just dump to screen
         traceback.print_exc()
     return 1, '', 1
Beispiel #19
0
    def launch_remote_nodes(self):
        """
        Contact each child to launch remote nodes
        """
        succeeded = []
        failed = []

        # initialize remote_nodes. we use the machine config key as
        # the key for the dictionary so that we can bin the nodes.
        self.remote_nodes = {}
        for m in self.machine_list:
            self.remote_nodes[m.config_key()] = []

        # build list of nodes that will be launched by machine
        nodes = [
            x for x in self.rosconfig.nodes if not is_machine_local(x.machine)
        ]
        for n in nodes:
            self.remote_nodes[n.machine.config_key()].append(n)

        for child in self.remote_processes:
            nodes = self.remote_nodes[child.machine.config_key()]
            body = '\n'.join([n.to_remote_xml() for n in nodes])
            # force utf-8 encoding, #3799
            xml = '<?xml version="1.0" encoding="utf-8"?>\n<launch>\n%s</launch>' % body
            if 0:
                print xml

            api = child.getapi()
            # TODO: timeouts
            try:
                self.logger.debug("sending [%s] XML [\n%s\n]" %
                                  (child.uri, xml))
                code, msg, val = api.launch(xml)
                if code == 1:
                    c_succ, c_fail = val
                    succeeded.extend(c_succ)
                    failed.extend(c_fail)
                else:
                    printerrlog('error launching on [%s, uri %s]: %s' %
                                (child.name, child.uri, msg))
                    self._assume_failed(nodes, failed)
            except socket.error, (errno, msg):
                printerrlog('error launching on [%s, uri %s]: %s' %
                            (child.name, child.uri, str(msg)))
                self._assume_failed(nodes, failed)

            except socket.gaierror, (errno, msg):
                # usually errno == -2. See #815.
                child_host, _ = network.parse_http_host_and_port(child.uri)
                printerrlog(
                    "Unable to contact remote roslaunch at [%s]. This is most likely due to a network misconfiguration with host lookups. Please make sure that you can contact '%s' from this machine"
                    % (child.uri, child_host))
                self._assume_failed(nodes, failed)
Beispiel #20
0
    def _run(self):
        """
        Internal run loop of ProcessMonitor
        """
        plock = self.plock
        dead = []
        respawn = []
        while not self._registrations_complete:
            logger.info("mirko hack")
            time.sleep(0.1)  #yield thread

        while not self.is_shutdown:
            with plock:  #copy self.procs
                procs = self.procs[:]
            if self.is_shutdown:
                break

            # check current signal handlers to see if children have stolen them away
            # TODO: this code may not be necessary anymore (have to test)
            for s in _signal_list:
                if signal.getsignal(s) != rl_signal:
                    self.reacquire_signals.add(s)

            for p in procs:
                try:
                    if not p.is_alive():
                        logger.debug(
                            "Process[%s] has died, respawn=%s, required=%s, exit_code=%s",
                            p.name, "True(%f)" %
                            p.respawn_delay if p.respawn else p.respawn,
                            p.required, p.exit_code)
                        exit_code_str = p.get_exit_description()
                        if p.required:
                            printerrlog(
                                '=' * 80 +
                                "REQUIRED process [%s] has died!\n%s\nInitiating shutdown!\n"
                                % (p.name, exit_code_str) + '=' * 80)
                            self.is_shutdown = True
                        elif not p in respawn:
                            if p.exit_code:
                                printerrlog("[%s] %s" %
                                            (p.name, exit_code_str))
                            else:
                                printlog_bold("[%s] %s" %
                                              (p.name, exit_code_str))
                            dead.append(p)

                        ## no need for lock as we require listeners be
                        ## added before process monitor is launched
                        for l in self.listeners:
                            l.process_died(p.name, p.exit_code)

                except Exception as e:
                    traceback.print_exc()
                    #don't respawn as this is an internal error
                    dead.append(p)
                if self.is_shutdown:
                    break  #stop polling
            for d in dead:
                try:
                    # when should_respawn() returns 0.0, bool(0.0) evaluates to False
                    # work around this by checking if the return value is False
                    if d.should_respawn() is not False:
                        respawn.append(d)
                    else:
                        self.unregister(d)
                        # stop process, don't accumulate errors
                        d.stop([])
                        # save process data to dead list
                        with plock:
                            self.dead_list.append(DeadProcess(d))
                except:
                    logger.error(traceback.format_exc())

            # dead check is to make sure that ProcessMonitor at least
            # waits until its had at least one process before exiting
            if self._registrations_complete and dead and not self.procs and not respawn:
                printlog(
                    "all processes on machine have died, roslaunch will exit")
                self.is_shutdown = True
            del dead[:]
            _respawn = []
            for r in respawn:
                try:
                    if self.is_shutdown:
                        break
                    if r.should_respawn() <= 0.0:
                        printlog("[%s] restarting process" % r.name)
                        # stop process, don't accumulate errors
                        r.stop([])
                        r.start()
                    else:
                        # not ready yet, keep it around
                        _respawn.append(r)
                except:
                    traceback.print_exc()
                    logger.error("Restart failed %s", traceback.format_exc())
            respawn = _respawn
            time.sleep(0.1)  #yield thread
Beispiel #21
0
    def _run(self):
        """
        Internal run loop of ProcessMonitor
        """
        plock = self.plock
        dead = []
        respawn = []
        while not self.is_shutdown:
            with plock:  # copy self.procs
                procs = self.procs[:]
            if self.is_shutdown:
                break

            # check current signal handlers to see if children have stolen them away
            # TODO: this code may not be necessary anymore (have to test)
            for s in _signal_list:
                if signal.getsignal(s) != rl_signal:
                    self.reacquire_signals.add(s)

            for p in procs:
                try:
                    if not p.is_alive():
                        logger.debug(
                            "Process[%s] has died, respawn=%s, required=%s, exit_code=%s",
                            p.name,
                            "True(%f)" % p.respawn_delay if p.respawn else p.respawn,
                            p.required,
                            p.exit_code,
                        )
                        exit_code_str = p.get_exit_description()
                        if p.required:
                            printerrlog(
                                "=" * 80
                                + "REQUIRED process [%s] has died!\n%s\nInitiating shutdown!\n"
                                % (p.name, exit_code_str)
                                + "=" * 80
                            )
                            self.is_shutdown = True
                        elif not p in respawn:
                            if p.exit_code:
                                printerrlog("[%s] %s" % (p.name, exit_code_str))
                            else:
                                printlog_bold("[%s] %s" % (p.name, exit_code_str))
                            dead.append(p)

                        ## no need for lock as we require listeners be
                        ## added before process monitor is launched
                        for l in self.listeners:
                            l.process_died(p.name, p.exit_code)

                except Exception as e:
                    traceback.print_exc()
                    # don't respawn as this is an internal error
                    dead.append(p)
                if self.is_shutdown:
                    break  # stop polling
            for d in dead:
                try:
                    if d.should_respawn():
                        respawn.append(d)
                    else:
                        self.unregister(d)
                        # stop process, don't accumulate errors
                        d.stop([])
                        # save process data to dead list
                        with plock:
                            self.dead_list.append(DeadProcess(d))
                except:
                    logger.error(traceback.format_exc())

            # dead check is to make sure that ProcessMonitor at least
            # waits until its had at least one process before exiting
            if self._registrations_complete and dead and not self.procs and not respawn:
                printlog("all processes on machine have died, roslaunch will exit")
                self.is_shutdown = True
            del dead[:]
            _respawn = []
            for r in respawn:
                try:
                    if self.is_shutdown:
                        break
                    if r.should_respawn() <= 0.0:
                        printlog("[%s] restarting process" % r.name)
                        # stop process, don't accumulate errors
                        r.stop([])
                        r.start()
                    else:
                        # not ready yet, keep it around
                        _respawn.append(r)
                except:
                    traceback.print_exc()
                    logger.error("Restart failed %s", traceback.format_exc())
            respawn = _respawn
            time.sleep(0.1)  # yield thread
Beispiel #22
0
class ROSRemoteRunner(roslaunch.launch.ROSRemoteRunnerIF):
    """
    Manages the running of remote roslaunch children
    """
    def __init__(self, run_id, rosconfig, pm, server):
        """
        @param run_id: roslaunch run_id of this runner
        @type  run_id: str
        @param config: launch configuration
        @type  config: L{ROSConfig}
        @param pm process monitor
        @type  pm: L{ProcessMonitor}
        @param server: roslaunch parent server
        @type  server: L{ROSLaunchParentNode}
        """
        self.run_id = run_id
        self.rosconfig = rosconfig
        self.server = server
        self.pm = pm
        self.logger = logging.getLogger('roslaunch.remote')
        self.listeners = []

        self.machine_list = []
        self.remote_processes = []

    def add_process_listener(self, l):
        """
        Listen to events about remote processes dying. Not
        threadsafe. Must be called before processes started.
        @param l: ProcessListener
        @type  l: L{ProcessListener}
        """
        self.listeners.append(l)

    def _start_child(self, server_node_uri, machine, counter):
        # generate a name for the machine. don't use config key as
        # it's too long to easily display
        name = "%s-%s" % (machine.address, counter)

        self.logger.info("remote[%s] starting roslaunch", name)
        print "remote[%s] starting roslaunch" % name

        env_dict = setup_env(None, machine, self.rosconfig.master.uri)
        p = SSHChildROSLaunchProcess(self.run_id, name, server_node_uri,
                                     env_dict, machine)
        success = p.start()
        self.pm.register(p)
        if not success:  #treat as fatal
            raise RLException("unable to start remote roslaunch child: %s" %
                              name)
        self.server.add_child(name, p)
        return p

    def start_children(self):
        """
        Start the child roslaunch processes
        """
        server_node_uri = self.server.uri
        if not server_node_uri:
            raise RLException("server URI is not initialized")

        # TODOXXX: break out table building code into a separate
        # routine so we can unit test it _start_child() should not be
        # determining the process name

        # Build table of unique machines that we are going to launch on
        machines = {}
        for n in self.rosconfig.nodes:
            if not is_machine_local(n.machine):
                machines[n.machine.config_key()] = n.machine

        # Launch child roslaunch processes on remote machines
        counter = 0
        #  - keep a list of procs so we can check for those that failed to launch
        procs = []
        for m in machines:
            p = self._start_child(server_node_uri, machines[m], counter)
            procs.append(p)
            counter += 1

        # Wait for all children to call register() callback. The machines can have
        # non-uniform registration timeouts. We consider the failure to occur once
        # one of the machines has failed to meet it's timeout.
        start_t = time.time()
        while True:
            pending = []
            for p in procs:
                if not p.is_alive():
                    raise RLException("remote roslaunch failed to launch: %s" %
                                      p.machine.name)
                elif not p.uri:
                    pending.append(p.machine)
            if not pending:
                break
            # timeout is the minimum of the remaining timeouts of the machines
            timeout_t = start_t + min([m.timeout for m in pending])
            if time.time() > timeout_t:
                break
            time.sleep(0.1)
        if pending:
            raise RLException(
                """The following roslaunch remote processes failed to register: 
%s

If this is a network latency issue, you may wish to consider setting 
  <machine timeout="NUMBER OF SECONDS" ... />
in your launch""" % '\n'.join([
                    " * %s (timeout %ss)" % (m.name, m.timeout)
                    for m in pending
                ]))

        # convert machine dictionary to a list
        self.machine_list = machines.values()
        # save a list of the remote processes
        self.remote_processes = procs

    def _assume_failed(self, nodes, failed):
        """
        Utility routine for logging/recording nodes that failed
        @param nodes: list of nodes that are assumed to have failed
        @type  nodes: [L{Node}]
        @param failed: list of names of nodes that have failed to extend
        @type  failed: [str]
        """
        str_nodes = ["%s/%s" % (n.package, n.type) for n in nodes]
        failed.extend(str_nodes)
        printerrlog("Launch of the following nodes most likely failed: %s" %
                    ', '.join(str_nodes))

    def launch_remote_nodes(self):
        """
        Contact each child to launch remote nodes
        """
        succeeded = []
        failed = []

        # initialize remote_nodes. we use the machine config key as
        # the key for the dictionary so that we can bin the nodes.
        self.remote_nodes = {}
        for m in self.machine_list:
            self.remote_nodes[m.config_key()] = []

        # build list of nodes that will be launched by machine
        nodes = [
            x for x in self.rosconfig.nodes if not is_machine_local(x.machine)
        ]
        for n in nodes:
            self.remote_nodes[n.machine.config_key()].append(n)

        for child in self.remote_processes:
            nodes = self.remote_nodes[child.machine.config_key()]
            body = '\n'.join([n.to_remote_xml() for n in nodes])
            # force utf-8 encoding, #3799
            xml = '<?xml version="1.0" encoding="utf-8"?>\n<launch>\n%s</launch>' % body
            if 0:
                print xml

            api = child.getapi()
            # TODO: timeouts
            try:
                self.logger.debug("sending [%s] XML [\n%s\n]" %
                                  (child.uri, xml))
                code, msg, val = api.launch(xml)
                if code == 1:
                    c_succ, c_fail = val
                    succeeded.extend(c_succ)
                    failed.extend(c_fail)
                else:
                    printerrlog('error launching on [%s, uri %s]: %s' %
                                (child.name, child.uri, msg))
                    self._assume_failed(nodes, failed)
            except socket.error, (errno, msg):
                printerrlog('error launching on [%s, uri %s]: %s' %
                            (child.name, child.uri, str(msg)))
                self._assume_failed(nodes, failed)

            except socket.gaierror, (errno, msg):
                # usually errno == -2. See #815.
                child_host, _ = network.parse_http_host_and_port(child.uri)
                printerrlog(
                    "Unable to contact remote roslaunch at [%s]. This is most likely due to a network misconfiguration with host lookups. Please make sure that you can contact '%s' from this machine"
                    % (child.uri, child_host))
                self._assume_failed(nodes, failed)

            except Exception, e:
                printerrlog('error launching on [%s, uri %s]: %s' %
                            (child.name, child.uri, str(e)))
                self._assume_failed(nodes, failed)
Beispiel #23
0
def launch_main(argv=sys.argv, real_args=None, fn_to_call=None):
    options = None
    logger = None
    try:
        from roslaunch import rlutil
        parser = roslaunch._get_optparse()

        (options, args) = parser.parse_args(argv[1:])
        args = rlutil.resolve_launch_arguments(args)
        roslaunch._validate_args(parser, options, args)

        # node args doesn't require any roslaunch infrastructure, so process it first
        if any([
                options.node_args, options.node_list, options.find_node,
                options.dump_params, options.file_list, options.ros_args
        ]):
            if options.node_args and not args:
                parser.error("please specify a launch file")

            from roslaunch import node_args
            if options.node_args:
                node_args.print_node_args(options.node_args, args)
            elif options.find_node:
                node_args.print_node_filename(options.find_node, args)
            # Dump parameters, #2685
            elif options.dump_params:
                roslaunch_param_dump.dump_params(args)
            elif options.file_list:
                rlutil.print_file_list(args)
            elif options.ros_args:
                import arg_dump as roslaunch_arg_dump
                roslaunch_arg_dump.dump_args(args)
            else:
                node_args.print_node_list(args)
            return

        # we have to wait for the master here because we don't have the run_id yet
        if options.wait_for_master:
            if options.core:
                parser.error("--wait cannot be used with roscore")
            rlutil._wait_for_master()

        # write the pid to a file
        roslaunch.write_pid_file(options.pid_fn, options.core, options.port)

        # spin up the logging infrastructure. have to wait until we can read options.run_id
        uuid = rlutil.get_or_generate_uuid(options.run_id,
                                           options.wait_for_master)
        roslaunch.configure_logging(uuid)

        # #3088: don't check disk usage on remote machines
        if not options.child_name and not options.skip_log_check:
            # #2761
            rlutil.check_log_disk_usage()

        logger = logging.getLogger('roslaunch')
        logger.info("roslaunch starting with args %s" % str(argv))
        logger.info("roslaunch env is %s" % os.environ)

        if options.child_name:
            logger.info('starting in child mode')

            # This is a roslaunch child, spin up client server.
            # client spins up an XML-RPC server that waits for
            # commands and configuration from the server.
            from roslaunch import child as roslaunch_child
            c = roslaunch_child.ROSLaunchChild(uuid, options.child_name,
                                               options.server_uri)
            c.run()
        else:
            logger.info('starting in server mode')

            # #1491 change terminal name
            if not options.disable_title:
                rlutil.change_terminal_name(args, options.core)

            # Read roslaunch string from stdin when - is passed as launch filename.
            roslaunch_strs = []
            if '-' in args:
                roslaunch_core.printlog(
                    "Passed '-' as file argument, attempting to read roslaunch XML from stdin."
                )
                roslaunch_strs.append(sys.stdin.read())
                roslaunch_core.printlog("... %d bytes read successfully.\n" %
                                        len(roslaunch_strs[-1]))
                args.remove('-')

            # This is a roslaunch parent, spin up parent server and launch processes.
            # args are the roslaunch files to load
            from roslaunch import parent as roslaunch_parent
            try:
                # force a port binding spec if we are running a core
                if options.core:
                    options.port = options.port or DEFAULT_MASTER_PORT
                p = roslaunch_parent.ROSLaunchParent(
                    uuid,
                    args,
                    roslaunch_strs=roslaunch_strs,
                    is_core=options.core,
                    port=options.port,
                    local_only=options.local_only,
                    verbose=options.verbose,
                    force_screen=options.force_screen,
                    num_workers=options.num_workers,
                    timeout=options.timeout)
                p.start()
                if fn_to_call is None:
                    p.spin()
                else:
                    fn_to_call(real_args)

            finally:
                # remove the pid file
                if options.pid_fn:
                    try:
                        os.unlink(options.pid_fn)
                    except os.error:
                        pass

    except RLException as e:
        roslaunch_core.printerrlog(str(e))
        roslaunch_core.printerrlog(
            'The traceback for the exception was written to the log file')
        if logger:
            logger.error(traceback.format_exc())
        sys.exit(1)
    except ValueError as e:
        # TODO: need to trap better than this high-level trap
        roslaunch_core.printerrlog(str(e))
        roslaunch_core.printerrlog(
            'The traceback for the exception was written to the log file')
        if logger:
            logger.error(traceback.format_exc())
        sys.exit(1)
    except Exception as e:
        traceback.print_exc()
        sys.exit(1)