def _start_native_worker(self, wtype, id, options=None, details=None): assert (wtype in ['router', 'container', 'websocket-testee']) # prohibit starting a worker twice # if id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format( id) self.log.error(emsg) raise ApplicationError(u'crossbar.error.worker_already_running', emsg) # check worker options # options = options or {} try: if wtype == 'router': checkconfig.check_router_options(options) elif wtype == 'container': checkconfig.check_container_options(options) elif wtype == 'websocket-testee': checkconfig.check_websocket_testee_options(options) else: raise Exception("logic error") except Exception as e: emsg = "Could not start native worker: invalid configuration ({})".format( e) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # allow override Python executable from options # if 'python' in options: exe = options['python'] # the executable must be an absolute path, e.g. /home/oberstet/pypy-2.2.1-linux64/bin/pypy # if not os.path.isabs(exe): emsg = "Invalid worker configuration: python executable '{}' must be an absolute path".format( exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # of course the path must exist and actually be executable # if not (os.path.isfile(exe) and os.access(exe, os.X_OK)): emsg = "Invalid worker configuration: python executable '{}' does not exist or isn't an executable".format( exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) else: exe = sys.executable # all native workers (routers and containers for now) start from the same script # filename = os.path.abspath( os.path.join(crossbar.__file__, "..", "worker", "process.py")) # assemble command line for forking the worker # args = [exe, "-u", filename] args.extend(["--cbdir", self._node._cbdir]) args.extend(["--node", str(self._node_id)]) args.extend(["--worker", str(id)]) args.extend(["--realm", self._realm]) args.extend(["--type", wtype]) args.extend(["--loglevel", _loglevel]) # allow override worker process title from options # if options.get('title', None): args.extend(['--title', options['title']]) # forward explicit reactor selection # if 'reactor' in options and sys.platform in options['reactor']: args.extend(['--reactor', options['reactor'][sys.platform]]) # FIXME # elif self._node.options.reactor: # args.extend(['--reactor', self._node.options.reactor]) # create worker process environment # worker_env = create_process_env(options) # We need to use the same PYTHONPATH we were started with, so we can # find the Crossbar we're working with -- it may not be the same as the # one on the default path worker_env["PYTHONPATH"] = os.pathsep.join(sys.path) # log name of worker # worker_logname = { 'router': 'Router', 'container': 'Container', 'websocket-testee': 'WebSocketTestee' }.get(wtype, 'Worker') # topic URIs used (later) # if wtype == 'router': starting_topic = 'crossbar.node.{}.on_router_starting'.format( self._node_id) started_topic = 'crossbar.node.{}.on_router_started'.format( self._node_id) elif wtype == 'container': starting_topic = 'crossbar.node.{}.on_container_starting'.format( self._node_id) started_topic = 'crossbar.node.{}.on_container_started'.format( self._node_id) elif wtype == 'websocket-testee': starting_topic = 'crossbar.node.{}.on_websocket_testee_starting'.format( self._node_id) started_topic = 'crossbar.node.{}.on_websocket_testee_started'.format( self._node_id) else: raise Exception("logic error") # add worker tracking instance to the worker map .. # if wtype == 'router': worker = RouterWorkerProcess(self, id, details.caller, keeplog=options.get( 'traceback', None)) elif wtype == 'container': worker = ContainerWorkerProcess(self, id, details.caller, keeplog=options.get( 'traceback', None)) elif wtype == 'websocket-testee': worker = WebSocketTesteeWorkerProcess(self, id, details.caller, keeplog=options.get( 'traceback', None)) else: raise Exception("logic error") self._workers[id] = worker # create a (custom) process endpoint. # if platform.isWindows(): childFDs = None # Use the default Twisted ones else: # The communication between controller and container workers is # using WAMP running over 2 pipes. # For controller->container traffic this runs over FD 0 (`stdin`) # and for the container->controller traffic, this runs over FD 3. # # Note: We use FD 3, not FD 1 (`stdout`) or FD 2 (`stderr`) for # container->controller traffic, so that components running in the # container which happen to write to `stdout` or `stderr` do not # interfere with the container-controller communication. childFDs = {0: "w", 1: "r", 2: "r", 3: "r"} ep = WorkerProcessEndpoint(self._node._reactor, exe, args, env=worker_env, worker=worker, childFDs=childFDs) # ready handling # def on_ready_success(id): self.log.info("{worker} with ID '{id}' and PID {pid} started", worker=worker_logname, id=worker.id, pid=worker.pid) self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', self._cleanup_worker, self._node._reactor, worker, ) worker.status = 'started' worker.started = datetime.utcnow() started_info = { 'id': worker.id, 'status': worker.status, 'started': utcstr(worker.started), 'who': worker.who } # FIXME: make start of stats printer dependent on log level .. worker.log_stats(5.) self.publish(started_topic, started_info, options=PublishOptions(exclude=[details.caller])) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'Failed to start native worker: {}'.format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, worker.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(_): self.log.info("Node worker {} ended successfully".format( worker.id)) worker.log_stats(0) del self._workers[worker.id] return True def on_exit_error(err): self.log.info("Node worker {} ended with error ({})".format( worker.id, err)) worker.log_stats(0) del self._workers[worker.id] return False def check_for_shutdown(was_successful): shutdown = False # automatically shutdown node whenever a worker ended (successfully, or with error) # if checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT in self._node._node_shutdown_triggers: self.log.info( "Node worker ended, and trigger '{}' active".format( checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT)) shutdown = True # automatically shutdown node when worker ended with error # if not was_successful and checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT_WITH_ERROR in self._node._node_shutdown_triggers: self.log.info( "Node worker ended with error, and trigger '{}' active". format( checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT_WITH_ERROR)) shutdown = True # automatically shutdown node when no more workers are left # if len( self._workers ) == 0 and checkconfig.NODE_SHUTDOWN_ON_LAST_WORKER_EXIT in self._node._node_shutdown_triggers: self.log.info( "No more node workers running, and trigger '{}' active". format(checkconfig.NODE_SHUTDOWN_ON_LAST_WORKER_EXIT)) shutdown = True # initiate shutdown (but only if we are not already shutting down) # if shutdown: if not self._shutdown_requested: self.log.info("Node shutting down ..") self.shutdown() else: # ignore: shutdown already initiated .. self.log.info("Node is already shutting down.") else: self.log.info( "Node will continue to run (node shutdown triggers active: {})" .format(self._node._node_shutdown_triggers)) d_on_exit = worker.exit.addCallbacks(on_exit_success, on_exit_error) d_on_exit.addBoth(check_for_shutdown) # create a transport factory for talking WAMP to the native worker # transport_factory = create_native_worker_client_factory( self._node._router_session_factory, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { 'id': id, 'status': worker.status, 'created': utcstr(worker.created), 'who': worker.who } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=[details.caller])) # now actually fork the worker .. # self.log.info("Starting {worker} with ID '{id}'...", worker=worker_logname, id=id) self.log.debug("{worker} '{id}' command line is '{cmdline}'", worker=worker_logname, id=id, cmdline=' '.join(args)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will errback .. pid = proto.transport.pid self.log.debug("Native worker process connected with PID {pid}", pid=pid) # note the PID of the worker worker.pid = pid # proto is an instance of NativeWorkerClientProtocol worker.proto = proto worker.status = 'connected' worker.connected = datetime.utcnow() def on_connect_error(err): # not sure when this errback is triggered at all .. self.log.error( "Interal error: connection to forked native worker failed ({err})", err=err) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def start_guest(self, id, config, details=None): """ Start a new guest process on this node. :param config: The guest process configuration. :type config: obj :returns: int -- The PID of the new process. """ # prohibit starting a worker twice # if id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format( id) self.log.error(emsg) raise ApplicationError(u'crossbar.error.worker_already_running', emsg) try: checkconfig.check_guest(config) except Exception as e: raise ApplicationError( u'crossbar.error.invalid_configuration', 'invalid guest worker configuration: {}'.format(e)) options = config.get('options', {}) # guest process working directory # workdir = self._node._cbdir if 'workdir' in options: workdir = os.path.join(workdir, options['workdir']) workdir = os.path.abspath(workdir) # guest process executable and command line arguments # # first try to configure the fully qualified path for the guest # executable by joining workdir and configured exectuable .. exe = os.path.abspath(os.path.join(workdir, config['executable'])) if check_executable(exe): self.log.info( "Using guest worker executable '{exe}' (executable path taken from configuration)", exe=exe) else: # try to detect the fully qualified path for the guest # executable by doing a "which" on the configured executable name exe = shutil.which(config['executable']) if exe is not None and check_executable(exe): self.log.info( "Using guest worker executable '{exe}' (executable path detected from environment)", exe=exe) else: emsg = "Could not start worker: could not find and executable for '{}'".format( config['executable']) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # guest process command line arguments # args = [exe] args.extend(config.get('arguments', [])) # guest process environment # worker_env = create_process_env(options) # log name of worker # worker_logname = 'Guest' # topic URIs used (later) # starting_topic = 'crossbar.node.{}.on_guest_starting'.format( self._node_id) started_topic = 'crossbar.node.{}.on_guest_started'.format( self._node_id) # add worker tracking instance to the worker map .. # worker = GuestWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) self._workers[id] = worker # create a (custom) process endpoint # ep = WorkerProcessEndpoint(self._node._reactor, exe, args, path=workdir, env=worker_env, worker=worker) # ready handling # def on_ready_success(proto): worker.pid = proto.transport.pid worker.status = 'started' worker.started = datetime.utcnow() self.log.info("{worker} with ID '{id}' and PID {pid} started", worker=worker_logname, id=worker.id, pid=worker.pid) self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', self._cleanup_worker, self._node._reactor, worker, ) # directory watcher # if 'watch' in options: if HAS_FSNOTIFY: # assemble list of watched directories watched_dirs = [] for d in options['watch'].get('directories', []): watched_dirs.append( os.path.abspath(os.path.join(self._node._cbdir, d))) worker.watch_timeout = options['watch'].get('timeout', 1) # create a directory watcher worker.watcher = DirWatcher(dirs=watched_dirs, notify_once=True) # make sure to stop the background thread running inside the # watcher upon Twisted being shut down def on_shutdown(): worker.watcher.stop() self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', on_shutdown) # this handler will get fired by the watcher upon detecting an FS event def on_fsevent(evt): worker.watcher.stop() proto.signal('TERM') if options['watch'].get('action', None) == 'restart': self.log.info("Restarting guest ..") # Add a timeout large enough (perhaps add a config option later) self._node._reactor.callLater( worker.watch_timeout, self.start_guest, id, config, details) # Shut the worker down, after the restart event is scheduled worker.stop() # now run the watcher on a background thread deferToThread(worker.watcher.loop, on_fsevent) else: self.log.warn( "Warning: cannot watch directory for changes - feature DirWatcher unavailable" ) # assemble guest worker startup information # started_info = { 'id': worker.id, 'status': worker.status, 'started': utcstr(worker.started), 'who': worker.who } self.publish(started_topic, started_info, options=PublishOptions(exclude=[details.caller])) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'Failed to start guest worker: {}'.format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, ep.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): self.log.info("Guest {id} exited with success", id=worker.id) del self._workers[worker.id] def on_exit_error(err): self.log.error("Guest {id} exited with error {err.value}", id=worker.id, err=err) del self._workers[worker.id] worker.exit.addCallbacks(on_exit_success, on_exit_error) # create a transport factory for talking WAMP to the native worker # transport_factory = create_guest_worker_client_factory( config, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { 'id': id, 'status': worker.status, 'created': utcstr(worker.created), 'who': worker.who } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=[details.caller])) # now actually fork the worker .. # self.log.info("Starting {worker} with ID '{id}'...", worker=worker_logname, id=id) self.log.debug("{worker} '{id}' using command line '{cli}'...", worker=worker_logname, id=id, cli=' '.join(args)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will # errback - probably only if the forking of a new process fails # at OS level due to out of memory conditions or such. pid = proto.transport.pid self.log.debug("Guest worker process connected with PID {pid}", pid=pid) worker.pid = pid # proto is an instance of GuestWorkerClientProtocol worker.proto = proto worker.status = 'connected' worker.connected = datetime.utcnow() def on_connect_error(err): # not sure when this errback is triggered at all .. see above. self.log.error( "Internal error: connection to forked guest worker failed ({})" .format(err)) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def _start_native_worker(self, wtype, id, options=None, details=None): assert(wtype in ['router', 'container']) # prohibit starting a worker twice # if id in self._workers: emsg = "ERROR: could not start worker - a worker with ID '{}'' is already running (or starting)".format(id) log.msg(emsg) raise ApplicationError('crossbar.error.worker_already_running', emsg) # check worker options # options = options or {} try: if wtype == 'router': checkconfig.check_router_options(options) elif wtype == 'container': checkconfig.check_container_options(options) else: raise Exception("logic error") except Exception as e: emsg = "ERROR: could not start native worker - invalid configuration ({})".format(e) log.msg(emsg) raise ApplicationError('crossbar.error.invalid_configuration', emsg) # allow override Python executable from options # if 'python' in options: exe = options['python'] # the executable must be an absolute path, e.g. /home/oberstet/pypy-2.2.1-linux64/bin/pypy # if not os.path.isabs(exe): emsg = "ERROR: python '{}' from worker options must be an absolute path".format(exe) log.msg(emsg) raise ApplicationError('crossbar.error.invalid_configuration', emsg) # of course the path must exist and actually be executable # if not (os.path.isfile(exe) and os.access(exe, os.X_OK)): emsg = "ERROR: python '{}' from worker options does not exist or isn't an executable".format(exe) log.msg(emsg) raise ApplicationError('crossbar.error.invalid_configuration', emsg) else: exe = sys.executable # all native workers (routers and containers for now) start from the same script # filename = pkg_resources.resource_filename('crossbar', 'worker/process.py') # assemble command line for forking the worker # args = [exe, "-u", filename] args.extend(["--cbdir", self._node._cbdir]) args.extend(["--node", str(self._node_id)]) args.extend(["--worker", str(id)]) args.extend(["--realm", self._realm]) args.extend(["--type", wtype]) # allow override worker process title from options # if options.get('title', None): args.extend(['--title', options['title']]) # allow overriding debug flag from options # if options.get('debug', self.debug): args.append('--debug') # forward explicit reactor selection # if 'reactor' in options and sys.platform in options['reactor']: args.extend(['--reactor', options['reactor'][sys.platform]]) elif self._node.options.reactor: args.extend(['--reactor', self._node.options.reactor]) # create worker process environment # worker_env = create_process_env(options) # log name of worker # worker_logname = {'router': 'Router', 'container': 'Container'}.get(wtype, 'Worker') # topic URIs used (later) # if wtype == 'router': starting_topic = 'crossbar.node.{}.on_router_starting'.format(self._node_id) started_topic = 'crossbar.node.{}.on_router_started'.format(self._node_id) elif wtype == 'container': starting_topic = 'crossbar.node.{}.on_container_starting'.format(self._node_id) started_topic = 'crossbar.node.{}.on_container_started'.format(self._node_id) else: raise Exception("logic error") # add worker tracking instance to the worker map .. # if wtype == 'router': worker = RouterWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) elif wtype == 'container': worker = ContainerWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) else: raise Exception("logic error") self._workers[id] = worker # create a (custom) process endpoint # ep = WorkerProcessEndpoint(self._node._reactor, exe, args, env=worker_env, worker=worker) # ready handling # def on_ready_success(id): log.msg("{} with ID '{}' and PID {} started".format(worker_logname, worker.id, worker.pid)) def cleanup_worker(): try: worker.proto.transport.signalProcess('TERM') except ProcessExitedAlready: pass # ignore; it's already dead self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', cleanup_worker, ) worker.status = 'started' worker.started = datetime.utcnow() started_info = { 'id': worker.id, 'status': worker.status, 'started': utcstr(worker.started), 'who': worker.who } self.publish(started_topic, started_info, options=PublishOptions(exclude=[details.caller])) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'ERROR: failed to start native worker - {}'.format(err.value) log.msg(emsg) raise ApplicationError("crossbar.error.cannot_start", emsg, worker.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): del self._workers[worker.id] def on_exit_error(err): del self._workers[worker.id] worker.exit.addCallbacks(on_exit_success, on_exit_error) # create a transport factory for talking WAMP to the native worker # transport_factory = create_native_worker_client_factory(self._node._router_session_factory, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { 'id': id, 'status': worker.status, 'created': utcstr(worker.created), 'who': worker.who } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=[details.caller])) # now actually fork the worker .. # if self.debug: log.msg("Starting {} with ID '{}' using command line '{}' ..".format(worker_logname, id, ' '.join(args))) else: log.msg("Starting {} with ID '{}' ..".format(worker_logname, id)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will errback .. pid = proto.transport.pid if self.debug: log.msg("Native worker process connected with PID {}".format(pid)) # note the PID of the worker worker.pid = pid # proto is an instance of NativeWorkerClientProtocol worker.proto = proto worker.status = 'connected' worker.connected = datetime.utcnow() def on_connect_error(err): # not sure when this errback is triggered at all .. if self.debug: log.msg("ERROR: Connecting forked native worker failed - {}".format(err)) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def start_guest(self, id, config, details=None): """ Start a new guest process on this node. :param config: The guest process configuration. :type config: obj :returns: int -- The PID of the new process. """ # prohibit starting a worker twice # if id in self._workers: emsg = "ERROR: could not start worker - a worker with ID '{}' is already running (or starting)".format(id) log.msg(emsg) raise ApplicationError('crossbar.error.worker_already_running', emsg) try: checkconfig.check_guest(config) except Exception as e: raise ApplicationError('crossbar.error.invalid_configuration', 'invalid guest worker configuration: {}'.format(e)) options = config.get('options', {}) # guest process working directory # workdir = self._node._cbdir if 'workdir' in options: workdir = os.path.join(workdir, options['workdir']) workdir = os.path.abspath(workdir) # guest process executable and command line arguments # # first try to configure the fully qualified path for the guest # executable by joining workdir and configured exectuable .. exe = os.path.abspath(os.path.join(workdir, config['executable'])) if check_executable(exe): log.msg("Using guest worker executable '{}' (executable path taken from configuration)".format(exe)) else: # try to detect the fully qualified path for the guest # executable by doing a "which" on the configured executable name exe = shutil.which(config['executable']) if exe is not None and check_executable(exe): log.msg("Using guest worker executable '{}' (executable path detected from environment)".format(exe)) else: emsg = "ERROR: could not start worker - could not find and executable for '{}'".format(config['executable']) log.msg(emsg) raise ApplicationError('crossbar.error.invalid_configuration', emsg) # guest process command line arguments # args = [exe] args.extend(config.get('arguments', [])) # guest process environment # worker_env = create_process_env(options) # log name of worker # worker_logname = 'Guest' # topic URIs used (later) # starting_topic = 'crossbar.node.{}.on_guest_starting'.format(self._node_id) started_topic = 'crossbar.node.{}.on_guest_started'.format(self._node_id) # add worker tracking instance to the worker map .. # worker = GuestWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) self._workers[id] = worker # create a (custom) process endpoint # ep = WorkerProcessEndpoint(self._node._reactor, exe, args, path=workdir, env=worker_env, worker=worker) # ready handling # def on_ready_success(proto): worker.pid = proto.transport.pid worker.status = 'started' worker.started = datetime.utcnow() log.msg("{} with ID '{}' and PID {} started".format(worker_logname, worker.id, worker.pid)) # directory watcher # if 'watch' in options: if HAS_FSNOTIFY: # assemble list of watched directories watched_dirs = [] for d in options['watch'].get('directories', []): watched_dirs.append(os.path.abspath(os.path.join(self._node._cbdir, d))) # create a directory watcher worker.watcher = DirWatcher(dirs=watched_dirs, notify_once=True) # make sure to stop the background thread running inside the # watcher upon Twisted being shut down def on_shutdown(): worker.watcher.stop() reactor.addSystemEventTrigger('before', 'shutdown', on_shutdown) # this handler will get fired by the watcher upon detecting an FS event def on_fsevent(evt): worker.watcher.stop() proto.signal('TERM') if options['watch'].get('action', None) == 'restart': log.msg("Restarting guest ..") reactor.callLater(0.1, self.start_guest, id, config, details) # now run the watcher on a background thread deferToThread(worker.watcher.loop, on_fsevent) else: log.msg("Warning: cannot watch directory for changes - feature DirWatcher unavailable") # assemble guest worker startup information # started_info = { 'id': worker.id, 'status': worker.status, 'started': utcstr(worker.started), 'who': worker.who } self.publish(started_topic, started_info, options=PublishOptions(exclude=[details.caller])) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'ERROR: failed to start guest worker - {}'.format(err.value) log.msg(emsg) raise ApplicationError("crossbar.error.cannot_start", emsg, ep.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): log.msg("Guest excited with success") del self._workers[worker.id] def on_exit_error(err): log.msg("Guest excited with error", err) del self._workers[worker.id] worker.exit.addCallbacks(on_exit_success, on_exit_error) # create a transport factory for talking WAMP to the native worker # transport_factory = create_guest_worker_client_factory(config, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { 'id': id, 'status': worker.status, 'created': utcstr(worker.created), 'who': worker.who } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=[details.caller])) # now actually fork the worker .. # if self.debug: log.msg("Starting {} with ID '{}' using command line '{}' ..".format(worker_logname, id, ' '.join(args))) else: log.msg("Starting {} with ID '{}' ..".format(worker_logname, id)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will # errback - probably only if the forking of a new process fails # at OS level due to out of memory conditions or such. pid = proto.transport.pid if self.debug: log.msg("Guest worker process connected with PID {}".format(pid)) worker.pid = pid # proto is an instance of GuestWorkerClientProtocol worker.proto = proto worker.status = 'connected' worker.connected = datetime.utcnow() def on_connect_error(err): # not sure when this errback is triggered at all .. see above. if self.debug: log.msg("ERROR: Connecting forked guest worker failed - {}".format(err)) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def _start_guest_worker(self, worker_id, worker_config, details=None): """ Start a new guest process on this node. :param config: The guest process configuration. :type config: obj :returns: int -- The PID of the new process. """ # prohibit starting a worker twice # if worker_id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format( worker_id) self.log.error(emsg) raise ApplicationError(u'crossbar.error.worker_already_running', emsg) try: checkconfig.check_guest(worker_config) except Exception as e: raise ApplicationError( u'crossbar.error.invalid_configuration', 'invalid guest worker configuration: {}'.format(e)) options = worker_config.get('options', {}) # guest process working directory # workdir = self._node._cbdir if 'workdir' in options: workdir = os.path.join(workdir, options['workdir']) workdir = os.path.abspath(workdir) # guest process executable and command line arguments # # first try to configure the fully qualified path for the guest # executable by joining workdir and configured exectuable .. exe = os.path.abspath( os.path.join(workdir, worker_config['executable'])) if check_executable(exe): self.log.info( "Using guest worker executable '{exe}' (executable path taken from configuration)", exe=exe) else: # try to detect the fully qualified path for the guest # executable by doing a "which" on the configured executable name exe = which(worker_config['executable']) if exe is not None and check_executable(exe): self.log.info( "Using guest worker executable '{exe}' (executable path detected from environment)", exe=exe) else: emsg = "Could not start worker: could not find and executable for '{}'".format( worker_config['executable']) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # guest process command line arguments # args = [exe] args.extend(worker_config.get('arguments', [])) # guest process environment # worker_env = create_process_env(options) # log name of worker # worker_logname = 'Guest' # topic URIs used (later) # starting_topic = u'{}.on_guest_starting'.format(self._uri_prefix) started_topic = u'{}.on_guest_started'.format(self._uri_prefix) # add worker tracking instance to the worker map .. # worker = GuestWorkerProcess(self, worker_id, details.caller, keeplog=options.get('traceback', None)) self._workers[worker_id] = worker # create a (custom) process endpoint # ep = WorkerProcessEndpoint(self._node._reactor, exe, args, path=workdir, env=worker_env, worker=worker) # ready handling # def on_ready_success(proto): self.log.info('{worker_logname} worker "{worker_id}" started', worker_logname=worker_logname, worker_id=worker.id) worker.on_worker_started(proto) self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', self._cleanup_worker, self._node._reactor, worker, ) # directory watcher # if 'watch' in options: if HAS_FS_WATCHER: # assemble list of watched directories watched_dirs = [] for d in options['watch'].get('directories', []): watched_dirs.append( os.path.abspath(os.path.join(self._node._cbdir, d))) worker.watch_timeout = options['watch'].get('timeout', 1) # create a filesystem watcher worker.watcher = FilesystemWatcher( workdir, watched_dirs=watched_dirs) # make sure to stop the watch upon Twisted being shut down def on_shutdown(): worker.watcher.stop() self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', on_shutdown) # this handler will get fired by the watcher upon detecting an FS event def on_filesystem_change(fs_event): worker.watcher.stop() proto.signal('TERM') if options['watch'].get('action', None) == 'restart': self.log.info( "Filesystem watcher detected change {fs_event} - restarting guest in {watch_timeout} seconds ..", fs_event=fs_event, watch_timeout=worker.watch_timeout) # Add a timeout large enough (perhaps add a config option later) self._node._reactor.callLater( worker.watch_timeout, self.start_guest, worker_id, worker_config, details) # Shut the worker down, after the restart event is scheduled # FIXME: all workers should have a stop() method .. # -> 'GuestWorkerProcess' object has no attribute 'stop' # worker.stop() else: self.log.info( "Filesystem watcher detected change {fs_event} - no action taken!", fs_event=fs_event) # now start watching .. worker.watcher.start(on_filesystem_change) else: self.log.warn( "Cannot watch directories for changes - feature not available" ) # assemble guest worker startup information # started_info = { u'id': worker.id, u'status': worker.status, u'started': utcstr(worker.started), u'who': worker.who, } self.publish(started_topic, started_info, options=PublishOptions(exclude=details.caller)) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'Failed to start guest worker: {}'.format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, ep.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): self.log.info("Guest {worker_id} exited with success", worker_id=worker.id) del self._workers[worker.id] def on_exit_error(err): self.log.error("Guest {worker_id} exited with error {err.value}", worker_id=worker.id, err=err) del self._workers[worker.id] worker.exit.addCallbacks(on_exit_success, on_exit_error) # create a transport factory for talking WAMP to the native worker # transport_factory = create_guest_worker_client_factory( worker_config, worker.ready, worker.exit) transport_factory.noisy = False self._workers[worker_id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { u'id': worker_id, u'status': worker.status, u'created': utcstr(worker.created), u'who': worker.who, } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=details.caller)) # now actually fork the worker .. # self.log.info('{worker_logname} "{worker_id}" process starting ..', worker_logname=worker_logname, worker_id=worker_id) self.log.debug( '{worker_logname} "{worker_id}" process using command line "{cli}" ..', worker_logname=worker_logname, worker_id=worker_id, cli=' '.join(args)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will # errback - probably only if the forking of a new process fails # at OS level due to out of memory conditions or such. self.log.debug('{worker_logname} "{worker_id}" connected', worker_logname=worker_logname, worker_id=worker_id) # do not comment this: it will lead to on_worker_started being called # _before_ on_worker_connected, and we don't need it! # worker.on_worker_connected(proto) def on_connect_error(err): # not sure when this errback is triggered at all .. see above. self.log.failure( "Internal error: connection to forked guest worker failed ({log_failure.value})", ) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def _start_native_worker(self, wtype, id, options=None, details=None): assert (wtype in ['router', 'container']) ## prohibit starting a worker twice ## if id in self._workers: emsg = "ERROR: could not start worker - a worker with ID '{}'' is already running (or starting)".format( id) log.msg(emsg) raise ApplicationError('crossbar.error.worker_already_running', emsg) ## check worker options ## options = options or {} try: if wtype == 'router': checkconfig.check_router_options(options) elif wtype == 'container': checkconfig.check_container_options(options) else: raise Exception("logic error") except Exception as e: emsg = "ERROR: could not start native worker - invalid configuration ({})".format( e) log.msg(emsg) raise ApplicationError('crossbar.error.invalid_configuration', emsg) ## allow override Python executable from options ## if 'python' in options: exe = options['python'] ## the executable must be an absolute path, e.g. /home/oberstet/pypy-2.2.1-linux64/bin/pypy ## if not os.path.isabs(exe): emsg = "ERROR: python '{}' from worker options must be an absolute path".format( exe) log.msg(emsg) raise ApplicationError('crossbar.error.invalid_configuration', emsg) ## of course the path must exist and actually be executable ## if not (os.path.isfile(exe) and os.access(exe, os.X_OK)): emsg = "ERROR: python '{}' from worker options does not exist or isn't an executable".format( exe) log.msg(emsg) raise ApplicationError('crossbar.error.invalid_configuration', emsg) else: exe = sys.executable ## all native workers (routers and containers for now) start from the same script ## filename = pkg_resources.resource_filename('crossbar', 'worker/process.py') ## assemble command line for forking the worker ## args = [exe, "-u", filename] args.extend(["--cbdir", self._node._cbdir]) args.extend(["--node", str(self._node_id)]) args.extend(["--worker", str(id)]) args.extend(["--realm", self._realm]) args.extend(["--type", wtype]) ## allow override worker process title from options ## if options.get('title', None): args.extend(['--title', options['title']]) ## allow overriding debug flag from options ## if options.get('debug', self.debug): args.append('--debug') ## forward explicit reactor selection ## if 'reactor' in options and sys.platform in options['reactor']: args.extend(['--reactor', options['reactor'][sys.platform]]) elif self._node.options.reactor: args.extend(['--reactor', self._node.options.reactor]) ## create worker process environment ## worker_env = create_process_env(options) ## log name of worker ## worker_logname = { 'router': 'Router', 'container': 'Container' }.get(wtype, 'Worker') ## topic URIs used (later) ## if wtype == 'router': starting_topic = 'crossbar.node.{}.on_router_starting'.format( self._node_id) started_topic = 'crossbar.node.{}.on_router_started'.format( self._node_id) elif wtype == 'container': starting_topic = 'crossbar.node.{}.on_container_starting'.format( self._node_id) started_topic = 'crossbar.node.{}.on_container_started'.format( self._node_id) else: raise Exception("logic error") ## add worker tracking instance to the worker map .. ## if wtype == 'router': worker = RouterWorkerProcess(self, id, details.authid, keeplog=options.get( 'traceback', None)) elif wtype == 'container': worker = ContainerWorkerProcess(self, id, details.authid, keeplog=options.get( 'traceback', None)) else: raise Exception("logic error") self._workers[id] = worker ## create a (custom) process endpoint ## ep = WorkerProcessEndpoint(self._node._reactor, exe, args, env=worker_env, worker=worker) ## ready handling ## def on_ready_success(id): log.msg("{} with ID '{}' and PID {} started".format( worker_logname, worker.id, worker.pid)) worker.status = 'started' worker.started = datetime.utcnow() started_info = { 'id': worker.id, 'status': worker.status, 'started': utcstr(worker.started), 'who': worker.who } self.publish(started_topic, started_info, options=PublishOptions(exclude=[details.caller])) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'ERROR: failed to start native worker - {}'.format( err.value) log.msg(emsg) raise ApplicationError("crossbar.error.cannot_start", emsg, worker.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): del self._workers[worker.id] def on_exit_error(err): del self._workers[worker.id] worker.exit.addCallbacks(on_exit_success, on_exit_error) ## create a transport factory for talking WAMP to the native worker ## transport_factory = create_native_worker_client_factory( self._node._router_session_factory, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory ## now (immediately before actually forking) signal the starting of the worker ## starting_info = { 'id': id, 'status': worker.status, 'created': utcstr(worker.created), 'who': worker.who } ## the caller gets a progressive result .. if details.progress: details.progress(starting_info) ## .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=[details.caller])) ## now actually fork the worker .. ## if self.debug: log.msg( "Starting {} with ID '{}' using command line '{}' ..".format( worker_logname, id, ' '.join(args))) else: log.msg("Starting {} with ID '{}' ..".format(worker_logname, id)) d = ep.connect(transport_factory) def on_connect_success(proto): ## this seems to be called immediately when the child process ## has been forked. even if it then immediately fails because ## e.g. the executable doesn't even exist. in other words, ## I'm not sure under what conditions the deferred will errback .. pid = proto.transport.pid if self.debug: log.msg( "Native worker process connected with PID {}".format(pid)) ## note the PID of the worker worker.pid = pid ## proto is an instance of NativeWorkerClientProtocol worker.proto = proto worker.status = 'connected' worker.connected = datetime.utcnow() def on_connect_error(err): ## not sure when this errback is triggered at all .. if self.debug: log.msg("ERROR: Connecting forked native worker failed - {}". format(err)) ## in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def _start_native_worker(self, worker_type, worker_id, worker_options=None, details=None): # prohibit starting a worker twice # if worker_id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format( worker_id) self.log.error(emsg) raise ApplicationError(u'crossbar.error.worker_already_running', emsg) # check worker options # options = worker_options or {} try: if worker_type in self._node._native_workers: if self._node._native_workers[worker_type][ 'checkconfig_options']: self._node._native_workers[worker_type][ 'checkconfig_options'](options) else: raise Exception( 'No checkconfig_options for worker type "{worker_type}" implemented!' .format(worker_type=worker_type)) else: raise Exception('invalid worker type "{}"'.format(worker_type)) except Exception as e: emsg = "Could not start native worker: invalid configuration ({})".format( e) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # the fully qualified worker class as a string worker_class = qual( self._node._native_workers[worker_type]['worker_class']) # allow override Python executable from options # if 'python' in options: exe = options['python'] # the executable must be an absolute path, e.g. /home/oberstet/pypy-2.2.1-linux64/bin/pypy # if not os.path.isabs(exe): emsg = "Invalid worker configuration: python executable '{}' must be an absolute path".format( exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # of course the path must exist and actually be executable # if not (os.path.isfile(exe) and os.access(exe, os.X_OK)): emsg = "Invalid worker configuration: python executable '{}' does not exist or isn't an executable".format( exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) else: exe = sys.executable # allow override default Python module search paths from options # if 'pythonpath' in options: pythonpaths_to_add = [ os.path.abspath(os.path.join(self._node._cbdir, p)) for p in options.get('pythonpath', []) ] else: pythonpaths_to_add = [] # assemble command line for forking the worker # # all native workers (routers and containers for now) start # from the same script in crossbar/worker/process.py -- we're # invoking via "-m" so that .pyc files, __pycache__ etc work # properly. # args = [exe, "-u", "-m", "crossbar.worker.process"] args.extend(["--cbdir", self._node._cbdir]) args.extend(["--worker", str(worker_id)]) args.extend(["--realm", self._realm]) args.extend(["--klass", worker_class]) args.extend(["--loglevel", get_global_log_level()]) if "shutdown" in options: args.extend(["--shutdown", options["shutdown"]]) # Node-level callback to inject worker arguments # self._node._extend_worker_args(args, options) # allow override worker process title from options # if options.get('title', None): args.extend(['--title', options['title']]) # forward explicit reactor selection # if 'reactor' in options and sys.platform in options['reactor']: args.extend(['--reactor', options['reactor'][sys.platform]]) # FIXME # elif self._node.options.reactor: # args.extend(['--reactor', self._node.options.reactor]) # create worker process environment # worker_env = create_process_env(options) # We need to use the same PYTHONPATH we were started with, so we can # find the Crossbar we're working with -- it may not be the same as the # one on the default path worker_env["PYTHONPATH"] = os.pathsep.join(pythonpaths_to_add + sys.path) # log name of worker # worker_logname = self._node._native_workers[worker_type]['logname'] # each worker is run under its own dedicated WAMP auth role # worker_auth_role = u'crossbar.worker.{}'.format(worker_id) # topic URIs used (later) # starting_topic = self._node._native_workers[worker_type]['topics'][ 'starting'] started_topic = self._node._native_workers[worker_type]['topics'][ 'started'] # add worker tracking instance to the worker map .. # WORKER = self._node._native_workers[worker_type]['class'] worker = WORKER(self, worker_id, details.caller, keeplog=options.get('traceback', None)) self._workers[worker_id] = worker # create a (custom) process endpoint. # if platform.isWindows(): childFDs = None # Use the default Twisted ones else: # The communication between controller and container workers is # using WAMP running over 2 pipes. # For controller->container traffic this runs over FD 0 (`stdin`) # and for the container->controller traffic, this runs over FD 3. # # Note: We use FD 3, not FD 1 (`stdout`) or FD 2 (`stderr`) for # container->controller traffic, so that components running in the # container which happen to write to `stdout` or `stderr` do not # interfere with the container-controller communication. childFDs = {0: "w", 1: "r", 2: "r", 3: "r"} ep = WorkerProcessEndpoint(self._node._reactor, exe, args, env=worker_env, worker=worker, childFDs=childFDs) # ready handling # def on_ready_success(worker_id): self.log.info( '{worker_type} worker "{worker_id}" process {pid} started', worker_type=worker_logname, worker_id=worker.id, pid=worker.pid) self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', self._cleanup_worker, self._node._reactor, worker, ) worker.on_worker_started() started_info = { u'id': worker.id, u'status': worker.status, u'started': utcstr(worker.started), u'who': worker.who, } # FIXME: make start of stats printer dependent on log level .. if False: worker.log_stats(5.) self.publish(started_topic, started_info, options=PublishOptions(exclude=details.caller)) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'Failed to start native worker: {}'.format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, worker.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(_): self.log.info("Node worker {worker.id} ended successfully", worker=worker) # clear worker log worker.log_stats(0) # remove the dedicated node router authrole we dynamically # added for the worker self._node._drop_worker_role(worker_auth_role) # remove our metadata tracking for the worker del self._workers[worker.id] # indicate that the worker excited successfully return True def on_exit_error(err): self.log.info("Node worker {worker.id} ended with error ({err})", worker=worker, err=err) # clear worker log worker.log_stats(0) # remove the dedicated node router authrole we dynamically # added for the worker self._node._drop_worker_role(worker_auth_role) # remove our metadata tracking for the worker del self._workers[worker.id] # indicate that the worker excited with error return False def check_for_shutdown(was_successful): self.log.info( 'Checking for node shutdown: worker_exit_success={worker_exit_success}, shutdown_requested={shutdown_requested}, node_shutdown_triggers={node_shutdown_triggers}', worker_exit_success=was_successful, shutdown_requested=self._shutdown_requested, node_shutdown_triggers=self._node._node_shutdown_triggers) shutdown = self._shutdown_requested # automatically shutdown node whenever a worker ended (successfully, or with error) # if checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT in self._node._node_shutdown_triggers: self.log.info( "Node worker ended, and trigger '{trigger}' active", trigger=checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT) shutdown = True # automatically shutdown node when worker ended with error # if not was_successful and checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT_WITH_ERROR in self._node._node_shutdown_triggers: self.log.info( "Node worker ended with error, and trigger '{trigger}' active", trigger=checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT_WITH_ERROR ) shutdown = True # automatically shutdown node when no more workers are left # if len( self._workers ) == 0 and checkconfig.NODE_SHUTDOWN_ON_LAST_WORKER_EXIT in self._node._node_shutdown_triggers: self.log.info( "No more node workers running, and trigger '{trigger}' active", trigger=checkconfig.NODE_SHUTDOWN_ON_LAST_WORKER_EXIT) shutdown = True # initiate shutdown (but only if we are not already shutting down) # if shutdown: self.shutdown() else: self.log.info('Node will continue to run!') d_on_exit = worker.exit.addCallbacks(on_exit_success, on_exit_error) d_on_exit.addBoth(check_for_shutdown) # create a transport factory for talking WAMP to the native worker # transport_factory = create_native_worker_client_factory( self._node._router_session_factory, worker_auth_role, worker.ready, worker.exit) transport_factory.noisy = False self._workers[worker_id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { u'id': worker_id, u'status': worker.status, u'created': utcstr(worker.created), u'who': worker.who, } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=details.caller)) # now actually fork the worker .. # self.log.info('{worker_logname} worker "{worker_id}" starting ..', worker_logname=worker_logname, worker_id=worker_id) self.log.debug( '{worker_logname} "{worker_id}" command line is "{cmdline}"', worker_logname=worker_logname, worker_id=worker_id, cmdline=' '.join(args)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will errback .. self.log.debug('Native worker "{worker_id}" connected', worker_id=worker_id) worker.on_worker_connected(proto) # dynamically add a dedicated authrole to the router # for the worker we've just started self._node._add_worker_role(worker_auth_role, options) def on_connect_error(err): # not sure when this errback is triggered at all .. self.log.error( "Interal error: connection to forked native worker failed ({err})", err=err) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def _start_native_worker(self, wtype, id, options=None, details=None): assert(wtype in ['router', 'container', 'websocket-testee']) # prohibit starting a worker twice # if id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format(id) self.log.error(emsg) raise ApplicationError(u'crossbar.error.worker_already_running', emsg) # check worker options # options = options or {} try: if wtype == 'router': checkconfig.check_router_options(options) elif wtype == 'container': checkconfig.check_container_options(options) elif wtype == 'websocket-testee': checkconfig.check_websocket_testee_options(options) else: raise Exception("logic error") except Exception as e: emsg = "Could not start native worker: invalid configuration ({})".format(e) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # allow override Python executable from options # if 'python' in options: exe = options['python'] # the executable must be an absolute path, e.g. /home/oberstet/pypy-2.2.1-linux64/bin/pypy # if not os.path.isabs(exe): emsg = "Invalid worker configuration: python executable '{}' must be an absolute path".format(exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # of course the path must exist and actually be executable # if not (os.path.isfile(exe) and os.access(exe, os.X_OK)): emsg = "Invalid worker configuration: python executable '{}' does not exist or isn't an executable".format(exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) else: exe = sys.executable # all native workers (routers and containers for now) start from the same script # filename = FilePath(crossbar.__file__).parent().child("worker").child("process.py").path # assemble command line for forking the worker # args = [exe, "-u", filename] args.extend(["--cbdir", self._node._cbdir]) args.extend(["--node", str(self._node_id)]) args.extend(["--worker", str(id)]) args.extend(["--realm", self._realm]) args.extend(["--type", wtype]) args.extend(["--loglevel", _loglevel]) # allow override worker process title from options # if options.get('title', None): args.extend(['--title', options['title']]) # forward explicit reactor selection # if 'reactor' in options and sys.platform in options['reactor']: args.extend(['--reactor', options['reactor'][sys.platform]]) elif self._node.options.reactor: args.extend(['--reactor', self._node.options.reactor]) # create worker process environment # worker_env = create_process_env(options) # We need to use the same PYTHONPATH we were started with, so we can # find the Crossbar we're working with -- it may not be the same as the # one on the default path worker_env["PYTHONPATH"] = os.pathsep.join(sys.path) # log name of worker # worker_logname = { 'router': 'Router', 'container': 'Container', 'websocket-testee': 'WebSocketTestee' }.get(wtype, 'Worker') # topic URIs used (later) # if wtype == 'router': starting_topic = 'crossbar.node.{}.on_router_starting'.format(self._node_id) started_topic = 'crossbar.node.{}.on_router_started'.format(self._node_id) elif wtype == 'container': starting_topic = 'crossbar.node.{}.on_container_starting'.format(self._node_id) started_topic = 'crossbar.node.{}.on_container_started'.format(self._node_id) elif wtype == 'websocket-testee': starting_topic = 'crossbar.node.{}.on_websocket_testee_starting'.format(self._node_id) started_topic = 'crossbar.node.{}.on_websocket_testee_started'.format(self._node_id) else: raise Exception("logic error") # add worker tracking instance to the worker map .. # if wtype == 'router': worker = RouterWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) elif wtype == 'container': worker = ContainerWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) elif wtype == 'websocket-testee': worker = WebSocketTesteeWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) else: raise Exception("logic error") self._workers[id] = worker # create a (custom) process endpoint. # if platform.isWindows(): childFDs = None # Use the default Twisted ones else: # The communication between controller and container workers is # using WAMP running over 2 pipes. # For controller->container traffic this runs over FD 0 (`stdin`) # and for the container->controller traffic, this runs over FD 3. # # Note: We use FD 3, not FD 1 (`stdout`) or FD 2 (`stderr`) for # container->controller traffic, so that components running in the # container which happen to write to `stdout` or `stderr` do not # interfere with the container-controller communication. childFDs = {0: "w", 1: "r", 2: "r", 3: "r"} ep = WorkerProcessEndpoint( self._node._reactor, exe, args, env=worker_env, worker=worker, childFDs=childFDs) # ready handling # def on_ready_success(id): self.log.info("{worker} with ID '{id}' and PID {pid} started", worker=worker_logname, id=worker.id, pid=worker.pid) self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', self._cleanup_worker, self._node._reactor, worker, ) worker.status = 'started' worker.started = datetime.utcnow() started_info = { 'id': worker.id, 'status': worker.status, 'started': utcstr(worker.started), 'who': worker.who } # FIXME: make start of stats printer dependent on log level .. worker.log_stats(5.) self.publish(started_topic, started_info, options=PublishOptions(exclude=[details.caller])) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'Failed to start native worker: {}'.format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, worker.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): worker.log_stats(0) del self._workers[worker.id] return worker.id def on_exit_error(err): worker.log_stats(0) del self._workers[worker.id] return worker.id def check_for_shutdown(worker_id): shutdown = True if not self._workers: shutdown = True self.log.info("Node worker {} ended ({} workers left)".format(worker_id, len(self._workers))) if shutdown: if not self._shutdown_requested: self.log.info("Node shutting down ..") self._shutdown_requested = True self.shutdown() else: # shutdown already initiated pass d_on_exit = worker.exit.addCallbacks(on_exit_success, on_exit_error) d_on_exit.addBoth(check_for_shutdown) # create a transport factory for talking WAMP to the native worker # transport_factory = create_native_worker_client_factory(self._node._router_session_factory, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { 'id': id, 'status': worker.status, 'created': utcstr(worker.created), 'who': worker.who } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=[details.caller])) # now actually fork the worker .. # self.log.info("Starting {worker} with ID '{id}'...", worker=worker_logname, id=id) self.log.debug("{worker} '{id}' command line is '{cmdline}'", worker=worker_logname, id=id, cmdline=' '.join(args)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will errback .. pid = proto.transport.pid self.log.debug("Native worker process connected with PID {pid}", pid=pid) # note the PID of the worker worker.pid = pid # proto is an instance of NativeWorkerClientProtocol worker.proto = proto worker.status = 'connected' worker.connected = datetime.utcnow() def on_connect_error(err): # not sure when this errback is triggered at all .. self.log.error("Interal error: connection to forked native worker failed ({err})", err=err) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def start_guest(self, id, config, details=None): """ Start a new guest process on this node. :param config: The guest process configuration. :type config: obj :returns: int -- The PID of the new process. """ # prohibit starting a worker twice # if id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format(id) self.log.error(emsg) raise ApplicationError(u'crossbar.error.worker_already_running', emsg) try: checkconfig.check_guest(config) except Exception as e: raise ApplicationError(u'crossbar.error.invalid_configuration', 'invalid guest worker configuration: {}'.format(e)) options = config.get('options', {}) # guest process working directory # workdir = self._node._cbdir if 'workdir' in options: workdir = os.path.join(workdir, options['workdir']) workdir = os.path.abspath(workdir) # guest process executable and command line arguments # # first try to configure the fully qualified path for the guest # executable by joining workdir and configured exectuable .. exe = os.path.abspath(os.path.join(workdir, config['executable'])) if check_executable(exe): self.log.info("Using guest worker executable '{exe}' (executable path taken from configuration)", exe=exe) else: # try to detect the fully qualified path for the guest # executable by doing a "which" on the configured executable name exe = shutil.which(config['executable']) if exe is not None and check_executable(exe): self.log.info("Using guest worker executable '{exe}' (executable path detected from environment)", exe=exe) else: emsg = "Could not start worker: could not find and executable for '{}'".format(config['executable']) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # guest process command line arguments # args = [exe] args.extend(config.get('arguments', [])) # guest process environment # worker_env = create_process_env(options) # log name of worker # worker_logname = 'Guest' # topic URIs used (later) # starting_topic = 'crossbar.node.{}.on_guest_starting'.format(self._node_id) started_topic = 'crossbar.node.{}.on_guest_started'.format(self._node_id) # add worker tracking instance to the worker map .. # worker = GuestWorkerProcess(self, id, details.caller, keeplog=options.get('traceback', None)) self._workers[id] = worker # create a (custom) process endpoint # ep = WorkerProcessEndpoint(self._node._reactor, exe, args, path=workdir, env=worker_env, worker=worker) # ready handling # def on_ready_success(proto): worker.pid = proto.transport.pid worker.status = 'started' worker.started = datetime.utcnow() self.log.info("{worker} with ID '{id}' and PID {pid} started", worker=worker_logname, id=worker.id, pid=worker.pid) self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', self._cleanup_worker, self._node._reactor, worker, ) # directory watcher # if 'watch' in options: if HAS_FS_WATCHER: # assemble list of watched directories watched_dirs = [] for d in options['watch'].get('directories', []): watched_dirs.append(os.path.abspath(os.path.join(self._node._cbdir, d))) worker.watch_timeout = options['watch'].get('timeout', 1) # create a filesystem watcher worker.watcher = FilesystemWatcher(workdir, watched_dirs=watched_dirs) # make sure to stop the watch upon Twisted being shut down def on_shutdown(): worker.watcher.stop() self._node._reactor.addSystemEventTrigger('before', 'shutdown', on_shutdown) # this handler will get fired by the watcher upon detecting an FS event def on_filesystem_change(fs_event): worker.watcher.stop() proto.signal('TERM') if options['watch'].get('action', None) == 'restart': self.log.info("Filesystem watcher detected change {fs_event} - restarting guest in {watch_timeout} seconds ..", fs_event=fs_event, watch_timeout=worker.watch_timeout) # Add a timeout large enough (perhaps add a config option later) self._node._reactor.callLater(worker.watch_timeout, self.start_guest, id, config, details) # Shut the worker down, after the restart event is scheduled # FIXME: all workers should have a stop() method .. # -> 'GuestWorkerProcess' object has no attribute 'stop' # worker.stop() else: self.log.info("Filesystem watcher detected change {fs_event} - no action taken!", fs_event=fs_event) # now start watching .. worker.watcher.start(on_filesystem_change) else: self.log.warn("Cannot watch directories for changes - feature not available") # assemble guest worker startup information # started_info = { u'id': worker.id, u'status': worker.status, u'started': utcstr(worker.started), u'who': worker.who, } self.publish(started_topic, started_info, options=PublishOptions(exclude=details.caller)) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'Failed to start guest worker: {}'.format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, ep.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): self.log.info("Guest {id} exited with success", id=worker.id) del self._workers[worker.id] def on_exit_error(err): self.log.error("Guest {id} exited with error {err.value}", id=worker.id, err=err) del self._workers[worker.id] worker.exit.addCallbacks(on_exit_success, on_exit_error) # create a transport factory for talking WAMP to the native worker # transport_factory = create_guest_worker_client_factory(config, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { u'id': id, u'status': worker.status, u'created': utcstr(worker.created), u'who': worker.who, } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=details.caller)) # now actually fork the worker .. # self.log.info("Starting {worker} with ID '{id}'...", worker=worker_logname, id=id) self.log.debug("{worker} '{id}' using command line '{cli}'...", worker=worker_logname, id=id, cli=' '.join(args)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will # errback - probably only if the forking of a new process fails # at OS level due to out of memory conditions or such. pid = proto.transport.pid self.log.debug("Guest worker process connected with PID {pid}", pid=pid) worker.pid = pid # proto is an instance of GuestWorkerClientProtocol worker.proto = proto worker.status = 'connected' worker.connected = datetime.utcnow() def on_connect_error(err): # not sure when this errback is triggered at all .. see above. self.log.failure( "Internal error: connection to forked guest worker failed ({log_failure.value})", ) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def _start_native_worker(self, worker_type, worker_id, worker_options=None, details=None): # prohibit starting a worker twice # if worker_id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format(worker_id) self.log.error(emsg) raise ApplicationError(u'crossbar.error.worker_already_running', emsg) # check worker options # options = worker_options or {} try: if worker_type in self._node._native_workers: if self._node._native_workers[worker_type]['checkconfig_options']: self._node._native_workers[worker_type]['checkconfig_options'](options) else: raise Exception('No checkconfig_options for worker type "{worker_type}" implemented!'.format(worker_type=worker_type)) else: raise Exception('invalid worker type "{}"'.format(worker_type)) except Exception as e: emsg = "Could not start native worker: invalid configuration ({})".format(e) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # the fully qualified worker class as a string worker_class = qual(self._node._native_workers[worker_type]['worker_class']) # allow override Python executable from options # if 'python' in options: exe = options['python'] # the executable must be an absolute path, e.g. /home/oberstet/pypy-2.2.1-linux64/bin/pypy # if not os.path.isabs(exe): emsg = "Invalid worker configuration: python executable '{}' must be an absolute path".format(exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) # of course the path must exist and actually be executable # if not (os.path.isfile(exe) and os.access(exe, os.X_OK)): emsg = "Invalid worker configuration: python executable '{}' does not exist or isn't an executable".format(exe) self.log.error(emsg) raise ApplicationError(u'crossbar.error.invalid_configuration', emsg) else: exe = sys.executable # allow override default Python module search paths from options # if 'pythonpath' in options: pythonpaths_to_add = [os.path.abspath(os.path.join(self._node._cbdir, p)) for p in options.get('pythonpath', [])] else: pythonpaths_to_add = [] # assemble command line for forking the worker # # all native workers (routers and containers for now) start # from the same script in crossbar/worker/process.py -- we're # invoking via "-m" so that .pyc files, __pycache__ etc work # properly. # args = [exe, "-u", "-m", "crossbar.worker.process"] args.extend(["--cbdir", self._node._cbdir]) args.extend(["--node", str(self._node._node_id)]) args.extend(["--worker", str(worker_id)]) args.extend(["--realm", self._realm]) args.extend(["--klass", worker_class]) args.extend(["--loglevel", get_global_log_level()]) if "shutdown" in options: args.extend(["--shutdown", options["shutdown"]]) # Node-level callback to inject worker arguments # self._node._extend_worker_args(args, options) # allow override worker process title from options # if options.get('title', None): args.extend(['--title', options['title']]) # forward explicit reactor selection # if 'reactor' in options and sys.platform in options['reactor']: args.extend(['--reactor', options['reactor'][sys.platform]]) # FIXME # elif self._node.options.reactor: # args.extend(['--reactor', self._node.options.reactor]) # create worker process environment # worker_env = create_process_env(options) # We need to use the same PYTHONPATH we were started with, so we can # find the Crossbar we're working with -- it may not be the same as the # one on the default path worker_env["PYTHONPATH"] = os.pathsep.join(pythonpaths_to_add + sys.path) # log name of worker # worker_logname = self._node._native_workers[worker_type]['logname'] # each worker is run under its own dedicated WAMP auth role # worker_auth_role = u'crossbar.worker.{}'.format(worker_id) # topic URIs used (later) # starting_topic = self._node._native_workers[worker_type]['topics']['starting'] started_topic = self._node._native_workers[worker_type]['topics']['started'] # add worker tracking instance to the worker map .. # WORKER = self._node._native_workers[worker_type]['class'] worker = WORKER(self, worker_id, details.caller, keeplog=options.get('traceback', None)) self._workers[worker_id] = worker # create a (custom) process endpoint. # if platform.isWindows(): childFDs = None # Use the default Twisted ones else: # The communication between controller and container workers is # using WAMP running over 2 pipes. # For controller->container traffic this runs over FD 0 (`stdin`) # and for the container->controller traffic, this runs over FD 3. # # Note: We use FD 3, not FD 1 (`stdout`) or FD 2 (`stderr`) for # container->controller traffic, so that components running in the # container which happen to write to `stdout` or `stderr` do not # interfere with the container-controller communication. childFDs = {0: "w", 1: "r", 2: "r", 3: "r"} ep = WorkerProcessEndpoint( self._node._reactor, exe, args, env=worker_env, worker=worker, childFDs=childFDs) # ready handling # def on_ready_success(worker_id): self.log.info('{worker_type} worker "{worker_id}" process {pid} started', worker_type=worker_logname, worker_id=worker.id, pid=worker.pid) self._node._reactor.addSystemEventTrigger( 'before', 'shutdown', self._cleanup_worker, self._node._reactor, worker, ) worker.on_worker_started() started_info = { u'id': worker.id, u'status': worker.status, u'started': utcstr(worker.started), u'who': worker.who, } # FIXME: make start of stats printer dependent on log level .. if False: worker.log_stats(5.) self.publish(started_topic, started_info, options=PublishOptions(exclude=details.caller)) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = 'Failed to start native worker: {}'.format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, worker.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(_): self.log.info("Node worker {worker.id} ended successfully", worker=worker) # clear worker log worker.log_stats(0) # remove the dedicated node router authrole we dynamically # added for the worker self._node._drop_worker_role(worker_auth_role) # remove our metadata tracking for the worker del self._workers[worker.id] # indicate that the worker excited successfully return True def on_exit_error(err): self.log.info("Node worker {worker.id} ended with error ({err})", worker=worker, err=err) # clear worker log worker.log_stats(0) # remove the dedicated node router authrole we dynamically # added for the worker self._node._drop_worker_role(worker_auth_role) # remove our metadata tracking for the worker del self._workers[worker.id] # indicate that the worker excited with error return False def check_for_shutdown(was_successful): self.log.info('Checking for node shutdown: worker_exit_success={worker_exit_success}, shutdown_requested={shutdown_requested}, node_shutdown_triggers={node_shutdown_triggers}', worker_exit_success=was_successful, shutdown_requested=self._shutdown_requested, node_shutdown_triggers=self._node._node_shutdown_triggers) shutdown = self._shutdown_requested # automatically shutdown node whenever a worker ended (successfully, or with error) # if checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT in self._node._node_shutdown_triggers: self.log.info("Node worker ended, and trigger '{trigger}' active", trigger=checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT) shutdown = True # automatically shutdown node when worker ended with error # if not was_successful and checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT_WITH_ERROR in self._node._node_shutdown_triggers: self.log.info("Node worker ended with error, and trigger '{trigger}' active", trigger=checkconfig.NODE_SHUTDOWN_ON_WORKER_EXIT_WITH_ERROR) shutdown = True # automatically shutdown node when no more workers are left # if len(self._workers) == 0 and checkconfig.NODE_SHUTDOWN_ON_LAST_WORKER_EXIT in self._node._node_shutdown_triggers: self.log.info("No more node workers running, and trigger '{trigger}' active", trigger=checkconfig.NODE_SHUTDOWN_ON_LAST_WORKER_EXIT) shutdown = True # initiate shutdown (but only if we are not already shutting down) # if shutdown: self.shutdown() else: self.log.info('Node will continue to run!') d_on_exit = worker.exit.addCallbacks(on_exit_success, on_exit_error) d_on_exit.addBoth(check_for_shutdown) # create a transport factory for talking WAMP to the native worker # transport_factory = create_native_worker_client_factory(self._node._router_session_factory, worker_auth_role, worker.ready, worker.exit) transport_factory.noisy = False self._workers[worker_id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = { u'id': worker_id, u'status': worker.status, u'created': utcstr(worker.created), u'who': worker.who, } # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=details.caller)) # now actually fork the worker .. # self.log.info('{worker_logname} worker "{worker_id}" starting ..', worker_logname=worker_logname, worker_id=worker_id) self.log.debug('{worker_logname} "{worker_id}" command line is "{cmdline}"', worker_logname=worker_logname, worker_id=worker_id, cmdline=' '.join(args)) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will errback .. self.log.debug('Native worker "{worker_id}" connected', worker_id=worker_id) worker.on_worker_connected(proto) # dynamically add a dedicated authrole to the router # for the worker we've just started self._node._add_worker_role(worker_auth_role, options) def on_connect_error(err): # not sure when this errback is triggered at all .. self.log.error("Interal error: connection to forked native worker failed ({err})", err=err) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready
def start_guest(self, id, config, details=None): """ Start a new guest process on this node. :param config: The guest process configuration. :type config: obj :returns: int -- The PID of the new process. """ # prohibit starting a worker twice # if id in self._workers: emsg = "Could not start worker: a worker with ID '{}' is already running (or starting)".format(id) self.log.error(emsg) raise ApplicationError(u"crossbar.error.worker_already_running", emsg) try: checkconfig.check_guest(config) except Exception as e: raise ApplicationError( u"crossbar.error.invalid_configuration", "invalid guest worker configuration: {}".format(e) ) options = config.get("options", {}) # guest process working directory # workdir = self._node._cbdir if "workdir" in options: workdir = os.path.join(workdir, options["workdir"]) workdir = os.path.abspath(workdir) # guest process executable and command line arguments # # first try to configure the fully qualified path for the guest # executable by joining workdir and configured exectuable .. exe = os.path.abspath(os.path.join(workdir, config["executable"])) if check_executable(exe): self.log.info("Using guest worker executable '{exe}' (executable path taken from configuration)", exe=exe) else: # try to detect the fully qualified path for the guest # executable by doing a "which" on the configured executable name exe = shutil.which(config["executable"]) if exe is not None and check_executable(exe): self.log.info( "Using guest worker executable '{exe}' (executable path detected from environment)", exe=exe ) else: emsg = "Could not start worker: could not find and executable for '{}'".format(config["executable"]) self.log.error(emsg) raise ApplicationError(u"crossbar.error.invalid_configuration", emsg) # guest process command line arguments # args = [exe] args.extend(config.get("arguments", [])) # guest process environment # worker_env = create_process_env(options) # log name of worker # worker_logname = "Guest" # topic URIs used (later) # starting_topic = "crossbar.node.{}.on_guest_starting".format(self._node_id) started_topic = "crossbar.node.{}.on_guest_started".format(self._node_id) # add worker tracking instance to the worker map .. # worker = GuestWorkerProcess(self, id, details.caller, keeplog=options.get("traceback", None)) self._workers[id] = worker # create a (custom) process endpoint # ep = WorkerProcessEndpoint(self._node._reactor, exe, args, path=workdir, env=worker_env, worker=worker) # ready handling # def on_ready_success(proto): worker.pid = proto.transport.pid worker.status = "started" worker.started = datetime.utcnow() self.log.info( "{worker} with ID '{id}' and PID {pid} started", worker=worker_logname, id=worker.id, pid=worker.pid ) self._node._reactor.addSystemEventTrigger( "before", "shutdown", self._cleanup_worker, self._node._reactor, worker ) # directory watcher # if "watch" in options: if HAS_FSNOTIFY: # assemble list of watched directories watched_dirs = [] for d in options["watch"].get("directories", []): watched_dirs.append(os.path.abspath(os.path.join(self._node._cbdir, d))) worker.watch_timeout = options["watch"].get("timeout", 1) # create a directory watcher worker.watcher = DirWatcher(dirs=watched_dirs, notify_once=True) # make sure to stop the background thread running inside the # watcher upon Twisted being shut down def on_shutdown(): worker.watcher.stop() self._node._reactor.addSystemEventTrigger("before", "shutdown", on_shutdown) # this handler will get fired by the watcher upon detecting an FS event def on_fsevent(evt): worker.watcher.stop() proto.signal("TERM") if options["watch"].get("action", None) == "restart": self.log.info("Restarting guest ..") # Add a timeout large enough (perhaps add a config option later) self._node._reactor.callLater(worker.watch_timeout, self.start_guest, id, config, details) # Shut the worker down, after the restart event is scheduled worker.stop() # now run the watcher on a background thread deferToThread(worker.watcher.loop, on_fsevent) else: self.log.warn("Warning: cannot watch directory for changes - feature DirWatcher unavailable") # assemble guest worker startup information # started_info = { u"id": worker.id, u"status": worker.status, u"started": utcstr(worker.started), u"who": worker.who, } self.publish(started_topic, started_info, options=PublishOptions(exclude=details.caller)) return started_info def on_ready_error(err): del self._workers[worker.id] emsg = "Failed to start guest worker: {}".format(err.value) self.log.error(emsg) raise ApplicationError(u"crossbar.error.cannot_start", emsg, ep.getlog()) worker.ready.addCallbacks(on_ready_success, on_ready_error) def on_exit_success(res): self.log.info("Guest {id} exited with success", id=worker.id) del self._workers[worker.id] def on_exit_error(err): self.log.error("Guest {id} exited with error {err.value}", id=worker.id, err=err) del self._workers[worker.id] worker.exit.addCallbacks(on_exit_success, on_exit_error) # create a transport factory for talking WAMP to the native worker # transport_factory = create_guest_worker_client_factory(config, worker.ready, worker.exit) transport_factory.noisy = False self._workers[id].factory = transport_factory # now (immediately before actually forking) signal the starting of the worker # starting_info = {u"id": id, u"status": worker.status, u"created": utcstr(worker.created), u"who": worker.who} # the caller gets a progressive result .. if details.progress: details.progress(starting_info) # .. while all others get an event self.publish(starting_topic, starting_info, options=PublishOptions(exclude=details.caller)) # now actually fork the worker .. # self.log.info("Starting {worker} with ID '{id}'...", worker=worker_logname, id=id) self.log.debug( "{worker} '{id}' using command line '{cli}'...", worker=worker_logname, id=id, cli=" ".join(args) ) d = ep.connect(transport_factory) def on_connect_success(proto): # this seems to be called immediately when the child process # has been forked. even if it then immediately fails because # e.g. the executable doesn't even exist. in other words, # I'm not sure under what conditions the deferred will # errback - probably only if the forking of a new process fails # at OS level due to out of memory conditions or such. pid = proto.transport.pid self.log.debug("Guest worker process connected with PID {pid}", pid=pid) worker.pid = pid # proto is an instance of GuestWorkerClientProtocol worker.proto = proto worker.status = "connected" worker.connected = datetime.utcnow() def on_connect_error(err): # not sure when this errback is triggered at all .. see above. self.log.error("Internal error: connection to forked guest worker failed ({})".format(err)) # in any case, forward the error .. worker.ready.errback(err) d.addCallbacks(on_connect_success, on_connect_error) return worker.ready