def __init__(self, *args, **kwargs): BaseContainerAgent.__init__(self, *args, **kwargs) # Coordinates the container start self._status = INIT self._is_started = False # set container id and cc_agent name (as they are set in base class call) self.id = get_default_container_id() self.name = "cc_agent_%s" % self.id self.start_time = get_ion_ts() bootstrap.container_instance = self Container.instance = self self.container = self # Make self appear as process to service clients self.CCAP = CCAP self.CFG = CFG log.debug("Container (sysname=%s) initializing ..." % bootstrap.get_sys_name()) # Keep track of the overrides from the command-line, so they can trump app/rel file data self.spawn_args = kwargs # Greenlet context-local storage self.context = LocalContextMixin() # Load general capabilities file and augment with specific profile self._load_capabilities() # Start the capabilities start_order = self.cap_profile['start_order'] for cap in start_order: if cap not in self._cap_definitions: raise ContainerError( "CC capability %s not defined in profile" % cap) if cap in self._capabilities or cap in self._cap_instances: raise ContainerError("CC capability %s already initialized" % cap) try: cap_def = self._cap_definitions[cap] log.debug("__init__(): Initializing '%s'" % cap) cap_obj = named_any(cap_def['class'])(container=self) self._cap_instances[cap] = cap_obj if 'depends_on' in cap_def and cap_def['depends_on']: dep_list = cap_def['depends_on'].split(',') for dep in dep_list: dep = dep.strip() if dep not in self._cap_initialized: raise ContainerError( "CC capability %s dependent on non-existing capability %s" % (cap, dep)) if 'field' in cap_def and cap_def['field']: setattr(self, cap_def['field'], cap_obj) self._cap_initialized.append(cap) except Exception as ex: log.error("Container Capability %s init error: %s" % (cap, ex)) raise log.debug("Container initialized, OK.")
def ensure_ready(self, proc, errmsg=None, timeout=10): """ Waits until either the process dies or reports it is ready, whichever comes first. If the process dies or times out while waiting for it to be ready, a ContainerError is raised. You must be sure the process implements get_ready_event properly, otherwise this method returns immediately as the base class behavior simply passes. @param proc The process to wait on. @param errmsg A custom error message to put in the ContainerError's message. May be blank. @param timeout Amount of time (in seconds) to wait for the ready, default 10 seconds. @throws ContainerError If the process dies or if we get a timeout before the process signals ready. """ if isinstance(proc, PythonProcess): log.warn("ensure_ready does not yet work on PythonProcesses") return True if not errmsg: errmsg = "ensure_ready failed" ev = Event() def cb(*args, **kwargs): ev.set() # link either a greenlet failure due to exception OR a success via ready event proc.proc.link_exception(cb) proc.get_ready_event().rawlink(cb) retval = ev.wait(timeout=timeout) # unlink the events: ready event is probably harmless but the exception one, we want to install our own later proc.get_ready_event().unlink(cb) # if the process is stopped while we are waiting, proc.proc is set to None if proc.proc is not None: proc.proc.unlink(cb) # raise an exception if: # - we timed out # - we caught an exception if not retval: raise ContainerError("%s (timed out)" % errmsg) elif proc.proc is not None and proc.proc.dead and not proc.proc.successful( ): raise ContainerError("%s (failed): %s" % (errmsg, proc.proc.exception))
def _stop_capability(self, capability): if capability == "CONTAINER_AGENT": pass elif capability == "APP_MANAGER": self.app_manager.stop() elif capability == "PROC_MANAGER": self.proc_manager.stop() elif capability == "EXCHANGE_MANAGER": self.ex_manager.stop() elif capability == "EVENT_REPOSITORY": # close event repository (possible CouchDB connection) self.event_repository.close() elif capability == "STATE_REPOSITORY": # close state repository (possible CouchDB connection) self.state_repository.close() elif capability == "RESOURCE_REGISTRY": # close state resource registry (possible CouchDB connection) self.resource_registry.close() elif capability == "DIRECTORY": # Unregister from directory self.directory.unregister_safe("/Containers/%s" % self.id, "Processes") self.directory.unregister_safe("/Containers", self.id) # Close directory (possible CouchDB connection) self.directory.close() elif capability == "DATASTORE_MANAGER": # close any open connections to datastores self.datastore_manager.stop() elif capability == "EXCHANGE_CONNECTION": self.node.client.close() self.ioloop.kill() self.node.client.ioloop.start() # loop until connection closes # destroy AMQP connection elif capability == "GOVERNANCE_CONTROLLER": self.governance_controller.stop() elif capability == "PID_FILE": self._cleanup_pid() elif capability == "SFLOW_MANAGER": self.sflow_manager.stop() else: raise ContainerError("Cannot stop capability: %s" % capability)
def _stop_capability(self, capability): if capability == "CONTAINER_AGENT": pass elif capability == "APP_MANAGER": self.app_manager.stop() elif capability == "PROC_MANAGER": self.proc_manager.stop() elif capability == "EXCHANGE_MANAGER": self.ex_manager.stop() elif capability == "LOCAL_ROUTER": if self.local_router is not None: self.local_router.stop() elif capability == "EVENT_REPOSITORY": # close event repository (possible CouchDB connection) self.event_repository.close() self.event_pub.close() elif capability == "STATE_REPOSITORY": # close state repository (possible CouchDB connection) self.state_repository.close() elif capability == "RESOURCE_REGISTRY": # close state resource registry (possible CouchDB connection) self.resource_registry.close() elif capability == "DIRECTORY": # Close directory (possible CouchDB connection) self.directory.close() elif capability == "DATASTORE_MANAGER": # close any open connections to datastores self.datastore_manager.stop() elif capability == "GOVERNANCE_CONTROLLER": self.governance_controller.stop() elif capability == "PID_FILE": self._cleanup_pid() elif capability == "SFLOW_MANAGER": self.sflow_manager.stop() else: raise ContainerError("Cannot stop capability: %s" % capability)
def start(self): # Check if this UNIX process already runs a Container. self.container.pidfile = "cc-pid-%d" % os.getpid() if os.path.exists(self.container.pidfile): raise ContainerError( "Container.on_start(): Container is a singleton per UNIX process. Existing pid file found: %s" % self.container.pidfile) # write out a PID file containing our agent messaging name with open(self.container.pidfile, 'w') as f: pid_contents = { 'messaging': dict(CFG.server.amqp), 'container-agent': self.container.name, 'container-xp': bootstrap.get_sys_name() } f.write(msgpack.dumps(pid_contents)) atexit.register(self.container._cleanup_pid)
def start(self): log.debug("Container starting...") if self._is_started: raise ContainerError("Container already started") start_order = self.cap_profile['start_order'] for cap in start_order: if cap not in self._cap_instances: continue # First find the default enabled value if no CFG key exists enabled_default = self._cap_definitions.get_safe( "%s.enabled_default" % cap, True) # Then find CFG key where enabled flag is (default or override) enabled_config = self._cap_definitions.get_safe( "%s.enabled_config" % cap, "container.%s.enabled" % cap) # Then determine the enabled value enabled = CFG.get_safe(enabled_config, enabled_default) if enabled: log.debug("start(): Starting '%s'" % cap) try: cap_obj = self._cap_instances[cap] cap_obj.start() self._capabilities.append(cap) except Exception as ex: log.error("Container Capability %s start error: %s" % (cap, ex)) raise else: log.debug("start(): Capability '%s' disabled by config '%s'", cap, enabled_config) if self.has_capability(CCAP.EVENT_PUBLISHER): self.event_pub.publish_event(event_type="ContainerLifecycleEvent", origin=self.id, origin_type="CapabilityContainer", sub_type="START", state=ContainerStateEnum.START) self._is_started = True self._status = RUNNING log.info("Container (%s) started, OK.", self.id)
def _load_capabilities(self): self._cap_initialized = [ ] # List of capability constants initialized in container self._capabilities = [ ] # List of capability constants active in container self._cap_instances = {} # Dict mapping capability->manager instance self._cap_definitions = Config( ["res/config/container_capabilities.yml"]).data['capabilities'] profile_filename = CFG.get_safe("container.profile", "development") if not profile_filename.endswith(".yml"): profile_filename = "res/profile/%s.yml" % profile_filename log.debug("Loading CC capability profile from file: %s", profile_filename) profile_cfg = Config([profile_filename]).data if not isinstance( profile_cfg, dict ) or profile_cfg['type'] != "profile" or not "profile" in profile_cfg: raise ContainerError("Container capability profile invalid: %s" % profile_filename) self.cap_profile = profile_cfg['profile'] if "capabilities" in self.cap_profile and self.cap_profile[ 'capabilities']: dict_merge(self._cap_definitions, self.cap_profile['capabilities'], True) CCAP.clear() cap_list = self._cap_definitions.keys() CCAP.update(zip(cap_list, cap_list)) if "config" in self.cap_profile and self.cap_profile['config']: log.info("Container CFG was changed based on profile: %s", profile_filename)
def start(self): log.debug("Container starting...") if self._is_started: raise ContainerError("Container already started") # Check if this UNIX process already runs a Container. self.pidfile = "cc-pid-%d" % os.getpid() if os.path.exists(self.pidfile): raise ContainerError("Container.on_start(): Container is a singleton per UNIX process. Existing pid file found: %s" % self.pidfile) # write out a PID file containing our agent messaging name with open(self.pidfile, 'w') as f: pid_contents = {'messaging': dict(CFG.server.amqp), 'container-agent': self.name, 'container-xp': bootstrap.get_sys_name() } f.write(msgpack.dumps(pid_contents)) atexit.register(self._cleanup_pid) self._capabilities.append("PID_FILE") # set up abnormal termination handler for this container def handl(signum, frame): try: self._cleanup_pid() # cleanup the pidfile first self.quit() # now try to quit - will not error on second cleanup pidfile call finally: signal.signal(signal.SIGTERM, self._normal_signal) os.kill(os.getpid(), signal.SIGTERM) self._normal_signal = signal.signal(signal.SIGTERM, handl) self.datastore_manager.start() self._capabilities.append("DATASTORE_MANAGER") # Self-register with Directory self.directory.register("/Containers", self.id, cc_agent=self.name) self.directory.register("/Containers/%s" % self.id, "Processes") self._capabilities.append("DIRECTORY") # Event repository self.event_repository = EventRepository() self.event_pub = EventPublisher() self._capabilities.append("EVENT_REPOSITORY") # Local resource registry self.resource_registry = ResourceRegistry() self._capabilities.append("RESOURCE_REGISTRY") # Persistent objects self.datastore_manager.get_datastore("objects", DataStore.DS_PROFILE.OBJECTS) # State repository self.state_repository = StateRepository() self._capabilities.append("STATE_REPOSITORY") # Start ExchangeManager, which starts the node (broker connection) self.ex_manager.start() self._capabilities.append("EXCHANGE_MANAGER") self.proc_manager.start() self._capabilities.append("PROC_MANAGER") self.app_manager.start() self._capabilities.append("APP_MANAGER") self.governance_controller.start() self._capabilities.append("GOVERNANCE_CONTROLLER") if CFG.container.get('sflow', {}).get('enabled', False): self.sflow_manager.start() self._capabilities.append("SFLOW_MANAGER") # Start the CC-Agent API rsvc = ProcessRPCServer(node=self.node, from_name=self.name, service=self, process=self) # Start an ION process with the right kind of endpoint factory proc = self.proc_manager.proc_sup.spawn(name=self.name, listeners=[rsvc], service=self) self.proc_manager.proc_sup.ensure_ready(proc) self._capabilities.append("CONTAINER_AGENT") self.event_pub.publish_event(event_type="ContainerLifecycleEvent", origin=self.id, origin_type="CapabilityContainer", sub_type="START", state=ContainerStateEnum.START) self._is_started = True self._status = "RUNNING" log.info("Container started, OK.")
def _control_flow(self): """ Main process thread of execution method. This method is run inside a greenlet and exists for each ION process. Listeners attached to the process, either RPC Servers or Subscribers, synchronize their calls by placing future calls into the queue by calling _routing_call. This is all done automatically for you by the Container's Process Manager. This method blocks until there are calls to be made in the synchronized queue, and then calls from within this greenlet. Any exception raised is caught and re-raised in the greenlet that originally scheduled the call. If successful, the AsyncResult created at scheduling time is set with the result of the call. """ if self.name: svc_name = "unnamed-service" if self.service is not None and hasattr(self.service, 'name'): svc_name = self.service.name threading.current_thread().name = "%s-%s-ctrl" % (svc_name, self.name) self._ready_control.set() for calltuple in self._ctrl_queue: calling_gl, ar, call, callargs, callkwargs, context = calltuple log.debug("control_flow making call: %s %s %s (has context: %s)", call, callargs, callkwargs, context is not None) res = None start_proc_time = int(get_ion_ts()) # check context for expiration if context is not None and 'reply-by' in context: if start_proc_time >= int(context['reply-by']): log.info( "control_flow: attempting to process message already exceeding reply-by, ignore" ) # raise a timeout in the calling thread to allow endpoints to continue processing e = IonTimeout( "Reply-by time has already occurred (reply-by: %s, op start time: %s)" % (context['reply-by'], start_proc_time)) calling_gl.kill(exception=e, block=False) continue # also check ar if it is set, if it is, that means it is cancelled if ar.ready(): log.info( "control_flow: attempting to process message that has been cancelled, ignore" ) continue try: with self.service.push_context(context): with self.service.container.context.push_context(context): self._ctrl_current = ar res = call(*callargs, **callkwargs) except OperationInterruptedException: # endpoint layer takes care of response as it's the one that caused this log.debug("Operation interrupted") pass except Exception as e: # raise the exception in the calling greenlet, and don't # wait for it to die - it's likely not going to do so. # try decorating the args of the exception with the true traceback # this should be reported by ThreadManager._child_failed exc = PyonThreadTraceback( "IonProcessThread _control_flow caught an exception (call: %s, *args %s, **kwargs %s, context %s)\nTrue traceback captured by IonProcessThread' _control_flow:\n\n%s" % (call, callargs, callkwargs, context, traceback.format_exc())) e.args = e.args + (exc, ) # HACK HACK HACK # we know that we only handle TypeError and IonException derived things, so only forward those if appropriate if isinstance(e, (TypeError, IonException)): calling_gl.kill(exception=e, block=False) else: # otherwise, swallow/record/report and hopefully we can continue on our way self._errors.append( (call, callargs, callkwargs, context, e, exc)) log.warn(exc) log.warn("Attempting to continue...") # have to raise something friendlier on the client side calling_gl.kill(exception=ContainerError(str(exc)), block=False) finally: proc_time = int(get_ion_ts()) - start_proc_time self._proc_time += proc_time self._ctrl_current = None ar.set(res)
def _control_flow(self): """ Entry point for process control thread of execution. This method is run by the control greenlet for each ION process. Listeners attached to the process, either RPC Servers or Subscribers, synchronize calls to the process by placing call requests into the queue by calling _routing_call. This method blocks until there are calls to be made in the synchronized queue, and then calls from within this greenlet. Any exception raised is caught and re-raised in the greenlet that originally scheduled the call. If successful, the AsyncResult created at scheduling time is set with the result of the call. """ svc_name = getattr( self.service, "name", "unnamed-service") if self.service else "unnamed-service" proc_id = getattr(self.service, "id", "unknown-pid") if self.service else "unknown-pid" if self.name: threading.current_thread().name = "%s-%s" % (svc_name, self.name) thread_base_name = threading.current_thread().name self._ready_control.set() for calltuple in self._ctrl_queue: calling_gl, ar, call, callargs, callkwargs, context = calltuple request_id = (context or {}).get("request-id", None) if request_id: threading.current_thread( ).name = thread_base_name + "-" + str(request_id) #log.debug("control_flow making call: %s %s %s (has context: %s)", call, callargs, callkwargs, context is not None) res = None start_proc_time = get_ion_ts_millis() self._record_proc_time(start_proc_time) # check context for expiration if context is not None and 'reply-by' in context: if start_proc_time >= int(context['reply-by']): log.info( "control_flow: attempting to process message already exceeding reply-by, ignore" ) # raise a timeout in the calling thread to allow endpoints to continue processing e = IonTimeout( "Reply-by time has already occurred (reply-by: %s, op start time: %s)" % (context['reply-by'], start_proc_time)) calling_gl.kill(exception=e, block=False) continue # If ar is set, means it is cancelled if ar.ready(): log.info( "control_flow: attempting to process message that has been cancelled, ignore" ) continue init_db_stats() try: # ****************************************************************** # ****** THIS IS WHERE THE RPC OPERATION/SERVICE CALL IS MADE ****** with self.service.push_context(context), \ self.service.container.context.push_context(context): self._ctrl_current = ar res = call(*callargs, **callkwargs) # ****** END CALL, EXCEPTION HANDLING FOLLOWS ****** # ****************************************************************** except OperationInterruptedException: # endpoint layer takes care of response as it's the one that caused this log.debug("Operation interrupted") pass except Exception as e: if self._log_call_exception: log.exception("PROCESS exception: %s" % e.message) # Raise the exception in the calling greenlet. # Try decorating the args of the exception with the true traceback - # this should be reported by ThreadManager._child_failed exc = PyonThreadTraceback( "IonProcessThread _control_flow caught an exception " "(call: %s, *args %s, **kwargs %s, context %s)\n" "True traceback captured by IonProcessThread' _control_flow:\n\n%s" % (call, callargs, callkwargs, context, traceback.format_exc())) e.args = e.args + (exc, ) if isinstance(e, (TypeError, IonException)): # Pass through known process exceptions, in particular IonException calling_gl.kill(exception=e, block=False) else: # Otherwise, wrap unknown, forward and hopefully we can continue on our way self._errors.append( (call, callargs, callkwargs, context, e, exc)) log.warn(exc) log.warn("Attempting to continue...") # Note: Too large exception string will crash the container (when passed on as msg header). exception_str = str(exc) if len(exception_str) > 10000: exception_str = ( "Exception string representation too large. " "Begin and end of the exception:\n" + exception_str[:2000] + "\n...\n" + exception_str[-2000:]) calling_gl.kill(exception=ContainerError(exception_str), block=False) finally: try: # Compute statistics self._compute_proc_stats(start_proc_time) db_stats = get_db_stats() if db_stats: if self._warn_call_dbstmt_threshold > 0 and db_stats.get( "count.all", 0) >= self._warn_call_dbstmt_threshold: stats_str = ", ".join( "{}={}".format(k, db_stats[k]) for k in sorted(db_stats.keys())) log.warn( "PROC_OP '%s.%s' EXCEEDED DB THRESHOLD. stats=%s", svc_name, call.__name__, stats_str) elif self._log_call_dbstats: stats_str = ", ".join( "{}={}".format(k, db_stats[k]) for k in sorted(db_stats.keys())) log.info("PROC_OP '%s.%s' DB STATS: %s", svc_name, call.__name__, stats_str) clear_db_stats() if stats_callback: stats_callback(proc_id=proc_id, proc_name=self.name, svc=svc_name, op=call.__name__, request_id=request_id, context=context, db_stats=db_stats, proc_stats=self.time_stats, result=res, exc=None) except Exception: log.exception("Error computing process call stats") self._ctrl_current = None threading.current_thread().name = thread_base_name # Set response in AsyncEvent of caller (endpoint greenlet) ar.set(res)