def run_child(self): def child_hup(*args): """Shuts down child processes, existing requests are handled.""" signal.signal(signal.SIGHUP, signal.SIG_IGN) eventlet.wsgi.is_accepting = False self.sock.close() pid = os.fork() if pid == 0: signal.signal(signal.SIGHUP, child_hup) signal.signal(signal.SIGTERM, signal.SIG_DFL) # ignore the interrupt signal to avoid a race whereby # a child worker receives the signal before the parent # and is respawned unnecessarily as a result signal.signal(signal.SIGINT, signal.SIG_IGN) # The child has no need to stash the unwrapped # socket, and the reference prevents a clean # exit on sighup self._sock = None self.run_server() LOG.info(_LI('Child %d exiting normally'), os.getpid()) # self.pool.waitall() is now called in wsgi's server so # it's safe to exit here sys.exit(0) else: LOG.info(_LI('Started child %s'), pid) self.children.add(pid)
def acquire(self, retry=True): '''Acquire a lock on the target. :param retry: When True, retry if lock was released while stealing. :type retry: boolean ''' lock_engine_id = self.lock_create(self.target.id, self.engine_id) if lock_engine_id is None: LOG.debug("Engine %(engine)s acquired lock on %(target_type)s " "%(target)s" % {'engine': self.engine_id, 'target_type': self.target_type, 'target': self.target.id}) return if lock_engine_id == self.engine_id or \ self.engine_alive(self.context, lock_engine_id): LOG.debug("Lock on %(target_type)s %(target)s is owned by engine " "%(engine)s" % {'target_type': self.target_type, 'target': self.target.id, 'engine': lock_engine_id}) raise exception.ActionInProgress(target_name=self.target.name, action=self.target.status) else: LOG.info(_LI("Stale lock detected on %(target_type)s %(target)s. " "Engine %(engine)s will attempt to steal the lock"), {'target_type': self.target_type, 'target': self.target.id, 'engine': self.engine_id}) result = self.lock_steal(self.target.id, lock_engine_id, self.engine_id) if result is None: LOG.info(_LI("Engine %(engine)s successfully stole the lock " "on %(target_type)s %(target)s"), {'engine': self.engine_id, 'target_type': self.target_type, 'target': self.target.id}) return elif result is True: if retry: LOG.info(_LI("The lock on %(target_type)s %(target)s was " "released while engine %(engine)s was " "stealing it. Trying again"), {'target_type': self.target_type, 'target': self.target.id, 'engine': self.engine_id}) return self.acquire(retry=False) else: new_lock_engine_id = result LOG.info(_LI("Failed to steal lock on %(target_type)s " "%(target)s. Engine %(engine)s stole the " "lock already"), {'target_type': self.target_type, 'target': self.target.id, 'engine': new_lock_engine_id}) raise exception.ActionInProgress( target_name=self.target.name, action=self.target.status)
def stop(self): super(Dispatcher, self).stop() # Wait for all action threads to be finished LOG.info(_LI("Stopping all action threads of engine %s"), self.engine_id) # Stop ThreadGroup gracefully self.TG.stop(True) LOG.info(_LI("All action threads have been finished"))
def _remove_children(self, pid): if pid in self.children: self.children.remove(pid) LOG.info(_LI('Removed dead child %s'), pid) elif pid in self.stale_children: self.stale_children.remove(pid) LOG.info(_LI('Removed stale child %s'), pid) else: LOG.warning(_LW('Unrecognized child %s'), pid)
def run_child(self): pid = os.fork() if pid == 0: signal.signal(signal.SIGHUP, signal.SIG_DFL) signal.signal(signal.SIGTERM, signal.SIG_DFL) self.run_server() self.LOG.info(_LI('Child %d exiting normally') % os.getpid()) return else: self.LOG.info(_LI('Started child %s') % pid) self.children.append(pid)
def _single_run(self, application, sock): """Start a WSGI server in a new green thread.""" LOG.info(_LI("Starting single process server")) eventlet.wsgi.server(sock, application, custom_pool=self.pool, url_length_limit=URL_LENGTH_LIMIT, log=self._logger, debug=cfg.CONF.debug)
def _start_check(self, entry): """Routine to call for starting the checking for a cluster. @param entry: A dict containing the data associated with the cluster. @return: An updated registry entry record. """ if entry['check_type'] == consts.NODE_STATUS_POLLING: interval = min(entry['interval'], cfg.CONF.periodic_interval_max) timer = self.TG.add_timer(interval, self._poll_cluster, None, entry['cluster_id']) entry['timer'] = timer elif entry['check_type'] == consts.VM_LIFECYCLE_EVENTS: LOG.info(_LI("Start listening events for cluster (%s)."), entry['cluster_id']) listener = self._add_listener(entry['cluster_id']) if listener: entry['listener'] = listener else: return None else: LOG.warn(_LW("Cluster (%(id)s) check type (%(type)s) is invalid."), {'id': entry['cluster_id'], 'type': entry['check_type']}) return None return entry
def main(): try: logging.register_options(cfg.CONF) cfg.CONF(project='senlin', prog='senlin-api', version=version.version_info.version_string()) logging.setup(cfg.CONF, 'senlin-api') objects.register_all() messaging.setup() app = wsgi.load_paste_app() host = cfg.CONF.senlin_api.bind_host port = cfg.CONF.senlin_api.bind_port LOG.info(_LI('Starting Senlin API on %(host)s:%(port)s'), { 'host': host, 'port': port }) profiler.setup('senlin-api', host) server = wsgi.Server('senlin-api', cfg.CONF.senlin_api) server.start(app, default_port=port) systemd.notify_once() server.wait() except RuntimeError as ex: sys.exit("ERROR: %s" % six.text_type(ex))
def _register_info(self, name, info): '''place the new info in the correct location in the registry. :param path: a string of plugin name. :param info: reference to a PluginInfo data structure, deregister a PluginInfo if specified as None. ''' registry = self._registry if info is None: # delete this entry. msg = _LW("Removing %(item)s from registry") LOG.warning(msg, {'item': name}) registry.pop(name, None) return if name in registry and isinstance(registry[name], PluginInfo): if registry[name] == info: return details = { 'name': name, 'old': str(registry[name].plugin), 'new': str(info.plugin) } LOG.warning(_LW('Changing %(name)s from %(old)s to %(new)s'), details) else: msg = _LI('Registering %(name)s -> %(value)s') LOG.info(msg, {'name': name, 'value': str(info.plugin)}) info.user_provided = not self.is_global registry[name] = info
def _start_check(self, entry): """Routine to call for starting the checking for a cluster. @param entry: A dict containing the data associated with the cluster. @return: An updated registry entry record. """ if entry['check_type'] == consts.NODE_STATUS_POLLING: interval = min(entry['interval'], cfg.CONF.periodic_interval_max) timer = self.TG.add_timer(interval, self._poll_cluster, None, entry['cluster_id']) entry['timer'] = timer elif entry['check_type'] == consts.VM_LIFECYCLE_EVENTS: LOG.info(_LI("Start listening events for cluster (%s)."), entry['cluster_id']) listener = self._add_listener(entry['cluster_id']) if listener: entry['listener'] = listener else: return None else: LOG.warning(_LW("Cluster (%(id)s) check type (%(type)s) is " "invalid."), {'id': entry['cluster_id'], 'type': entry['check_type']}) return None return entry
def _register_info(self, name, info): '''place the new info in the correct location in the registry. :param path: a string of plugin name. :param info: reference to a PluginInfo data structure, deregister a PluginInfo if specified as None. ''' registry = self._registry if info is None: # delete this entry. LOG.warning(_LW('Removing %(item)s from registry'), {'item': name}) registry.pop(name, None) return if name in registry and isinstance(registry[name], PluginInfo): if registry[name] == info: return details = { 'name': name, 'old': str(registry[name].plugin), 'new': str(info.plugin) } LOG.warning(_LW('Changing %(name)s from %(old)s to %(new)s'), details) else: LOG.info(_LI('Registering %(name)s -> %(value)s'), { 'name': name, 'value': str(info.plugin)}) info.user_provided = not self.is_global registry[name] = info
def ActionProc(context, action_id, worker_id): '''Action process.''' # Step 1: lock the action for execution timestamp = wallclock() result = db_api.action_acquire(context, action_id, worker_id, timestamp) if result is None: LOG.debug(_('Failed locking action "%s" for execution'), action_id) return False # Step 2: materialize the action object action = Action.load(context, action_id=action_id) LOG.info(_LI('Action %(name)s [%(id)s] started'), {'name': six.text_type(action.action), 'id': action.id}) reason = 'Action completed' try: # Step 3: execute the action result, reason = action.execute() # NOTE: The following exception report is not giving useful # information for some reasons. # except Exception as ex: # We catch exception here to make sure the following logics are # executed. # result = action.RES_ERROR # reason = six.text_type(ex) # LOG.error(_('Exception occurred in action execution[%(action)s]: ' # '%(reason)s'), {'action': action.action, # 'reason': reason}) finally: # NOTE: locks on action is eventually released here by status update action.set_status(result, reason)
def node_lock_acquire(context, node_id, action_id, engine=None, forced=False): """Try to lock the specified node. :param context: the context used for DB operations; :param node_id: ID of the node to be locked. :param action_id: ID of the action that attempts to lock the node. :param engine: ID of the engine that attempts to lock the node. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the node - if the returned owner_id is the # action id, it was a success owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for node %s again' % node_id) owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owner = db_api.node_lock_steal(node_id, action_id) return action_id == owner # if this node lock by dead engine action = db_api.action_get(context, owner) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info( _LI('The node %(n)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'n': node_id, 'a': owner }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) db_api.node_lock_steal(node_id, action_id) return True LOG.error( _LE('Node is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), { 'old': owner, 'new': action_id }) return False
def _update_nodes(self, profile_id, nodes_obj): # Get batching policy data if any fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.") LOG.info(fmt, {'cluster': self.entity.id, 'profile': profile_id}) pause_time = 0 plan = [] pd = self.data.get('update', None) if pd: pause_time = pd.get('pause_time') plan = pd.get('plan') else: pause_time = 0 nodes_list = [] for node in self.entity.nodes: nodes_list.append(node.id) plan.append(set(nodes_list)) nodes = [] for node_set in plan: child = [] nodes = list(node_set) for node in nodes: kwargs = { 'name': 'node_update_%s' % node[:8], 'cause': base.CAUSE_DERIVED, 'inputs': { 'new_profile_id': profile_id, }, } action_id = base.Action.create(self.context, node, consts.NODE_UPDATE, **kwargs) child.append(action_id) if child: dobj.Dependency.create(self.context, [c for c in child], self.id) for cid in child: ao.Action.update(self.context, cid, {'status': base.Action.READY}) dispatcher.start_action() # clear the action list child = [] result, new_reason = self._wait_for_dependents() if result != self.RES_OK: self.entity.eval_status(self.context, consts.CLUSTER_UPDATE) return result, _('Failed in updating nodes.') # pause time if pause_time != 0: self._sleep(pause_time) self.entity.profile_id = profile_id self.entity.eval_status(self.context, consts.CLUSTER_UPDATE, profile_id=profile_id, updated_at=timeutils.utcnow(True)) return self.RES_OK, 'Cluster update completed.'
def cluster_lock_acquire(context, cluster_id, action_id, engine=None, scope=CLUSTER_SCOPE, forced=False): """Try to lock the specified cluster. :param cluster_id: ID of the cluster to be locked. :param action_id: ID of the action which wants to lock the cluster. :param engine: ID of the engine which wants to lock the cluster. :param scope: scope of lock, could be cluster wide lock, or node-wide lock. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the cluster - if the returned owner_id is the # action id, it was a success owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for cluster %s again' % cluster_id) owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners # Will reach here only because scope == CLUSTER_SCOPE action = db_api.action_get(context, owners[0]) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info(_LI('The cluster %(c)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'c': cluster_id, 'a': owners[0] }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners LOG.error(_LE('Cluster is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), {'old': str(owners), 'new': action_id}) return False
def node_lock_acquire(context, node_id, action_id, engine=None, forced=False): """Try to lock the specified node. :param context: the context used for DB operations; :param node_id: ID of the node to be locked. :param action_id: ID of the action that attempts to lock the node. :param engine: ID of the engine that attempts to lock the node. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the node - if the returned owner_id is the # action id, it was a success owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for node %s again' % node_id) owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owner = db_api.node_lock_steal(node_id, action_id) return action_id == owner # if this node lock by dead engine action = db_api.action_get(context, owner) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info(_LI('The node %(n)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'n': node_id, 'a': owner }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) db_api.node_lock_steal(node_id, action_id) return True LOG.error(_LE('Node is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), {'old': owner, 'new': action_id}) return False
def cluster_lock_acquire(context, cluster_id, action_id, engine=None, scope=CLUSTER_SCOPE, forced=False): """Try to lock the specified cluster. :param cluster_id: ID of the cluster to be locked. :param action_id: ID of the action which wants to lock the cluster. :param engine: ID of the engine which wants to lock the cluster. :param scope: scope of lock, could be cluster wide lock, or node-wide lock. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the cluster - if the returned owner_id is the # action id, it was a success owners = cl_obj.ClusterLock.acquire(cluster_id, action_id, scope) if action_id in owners: return True # Step 2: Last resort is 'forced locking', only needed when retry failed if forced: owners = cl_obj.ClusterLock.steal(cluster_id, action_id) return action_id in owners # Step 3: check if the owner is a dead engine, if so, steal the lock. # Will reach here only because scope == CLUSTER_SCOPE action = ao.Action.get(context, owners[0]) if (action and action.owner and action.owner != engine and utils.is_engine_dead(context, action.owner)): LOG.info( _LI('The cluster %(c)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'c': cluster_id, 'a': owners[0] }) reason = _('Engine died when executing this action.') owners = cl_obj.ClusterLock.steal(cluster_id, action_id) # Mark the old action to failed. ao.Action.mark_failed(context, action.id, time.time(), reason) return action_id in owners LOG.error( _LE('Cluster is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), { 'old': str(owners), 'new': action_id }) return False
def start_wsgi(self): if self.conf.workers == 0: # Useful for profiling, test, debug etc. self.pool = eventlet.GreenPool(size=self.threads) self.pool.spawn_n(self._single_run, self.application, self.sock) return LOG.info(_LI("Starting %d workers") % self.conf.workers) signal.signal(signal.SIGTERM, self.kill_children) signal.signal(signal.SIGINT, self.kill_children) signal.signal(signal.SIGHUP, self.hup) while len(self.children) < self.conf.workers: self.run_child()
def _verify_and_respawn_children(self, pid, status): if len(self.stale_children) == 0: LOG.debug('No stale children') if os.WIFEXITED(status) and os.WEXITSTATUS(status) != 0: LOG.error(_LE('Not respawning child %d, cannot ' 'recover from termination'), pid) if not self.children and not self.stale_children: LOG.info(_LI('All workers have terminated. Exiting')) self.running = False else: if len(self.children) < self.conf.workers: self.run_child()
def info(context, entity, action, status=None, status_reason=None, timestamp=None): timestamp = timestamp or timeutils.utcnow(True) event = Event(timestamp, logging.INFO, entity, action=action, status=status, status_reason=status_reason, user=context.user, project=context.project) event.store(context) LOG.info(_LI('%(name)s [%(id)s] %(action)s - %(status)s: %(reason)s'), {'name': event.oname, 'id': event.oid and event.oid[:8], 'action': action, 'status': status, 'reason': status_reason})
def load_dispatcher(): """Load dispatchers.""" global dispatchers LOG.debug("Loading dispatchers") dispatchers = named.NamedExtensionManager( namespace="senlin.dispatchers", names=cfg.CONF.event_dispatchers, invoke_on_load=True, propagate_map_exceptions=True) if not list(dispatchers): LOG.warning(_LW("No dispatchers configured for 'senlin.dispatchers'")) else: LOG.info(_LI("Loaded dispatchers: %s"), dispatchers.names())
def _verify_and_respawn_children(self, pid, status): if len(self.stale_children) == 0: LOG.debug('No stale children') if os.WIFEXITED(status) and os.WEXITSTATUS(status) != 0: LOG.error( _LE('Not respawning child %d, cannot ' 'recover from termination'), pid) if not self.children and not self.stale_children: LOG.info(_LI('All workers have terminated. Exiting')) self.running = False else: if len(self.children) < self.conf.workers: self.run_child()
def set_status(self, result, reason=None): '''Set action status based on return value from execute.''' timestamp = wallclock() if result == self.RES_OK: status = self.SUCCEEDED msg = _LI('Action %(name)s [%(id)s] completed with SUCCESS.') db_api.action_mark_succeeded(self.context, self.id, timestamp) elif result == self.RES_ERROR: status = self.FAILED msg = _LI('Action %(name)s [%(id)s] failed with ERROR.') db_api.action_mark_failed(self.context, self.id, timestamp, reason=reason or 'ERROR') elif result == self.RES_TIMEOUT: status = self.FAILED msg = _LI('Action %(name)s [%(id)s] failed with TIMEOUT.') db_api.action_mark_failed(self.context, self.id, timestamp, reason=reason or 'TIMEOUT') elif result == self.RES_CANCEL: status = self.CANCELLED msg = _LI('Action %(name)s [%(id)s] was cancelled.') db_api.action_mark_cancelled(self.context, self.id, timestamp) else: # result == self.RES_RETRY: status = self.READY # Action failed at the moment, but can be retried # We abandon it and then notify other dispatchers to execute it db_api.action_abandon(self.context, self.id) msg = _LI('Action %(name)s [%(id)s] aborted with RETRY.') LOG.info(msg, {'name': self.action, 'id': self.id, 'status': status}) self.status = status self.status_reason = reason
def node_lock_acquire(context, node_id, action_id, engine=None, forced=False): """Try to lock the specified node. :param context: the context used for DB operations; :param node_id: ID of the node to be locked. :param action_id: ID of the action that attempts to lock the node. :param engine: ID of the engine that attempts to lock the node. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the node - if the returned owner_id is the # action id, it was a success owner = nl_obj.NodeLock.acquire(node_id, action_id) if action_id == owner: return True # Step 2: Last resort is 'forced locking', only needed when retry failed if forced: owner = nl_obj.NodeLock.steal(node_id, action_id) return action_id == owner # Step 3: Try to steal a lock if it's owner is a dead engine. # if this node lock by dead engine action = ao.Action.get(context, owner) if (action and action.owner and action.owner != engine and utils.is_engine_dead(context, action.owner)): LOG.info( _LI('The node %(n)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'n': node_id, 'a': owner }) reason = _('Engine died when executing this action.') nl_obj.NodeLock.steal(node_id, action_id) ao.Action.mark_failed(context, action.id, time.time(), reason) return True LOG.error( _LE('Node is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), { 'old': owner, 'new': action_id }) return False
def wait_on_children(self): while self.running: try: pid, status = os.wait() if os.WIFEXITED(status) or os.WIFSIGNALED(status): self.LOG.error(_LE('Removing dead child %s') % pid) self.children.remove(pid) self.run_child() except OSError as err: if err.errno not in (errno.EINTR, errno.ECHILD): raise except KeyboardInterrupt: self.LOG.info(_LI('Caught keyboard interrupt. Exiting.')) os.killpg(0, signal.SIGTERM) break eventlet.greenio.shutdown_safe(self.sock) self.sock.close() self.LOG.debug('Exited')
def info(self, ctxt, publisher_id, event_type, payload, metadata): meta = payload['metadata'] if meta.get('cluster_id') == self.cluster_id: if event_type not in self.VM_FAILURE_EVENTS: return params = { 'event': self.VM_FAILURE_EVENTS[event_type], 'state': payload.get('state', 'Unknown'), 'instance_id': payload.get('instance_id', 'Unknown'), 'timestamp': metadata['timestamp'], 'publisher': publisher_id, } node_id = meta.get('cluster_node_id') if node_id: LOG.info(_LI("Requesting node recovery: %s"), node_id) ctx_value = context.get_service_context( project=self.project_id, user=payload['user_id']) ctx = context.RequestContext(**ctx_value) self.rpc.node_recover(ctx, node_id, params)
def url_fetch(url, allowed_schemes=('http', 'https')): '''Get the data at the specified URL. The URL must use the http: or https: schemes. The file: scheme is also supported if you override the allowed_schemes argument. Raise an IOError if getting the data fails. ''' LOG.info(_LI('Fetching data from %s'), url) components = urllib.parse.urlparse(url) if components.scheme not in allowed_schemes: raise URLFetchError(_('Invalid URL scheme %s') % components.scheme) if components.scheme == 'file': try: return urllib.request.urlopen(url).read() except urllib.error.URLError as uex: raise URLFetchError(_('Failed to retrieve data: %s') % uex) try: resp = requests.get(url, stream=True) resp.raise_for_status() # We cannot use resp.text here because it would download the entire # file, and a large enough file would bring down the engine. The # 'Content-Length' header could be faked, so it's necessary to # download the content in chunks to until max_reponse_size is reached. # The chunk_size we use needs to balance CPU-intensive string # concatenation with accuracy (eg. it's possible to fetch 1000 bytes # greater than max_response_size with a chunk_size of 1000). reader = resp.iter_content(chunk_size=1000) result = "" for chunk in reader: result += chunk if len(result) > cfg.CONF.max_response_size: raise URLFetchError("Data exceeds maximum allowed size (%s" " bytes)" % cfg.CONF.max_response_size) return result except exceptions.RequestException as ex: raise URLFetchError(_('Failed to retrieve data: %s') % ex)
def _load_runtime_registry(self): """Load the initial runtime registry with a DB scan.""" db_registries = objects.HealthRegistry.claim(self.ctx, self.engine_id) for cluster in db_registries: entry = { 'cluster_id': cluster.cluster_id, 'check_type': cluster.check_type, 'interval': cluster.interval, 'params': cluster.params, 'enabled': True, } LOG.info(_LI("Loading cluster %s for health monitoring"), cluster.cluster_id) entry = self._start_check(entry) if entry: self.rt['registries'].append(entry)
def url_fetch(url, allowed_schemes=('http', 'https')): '''Get the data at the specified URL. The URL must use the http: or https: schemes. The file: scheme is also supported if you override the allowed_schemes argument. Raise an IOError if getting the data fails. ''' LOG.info(_LI('Fetching data from %s'), url) components = urllib.parse.urlparse(url) if components.scheme not in allowed_schemes: raise URLFetchError(_('Invalid URL scheme %s') % components.scheme) if components.scheme == 'file': try: return urllib.request.urlopen(url).read() except urllib.error.URLError as uex: raise URLFetchError(_('Failed to retrieve data: %s') % uex) try: resp = requests.get(url, stream=True) resp.raise_for_status() # We cannot use resp.text here because it would download the entire # file, and a large enough file would bring down the engine. The # 'Content-Length' header could be faked, so it's necessary to # download the content in chunks to until max_response_size is reached. # The chunk_size we use needs to balance CPU-intensive string # concatenation with accuracy (eg. it's possible to fetch 1000 bytes # greater than max_response_size with a chunk_size of 1000). reader = resp.iter_content(chunk_size=1000) result = "" for chunk in reader: result += chunk if len(result) > cfg.CONF.max_response_size: raise URLFetchError("Data exceeds maximum allowed size (%s" " bytes)" % cfg.CONF.max_response_size) return result except exceptions.RequestException as ex: raise URLFetchError(_('Failed to retrieve data: %s') % ex)
def main(): try: logging.register_options(cfg.CONF) cfg.CONF(project='senlin', prog='senlin-api', version=version.version_info.version_string()) logging.setup(cfg.CONF, 'senlin-api') messaging.setup() app = wsgi.load_paste_app() host = cfg.CONF.senlin_api.bind_host port = cfg.CONF.senlin_api.bind_port LOG.info(_LI('Starting Senlin API on %(host)s:%(port)s'), {'host': host, 'port': port}) server = wsgi.Server('senlin-api', cfg.CONF.senlin_api) server.start(app, default_port=port) systemd.notify_once() server.wait() except RuntimeError as ex: sys.exit("ERROR: %s" % six.text_type(ex))
def start(self, application, conf, default_port): '''Run a WSGI server with the given application. :param application: The application to run in the WSGI server :param conf: a cfg.ConfigOpts object :param default_port: Port to bind to if none is specified in conf ''' def kill_children(*args): """Kills the entire process group.""" self.LOG.error(_LE('SIGTERM received')) signal.signal(signal.SIGTERM, signal.SIG_IGN) self.running = False os.killpg(0, signal.SIGTERM) def hup(*args): # Shuts down the server(s), but allows running requests to complete self.LOG.error(_LE('SIGHUP received')) signal.signal(signal.SIGHUP, signal.SIG_IGN) os.killpg(0, signal.SIGHUP) signal.signal(signal.SIGHUP, hup) # Note: may need to make this configurable eventlet.wsgi.MAX_HEADER_LINE = 16384 self.application = application self.sock = get_socket(conf, default_port) self.LOG = logging.getLogger('eventlet.wsgi.server') if conf.workers == 0: # Useful for profiling, test, debug etc. self.pool = eventlet.GreenPool(size=self.threads) self.pool.spawn_n(self._single_run, application, self.sock) return self.LOG.info(_LI("Starting %d workers") % conf.workers) signal.signal(signal.SIGTERM, kill_children) signal.signal(signal.SIGHUP, hup) while len(self.children) < conf.workers: self.run_child()
def wait_on_children(self): """Wait on children exit.""" while self.running: try: pid, status = os.wait() if os.WIFEXITED(status) or os.WIFSIGNALED(status): self._remove_children(pid) self._verify_and_respawn_children(pid, status) except OSError as err: if err.errno not in (errno.EINTR, errno.ECHILD): raise except KeyboardInterrupt: LOG.info(_LI('Caught keyboard interrupt. Exiting.')) os.killpg(0, signal.SIGTERM) break except exception.SIGHUPInterrupt: self.reload() continue eventlet.greenio.shutdown_safe(self.sock) self.sock.close() LOG.debug('Exited')
def read_global_environment(self): '''Read and parse global environment files.''' cfg.CONF.import_opt('environment_dir', 'senlin.common.config') env_dir = cfg.CONF.environment_dir try: files = glob.glob(os.path.join(env_dir, '*')) except OSError as ex: LOG.error(_LE('Failed to read %s'), env_dir) LOG.exception(ex) return for fname in files: try: with open(fname) as f: LOG.info(_LI('Loading environment from %s'), fname) self.load(self.parse(f.read())) except ValueError as vex: LOG.error(_LE('Failed to parse %s'), fname) LOG.exception(six.text_type(vex)) except IOError as ioex: LOG.error(_LE('Failed to read %s'), fname) LOG.exception(six.text_type(ioex))
def do_update(self): """Handler for CLUSTER_UPDATE action. :returns: A tuple consisting the result and the corresponding reason. """ res = self.cluster.do_update(self.context) if not res: reason = _('Cluster update failed.') self.cluster.set_status(self.context, self.cluster.ERROR, reason) return self.RES_ERROR, reason name = self.inputs.get('name') metadata = self.inputs.get('metadata') timeout = self.inputs.get('timeout') profile_id = self.inputs.get('new_profile_id') if name is not None: self.cluster.name = name if metadata is not None: self.cluster.metadata = metadata if timeout is not None: self.cluster.timeout = timeout self.cluster.store(self.context) reason = _('Cluster update completed.') if profile_id is not None: fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.") LOG.info(fmt, {'cluster': self.cluster.id, 'profile': profile_id}) child_actions = [] for node in self.cluster.nodes: kwargs = { 'name': 'node_update_%s' % node.id[:8], 'cause': base.CAUSE_DERIVED, 'inputs': { 'new_profile_id': profile_id, }, 'user': self.context.user, 'project': self.context.project, 'domain': self.context.domain, } action = base.Action(node.id, 'NODE_UPDATE', **kwargs) action.store(self.context) child_actions.append(action) if child_actions: db_api.dependency_add(self.context, [c.id for c in child_actions], self.id) for child in child_actions: db_api.action_update(self.context, child.id, {'status': child.READY}) dispatcher.start_action(action_id=child.id) result, new_reason = self._wait_for_dependents() if result != self.RES_OK: self.cluster.set_status(self.context, self.cluster.WARNING, new_reason) return result, new_reason self.cluster.set_status(self.context, self.cluster.ACTIVE, reason, profile_id=profile_id) return self.RES_OK, reason self.cluster.set_status(self.context, self.cluster.ACTIVE, reason) return self.RES_OK, reason
def do_update(self): """Handler for CLUSTER_UPDATE action. :returns: A tuple consisting the result and the corresponding reason. """ res = self.cluster.do_update(self.context) if not res: reason = _('Cluster update failed.') self.cluster.set_status(self.context, self.cluster.ERROR, reason) return self.RES_ERROR, reason name = self.inputs.get('name') metadata = self.inputs.get('metadata') timeout = self.inputs.get('timeout') profile_id = self.inputs.get('new_profile_id') if name is not None: self.cluster.name = name if metadata is not None: self.cluster.metadata = metadata if timeout is not None: self.cluster.timeout = timeout self.cluster.store(self.context) reason = _('Cluster update completed.') if profile_id is None: self.cluster.set_status(self.context, self.cluster.ACTIVE, reason) return self.RES_OK, reason fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.") LOG.info(fmt, {'cluster': self.cluster.id, 'profile': profile_id}) child = [] for node in self.cluster.nodes: kwargs = { 'name': 'node_update_%s' % node.id[:8], 'cause': base.CAUSE_DERIVED, 'inputs': { 'new_profile_id': profile_id, }, } action_id = base.Action.create(self.context, node.id, consts.NODE_UPDATE, **kwargs) child.append(action_id) if child: db_api.dependency_add(self.context, [c for c in child], self.id) for cid in child: db_api.action_update(self.context, cid, {'status': base.Action.READY}) dispatcher.start_action(action_id=cid) result, new_reason = self._wait_for_dependents() if result != self.RES_OK: new_reason = _('Failed in updating nodes.') self.cluster.set_status(self.context, self.cluster.WARNING, new_reason) return result, new_reason self.cluster.set_status(self.context, self.cluster.ACTIVE, reason, profile_id=profile_id) return self.RES_OK, reason
def cluster_lock_acquire(context, cluster_id, action_id, engine=None, scope=CLUSTER_SCOPE, forced=False): """Try to lock the specified cluster. :param cluster_id: ID of the cluster to be locked. :param action_id: ID of the action which wants to lock the cluster. :param engine: ID of the engine which wants to lock the cluster. :param scope: scope of lock, could be cluster wide lock, or node-wide lock. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the cluster - if the returned owner_id is the # action id, it was a success owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for cluster %s again' % cluster_id) owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners # Will reach here only because scope == CLUSTER_SCOPE action = db_api.action_get(context, owners[0]) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info( _LI('The cluster %(c)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'c': cluster_id, 'a': owners[0] }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners LOG.error( _LE('Cluster is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), { 'old': str(owners), 'new': action_id }) return False
def do_update(self): """Handler for CLUSTER_UPDATE action. :returns: A tuple consisting the result and the corresponding reason. """ res = self.cluster.do_update(self.context) if not res: reason = _('Cluster update failed.') self.cluster.set_status(self.context, self.cluster.ERROR, reason) return self.RES_ERROR, reason name = self.inputs.get('name') metadata = self.inputs.get('metadata') timeout = self.inputs.get('timeout') profile_id = self.inputs.get('new_profile_id') if name is not None: self.cluster.name = name if metadata is not None: self.cluster.metadata = metadata if timeout is not None: self.cluster.timeout = timeout self.cluster.store(self.context) reason = _('Cluster update completed.') if profile_id is None: self.cluster.set_status(self.context, self.cluster.ACTIVE, reason) return self.RES_OK, reason fmt = _LI("Updating cluster '%(cluster)s': profile='%(profile)s'.") LOG.info(fmt, {'cluster': self.cluster.id, 'profile': profile_id}) child = [] for node in self.cluster.nodes: kwargs = { 'name': 'node_update_%s' % node.id[:8], 'cause': base.CAUSE_DERIVED, 'inputs': { 'new_profile_id': profile_id, }, } action_id = base.Action.create(self.context, node.id, consts.NODE_UPDATE, **kwargs) child.append(action_id) if child: dobj.Dependency.create(self.context, [c for c in child], self.id) for cid in child: ao.Action.update(self.context, cid, {'status': base.Action.READY}) dispatcher.start_action() result, new_reason = self._wait_for_dependents() if result != self.RES_OK: new_reason = _('Failed in updating nodes.') self.cluster.set_status(self.context, self.cluster.WARNING, new_reason) return result, new_reason self.cluster.set_status(self.context, self.cluster.ACTIVE, reason, profile_id=profile_id) return self.RES_OK, reason