def info(self, ctxt, publisher_id, event_type, payload, metadata): if event_type not in self.STACK_FAILURE_EVENTS: return tags = payload['tags'] if tags is None or tags == []: return cluster_id = None node_id = None for tag in tags: if cluster_id is None: start = tag.find('cluster_id') if start == 0 and tag[11:] == self.cluster_id: cluster_id = tag[11:] if node_id is None: start = tag.find('cluster_node_id') if start == 0: node_id = tag[16:] if cluster_id is None or node_id is None: return params = { 'event': self.STACK_FAILURE_EVENTS[event_type], 'state': payload.get('state', 'Unknown'), 'stack_id': payload.get('stack_identity', 'Unknown'), 'timestamp': metadata['timestamp'], 'publisher': publisher_id, } LOG.info("Requesting stack recovery: %s", node_id) ctx = context.get_service_context(project=self.project_id, user=payload['user_identity']) req = objects.NodeRecoverRequest(identity=node_id, params=params) self.rpc.call(ctx, 'node_recover', req)
def process_request(self, req): # We only handle POST requests if req.method != 'POST': return # Extract webhook (receiver) ID and params results = self._parse_url(req.url) if not results: return (receiver_id, params) = results dbctx = context.RequestContext(is_admin=True) rpcc = rpc.EngineClient() receiver = rpcc.receiver_get(dbctx, receiver_id, project_safe=False) svc_ctx = context.get_service_context() kwargs = { 'auth_url': svc_ctx['auth_url'], 'username': svc_ctx['username'], 'user_domain_name': svc_ctx['user_domain_name'], 'password': svc_ctx['password'] } kwargs.update(receiver['actor']) # Get token and fill it into the request header token = self._get_token(**kwargs) req.headers['X-Auth-Token'] = token
def generate_url(self, key): """Generate webhook URL with proper format. :param key: Key string to be used for decrypt the credentials. """ senlin_creds = context.get_service_context() kc = driver_base.SenlinDriver().identity(senlin_creds) senlin_service = kc.service_get('clustering', 'senlin') if not senlin_service: resource = _('service:type=clustering,name=senlin') raise exception.ResourceNotFound(resource=resource) senlin_service_id = senlin_service['id'] region = cfg.CONF.region_name_for_services endpoint = kc.endpoint_get(senlin_service_id, region, 'public') if not endpoint: resource = _('endpoint: service=%(service)s,region=' '%(region)s,visibility=%(interface)s' ) % {'service': senlin_service_id, 'region': region, 'interface': 'public'} raise exception.ResourceNotFound(resource=resource) endpoint_url = endpoint['url'].replace('$(tenant_id)s', self.project) location = endpoint_url + '/webhooks/%s/trigger' % self.id location += "?%s" % parse.urlencode({'key': key}) return location, key
def info(self, ctxt, publisher_id, event_type, payload, metadata): meta = payload['metadata'] cluster_id = meta.get('cluster_id') if not cluster_id: return if self.cluster_id != cluster_id: return if event_type not in self.VM_FAILURE_EVENTS: return params = { 'event': self.VM_FAILURE_EVENTS[event_type], 'state': payload.get('state', 'Unknown'), 'instance_id': payload.get('instance_id', 'Unknown'), 'timestamp': metadata['timestamp'], 'publisher': publisher_id, 'operation': self.recover_action['operation'], } node_id = meta.get('cluster_node_id') if node_id: LOG.info("Requesting node recovery: %s", node_id) ctx = context.get_service_context(project_id=self.project_id, user_id=payload['user_id']) req = objects.NodeRecoverRequest(identity=node_id, params=params) self.rpc.call(ctx, 'node_recover', req)
def do_create(self, obj): """Create a container instance using the given profile. :param obj: The node object for this container. :returns: ID of the container instance or ``None`` if driver fails. :raises: `EResourceCreation` """ name = self.properties[self.NAME] if name is None: name = '-'.join([obj.name, utils.random_name()]) params = { 'image': self.properties[self.IMAGE], 'name': name, 'command': self.properties[self.COMMAND], } try: ctx = context.get_service_context(project=obj.project, user=obj.user) dockerclient = self.docker(obj) db_api.node_add_dependents(ctx, self.host.id, obj.id) container = dockerclient.container_create(**params) dockerclient.start(container['Id']) except exc.InternalError as ex: raise exc.EResourceCreation(type='container', message=str(ex)) self.container_id = container['Id'][:36] return self.container_id
def _get_security_group(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) if obj.cluster_id: cluster = cluster_obj.Cluster.get(ctx, obj.cluster_id) return cluster.data.get(self.SECURITY_GROUP) return None
def _get_network(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) if obj.cluster_id: cluster = cluster_obj.Cluster.get(ctx, obj.cluster_id) return cluster.data.get(self.PRIVATE_NETWORK) return None
def _create_network(self, obj): client = self.network(obj) try: net = client.network_create() subnet = client.subnet_create(network_id=net.id, cidr='10.7.0.0/24', ip_version=4) except exc.InternalError as ex: raise exc.EResourceCreation(type='kubernetes', message=str(ex), resource_id=obj.id) pub_net = client.network_get(self.properties[self.PUBLIC_NETWORK]) try: router = client.router_create( external_gateway_info={"network_id": pub_net.id}) client.add_interface_to_router(router, subnet_id=subnet.id) fip = client.floatingip_create(floating_network_id=pub_net.id) except exc.InternalError as ex: raise exc.EResourceCreation(type='kubernetes', message=str(ex), resource_id=obj.id) ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) data = obj.data data[self.PRIVATE_NETWORK] = net.id data[self.PRIVATE_SUBNET] = subnet.id data[self.PRIVATE_ROUTER] = router.id data[self.KUBE_MASTER_FLOATINGIP] = fip.floating_ip_address data[self.KUBE_MASTER_FLOATINGIP_ID] = fip.id cluster_obj.Cluster.update(ctx, obj.id, {'data': data}) return net.id
def _get_kubeadm_token(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) if obj.cluster_id: cluster = cluster_obj.Cluster.get(ctx, obj.cluster_id) return cluster.data.get(self.KUBEADM_TOKEN) return None
def generate_url(self, key): """Generate webhook URL with proper format. :param key: Key string to be used for decrypt the credentials. """ senlin_creds = context.get_service_context() kc = driver_base.SenlinDriver().identity(senlin_creds) senlin_service = kc.service_get('clustering', 'senlin') if not senlin_service: resource = _('service:type=clustering,name=senlin') raise exception.ResourceNotFound(resource=resource) senlin_service_id = senlin_service['id'] region = cfg.CONF.region_name_for_services endpoint = kc.endpoint_get(senlin_service_id, region, 'public') if not endpoint: resource = _('endpoint: service=%(service)s,region=' '%(region)s,visibility=%(interface)s') % { 'service': senlin_service_id, 'region': region, 'interface': 'public' } raise exception.ResourceNotFound(resource=resource) endpoint_url = endpoint['url'].replace('$(tenant_id)s', self.project) location = endpoint_url + '/webhooks/%s/trigger' % self.id location += "?%s" % parse.urlencode({'key': key}) return location, key
def _update_master_ip(self, obj, ip): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) if obj.cluster_id: cluster = cluster_obj.Cluster.get(ctx, obj.cluster_id) cluster.data['kube_master_ip'] = ip cluster.update(ctx, obj.cluster_id, {'data': cluster.data})
def process_request(self, req): # We only handle POST requests if req.method != 'POST': return # Extract project, webhook ID and key results = self._parse_url(req.url) if not results: return (project, webhook_id, key) = results credential = self._get_credential(project, webhook_id, key) if not credential: return svc_ctx = context.get_service_context() kwargs = { 'auth_url': svc_ctx['auth_url'], 'username': svc_ctx['username'], 'user_domain_name': svc_ctx['user_domain_name'], 'password': svc_ctx['password'] } kwargs.update(credential) # Get token and fill it into the request header token = self._get_token(**kwargs) req.headers['X-Auth-Token'] = token
def _get_cluster_data(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) if obj.cluster_id: cluster = cluster_obj.Cluster.get(ctx, obj.cluster_id) return cluster.data return {}
def _generate_kubeadm_token(self, obj): token = GenKubeToken() # store generated token ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) data = obj.data data[self.KUBEADM_TOKEN] = token cluster_obj.Cluster.update(ctx, obj.id, {'data': data}) return token
def _init_context(self): profile_context = {} if self.CONTEXT in self.spec_data: profile_context = self.spec_data[self.CONTEXT] or {} ctx_dict = context.get_service_context(**profile_context) ctx_dict.pop('project_name') ctx_dict.pop('project_domain_name') return ctx_dict
def _init_context(self): profile_context = {} if self.CONTEXT in self.properties: profile_context = self.properties[self.CONTEXT] or {} ctx_dict = context.get_service_context(**profile_context) ctx_dict.pop('project_name', None) ctx_dict.pop('project_domain_name', None) return ctx_dict
def _get_base_url(self): base = None service_cred = senlin_context.get_service_context() kc = driver_base.SenlinDriver().identity(service_cred) try: base = kc.get_senlin_endpoint() except exception.InternalError as ex: msg = _('Senlin endpoint can not be found: %s.') % ex.message LOG.warning(msg) return base
def _disassociate_floatingip(self, obj, server): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) if obj.cluster_id: cluster = cluster_obj.Cluster.get(ctx, obj.cluster_id) fip = cluster.data.get(self.KUBE_MASTER_FLOATINGIP) if fip: try: self.compute(obj).server_floatingip_disassociate( server, fip) except exc.InternalError as ex: raise exc.EResourceOperation(op='floatingip', type='kubernetes', id=fip, message=str(ex))
def _get_trust(self, ctx): '''List trusts with current user as the trustor.''' # DB table is used as a cache for the trusts. cred_exists = False res = db_api.cred_get(ctx, ctx.user, ctx.project) if res is not None: try: trust_id = res.cred['openstack']['trust'] return trust_id except KeyError: # Garbage in the store, ignore it cred_exists = True pass params = { 'auth_url': ctx.auth_url, 'token': ctx.auth_token, 'project_id': ctx.project, 'user_id': ctx.user, } kc = driver_base.SenlinDriver().identity(params) service_cred = context.get_service_context() admin_id = kc.get_user_id(**service_cred) try: trust = kc.trust_get_by_trustor(ctx.user, admin_id, ctx.project) except exception.InternalError as ex: if ex.code == 400: trust = None else: raise ex if not trust: # Create a trust if no existing one found trust = kc.trust_create(ctx.user, admin_id, ctx.project, ctx.roles) # update cache if cred_exists: db_api.cred_update(ctx.user, ctx.project, {'cred': {'openstack': {'trust': trust.id}}}) else: values = { 'user': ctx.user, 'project': ctx.project, 'cred': {'openstack': {'trust': trust.id}} } db_api.cred_create(ctx, values) return trust.id
def _set_cluster_dependents(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) master = self.properties[self.MASTER_CLUSTER] try: master_cluster = cluster_obj.Cluster.find(ctx, master) except exc.ResourceNotFound: msg = _("Cannot find the given cluster: %s") % master raise exc.BadRequest(msg=msg) if master_cluster: # configure kube master dependents, kube master record kube node # cluster uuid master_dependents = master_cluster.dependents master_dependents['kube-node'] = obj.id cluster_obj.Cluster.update(ctx, master_cluster.id, {'dependents': master_dependents})
def _get_master_cluster_info(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) master = self.properties[self.MASTER_CLUSTER] try: cluster = cluster_obj.Cluster.find(ctx, master) except Exception as ex: raise exc.EResourceCreation(type='kubernetes.worker', message=six.text_type(ex)) for key in self.MASTER_CLUSTER_KEYS: if key not in cluster.data: raise exc.EResourceCreation( type='kubernetes.worker', message="Can't find %s in cluster %s" % (key, master)) return cluster.data
def execute_health_check(self): start_time = timeutils.utcnow(True) try: if not self.health_check_types: LOG.error("No health check types found for cluster: %s", self.cluster_id) return _chase_up(start_time, self.interval) cluster = objects.Cluster.get(self.ctx, self.cluster_id, project_safe=False) if not cluster: LOG.warning("Cluster (%s) is not found.", self.cluster_id) return _chase_up(start_time, self.interval) ctx = context.get_service_context(user_id=cluster.user, project_id=cluster.project) actions = [] # loop through nodes and run all health checks on each node nodes = objects.Node.get_all_by_cluster(ctx, self.cluster_id) for node in nodes: action = self._check_node_health(ctx, node, cluster) if action: actions.append(action) for a in actions: # wait for action to complete res, reason = self._wait_for_action(ctx, a['action'], self.node_update_timeout) if not res: LOG.warning( "Node recovery action %s did not complete " "within specified timeout: %s", a['action'], reason) if len(actions) == 0: LOG.info("Health check passed for all nodes in cluster %s.", self.cluster_id) except Exception as ex: LOG.warning("Error while performing health check: %s", ex) finally: return _chase_up(start_time, self.interval)
def _poll_cluster(self, cluster_id, timeout, recover_action): """Routine to be executed for polling cluster status. :param cluster_id: The UUID of the cluster to be checked. :param timeout: The maximum number of seconds to wait. :param recover_action: The health policy action name. :returns: Nothing. """ start_time = timeutils.utcnow(True) cluster = objects.Cluster.get(self.ctx, cluster_id, project_safe=False) if not cluster: LOG.warning("Cluster (%s) is not found.", cluster_id) return _chase_up(start_time, timeout) ctx = context.get_service_context(user_id=cluster.user, project_id=cluster.project) params = {'delete_check_action': True} try: req = objects.ClusterCheckRequest(identity=cluster_id, params=params) action = self.rpc_client.call(ctx, 'cluster_check', req) except Exception as ex: LOG.warning( "Failed in triggering 'cluster_check' RPC for " "'%(c)s': %(r)s", { 'c': cluster_id, 'r': six.text_type(ex) }) return _chase_up(start_time, timeout) # wait for action to complete res, reason = self._wait_for_action(ctx, action['action'], timeout) if not res: LOG.warning("%s", reason) return _chase_up(start_time, timeout) # loop through nodes to trigger recovery nodes = objects.Node.get_all_by_cluster(ctx, cluster_id) for node in nodes: if node.status != consts.NS_ACTIVE: LOG.info("Requesting node recovery: %s", node.id) req = objects.NodeRecoverRequest(identity=node.id, params=recover_action) self.rpc_client.call(ctx, 'node_recover', req) return _chase_up(start_time, timeout)
def _del_cluster_dependents(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) master = self.properties[self.MASTER_CLUSTER] try: master_cluster = cluster_obj.Cluster.find(ctx, master) except exc.ResourceNotFound: msg = _("Cannot find the given cluster: %s") % master raise exc.BadRequest(msg=msg) if master_cluster: # remove kube master record kube node dependents master_dependents = master_cluster.dependents if master_dependents and 'kube-node' in master_dependents: master_dependents.pop('kube-node') cluster_obj.Cluster.update(ctx, master_cluster.id, {'dependents': master_dependents})
def _create_security_group(self, obj): ctx = context.get_service_context(user_id=obj.user, project_id=obj.project) sgid = obj.data.get(self.SECURITY_GROUP, None) if sgid: return sgid client = self.network(obj) try: sg = client.security_group_create(name=self.name) except Exception as ex: raise exc.EResourceCreation(type='kubernetes', message=str(ex)) data = obj.data data[self.SECURITY_GROUP] = sg.id cluster_obj.Cluster.update(ctx, obj.id, {'data': data}) self._set_security_group_rules(obj, sg.id) return sg.id
def _get_trust(self, req): """List trusts with current user as the trustor. :param req: The WSGI request object. :return: ID of the trust or exception of InternalError. """ rpcc = rpc.EngineClient() ctx = req.context params = {'user': ctx.user, 'project': ctx.project} obj = util.parse_request('CredentialGetRequest', req, params) res = rpcc.call2(ctx, 'credential_get', obj) if res: trust_id = res.get('trust', None) if trust_id: return trust_id params = { 'auth_url': ctx.auth_url, 'token': ctx.auth_token, 'project_id': ctx.project, 'user_id': ctx.user, } kc = driver_base.SenlinDriver().identity(params) service_cred = context.get_service_context() admin_id = kc.get_user_id(**service_cred) try: trust = kc.trust_get_by_trustor(ctx.user, admin_id, ctx.project) except exception.InternalError as ex: if ex.code == 400: trust = None else: raise if not trust: # Create a trust if no existing one found trust = kc.trust_create(ctx.user, admin_id, ctx.project, ctx.roles) # If credential not exists, create it, otherwise update it. cred = {'openstack': {'trust': trust.id}} params = {'cred': cred} obj = util.parse_request('CredentialCreateRequest', req, params) rpcc.call2(ctx, 'credential_create', obj) return trust.id
def _get_trust(self, ctx): """List trusts with current user as the trustor. :param ctx: The requesting context. :return: ID of the trust or exception of InternalError. """ rpcc = rpc.EngineClient() cred_exists = False res = rpcc.credential_get(ctx) if res: trust_id = res.get('trust', None) if trust_id: return trust_id cred_exists = True params = { 'auth_url': ctx.auth_url, 'token': ctx.auth_token, 'project_id': ctx.project, 'user_id': ctx.user, } kc = driver_base.SenlinDriver().identity(params) service_cred = context.get_service_context() admin_id = kc.get_user_id(**service_cred) try: trust = kc.trust_get_by_trustor(ctx.user, admin_id, ctx.project) except exception.InternalError as ex: if ex.code == 400: trust = None else: raise ex if not trust: # Create a trust if no existing one found trust = kc.trust_create(ctx.user, admin_id, ctx.project, ctx.roles) # update cache if cred_exists: rpcc.credential_update(ctx, trust.id) else: rpcc.credential_create(ctx, trust.id) return trust.id
def info(self, ctxt, publisher_id, event_type, payload, metadata): meta = payload['metadata'] if meta.get('cluster_id') == self.cluster_id: if event_type not in self.VM_FAILURE_EVENTS: return params = { 'event': self.VM_FAILURE_EVENTS[event_type], 'state': payload.get('state', 'Unknown'), 'instance_id': payload.get('instance_id', 'Unknown'), 'timestamp': metadata['timestamp'], 'publisher': publisher_id, } node_id = meta.get('cluster_node_id') if node_id: LOG.info(_LI("Requesting node recovery: %s"), node_id) ctx_value = context.get_service_context( project=self.project_id, user=payload['user_id']) ctx = context.RequestContext(**ctx_value) self.rpc.node_recover(ctx, node_id, params)
def _build_conn_params(self, cluster): """Build trust-based connection parameters. :param cluster: the cluste for which the trust will be checked. """ service_creds = senlin_context.get_service_context() params = { 'username': service_creds.get('username'), 'password': service_creds.get('password'), 'auth_url': service_creds.get('auth_url'), 'user_domain_name': service_creds.get('user_domain_name') } cred = db_api.cred_get(oslo_context.get_current(), cluster.user, cluster.project) if cred is None: raise exception.TrustNotFound(trustor=cluster.user) params['trust_id'] = cred.cred['openstack']['trust'] return params
def _build_conn_params(self, cluster): """Build trust-based connection parameters. :param cluster: the cluste for which the trust will be checked. """ service_creds = senlin_context.get_service_context() params = { 'username': service_creds.get('username'), 'password': service_creds.get('password'), 'auth_url': service_creds.get('auth_url'), 'user_domain_name': service_creds.get('user_domain_name') } cred = db_api.cred_get(oslo_context.get_current(), cluster.user, cluster.project) if cred is None: raise exception.TrustNotFound(trustor=cluster.user) params['trust_id'] = [cred.cred['openstack']['trust']] return params
def _poll_url(self, cluster_id, timeout, recover_action, params): """Routine to be executed for polling node status from a url :param cluster_id: The UUID of the cluster to be checked. :param timeout: The maximum number of seconds to wait for recovery action :param recover_action: The health policy action name. :param params: Parameters specific to poll url or recovery action :returns: Nothing. """ start_time = timeutils.utcnow(True) cluster = objects.Cluster.get(self.ctx, cluster_id, project_safe=False) if not cluster: LOG.warning("Cluster (%s) is not found.", cluster_id) return _chase_up(start_time, timeout) ctx = context.get_service_context(user_id=cluster.user, project_id=cluster.project) actions = [] # loop through nodes to poll url for each node nodes = objects.Node.get_all_by_cluster(ctx, cluster_id) for node in nodes: action = self._check_url_and_recover_node(ctx, node, recover_action, params) if action: actions.append(action) for a in actions: # wait for action to complete res, reason = self._wait_for_action(ctx, a['action'], timeout) if not res: LOG.warning( "Node recovery action %s did not complete " "within specified timeout: %s", a['action'], reason) return _chase_up(start_time, timeout)
def _get_trust(self, ctx): """List trusts with current user as the trustor. :param ctx: The requesting context. :return: ID of the trust or exception of InternalError. """ rpcc = rpc.EngineClient() res = rpcc.credential_get(ctx) if res: trust_id = res.get('trust', None) if trust_id: return trust_id params = { 'auth_url': ctx.auth_url, 'token': ctx.auth_token, 'project_id': ctx.project, 'user_id': ctx.user, } kc = driver_base.SenlinDriver().identity(params) service_cred = context.get_service_context() admin_id = kc.get_user_id(**service_cred) try: trust = kc.trust_get_by_trustor(ctx.user, admin_id, ctx.project) except exception.InternalError as ex: if ex.code == 400: trust = None else: raise ex if not trust: # Create a trust if no existing one found trust = kc.trust_create(ctx.user, admin_id, ctx.project, ctx.roles) # If credential not exists, create it, otherwise update it. rpcc.credential_create(ctx, trust.id) return trust.id
def process_request(self, req): # We only handle POST requests if req.method != 'POST': return # Extract webhook (receiver) ID and params results = self._parse_url(req.url) if not results: return (receiver_id, params) = results dbctx = context.RequestContext(is_admin=True) rpcc = rpc.EngineClient() try: norm_req = obj_base.SenlinObject.normalize_req( 'ReceiverGetRequest', {'identity': receiver_id}) obj = vorr.ReceiverGetRequest.obj_from_primitive(norm_req) jsonschema.validate(norm_req, obj.to_json_schema()) except (ValueError) as ex: raise exc.HTTPBadRequest(six.text_type(ex)) except jsonschema.exceptions.ValidationError as ex: raise exc.HTTPBadRequest(six.text_type(ex.message)) receiver = rpcc.call2(dbctx, 'receiver_get', obj) svc_ctx = context.get_service_context() kwargs = { 'auth_url': svc_ctx['auth_url'], 'username': svc_ctx['username'], 'user_domain_name': svc_ctx['user_domain_name'], 'password': svc_ctx['password'] } kwargs.update(receiver['actor']) # Get token and fill it into the request header token = self._get_token(**kwargs) req.headers['X-Auth-Token'] = token
def _build_conn_params(self, user, project): """Build connection params for specific user and project. :param user: The ID of the user for which a trust will be used. :param project: The ID of the project for which a trust will be used. :returns: A dict containing the required parameters for connection creation. """ service_creds = senlin_context.get_service_context() params = { 'username': service_creds.get('username'), 'password': service_creds.get('password'), 'auth_url': service_creds.get('auth_url'), 'user_domain_name': service_creds.get('user_domain_name') } cred = co.Credential.get(oslo_context.get_current(), user, project) if cred is None: raise exception.TrustNotFound(trustor=user) params['trust_id'] = cred.cred['openstack']['trust'] return params
def _execute_health_check(self, interval, cluster_id, recover_action, recovery_cond, node_update_timeout): start_time = timeutils.utcnow(True) try: if cluster_id not in self.health_check_types: LOG.error("Cluster (%s) is not found in health_check_types.", self.cluster_id) return _chase_up(start_time, interval) if len(self.health_check_types[cluster_id]) == 0: LOG.error("No health check types found for Cluster (%s).", self.cluster_id) return _chase_up(start_time, interval) cluster = objects.Cluster.get(self.ctx, cluster_id, project_safe=False) if not cluster: LOG.warning("Cluster (%s) is not found.", self.cluster_id) return _chase_up(start_time, interval) ctx = context.get_service_context(user_id=cluster.user, project_id=cluster.project) actions = [] # loop through nodes and run all health checks on each node nodes = objects.Node.get_all_by_cluster(ctx, cluster_id) for node in nodes: node_is_healthy = True if recovery_cond == consts.ANY_FAILED: # recovery happens if any detection mode fails # i.e. the inverse logic is that node is considered healthy # if all detection modes pass node_is_healthy = all( hc.run_health_check(ctx, node) for hc in self.health_check_types[cluster_id]) elif recovery_cond == consts.ALL_FAILED: # recovery happens if all detection modes fail # i.e. the inverse logic is that node is considered healthy # if any detection mode passes node_is_healthy = any( hc.run_health_check(ctx, node) for hc in self.health_check_types[cluster_id]) else: raise Exception( '{} is an invalid recovery conditional'.format( recovery_cond)) if not node_is_healthy: action = self._recover_node(node.id, ctx, recover_action) actions.append(action) for a in actions: # wait for action to complete res, reason = self._wait_for_action(ctx, a['action'], node_update_timeout) if not res: LOG.warning( "Node recovery action %s did not complete " "within specified timeout: %s", a['action'], reason) if len(actions) > 0: LOG.info('Health check passed for all nodes in cluster %s.', cluster_id) except Exception as ex: LOG.warning('Error while performing health check: %s', ex) return _chase_up(start_time, interval)