def _report_fault(self, health, alarm_id): if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH: new_severity = constants.SEVERITY[health['health']] new_reason_text = self._parse_reason(health) new_service_affecting = \ constants.SERVICE_AFFECTING[health['health']] # Raise or update alarm if necessary if ((not self.current_health_alarm) or (self.current_health_alarm.__dict__['severity'] != new_severity) or (self.current_health_alarm.__dict__['reason_text'] != new_reason_text) or (self.current_health_alarm.__dict__['service_affecting'] != str(new_service_affecting))): fault = fm_api.Fault( alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH, alarm_type=fm_constants.FM_ALARM_TYPE_4, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=self.service.entity_instance_id, severity=new_severity, reason_text=new_reason_text, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, proposed_repair_action=constants.REPAIR_ACTION, service_affecting=new_service_affecting) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info(_LI( "Created storage alarm %(alarm_uuid)s - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "alarm_uuid": alarm_uuid, "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting}) else: LOG.error(_LE( "Failed to create storage alarm - " "severity: %(severity)s, reason: %(reason)s " "service_affecting: %(service_affecting)s") % { "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting}) # Log detailed reason for later analysis if (self.current_ceph_health != health['health'] or self.detailed_health_reason != health['detail']): LOG.info(_LI("Ceph status changed: %(health)s " "detailed reason: %(detail)s") % health) self.current_ceph_health = health['health'] self.detailed_health_reason = health['detail']
def _clear_fault(self, alarm_id, entity_instance_id=None): # Only clear alarm if there is one already raised if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and self.current_health_alarm): LOG.info(_LI("Clearing health alarm")) self.service.fm_api.clear_fault( fm_constants.FM_ALARM_ID_STORAGE_CEPH, self.service.entity_instance_id) elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and entity_instance_id in self.current_quota_alarms): LOG.info(_LI("Clearing quota alarm with entity_instance_id %s") % entity_instance_id) self.service.fm_api.clear_fault( fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, entity_instance_id)
def _clear_fault(self, alarm_id, entity_instance_id=None): # Only clear alarm if there is one already raised if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and self.current_health_alarm): LOG.info(_LI("Clearing health alarm")) self.service.fm_api.clear_fault( fm_constants.FM_ALARM_ID_STORAGE_CEPH, self.service.entity_instance_id)
def get_tiers_size(self, _): """Get the ceph cluster tier sizes. returns: a dict of sizes (in GB) by tier name """ tiers_size = self.service.monitor.tiers_size LOG.debug(_LI("Ceph cluster tiers (size in GB): %s") % str(tiers_size)) return tiers_size
def osd_pool_set_quota(ceph_api, pool_name, max_bytes=0, max_objects=0): """Set the quota for an OSD pool_name Setting max_bytes or max_objects to 0 will disable that quota param :param pool_name: OSD pool_name :param max_bytes: maximum bytes for OSD pool_name :param max_objects: maximum objects for OSD pool_name """ # Update quota if needed prev_quota = osd_pool_get_quota(ceph_api, pool_name) if prev_quota["max_bytes"] != max_bytes: resp, b = ceph_api.osd_set_pool_quota(pool_name, 'max_bytes', max_bytes, body='json') if resp.ok: LOG.info( _LI("Set OSD pool_name quota: " "pool_name={}, max_bytes={}").format(pool_name, max_bytes)) else: e = exception.CephPoolSetQuotaFailure(pool=pool_name, name='max_bytes', value=max_bytes, reason=resp.reason) LOG.error(e) raise e if prev_quota["max_objects"] != max_objects: resp, b = ceph_api.osd_set_pool_quota(pool_name, 'max_objects', max_objects, body='json') if resp.ok: LOG.info( _LI("Set OSD pool_name quota: " "pool_name={}, max_objects={}").format( pool_name, max_objects)) else: e = exception.CephPoolSetQuotaFailure(pool=pool_name, name='max_objects', value=max_objects, reason=resp.reason) LOG.error(e) raise e
def _set_upgrade(self, upgrade): state = upgrade.get('state') from_version = upgrade.get('from_version') if (state and state != constants.UPGRADE_COMPLETED and from_version == constants.TITANIUM_SERVER_VERSION_18_03): LOG.info( _LI("Wait for ceph upgrade to complete before monitoring cluster." )) self.wait_for_upgrade_complete = True
def osd_pool_create(ceph_api, pool_name, pg_num, pgp_num): # ruleset 0: is the default ruleset if no crushmap is loaded or # the ruleset for the backing tier if loaded: # Name: storage_tier_ruleset ruleset = 0 response, body = ceph_api.osd_pool_create(pool_name, pg_num, pgp_num, pool_type="replicated", ruleset=ruleset, body='json') if response.ok: LOG.info( _LI("Created OSD pool: " "pool_name={}, pg_num={}, pgp_num={}, " "pool_type=replicated, ruleset={}").format( pool_name, pg_num, pgp_num, ruleset)) else: e = exception.CephPoolCreateFailure(name=pool_name, reason=response.reason) LOG.error(e) raise e # Explicitly assign the ruleset to the pool on creation since it is # ignored in the create call response, body = ceph_api.osd_set_pool_param(pool_name, "crush_ruleset", ruleset, body='json') if response.ok: LOG.info( _LI("Assigned crush ruleset to OS pool: " "pool_name={}, ruleset={}").format(pool_name, ruleset)) else: e = exception.CephPoolRulesetFailure(name=pool_name, reason=response.reason) LOG.error(e) ceph_api.osd_pool_delete(pool_name, pool_name, sure='--yes-i-really-really-mean-it', body='json') raise e
def ceph_poll_status(self): # get previous data every time in case: # * daemon restarted # * alarm was cleared manually but stored as raised in daemon self._get_current_alarms() if self.current_health_alarm: LOG.info(_LI("Current alarm: %s") % str(self.current_health_alarm.__dict__)) # get ceph health health = self._get_health() LOG.info(_LI("Current Ceph health: " "%(health)s detail: %(detail)s") % health) health = self.filter_health_status(health) if health['health'] != constants.CEPH_HEALTH_OK: self._report_fault(health, fm_constants.FM_ALARM_ID_STORAGE_CEPH) self._report_alarm_osds_health() else: self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH) self.clear_all_major_critical()
def get_primary_tier_size(self, _): """Get the ceph size for the primary tier. returns: an int for the size (in GB) of the tier """ tiers_size = self.service.monitor.tiers_size primary_tier_size = tiers_size.get( self.service.monitor.primary_tier_name, 0) LOG.debug(_LI("Ceph cluster primary tier size: %s GB") % str(primary_tier_size)) return primary_tier_size
def set_flag_require_jewel_osds(self): try: response, body = self.service.ceph_api.osd_set_key( constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS, body='json') LOG.info(_LI("Set require_jewel_osds flag")) except IOError as e: raise exception.CephApiFailure(call="osd_set_key", reason=str(e)) else: if not response.ok: raise exception.CephSetKeyFailure( flag=constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS, extra=_("needed to complete upgrade to Jewel"), response_status_code=response.status_code, response_reason=response.reason, status=body.get('status'), output=body.get('output'))
def osd_pool_delete(ceph_api, pool_name): """Delete an osd pool :param pool_name: pool name """ response, body = ceph_api.osd_pool_delete( pool_name, pool_name, sure='--yes-i-really-really-mean-it', body='json') if response.ok: LOG.info(_LI("Deleted OSD pool {}").format(pool_name)) else: e = exception.CephPoolDeleteFailure( name=pool_name, reason=response.reason) LOG.warn(e) raise e
def ceph_get_fsid(self): # Check whether an alarm has already been raised self._get_current_alarms() if self.current_health_alarm: LOG.info(_LI("Current alarm: %s") % str(self.current_health_alarm.__dict__)) fsid = self._get_fsid() if not fsid: # Raise alarm - it will not have an entity_instance_id self._report_fault({'health': constants.CEPH_HEALTH_DOWN, 'detail': 'Ceph cluster is down.'}, fm_constants.FM_ALARM_ID_STORAGE_CEPH) else: # Clear alarm with no entity_instance_id self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH) self.service.entity_instance_id = 'cluster=%s' % fsid
def auto_heal(self, health): if (health['health'] == constants.CEPH_HEALTH_WARN and (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET in health['detail'])): try: upgrade = self.service.get_software_upgrade_status() except Exception as ex: LOG.warn( _LW("Getting software upgrade status failed " "with: %s. Skip auto-heal attempt " "(will retry on next ceph status poll).") % str(ex)) return health state = upgrade.get('state') # surpress require_jewel_osds in case upgrade is # in progress but not completed or aborting if (not self.wait_for_upgrade_complete and (upgrade.get('from_version') == constants.TITANIUM_SERVER_VERSION_18_03) and state not in [ None, constants.UPGRADE_COMPLETED, constants.UPGRADE_ABORTING, constants.UPGRADE_ABORT_COMPLETING, constants.UPGRADE_ABORTING_ROLLBACK ]): self.wait_for_upgrade_complete = True # set require_jewel_osds in case upgrade is # not in progress or completed if (state in [None, constants.UPGRADE_COMPLETED]): LOG.warn( _LW("No upgrade in progress or update completed " "and require_jewel_osds health warning raised. " "Set require_jewel_osds flag.")) self.set_flag_require_jewel_osds() health = self._remove_require_jewel_osds_warning(health) LOG.info(_LI("Unsurpress require_jewel_osds health warning")) self.wait_for_upgrade_complete = False # unsurpress require_jewel_osds in case upgrade # is aborting if (state in [ constants.UPGRADE_ABORTING, constants.UPGRADE_ABORT_COMPLETING, constants.UPGRADE_ABORTING_ROLLBACK ]): self.wait_for_upgrade_complete = False return health
def _report_fault(self, health, alarm_id): if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH: new_severity = constants.SEVERITY[health['health']] new_reason_text = self._parse_reason(health) new_service_affecting = \ constants.SERVICE_AFFECTING[health['health']] # Raise or update alarm if necessary if ((not self.current_health_alarm) or (self.current_health_alarm.__dict__['severity'] != new_severity) or (self.current_health_alarm.__dict__['reason_text'] != new_reason_text) or (self.current_health_alarm.__dict__['service_affecting'] != str(new_service_affecting))): fault = fm_api.Fault( alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH, alarm_type=fm_constants.FM_ALARM_TYPE_4, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=self.service.entity_instance_id, severity=new_severity, reason_text=new_reason_text, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, proposed_repair_action=constants.REPAIR_ACTION, service_affecting=new_service_affecting) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info( _LI("Created storage alarm %(alarm_uuid)s - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "alarm_uuid": alarm_uuid, "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting }) else: LOG.error( _LE("Failed to create storage alarm - " "severity: %(severity)s, reason: %(reason)s " "service_affecting: %(service_affecting)s") % { "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting }) # Log detailed reason for later analysis if (self.current_ceph_health != health['health'] or self.detailed_health_reason != health['detail']): LOG.info( _LI("Ceph status changed: %(health)s " "detailed reason: %(detail)s") % health) self.current_ceph_health = health['health'] self.detailed_health_reason = health['detail'] elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and not health['tier_eid'] in self.current_quota_alarms): quota_reason_text = ("Quota/Space mismatch for the %s tier. The " "sum of Ceph pool quotas does not match the " "tier size." % health['tier_name']) fault = fm_api.Fault( alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=health['tier_eid'], severity=fm_constants.FM_ALARM_SEVERITY_MINOR, reason_text=quota_reason_text, alarm_type=fm_constants.FM_ALARM_TYPE_7, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_75, proposed_repair_action=( "Update ceph storage pool quotas to use all available " "cluster space for the %s tier." % health['tier_name']), service_affecting=False) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info( _LI("Created storage quota storage alarm %(alarm_uuid)s. " "Reason: %(reason)s") % { "alarm_uuid": alarm_uuid, "reason": quota_reason_text }) else: LOG.error( _LE("Failed to create quota " "storage alarm. Reason: %s") % quota_reason_text)
def _report_alarm_osds_health(self): response, osd_tree = self.service.ceph_api.osd_tree(body='json') if not response.ok: LOG.error( _LE("Failed to retrieve Ceph OSD tree: " "status_code: %(status_code)s, reason: %(reason)s") % { "status_code": response.status_code, "reason": response.reason }) return osd_tree = dict([(n['id'], n) for n in osd_tree['output']['nodes']]) alarms = [] self._check_storage_tier(osd_tree, "storage-tier", lambda *args: alarms.append(args)) old_alarms = {} for alarm_id in [ fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR, fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL ]: alarm_list = self.service.fm_api.get_faults_by_id(alarm_id) if not alarm_list: continue for alarm in alarm_list: if alarm.entity_instance_id not in old_alarms: old_alarms[alarm.entity_instance_id] = [] old_alarms[alarm.entity_instance_id].append( (alarm.alarm_id, alarm.reason_text)) for peer_group, reason, severity in alarms: if self._current_health_alarm_equals(reason, severity): continue alarm_critical_major = fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL: alarm_critical_major = ( fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL) entity_instance_id = (self.service.entity_instance_id + '.peergroup=' + peer_group) alarm_already_exists = False if entity_instance_id in old_alarms: for alarm_id, old_reason in old_alarms[entity_instance_id]: if (reason == old_reason and alarm_id == alarm_critical_major): # if the alarm is exactly the same, we don't need # to recreate it old_alarms[entity_instance_id].remove( (alarm_id, old_reason)) alarm_already_exists = True elif (alarm_id == alarm_critical_major): # if we change just the reason, then we just remove the # alarm from the list so we don't remove it at the # end of the function old_alarms[entity_instance_id].remove( (alarm_id, old_reason)) if (len(old_alarms[entity_instance_id]) == 0): del old_alarms[entity_instance_id] # in case the alarm is exactly the same, we skip the alarm set if alarm_already_exists is True: continue major_repair_action = constants.REPAIR_ACTION_MAJOR_CRITICAL_ALARM fault = fm_api.Fault( alarm_id=alarm_critical_major, alarm_type=fm_constants.FM_ALARM_TYPE_4, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=entity_instance_id, severity=severity, reason_text=reason, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, proposed_repair_action=major_repair_action, service_affecting=constants.SERVICE_AFFECTING['HEALTH_WARN']) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info( _LI("Created storage alarm %(alarm_uuid)s - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "alarm_uuid": str(alarm_uuid), "severity": str(severity), "reason": reason, "service_affecting": str(constants.SERVICE_AFFECTING['HEALTH_WARN']) }) else: LOG.error( _LE("Failed to create storage alarm - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "severity": str(severity), "reason": reason, "service_affecting": str(constants.SERVICE_AFFECTING['HEALTH_WARN']) }) for entity_instance_id in old_alarms: for alarm_id, old_reason in old_alarms[entity_instance_id]: self.service.fm_api.clear_fault(alarm_id, entity_instance_id)
def ceph_poll_quotas(self): self._get_current_alarms() if self.current_quota_alarms: LOG.info( _LI("Current quota alarms %s") % self.current_quota_alarms) # Get current current size of each tier previous_tiers_size = self.tiers_size self.tiers_size = self._get_tiers_size() # Make sure any removed tiers have the alarms cleared for t in (set(previous_tiers_size) - set(self.tiers_size)): self._clear_fault( fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, "{0}.tier={1}".format( self.service.entity_instance_id, t[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)])) # Check the quotas on each tier for tier in self.tiers_size: # Extract the tier name from the crush equivalent tier_name = tier[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)] if self.tiers_size[tier] == 0: LOG.info( _LI("'%s' tier cluster size not yet available") % tier_name) continue pools_quota_sum = 0 if tier == self.primary_tier_name: for pool in constants.CEPH_POOLS: if (pool['pool_name'] == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL or pool['pool_name'] == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER): object_pool_name = self._get_object_pool_name() if object_pool_name is None: LOG.error("Rados gateway object data pool does " "not exist.") else: pools_quota_sum += \ self._get_osd_pool_quota(object_pool_name) else: pools_quota_sum += self._get_osd_pool_quota( pool['pool_name']) else: for pool in constants.SB_TIER_CEPH_POOLS: pool_name = "{0}-{1}".format(pool['pool_name'], tier_name) pools_quota_sum += self._get_osd_pool_quota(pool_name) # Currently, there is only one pool on the addtional tier(s), # therefore allow a quota of 0 if (pools_quota_sum != self.tiers_size[tier] and pools_quota_sum != 0): self._report_fault( { 'tier_name': tier_name, 'tier_eid': "{0}.tier={1}".format(self.service.entity_instance_id, tier_name) }, fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE) else: self._clear_fault( fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, "{0}.tier={1}".format(self.service.entity_instance_id, tier_name))
def get_software_upgrade_status(self): LOG.info(_LI("Getting software upgrade status from sysinv")) cctxt = self.sysinv_conductor.prepare(timeout=2) upgrade = cctxt.call({}, 'get_software_upgrade_status') LOG.info(_LI("Software upgrade status: %s") % str(upgrade)) return upgrade