def _get_tiers_size(self): try: resp, body = self.service.ceph_api.osd_df(body='json', output_method='tree') except IOError: return 0 if not resp.ok: LOG.error( _LE("Getting the cluster usage " "information failed: %(reason)s - " "%(body)s") % { "reason": resp.reason, "body": body }) return {} # A node is a crushmap element: root, chassis, host, osd. Create a # dictionary for the nodes with the key as the id used for efficient # searching through nodes. # # For example: storage-0's node has one child node => OSD 0 # { # "id": -4, # "name": "storage-0", # "type": "host", # "type_id": 1, # "reweight": -1.000000, # "kb": 51354096, # "kb_used": 1510348, # "kb_avail": 49843748, # "utilization": 2.941047, # "var": 1.480470, # "pgs": 0, # "children": [ # 0 # ] # }, search_tree = {} for node in body['output']['nodes']: search_tree[node['id']] = node # Extract the tiers as we will return a dict for the size of each tier tiers = {k: v for k, v in search_tree.items() if v['type'] == 'root'} # For each tier, traverse the heirarchy from the root->chassis->host. # Sum the host sizes to determine the overall size of the tier tier_sizes = {} for tier in tiers.values(): tier_size = 0 for chassis_id in tier['children']: chassis_size = 0 chassis = search_tree[chassis_id] for host_id in chassis['children']: host = search_tree[host_id] if (chassis_size == 0 or chassis_size > host['kb']): chassis_size = host['kb'] tier_size += chassis_size / (1024**2) tier_sizes[tier['name']] = tier_size return tier_sizes
def _report_fault(self, health, alarm_id): if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH: new_severity = constants.SEVERITY[health['health']] new_reason_text = self._parse_reason(health) new_service_affecting = \ constants.SERVICE_AFFECTING[health['health']] # Raise or update alarm if necessary if ((not self.current_health_alarm) or (self.current_health_alarm.__dict__['severity'] != new_severity) or (self.current_health_alarm.__dict__['reason_text'] != new_reason_text) or (self.current_health_alarm.__dict__['service_affecting'] != str(new_service_affecting))): fault = fm_api.Fault( alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH, alarm_type=fm_constants.FM_ALARM_TYPE_4, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=self.service.entity_instance_id, severity=new_severity, reason_text=new_reason_text, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, proposed_repair_action=constants.REPAIR_ACTION, service_affecting=new_service_affecting) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info(_LI( "Created storage alarm %(alarm_uuid)s - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "alarm_uuid": alarm_uuid, "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting}) else: LOG.error(_LE( "Failed to create storage alarm - " "severity: %(severity)s, reason: %(reason)s " "service_affecting: %(service_affecting)s") % { "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting}) # Log detailed reason for later analysis if (self.current_ceph_health != health['health'] or self.detailed_health_reason != health['detail']): LOG.info(_LI("Ceph status changed: %(health)s " "detailed reason: %(detail)s") % health) self.current_ceph_health = health['health'] self.detailed_health_reason = health['detail']
def _get_osd_pool_quota(self, pool_name): try: resp, quota = self.service.ceph_api.osd_get_pool_quota( pool_name, body='json') except IOError: return 0 if not resp.ok: LOG.error(_LE("Getting the quota for " "%(name)s pool failed:%(reason)s)") % {"name": pool_name, "reason": resp.reason}) return 0 else: try: quota_gib = int(quota["output"]["quota_max_bytes"]) / (1024**3) return quota_gib except IOError: return 0
def _report_fault(self, health, alarm_id): if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH: new_severity = constants.SEVERITY[health['health']] new_reason_text = self._parse_reason(health) new_service_affecting = \ constants.SERVICE_AFFECTING[health['health']] # Raise or update alarm if necessary if ((not self.current_health_alarm) or (self.current_health_alarm.__dict__['severity'] != new_severity) or (self.current_health_alarm.__dict__['reason_text'] != new_reason_text) or (self.current_health_alarm.__dict__['service_affecting'] != str(new_service_affecting))): fault = fm_api.Fault( alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH, alarm_type=fm_constants.FM_ALARM_TYPE_4, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=self.service.entity_instance_id, severity=new_severity, reason_text=new_reason_text, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, proposed_repair_action=constants.REPAIR_ACTION, service_affecting=new_service_affecting) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info( _LI("Created storage alarm %(alarm_uuid)s - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "alarm_uuid": alarm_uuid, "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting }) else: LOG.error( _LE("Failed to create storage alarm - " "severity: %(severity)s, reason: %(reason)s " "service_affecting: %(service_affecting)s") % { "severity": new_severity, "reason": new_reason_text, "service_affecting": new_service_affecting }) # Log detailed reason for later analysis if (self.current_ceph_health != health['health'] or self.detailed_health_reason != health['detail']): LOG.info( _LI("Ceph status changed: %(health)s " "detailed reason: %(detail)s") % health) self.current_ceph_health = health['health'] self.detailed_health_reason = health['detail'] elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and not health['tier_eid'] in self.current_quota_alarms): quota_reason_text = ("Quota/Space mismatch for the %s tier. The " "sum of Ceph pool quotas does not match the " "tier size." % health['tier_name']) fault = fm_api.Fault( alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=health['tier_eid'], severity=fm_constants.FM_ALARM_SEVERITY_MINOR, reason_text=quota_reason_text, alarm_type=fm_constants.FM_ALARM_TYPE_7, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_75, proposed_repair_action=( "Update ceph storage pool quotas to use all available " "cluster space for the %s tier." % health['tier_name']), service_affecting=False) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info( _LI("Created storage quota storage alarm %(alarm_uuid)s. " "Reason: %(reason)s") % { "alarm_uuid": alarm_uuid, "reason": quota_reason_text }) else: LOG.error( _LE("Failed to create quota " "storage alarm. Reason: %s") % quota_reason_text)
def _report_alarm_osds_health(self): response, osd_tree = self.service.ceph_api.osd_tree(body='json') if not response.ok: LOG.error( _LE("Failed to retrieve Ceph OSD tree: " "status_code: %(status_code)s, reason: %(reason)s") % { "status_code": response.status_code, "reason": response.reason }) return osd_tree = dict([(n['id'], n) for n in osd_tree['output']['nodes']]) alarms = [] self._check_storage_tier(osd_tree, "storage-tier", lambda *args: alarms.append(args)) old_alarms = {} for alarm_id in [ fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR, fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL ]: alarm_list = self.service.fm_api.get_faults_by_id(alarm_id) if not alarm_list: continue for alarm in alarm_list: if alarm.entity_instance_id not in old_alarms: old_alarms[alarm.entity_instance_id] = [] old_alarms[alarm.entity_instance_id].append( (alarm.alarm_id, alarm.reason_text)) for peer_group, reason, severity in alarms: if self._current_health_alarm_equals(reason, severity): continue alarm_critical_major = fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL: alarm_critical_major = ( fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL) entity_instance_id = (self.service.entity_instance_id + '.peergroup=' + peer_group) alarm_already_exists = False if entity_instance_id in old_alarms: for alarm_id, old_reason in old_alarms[entity_instance_id]: if (reason == old_reason and alarm_id == alarm_critical_major): # if the alarm is exactly the same, we don't need # to recreate it old_alarms[entity_instance_id].remove( (alarm_id, old_reason)) alarm_already_exists = True elif (alarm_id == alarm_critical_major): # if we change just the reason, then we just remove the # alarm from the list so we don't remove it at the # end of the function old_alarms[entity_instance_id].remove( (alarm_id, old_reason)) if (len(old_alarms[entity_instance_id]) == 0): del old_alarms[entity_instance_id] # in case the alarm is exactly the same, we skip the alarm set if alarm_already_exists is True: continue major_repair_action = constants.REPAIR_ACTION_MAJOR_CRITICAL_ALARM fault = fm_api.Fault( alarm_id=alarm_critical_major, alarm_type=fm_constants.FM_ALARM_TYPE_4, alarm_state=fm_constants.FM_ALARM_STATE_SET, entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER, entity_instance_id=entity_instance_id, severity=severity, reason_text=reason, probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15, proposed_repair_action=major_repair_action, service_affecting=constants.SERVICE_AFFECTING['HEALTH_WARN']) alarm_uuid = self.service.fm_api.set_fault(fault) if alarm_uuid: LOG.info( _LI("Created storage alarm %(alarm_uuid)s - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "alarm_uuid": str(alarm_uuid), "severity": str(severity), "reason": reason, "service_affecting": str(constants.SERVICE_AFFECTING['HEALTH_WARN']) }) else: LOG.error( _LE("Failed to create storage alarm - " "severity: %(severity)s, reason: %(reason)s, " "service_affecting: %(service_affecting)s") % { "severity": str(severity), "reason": reason, "service_affecting": str(constants.SERVICE_AFFECTING['HEALTH_WARN']) }) for entity_instance_id in old_alarms: for alarm_id, old_reason in old_alarms[entity_instance_id]: self.service.fm_api.clear_fault(alarm_id, entity_instance_id)