예제 #1
0
    def _report_fault(self, health, alarm_id):
        if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH:
            new_severity = constants.SEVERITY[health['health']]
            new_reason_text = self._parse_reason(health)
            new_service_affecting = \
                constants.SERVICE_AFFECTING[health['health']]

            # Raise or update alarm if necessary
            if ((not self.current_health_alarm) or
                (self.current_health_alarm.__dict__['severity'] !=
                 new_severity) or
                (self.current_health_alarm.__dict__['reason_text'] !=
                 new_reason_text) or
                (self.current_health_alarm.__dict__['service_affecting'] !=
                 str(new_service_affecting))):

                fault = fm_api.Fault(
                    alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH,
                    alarm_type=fm_constants.FM_ALARM_TYPE_4,
                    alarm_state=fm_constants.FM_ALARM_STATE_SET,
                    entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                    entity_instance_id=self.service.entity_instance_id,
                    severity=new_severity,
                    reason_text=new_reason_text,
                    probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                    proposed_repair_action=constants.REPAIR_ACTION,
                    service_affecting=new_service_affecting)

                alarm_uuid = self.service.fm_api.set_fault(fault)
                if alarm_uuid:
                    LOG.info(_LI(
                        "Created storage alarm %(alarm_uuid)s - "
                        "severity: %(severity)s, reason: %(reason)s, "
                        "service_affecting: %(service_affecting)s") % {
                        "alarm_uuid": alarm_uuid,
                        "severity": new_severity,
                        "reason": new_reason_text,
                        "service_affecting": new_service_affecting})
                else:
                    LOG.error(_LE(
                        "Failed to create storage alarm - "
                        "severity: %(severity)s, reason: %(reason)s "
                        "service_affecting: %(service_affecting)s") % {
                        "severity": new_severity,
                        "reason": new_reason_text,
                        "service_affecting": new_service_affecting})

            # Log detailed reason for later analysis
            if (self.current_ceph_health != health['health'] or
                    self.detailed_health_reason != health['detail']):
                LOG.info(_LI("Ceph status changed: %(health)s "
                             "detailed reason: %(detail)s") % health)
                self.current_ceph_health = health['health']
                self.detailed_health_reason = health['detail']
예제 #2
0
 def _clear_fault(self, alarm_id, entity_instance_id=None):
     # Only clear alarm if there is one already raised
     if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and
             self.current_health_alarm):
         LOG.info(_LI("Clearing health alarm"))
         self.service.fm_api.clear_fault(
             fm_constants.FM_ALARM_ID_STORAGE_CEPH,
             self.service.entity_instance_id)
     elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE and
           entity_instance_id in self.current_quota_alarms):
         LOG.info(_LI("Clearing quota alarm with entity_instance_id %s")
                  % entity_instance_id)
         self.service.fm_api.clear_fault(
             fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
             entity_instance_id)
예제 #3
0
 def _clear_fault(self, alarm_id, entity_instance_id=None):
     # Only clear alarm if there is one already raised
     if (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH and
             self.current_health_alarm):
         LOG.info(_LI("Clearing health alarm"))
         self.service.fm_api.clear_fault(
             fm_constants.FM_ALARM_ID_STORAGE_CEPH,
             self.service.entity_instance_id)
예제 #4
0
    def get_tiers_size(self, _):
        """Get the ceph cluster tier sizes.

        returns: a dict of sizes (in GB) by tier name
        """

        tiers_size = self.service.monitor.tiers_size
        LOG.debug(_LI("Ceph cluster tiers (size in GB): %s") % str(tiers_size))
        return tiers_size
예제 #5
0
def osd_pool_set_quota(ceph_api, pool_name, max_bytes=0, max_objects=0):
    """Set the quota for an OSD pool_name
    Setting max_bytes or max_objects to 0 will disable that quota param
    :param pool_name:         OSD pool_name
    :param max_bytes:    maximum bytes for OSD pool_name
    :param max_objects:  maximum objects for OSD pool_name
    """

    # Update quota if needed
    prev_quota = osd_pool_get_quota(ceph_api, pool_name)
    if prev_quota["max_bytes"] != max_bytes:
        resp, b = ceph_api.osd_set_pool_quota(pool_name,
                                              'max_bytes',
                                              max_bytes,
                                              body='json')
        if resp.ok:
            LOG.info(
                _LI("Set OSD pool_name quota: "
                    "pool_name={}, max_bytes={}").format(pool_name, max_bytes))
        else:
            e = exception.CephPoolSetQuotaFailure(pool=pool_name,
                                                  name='max_bytes',
                                                  value=max_bytes,
                                                  reason=resp.reason)
            LOG.error(e)
            raise e
    if prev_quota["max_objects"] != max_objects:
        resp, b = ceph_api.osd_set_pool_quota(pool_name,
                                              'max_objects',
                                              max_objects,
                                              body='json')
        if resp.ok:
            LOG.info(
                _LI("Set OSD pool_name quota: "
                    "pool_name={}, max_objects={}").format(
                        pool_name, max_objects))
        else:
            e = exception.CephPoolSetQuotaFailure(pool=pool_name,
                                                  name='max_objects',
                                                  value=max_objects,
                                                  reason=resp.reason)
            LOG.error(e)
            raise e
예제 #6
0
    def _set_upgrade(self, upgrade):
        state = upgrade.get('state')
        from_version = upgrade.get('from_version')
        if (state and state != constants.UPGRADE_COMPLETED
                and from_version == constants.TITANIUM_SERVER_VERSION_18_03):

            LOG.info(
                _LI("Wait for ceph upgrade to complete before monitoring cluster."
                    ))
            self.wait_for_upgrade_complete = True
예제 #7
0
def osd_pool_create(ceph_api, pool_name, pg_num, pgp_num):
    # ruleset 0: is the default ruleset if no crushmap is loaded or
    # the ruleset for the backing tier if loaded:
    # Name: storage_tier_ruleset
    ruleset = 0
    response, body = ceph_api.osd_pool_create(pool_name,
                                              pg_num,
                                              pgp_num,
                                              pool_type="replicated",
                                              ruleset=ruleset,
                                              body='json')
    if response.ok:
        LOG.info(
            _LI("Created OSD pool: "
                "pool_name={}, pg_num={}, pgp_num={}, "
                "pool_type=replicated, ruleset={}").format(
                    pool_name, pg_num, pgp_num, ruleset))
    else:
        e = exception.CephPoolCreateFailure(name=pool_name,
                                            reason=response.reason)
        LOG.error(e)
        raise e

    # Explicitly assign the ruleset to the pool on creation since it is
    # ignored in the create call
    response, body = ceph_api.osd_set_pool_param(pool_name,
                                                 "crush_ruleset",
                                                 ruleset,
                                                 body='json')
    if response.ok:
        LOG.info(
            _LI("Assigned crush ruleset to OS pool: "
                "pool_name={}, ruleset={}").format(pool_name, ruleset))
    else:
        e = exception.CephPoolRulesetFailure(name=pool_name,
                                             reason=response.reason)
        LOG.error(e)
        ceph_api.osd_pool_delete(pool_name,
                                 pool_name,
                                 sure='--yes-i-really-really-mean-it',
                                 body='json')
        raise e
예제 #8
0
    def ceph_poll_status(self):
        # get previous data every time in case:
        # * daemon restarted
        # * alarm was cleared manually but stored as raised in daemon
        self._get_current_alarms()
        if self.current_health_alarm:
            LOG.info(_LI("Current alarm: %s") %
                     str(self.current_health_alarm.__dict__))

        # get ceph health
        health = self._get_health()
        LOG.info(_LI("Current Ceph health: "
                     "%(health)s detail: %(detail)s") % health)

        health = self.filter_health_status(health)
        if health['health'] != constants.CEPH_HEALTH_OK:
            self._report_fault(health, fm_constants.FM_ALARM_ID_STORAGE_CEPH)
            self._report_alarm_osds_health()
        else:
            self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
            self.clear_all_major_critical()
예제 #9
0
    def get_primary_tier_size(self, _):
        """Get the ceph size for the primary tier.

        returns: an int for the size (in GB) of the tier
        """

        tiers_size = self.service.monitor.tiers_size
        primary_tier_size = tiers_size.get(
            self.service.monitor.primary_tier_name, 0)
        LOG.debug(_LI("Ceph cluster primary tier size: %s GB") %
                  str(primary_tier_size))
        return primary_tier_size
예제 #10
0
 def set_flag_require_jewel_osds(self):
     try:
         response, body = self.service.ceph_api.osd_set_key(
             constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS, body='json')
         LOG.info(_LI("Set require_jewel_osds flag"))
     except IOError as e:
         raise exception.CephApiFailure(call="osd_set_key", reason=str(e))
     else:
         if not response.ok:
             raise exception.CephSetKeyFailure(
                 flag=constants.CEPH_FLAG_REQUIRE_JEWEL_OSDS,
                 extra=_("needed to complete upgrade to Jewel"),
                 response_status_code=response.status_code,
                 response_reason=response.reason,
                 status=body.get('status'),
                 output=body.get('output'))
예제 #11
0
def osd_pool_delete(ceph_api, pool_name):
    """Delete an osd pool

    :param pool_name:  pool name
    """
    response, body = ceph_api.osd_pool_delete(
        pool_name, pool_name,
        sure='--yes-i-really-really-mean-it',
        body='json')
    if response.ok:
        LOG.info(_LI("Deleted OSD pool {}").format(pool_name))
    else:
        e = exception.CephPoolDeleteFailure(
            name=pool_name, reason=response.reason)
        LOG.warn(e)
        raise e
예제 #12
0
    def ceph_get_fsid(self):
        # Check whether an alarm has already been raised
        self._get_current_alarms()
        if self.current_health_alarm:
            LOG.info(_LI("Current alarm: %s") %
                     str(self.current_health_alarm.__dict__))

        fsid = self._get_fsid()
        if not fsid:
            # Raise alarm - it will not have an entity_instance_id
            self._report_fault({'health': constants.CEPH_HEALTH_DOWN,
                                'detail': 'Ceph cluster is down.'},
                               fm_constants.FM_ALARM_ID_STORAGE_CEPH)
        else:
            # Clear alarm with no entity_instance_id
            self._clear_fault(fm_constants.FM_ALARM_ID_STORAGE_CEPH)
            self.service.entity_instance_id = 'cluster=%s' % fsid
예제 #13
0
 def auto_heal(self, health):
     if (health['health'] == constants.CEPH_HEALTH_WARN
             and (constants.CEPH_HEALTH_WARN_REQUIRE_JEWEL_OSDS_NOT_SET
                  in health['detail'])):
         try:
             upgrade = self.service.get_software_upgrade_status()
         except Exception as ex:
             LOG.warn(
                 _LW("Getting software upgrade status failed "
                     "with: %s. Skip auto-heal attempt "
                     "(will retry on next ceph status poll).") % str(ex))
             return health
         state = upgrade.get('state')
         # surpress require_jewel_osds in case upgrade is
         # in progress but not completed or aborting
         if (not self.wait_for_upgrade_complete
                 and (upgrade.get('from_version')
                      == constants.TITANIUM_SERVER_VERSION_18_03)
                 and state not in [
                     None, constants.UPGRADE_COMPLETED,
                     constants.UPGRADE_ABORTING,
                     constants.UPGRADE_ABORT_COMPLETING,
                     constants.UPGRADE_ABORTING_ROLLBACK
                 ]):
             self.wait_for_upgrade_complete = True
         # set require_jewel_osds in case upgrade is
         # not in progress or completed
         if (state in [None, constants.UPGRADE_COMPLETED]):
             LOG.warn(
                 _LW("No upgrade in progress or update completed "
                     "and require_jewel_osds health warning raised. "
                     "Set require_jewel_osds flag."))
             self.set_flag_require_jewel_osds()
             health = self._remove_require_jewel_osds_warning(health)
             LOG.info(_LI("Unsurpress require_jewel_osds health warning"))
             self.wait_for_upgrade_complete = False
         # unsurpress require_jewel_osds in case upgrade
         # is aborting
         if (state in [
                 constants.UPGRADE_ABORTING,
                 constants.UPGRADE_ABORT_COMPLETING,
                 constants.UPGRADE_ABORTING_ROLLBACK
         ]):
             self.wait_for_upgrade_complete = False
     return health
예제 #14
0
    def _report_fault(self, health, alarm_id):
        if alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH:
            new_severity = constants.SEVERITY[health['health']]
            new_reason_text = self._parse_reason(health)
            new_service_affecting = \
                constants.SERVICE_AFFECTING[health['health']]

            # Raise or update alarm if necessary
            if ((not self.current_health_alarm)
                    or (self.current_health_alarm.__dict__['severity'] !=
                        new_severity)
                    or (self.current_health_alarm.__dict__['reason_text'] !=
                        new_reason_text)
                    or (self.current_health_alarm.__dict__['service_affecting']
                        != str(new_service_affecting))):

                fault = fm_api.Fault(
                    alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH,
                    alarm_type=fm_constants.FM_ALARM_TYPE_4,
                    alarm_state=fm_constants.FM_ALARM_STATE_SET,
                    entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                    entity_instance_id=self.service.entity_instance_id,
                    severity=new_severity,
                    reason_text=new_reason_text,
                    probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                    proposed_repair_action=constants.REPAIR_ACTION,
                    service_affecting=new_service_affecting)

                alarm_uuid = self.service.fm_api.set_fault(fault)
                if alarm_uuid:
                    LOG.info(
                        _LI("Created storage alarm %(alarm_uuid)s - "
                            "severity: %(severity)s, reason: %(reason)s, "
                            "service_affecting: %(service_affecting)s") % {
                                "alarm_uuid": alarm_uuid,
                                "severity": new_severity,
                                "reason": new_reason_text,
                                "service_affecting": new_service_affecting
                            })
                else:
                    LOG.error(
                        _LE("Failed to create storage alarm - "
                            "severity: %(severity)s, reason: %(reason)s "
                            "service_affecting: %(service_affecting)s") % {
                                "severity": new_severity,
                                "reason": new_reason_text,
                                "service_affecting": new_service_affecting
                            })

            # Log detailed reason for later analysis
            if (self.current_ceph_health != health['health']
                    or self.detailed_health_reason != health['detail']):
                LOG.info(
                    _LI("Ceph status changed: %(health)s "
                        "detailed reason: %(detail)s") % health)
                self.current_ceph_health = health['health']
                self.detailed_health_reason = health['detail']

        elif (alarm_id == fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE
              and not health['tier_eid'] in self.current_quota_alarms):

            quota_reason_text = ("Quota/Space mismatch for the %s tier. The "
                                 "sum of Ceph pool quotas does not match the "
                                 "tier size." % health['tier_name'])
            fault = fm_api.Fault(
                alarm_id=fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                alarm_state=fm_constants.FM_ALARM_STATE_SET,
                entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                entity_instance_id=health['tier_eid'],
                severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
                reason_text=quota_reason_text,
                alarm_type=fm_constants.FM_ALARM_TYPE_7,
                probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_75,
                proposed_repair_action=(
                    "Update ceph storage pool quotas to use all available "
                    "cluster space for the %s tier." % health['tier_name']),
                service_affecting=False)

            alarm_uuid = self.service.fm_api.set_fault(fault)
            if alarm_uuid:
                LOG.info(
                    _LI("Created storage quota storage alarm %(alarm_uuid)s. "
                        "Reason: %(reason)s") % {
                            "alarm_uuid": alarm_uuid,
                            "reason": quota_reason_text
                        })
            else:
                LOG.error(
                    _LE("Failed to create quota "
                        "storage alarm. Reason: %s") % quota_reason_text)
예제 #15
0
    def _report_alarm_osds_health(self):
        response, osd_tree = self.service.ceph_api.osd_tree(body='json')
        if not response.ok:
            LOG.error(
                _LE("Failed to retrieve Ceph OSD tree: "
                    "status_code: %(status_code)s, reason: %(reason)s") % {
                        "status_code": response.status_code,
                        "reason": response.reason
                    })
            return
        osd_tree = dict([(n['id'], n) for n in osd_tree['output']['nodes']])
        alarms = []

        self._check_storage_tier(osd_tree, "storage-tier",
                                 lambda *args: alarms.append(args))

        old_alarms = {}
        for alarm_id in [
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR,
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL
        ]:
            alarm_list = self.service.fm_api.get_faults_by_id(alarm_id)
            if not alarm_list:
                continue
            for alarm in alarm_list:
                if alarm.entity_instance_id not in old_alarms:
                    old_alarms[alarm.entity_instance_id] = []
                old_alarms[alarm.entity_instance_id].append(
                    (alarm.alarm_id, alarm.reason_text))

        for peer_group, reason, severity in alarms:
            if self._current_health_alarm_equals(reason, severity):
                continue
            alarm_critical_major = fm_constants.FM_ALARM_ID_STORAGE_CEPH_MAJOR
            if severity == fm_constants.FM_ALARM_SEVERITY_CRITICAL:
                alarm_critical_major = (
                    fm_constants.FM_ALARM_ID_STORAGE_CEPH_CRITICAL)
            entity_instance_id = (self.service.entity_instance_id +
                                  '.peergroup=' + peer_group)
            alarm_already_exists = False
            if entity_instance_id in old_alarms:
                for alarm_id, old_reason in old_alarms[entity_instance_id]:
                    if (reason == old_reason
                            and alarm_id == alarm_critical_major):
                        # if the alarm is exactly the same, we don't need
                        # to recreate it
                        old_alarms[entity_instance_id].remove(
                            (alarm_id, old_reason))
                        alarm_already_exists = True
                    elif (alarm_id == alarm_critical_major):
                        # if we change just the reason, then we just remove the
                        # alarm from the list so we don't remove it at the
                        # end of the function
                        old_alarms[entity_instance_id].remove(
                            (alarm_id, old_reason))

                if (len(old_alarms[entity_instance_id]) == 0):
                    del old_alarms[entity_instance_id]

                # in case the alarm is exactly the same, we skip the alarm set
                if alarm_already_exists is True:
                    continue
            major_repair_action = constants.REPAIR_ACTION_MAJOR_CRITICAL_ALARM
            fault = fm_api.Fault(
                alarm_id=alarm_critical_major,
                alarm_type=fm_constants.FM_ALARM_TYPE_4,
                alarm_state=fm_constants.FM_ALARM_STATE_SET,
                entity_type_id=fm_constants.FM_ENTITY_TYPE_CLUSTER,
                entity_instance_id=entity_instance_id,
                severity=severity,
                reason_text=reason,
                probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_15,
                proposed_repair_action=major_repair_action,
                service_affecting=constants.SERVICE_AFFECTING['HEALTH_WARN'])
            alarm_uuid = self.service.fm_api.set_fault(fault)
            if alarm_uuid:
                LOG.info(
                    _LI("Created storage alarm %(alarm_uuid)s - "
                        "severity: %(severity)s, reason: %(reason)s, "
                        "service_affecting: %(service_affecting)s") % {
                            "alarm_uuid":
                            str(alarm_uuid),
                            "severity":
                            str(severity),
                            "reason":
                            reason,
                            "service_affecting":
                            str(constants.SERVICE_AFFECTING['HEALTH_WARN'])
                        })
            else:
                LOG.error(
                    _LE("Failed to create storage alarm - "
                        "severity: %(severity)s, reason: %(reason)s, "
                        "service_affecting: %(service_affecting)s") % {
                            "severity":
                            str(severity),
                            "reason":
                            reason,
                            "service_affecting":
                            str(constants.SERVICE_AFFECTING['HEALTH_WARN'])
                        })

        for entity_instance_id in old_alarms:
            for alarm_id, old_reason in old_alarms[entity_instance_id]:
                self.service.fm_api.clear_fault(alarm_id, entity_instance_id)
예제 #16
0
    def ceph_poll_quotas(self):
        self._get_current_alarms()
        if self.current_quota_alarms:
            LOG.info(
                _LI("Current quota alarms %s") % self.current_quota_alarms)

        # Get current current size of each tier
        previous_tiers_size = self.tiers_size
        self.tiers_size = self._get_tiers_size()

        # Make sure any removed tiers have the alarms cleared
        for t in (set(previous_tiers_size) - set(self.tiers_size)):
            self._clear_fault(
                fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                "{0}.tier={1}".format(
                    self.service.entity_instance_id,
                    t[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)]))

        # Check the quotas on each tier
        for tier in self.tiers_size:
            # Extract the tier name from the crush equivalent
            tier_name = tier[:-len(constants.CEPH_CRUSH_TIER_SUFFIX)]

            if self.tiers_size[tier] == 0:
                LOG.info(
                    _LI("'%s' tier cluster size not yet available") %
                    tier_name)
                continue

            pools_quota_sum = 0
            if tier == self.primary_tier_name:
                for pool in constants.CEPH_POOLS:
                    if (pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_JEWEL
                            or pool['pool_name']
                            == constants.CEPH_POOL_OBJECT_GATEWAY_NAME_HAMMER):
                        object_pool_name = self._get_object_pool_name()
                        if object_pool_name is None:
                            LOG.error("Rados gateway object data pool does "
                                      "not exist.")
                        else:
                            pools_quota_sum += \
                                self._get_osd_pool_quota(object_pool_name)
                    else:
                        pools_quota_sum += self._get_osd_pool_quota(
                            pool['pool_name'])
            else:
                for pool in constants.SB_TIER_CEPH_POOLS:
                    pool_name = "{0}-{1}".format(pool['pool_name'], tier_name)
                    pools_quota_sum += self._get_osd_pool_quota(pool_name)

            # Currently, there is only one pool on the addtional tier(s),
            # therefore allow a quota of 0
            if (pools_quota_sum != self.tiers_size[tier]
                    and pools_quota_sum != 0):
                self._report_fault(
                    {
                        'tier_name':
                        tier_name,
                        'tier_eid':
                        "{0}.tier={1}".format(self.service.entity_instance_id,
                                              tier_name)
                    }, fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE)
            else:
                self._clear_fault(
                    fm_constants.FM_ALARM_ID_STORAGE_CEPH_FREE_SPACE,
                    "{0}.tier={1}".format(self.service.entity_instance_id,
                                          tier_name))
예제 #17
0
 def get_software_upgrade_status(self):
     LOG.info(_LI("Getting software upgrade status from sysinv"))
     cctxt = self.sysinv_conductor.prepare(timeout=2)
     upgrade = cctxt.call({}, 'get_software_upgrade_status')
     LOG.info(_LI("Software upgrade status: %s") % str(upgrade))
     return upgrade