Example #1
0
    def _create_addn_hosts_dc(self, context):
        """Generate the addn_hosts_dc file for hostname/ip translation"""

        addn_hosts_dc = os.path.join(CONFIG_PATH, ADDN_HOSTS_DC)
        addn_hosts_dc_temp = addn_hosts_dc + '.temp'

        subclouds = db_api.subcloud_get_all(context)
        with open(addn_hosts_dc_temp, 'w') as f_out_addn_dc_temp:
            for subcloud in subclouds:
                addn_dc_line = subcloud.management_start_ip + ' ' + \
                    subcloud.name + '\n'
                f_out_addn_dc_temp.write(addn_dc_line)

            # if no more subclouds, create empty file so dnsmasq does not
            # emit an error log.
            if not subclouds:
                f_out_addn_dc_temp.write(' ')

        if not filecmp.cmp(addn_hosts_dc_temp, addn_hosts_dc):
            os.rename(addn_hosts_dc_temp, addn_hosts_dc)
            # restart dnsmasq so it can re-read our addn_hosts file.
            os.system("pkill -HUP dnsmasq")
Example #2
0
    def _periodic_patch_audit_loop(self):
        """Audit patch status of subclouds loop."""

        # We are running in our own green thread here.
        LOG.info('Triggered patch audit.')

        try:
            ks_client = KeystoneClient()
        except Exception:
            LOG.warn('Failure initializing KeystoneClient, exiting audit.')
            return

        # First query RegionOne to determine what patches should be applied
        # to the system.
        patching_client = PatchingClient(consts.DEFAULT_REGION_NAME,
                                         ks_client.session)
        regionone_patches = patching_client.query()
        LOG.debug("regionone_patches: %s" % regionone_patches)

        # Build lists of patches that should be applied or committed in all
        # subclouds, based on their state in RegionOne. Check repostate
        # (not patchstate) as we only care if the patch has been applied to
        # the repo (not whether it is installed on the hosts).
        applied_patch_ids = list()
        committed_patch_ids = list()
        for patch_id in regionone_patches.keys():
            if regionone_patches[patch_id]['repostate'] == \
                    patching_v1.PATCH_STATE_APPLIED:
                applied_patch_ids.append(patch_id)
            elif regionone_patches[patch_id]['repostate'] == \
                    patching_v1.PATCH_STATE_COMMITTED:
                committed_patch_ids.append(patch_id)
        LOG.debug("RegionOne applied_patch_ids: %s" % applied_patch_ids)
        LOG.debug("RegionOne committed_patch_ids: %s" % committed_patch_ids)

        # For each subcloud, check whether the patches match the target.
        for subcloud in db_api.subcloud_get_all(self.context):
            # Only audit patching on subclouds that are managed and online
            if (subcloud.management_state != consts.MANAGEMENT_MANAGED
                    or subcloud.availability_status !=
                    consts.AVAILABILITY_ONLINE):
                continue

            try:
                patching_client = PatchingClient(subcloud.name,
                                                 ks_client.session)
            except keystone_exceptions.EndpointNotFound:
                LOG.warn(
                    "Patching endpoint for online subcloud %s not found." %
                    subcloud.name)
                continue

            try:
                sysinv_client = SysinvClient(subcloud.name, ks_client.session)
            except keystone_exceptions.EndpointNotFound:
                LOG.warn("Sysinv endpoint for online subcloud %s not found." %
                         subcloud.name)
                continue

            # Retrieve all the patches that are present in this subcloud.
            try:
                subcloud_patches = patching_client.query()
                LOG.debug("Patches for subcloud %s: %s" %
                          (subcloud.name, subcloud_patches))
            except Exception:
                LOG.warn('Cannot retrieve patches for subcloud: %s' %
                         subcloud.name)
                continue

            # Determine which loads are present in this subcloud. During an
            # upgrade, there will be more than one load installed.
            installed_loads = list()
            try:
                loads = sysinv_client.get_loads()
            except Exception:
                LOG.warn('Cannot retrieve loads for subcloud: %s' %
                         subcloud.name)
                continue
            for load in loads:
                installed_loads.append(load.software_version)

            out_of_sync = False

            # Check that all patches in this subcloud are in the correct
            # state, based on the state of the patch in RegionOne. For the
            # subcloud, we use the patchstate because we care whether the
            # patch is installed on the hosts.
            for patch_id in subcloud_patches.keys():
                if subcloud_patches[patch_id]['patchstate'] == \
                        patching_v1.PATCH_STATE_APPLIED:
                    if patch_id not in applied_patch_ids:
                        if patch_id not in committed_patch_ids:
                            LOG.debug("Patch %s should not be applied in %s" %
                                      (patch_id, subcloud.name))
                        else:
                            LOG.debug("Patch %s should be committed in %s" %
                                      (patch_id, subcloud.name))
                        out_of_sync = True
                elif subcloud_patches[patch_id]['patchstate'] == \
                        patching_v1.PATCH_STATE_COMMITTED:
                    if patch_id not in committed_patch_ids:
                        LOG.warn("Patch %s should not be committed in %s" %
                                 (patch_id, subcloud.name))
                        out_of_sync = True
                else:
                    # In steady state, all patches should either be applied
                    # or committed in each subcloud. Patches in other
                    # states mean a sync is required.
                    out_of_sync = True

            # Check that all applied or committed patches in RegionOne are
            # present in the subcloud.
            for patch_id in applied_patch_ids:
                if regionone_patches[patch_id]['sw_version'] in \
                        installed_loads and patch_id not in subcloud_patches:
                    LOG.debug("Patch %s missing from %s" %
                              (patch_id, subcloud.name))
                    out_of_sync = True
            for patch_id in committed_patch_ids:
                if regionone_patches[patch_id]['sw_version'] in \
                        installed_loads and patch_id not in subcloud_patches:
                    LOG.debug("Patch %s missing from %s" %
                              (patch_id, subcloud.name))
                    out_of_sync = True

            if out_of_sync:
                LOG.debug("Subcloud %s is out-of-sync for patching" %
                          subcloud.name)
                self.subcloud_manager.update_subcloud_endpoint_status(
                    self.context,
                    subcloud_name=subcloud.name,
                    endpoint_type=dcorch_consts.ENDPOINT_TYPE_PATCHING,
                    sync_status=consts.SYNC_STATUS_OUT_OF_SYNC)
            else:
                LOG.debug("Subcloud %s is in-sync for patching" %
                          subcloud.name)
                self.subcloud_manager.update_subcloud_endpoint_status(
                    self.context,
                    subcloud_name=subcloud.name,
                    endpoint_type=dcorch_consts.ENDPOINT_TYPE_PATCHING,
                    sync_status=consts.SYNC_STATUS_IN_SYNC)
    def _validate_subcloud_config(self, context, name, management_subnet_str,
                                  management_start_ip_str,
                                  management_end_ip_str,
                                  management_gateway_ip_str,
                                  systemcontroller_gateway_ip_str):
        """Check whether subcloud config is valid."""

        # Validate the name
        if name.isdigit():
            pecan.abort(400, _("name must contain alphabetic characters"))

        if name in [consts.DEFAULT_REGION_NAME, consts.SYSTEM_CONTROLLER_NAME]:
            pecan.abort(
                400,
                _("name cannot be %(bad_name1)s or %(bad_name2)s") % {
                    'bad_name1': consts.DEFAULT_REGION_NAME,
                    'bad_name2': consts.SYSTEM_CONTROLLER_NAME
                })

        # Parse/validate the management subnet
        subcloud_subnets = []
        subclouds = db_api.subcloud_get_all(context)
        for subcloud in subclouds:
            subcloud_subnets.append(IPNetwork(subcloud.management_subnet))

        MIN_MANAGEMENT_ADDRESSES = 8

        management_subnet = None
        try:
            management_subnet = validate_network_str(
                management_subnet_str,
                minimum_size=MIN_MANAGEMENT_ADDRESSES,
                existing_networks=subcloud_subnets)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-subnet invalid: %s") % e)

        # Parse/validate the start/end addresses
        management_start_ip = None
        try:
            management_start_ip = validate_address_str(management_start_ip_str,
                                                       management_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-start-ip invalid: %s") % e)

        management_end_ip = None
        try:
            management_end_ip = validate_address_str(management_end_ip_str,
                                                     management_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-end-ip invalid: %s") % e)

        if not management_start_ip < management_end_ip:
            pecan.abort(
                400, _("management-start-ip  not less than management-end-ip"))

        if not len(IPRange(management_start_ip, management_end_ip)) >= \
                MIN_MANAGEMENT_ADDRESSES:
            pecan.abort(
                400,
                _("management address range must contain at least %d "
                  "addresses") % MIN_MANAGEMENT_ADDRESSES)

        # Parse/validate the gateway
        try:
            validate_address_str(management_gateway_ip_str, management_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-gateway-ip invalid: %s") % e)

        # Ensure systemcontroller gateway is in the management subnet
        # for the systemcontroller region.
        management_address_pool = self._get_management_address_pool(context)
        systemcontroller_subnet_str = "%s/%d" % (
            management_address_pool.network, management_address_pool.prefix)
        systemcontroller_subnet = IPNetwork(systemcontroller_subnet_str)
        try:
            validate_address_str(systemcontroller_gateway_ip_str,
                                 systemcontroller_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("systemcontroller-gateway-ip invalid: %s") % e)
Example #4
0
    def _validate_subcloud_config(self, context, name, management_subnet_str,
                                  management_start_ip_str,
                                  management_end_ip_str,
                                  management_gateway_ip_str,
                                  systemcontroller_gateway_ip_str):
        """Check whether subcloud config is valid."""

        # Validate the name
        if name.isdigit():
            pecan.abort(400, _("name must contain alphabetic characters"))

        if name in [consts.DEFAULT_REGION_NAME, consts.SYSTEM_CONTROLLER_NAME]:
            pecan.abort(
                400,
                _("name cannot be %(bad_name1)s or %(bad_name2)s") % {
                    'bad_name1': consts.DEFAULT_REGION_NAME,
                    'bad_name2': consts.SYSTEM_CONTROLLER_NAME
                })

        # Parse/validate the management subnet
        subcloud_subnets = []
        subclouds = db_api.subcloud_get_all(context)
        for subcloud in subclouds:
            subcloud_subnets.append(IPNetwork(subcloud.management_subnet))

        MIN_MANAGEMENT_SUBNET_SIZE = 8
        # subtract 3 for network, gateway and broadcast addresses.
        MIN_MANAGEMENT_ADDRESSES = MIN_MANAGEMENT_SUBNET_SIZE - 3

        management_subnet = None
        try:
            management_subnet = validate_network_str(
                management_subnet_str,
                minimum_size=MIN_MANAGEMENT_SUBNET_SIZE,
                existing_networks=subcloud_subnets)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-subnet invalid: %s") % e)

        # Parse/validate the start/end addresses
        management_start_ip = None
        try:
            management_start_ip = validate_address_str(management_start_ip_str,
                                                       management_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-start-ip invalid: %s") % e)

        management_end_ip = None
        try:
            management_end_ip = validate_address_str(management_end_ip_str,
                                                     management_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-end-ip invalid: %s") % e)

        if not management_start_ip < management_end_ip:
            pecan.abort(
                400, _("management-start-ip  not less than management-end-ip"))

        if not len(IPRange(management_start_ip, management_end_ip)) >= \
                MIN_MANAGEMENT_ADDRESSES:
            pecan.abort(
                400,
                _("management address range must contain at least %d "
                  "addresses") % MIN_MANAGEMENT_ADDRESSES)

        # Parse/validate the gateway
        try:
            validate_address_str(management_gateway_ip_str, management_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("management-gateway-ip invalid: %s") % e)

        # Ensure subcloud management gateway is not within the actual subcloud
        # management subnet address pool for consistency with the
        # systemcontroller gateway restriction below. Address collision
        # is not a concern as the address is added to sysinv.
        subcloud_mgmt_address_start = IPAddress(management_start_ip_str)
        subcloud_mgmt_address_end = IPAddress(management_end_ip_str)
        subcloud_mgmt_gw_ip = IPAddress(management_gateway_ip_str)
        if ((subcloud_mgmt_gw_ip >= subcloud_mgmt_address_start)
                and (subcloud_mgmt_gw_ip <= subcloud_mgmt_address_end)):
            pecan.abort(
                400,
                _("management-gateway-ip invalid, "
                  "is within management pool: %(start)s - "
                  "%(end)s") % {
                      'start': subcloud_mgmt_address_start,
                      'end': subcloud_mgmt_address_end
                  })

        # Ensure systemcontroller gateway is in the management subnet
        # for the systemcontroller region.
        management_address_pool = self._get_management_address_pool(context)
        systemcontroller_subnet_str = "%s/%d" % (
            management_address_pool.network, management_address_pool.prefix)
        systemcontroller_subnet = IPNetwork(systemcontroller_subnet_str)
        try:
            validate_address_str(systemcontroller_gateway_ip_str,
                                 systemcontroller_subnet)
        except ValidateFail as e:
            LOG.exception(e)
            pecan.abort(400, _("systemcontroller-gateway-ip invalid: %s") % e)
        # Ensure systemcontroller gateway is not within the actual
        # management subnet address pool to prevent address collision.
        mgmt_address_start = IPAddress(management_address_pool.ranges[0][0])
        mgmt_address_end = IPAddress(management_address_pool.ranges[0][1])
        systemcontroller_gw_ip = IPAddress(systemcontroller_gateway_ip_str)
        if ((systemcontroller_gw_ip >= mgmt_address_start)
                and (systemcontroller_gw_ip <= mgmt_address_end)):
            pecan.abort(
                400,
                _("systemcontroller-gateway-ip invalid, "
                  "is within management pool: %(start)s - "
                  "%(end)s") % {
                      'start': mgmt_address_start,
                      'end': mgmt_address_end
                  })
Example #5
0
    def _periodic_subcloud_audit_loop(self):
        """Audit availability of subclouds loop."""
        # We will be running in our own green thread here.
        LOG.info('Triggered subcloud audit.')

        # For each subcloud, if at least one service is active in
        # each service of servicegroup-list then declare the subcloud online.

        for subcloud in db_api.subcloud_get_all(self.context):
            subcloud_name = subcloud.name
            subcloud_id = subcloud.id
            management_state = subcloud.management_state
            avail_status_current = subcloud.availability_status
            audit_fail_count = subcloud.audit_fail_count

            # Set defaults to None and disabled so we will still set disabled
            # status if we encounter an error.

            sysinv_client = None
            svc_groups = None
            avail_to_set = consts.AVAILABILITY_OFFLINE

            try:
                ks_client = KeystoneClient(subcloud_name)
                sysinv_client = SysinvClient(subcloud_name, ks_client.session)
            except (keystone_exceptions.EndpointNotFound,
                    keystone_exceptions.ConnectFailure, IndexError) as e:
                if avail_status_current == consts.AVAILABILITY_OFFLINE:
                    LOG.info("Identity or Platform endpoint for %s not "
                             "found, ignoring for offline "
                             "subcloud." % subcloud_name)
                    continue
                else:
                    LOG.error("Identity or Platform endpoint for online "
                              "subcloud: %s not found." % subcloud_name)

            except Exception as e:
                LOG.exception(e)

            if sysinv_client:
                try:
                    svc_groups = sysinv_client.get_service_groups()
                except Exception as e:
                    svc_groups = None
                    LOG.warn('Cannot retrieve service groups for '
                             'subcloud:%s, %s' % (subcloud_name, e))

            if svc_groups:
                active_sgs = []
                inactive_sgs = []

                # Build 2 lists, 1 of active service groups,
                # one with non-active.
                for sg in svc_groups:
                    if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE:
                        inactive_sgs.append(sg.service_group_name)
                    else:
                        active_sgs.append(sg.service_group_name)

                # Create a list of service groups that are only present
                # in non-active list
                inactive_only = [
                    sg for sg in inactive_sgs if sg not in active_sgs
                ]

                # An empty inactive only list and a non-empty active list
                # means we're good to go.
                if not inactive_only and active_sgs:
                    avail_to_set = \
                        consts.AVAILABILITY_ONLINE
                else:
                    LOG.info("Subcloud:%s has non-active "
                             "service groups: %s" %
                             (subcloud_name, inactive_only))

            if avail_to_set == consts.AVAILABILITY_OFFLINE:
                if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX:
                    audit_fail_count = audit_fail_count + 1

                if (avail_status_current == consts.AVAILABILITY_ONLINE) and \
                        (audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM):
                    # Do not set offline until we have failed audit
                    # the requisite number of times
                    avail_to_set = consts.AVAILABILITY_ONLINE
            else:
                # In the case of a one off blip, we may need to set the
                # fail count back to 0
                audit_fail_count = 0

            if avail_to_set != avail_status_current:

                if avail_to_set == consts.AVAILABILITY_ONLINE:
                    audit_fail_count = 0

                LOG.info('Setting new availability status: %s '
                         'on subcloud: %s' % (avail_to_set, subcloud_name))

                entity_instance_id = "subcloud=%s" % subcloud_name
                fault = self.fm_api.get_fault(
                    fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
                    entity_instance_id)

                if fault and (avail_to_set == consts.AVAILABILITY_ONLINE):
                    try:
                        self.fm_api.clear_fault(
                            fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
                            entity_instance_id)
                    except Exception as e:
                        LOG.exception(e)

                elif not fault and \
                        (avail_to_set == consts.AVAILABILITY_OFFLINE):
                    try:
                        fault = fm_api.Fault(
                            alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE,
                            alarm_state=fm_const.FM_ALARM_STATE_SET,
                            entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD,
                            entity_instance_id=entity_instance_id,
                            severity=fm_const.FM_ALARM_SEVERITY_CRITICAL,
                            reason_text=('%s is offline' % subcloud_name),
                            alarm_type=fm_const.FM_ALARM_TYPE_0,
                            probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29,
                            proposed_repair_action="Wait for subcloud to "
                            "become online; if "
                            "problem persists contact "
                            "next level of support.",
                            service_affecting=True)

                        self.fm_api.set_fault(fault)
                    except Exception as e:
                        LOG.exception(e)

                try:
                    db_api.subcloud_update(self.context,
                                           subcloud_id,
                                           management_state=None,
                                           availability_status=avail_to_set,
                                           software_version=None,
                                           description=None,
                                           location=None,
                                           audit_fail_count=audit_fail_count)
                except exceptions.SubcloudNotFound:
                    # slim possibility subcloud could have been deleted since
                    # we found it in db, ignore this benign error.
                    LOG.info('Ignoring SubcloudNotFound when attempting state'
                             ' update: %s' % subcloud_name)
                    continue

                try:
                    self.dcorch_rpc_client.\
                        update_subcloud_states(self.context,
                                               subcloud_name,
                                               management_state,
                                               avail_to_set)

                    LOG.info('Notifying dcorch, subcloud:%s management: %s, '
                             'availability:%s' %
                             (subcloud_name, management_state, avail_to_set))
                except Exception as e:
                    LOG.exception(e)
                    LOG.warn('Problem informing dcorch of subcloud '
                             'state change, subcloud: %s' % subcloud_name)

                if avail_to_set == consts.AVAILABILITY_OFFLINE:
                    # Subcloud is going offline, set all endpoint statuses to
                    # unknown.
                    try:
                        self.subcloud_manager.update_subcloud_endpoint_status(
                            self.context,
                            subcloud_name=subcloud_name,
                            endpoint_type=None,
                            sync_status=consts.SYNC_STATUS_UNKNOWN)
                    except exceptions.SubcloudNotFound:
                        LOG.info('Ignoring SubcloudNotFound when attempting '
                                 'sync_status update: %s' % subcloud_name)
                        continue

            elif audit_fail_count != subcloud.audit_fail_count:

                try:
                    db_api.subcloud_update(self.context,
                                           subcloud_id,
                                           management_state=None,
                                           availability_status=None,
                                           software_version=None,
                                           description=None,
                                           location=None,
                                           audit_fail_count=audit_fail_count)
                except exceptions.SubcloudNotFound:
                    # slim possibility subcloud could have been deleted since
                    # we found it in db, ignore this benign error.
                    LOG.info('Ignoring SubcloudNotFound when attempting '
                             'audit_fail_count update: %s' % subcloud_name)
                    continue
Example #6
0
    def update_subcloud_endpoint_status(
            self,
            context,
            subcloud_name=None,
            endpoint_type=None,
            sync_status=consts.SYNC_STATUS_OUT_OF_SYNC,
            alarmable=True):
        """Update subcloud endpoint status

        :param context: request context object
        :param subcloud_name: name of subcloud to update
        :param endpoint_type: endpoint type to update
        :param sync_status: sync status to set
        """

        subcloud = None

        if subcloud_name:
            try:
                subcloud = db_api.subcloud_get_by_name(context, subcloud_name)
            except Exception as e:
                LOG.exception(e)
                raise e

            # Only allow updating the sync status if managed and online.
            # This means if a subcloud is going offline or unmanaged, then
            # the sync status update must be done first.
            if (((subcloud.availability_status == consts.AVAILABILITY_ONLINE)
                 and (subcloud.management_state == consts.MANAGEMENT_MANAGED))
                    or (sync_status != consts.SYNC_STATUS_IN_SYNC)):

                # update a single subcloud
                try:
                    self._update_endpoint_status_for_subcloud(
                        context, subcloud.id, endpoint_type, sync_status,
                        alarmable)
                except Exception as e:
                    LOG.exception(e)
                    raise e
            else:
                LOG.info("Ignoring unmanaged/offline subcloud sync_status "
                         "update for subcloud:%s endpoint:%s sync:%s" %
                         (subcloud_name, endpoint_type, sync_status))

        else:
            # update all subclouds
            for subcloud in db_api.subcloud_get_all(context):
                if (((subcloud.availability_status
                      == consts.AVAILABILITY_ONLINE) and
                     (subcloud.management_state == consts.MANAGEMENT_MANAGED))
                        or (sync_status != consts.SYNC_STATUS_IN_SYNC)):

                    try:
                        self._update_endpoint_status_for_subcloud(
                            context, subcloud.id, endpoint_type, sync_status,
                            alarmable)
                    except Exception as e:
                        LOG.exception(e)
                        raise e
                else:
                    LOG.info("Ignoring unmanaged/offline subcloud sync_status "
                             "update for subcloud:%s endpoint:%s sync:%s" %
                             (subcloud.name, endpoint_type, sync_status))