def _create_addn_hosts_dc(self, context): """Generate the addn_hosts_dc file for hostname/ip translation""" addn_hosts_dc = os.path.join(CONFIG_PATH, ADDN_HOSTS_DC) addn_hosts_dc_temp = addn_hosts_dc + '.temp' subclouds = db_api.subcloud_get_all(context) with open(addn_hosts_dc_temp, 'w') as f_out_addn_dc_temp: for subcloud in subclouds: addn_dc_line = subcloud.management_start_ip + ' ' + \ subcloud.name + '\n' f_out_addn_dc_temp.write(addn_dc_line) # if no more subclouds, create empty file so dnsmasq does not # emit an error log. if not subclouds: f_out_addn_dc_temp.write(' ') if not filecmp.cmp(addn_hosts_dc_temp, addn_hosts_dc): os.rename(addn_hosts_dc_temp, addn_hosts_dc) # restart dnsmasq so it can re-read our addn_hosts file. os.system("pkill -HUP dnsmasq")
def _periodic_patch_audit_loop(self): """Audit patch status of subclouds loop.""" # We are running in our own green thread here. LOG.info('Triggered patch audit.') try: ks_client = KeystoneClient() except Exception: LOG.warn('Failure initializing KeystoneClient, exiting audit.') return # First query RegionOne to determine what patches should be applied # to the system. patching_client = PatchingClient(consts.DEFAULT_REGION_NAME, ks_client.session) regionone_patches = patching_client.query() LOG.debug("regionone_patches: %s" % regionone_patches) # Build lists of patches that should be applied or committed in all # subclouds, based on their state in RegionOne. Check repostate # (not patchstate) as we only care if the patch has been applied to # the repo (not whether it is installed on the hosts). applied_patch_ids = list() committed_patch_ids = list() for patch_id in regionone_patches.keys(): if regionone_patches[patch_id]['repostate'] == \ patching_v1.PATCH_STATE_APPLIED: applied_patch_ids.append(patch_id) elif regionone_patches[patch_id]['repostate'] == \ patching_v1.PATCH_STATE_COMMITTED: committed_patch_ids.append(patch_id) LOG.debug("RegionOne applied_patch_ids: %s" % applied_patch_ids) LOG.debug("RegionOne committed_patch_ids: %s" % committed_patch_ids) # For each subcloud, check whether the patches match the target. for subcloud in db_api.subcloud_get_all(self.context): # Only audit patching on subclouds that are managed and online if (subcloud.management_state != consts.MANAGEMENT_MANAGED or subcloud.availability_status != consts.AVAILABILITY_ONLINE): continue try: patching_client = PatchingClient(subcloud.name, ks_client.session) except keystone_exceptions.EndpointNotFound: LOG.warn( "Patching endpoint for online subcloud %s not found." % subcloud.name) continue try: sysinv_client = SysinvClient(subcloud.name, ks_client.session) except keystone_exceptions.EndpointNotFound: LOG.warn("Sysinv endpoint for online subcloud %s not found." % subcloud.name) continue # Retrieve all the patches that are present in this subcloud. try: subcloud_patches = patching_client.query() LOG.debug("Patches for subcloud %s: %s" % (subcloud.name, subcloud_patches)) except Exception: LOG.warn('Cannot retrieve patches for subcloud: %s' % subcloud.name) continue # Determine which loads are present in this subcloud. During an # upgrade, there will be more than one load installed. installed_loads = list() try: loads = sysinv_client.get_loads() except Exception: LOG.warn('Cannot retrieve loads for subcloud: %s' % subcloud.name) continue for load in loads: installed_loads.append(load.software_version) out_of_sync = False # Check that all patches in this subcloud are in the correct # state, based on the state of the patch in RegionOne. For the # subcloud, we use the patchstate because we care whether the # patch is installed on the hosts. for patch_id in subcloud_patches.keys(): if subcloud_patches[patch_id]['patchstate'] == \ patching_v1.PATCH_STATE_APPLIED: if patch_id not in applied_patch_ids: if patch_id not in committed_patch_ids: LOG.debug("Patch %s should not be applied in %s" % (patch_id, subcloud.name)) else: LOG.debug("Patch %s should be committed in %s" % (patch_id, subcloud.name)) out_of_sync = True elif subcloud_patches[patch_id]['patchstate'] == \ patching_v1.PATCH_STATE_COMMITTED: if patch_id not in committed_patch_ids: LOG.warn("Patch %s should not be committed in %s" % (patch_id, subcloud.name)) out_of_sync = True else: # In steady state, all patches should either be applied # or committed in each subcloud. Patches in other # states mean a sync is required. out_of_sync = True # Check that all applied or committed patches in RegionOne are # present in the subcloud. for patch_id in applied_patch_ids: if regionone_patches[patch_id]['sw_version'] in \ installed_loads and patch_id not in subcloud_patches: LOG.debug("Patch %s missing from %s" % (patch_id, subcloud.name)) out_of_sync = True for patch_id in committed_patch_ids: if regionone_patches[patch_id]['sw_version'] in \ installed_loads and patch_id not in subcloud_patches: LOG.debug("Patch %s missing from %s" % (patch_id, subcloud.name)) out_of_sync = True if out_of_sync: LOG.debug("Subcloud %s is out-of-sync for patching" % subcloud.name) self.subcloud_manager.update_subcloud_endpoint_status( self.context, subcloud_name=subcloud.name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_PATCHING, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC) else: LOG.debug("Subcloud %s is in-sync for patching" % subcloud.name) self.subcloud_manager.update_subcloud_endpoint_status( self.context, subcloud_name=subcloud.name, endpoint_type=dcorch_consts.ENDPOINT_TYPE_PATCHING, sync_status=consts.SYNC_STATUS_IN_SYNC)
def _validate_subcloud_config(self, context, name, management_subnet_str, management_start_ip_str, management_end_ip_str, management_gateway_ip_str, systemcontroller_gateway_ip_str): """Check whether subcloud config is valid.""" # Validate the name if name.isdigit(): pecan.abort(400, _("name must contain alphabetic characters")) if name in [consts.DEFAULT_REGION_NAME, consts.SYSTEM_CONTROLLER_NAME]: pecan.abort( 400, _("name cannot be %(bad_name1)s or %(bad_name2)s") % { 'bad_name1': consts.DEFAULT_REGION_NAME, 'bad_name2': consts.SYSTEM_CONTROLLER_NAME }) # Parse/validate the management subnet subcloud_subnets = [] subclouds = db_api.subcloud_get_all(context) for subcloud in subclouds: subcloud_subnets.append(IPNetwork(subcloud.management_subnet)) MIN_MANAGEMENT_ADDRESSES = 8 management_subnet = None try: management_subnet = validate_network_str( management_subnet_str, minimum_size=MIN_MANAGEMENT_ADDRESSES, existing_networks=subcloud_subnets) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-subnet invalid: %s") % e) # Parse/validate the start/end addresses management_start_ip = None try: management_start_ip = validate_address_str(management_start_ip_str, management_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-start-ip invalid: %s") % e) management_end_ip = None try: management_end_ip = validate_address_str(management_end_ip_str, management_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-end-ip invalid: %s") % e) if not management_start_ip < management_end_ip: pecan.abort( 400, _("management-start-ip not less than management-end-ip")) if not len(IPRange(management_start_ip, management_end_ip)) >= \ MIN_MANAGEMENT_ADDRESSES: pecan.abort( 400, _("management address range must contain at least %d " "addresses") % MIN_MANAGEMENT_ADDRESSES) # Parse/validate the gateway try: validate_address_str(management_gateway_ip_str, management_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-gateway-ip invalid: %s") % e) # Ensure systemcontroller gateway is in the management subnet # for the systemcontroller region. management_address_pool = self._get_management_address_pool(context) systemcontroller_subnet_str = "%s/%d" % ( management_address_pool.network, management_address_pool.prefix) systemcontroller_subnet = IPNetwork(systemcontroller_subnet_str) try: validate_address_str(systemcontroller_gateway_ip_str, systemcontroller_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("systemcontroller-gateway-ip invalid: %s") % e)
def _validate_subcloud_config(self, context, name, management_subnet_str, management_start_ip_str, management_end_ip_str, management_gateway_ip_str, systemcontroller_gateway_ip_str): """Check whether subcloud config is valid.""" # Validate the name if name.isdigit(): pecan.abort(400, _("name must contain alphabetic characters")) if name in [consts.DEFAULT_REGION_NAME, consts.SYSTEM_CONTROLLER_NAME]: pecan.abort( 400, _("name cannot be %(bad_name1)s or %(bad_name2)s") % { 'bad_name1': consts.DEFAULT_REGION_NAME, 'bad_name2': consts.SYSTEM_CONTROLLER_NAME }) # Parse/validate the management subnet subcloud_subnets = [] subclouds = db_api.subcloud_get_all(context) for subcloud in subclouds: subcloud_subnets.append(IPNetwork(subcloud.management_subnet)) MIN_MANAGEMENT_SUBNET_SIZE = 8 # subtract 3 for network, gateway and broadcast addresses. MIN_MANAGEMENT_ADDRESSES = MIN_MANAGEMENT_SUBNET_SIZE - 3 management_subnet = None try: management_subnet = validate_network_str( management_subnet_str, minimum_size=MIN_MANAGEMENT_SUBNET_SIZE, existing_networks=subcloud_subnets) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-subnet invalid: %s") % e) # Parse/validate the start/end addresses management_start_ip = None try: management_start_ip = validate_address_str(management_start_ip_str, management_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-start-ip invalid: %s") % e) management_end_ip = None try: management_end_ip = validate_address_str(management_end_ip_str, management_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-end-ip invalid: %s") % e) if not management_start_ip < management_end_ip: pecan.abort( 400, _("management-start-ip not less than management-end-ip")) if not len(IPRange(management_start_ip, management_end_ip)) >= \ MIN_MANAGEMENT_ADDRESSES: pecan.abort( 400, _("management address range must contain at least %d " "addresses") % MIN_MANAGEMENT_ADDRESSES) # Parse/validate the gateway try: validate_address_str(management_gateway_ip_str, management_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("management-gateway-ip invalid: %s") % e) # Ensure subcloud management gateway is not within the actual subcloud # management subnet address pool for consistency with the # systemcontroller gateway restriction below. Address collision # is not a concern as the address is added to sysinv. subcloud_mgmt_address_start = IPAddress(management_start_ip_str) subcloud_mgmt_address_end = IPAddress(management_end_ip_str) subcloud_mgmt_gw_ip = IPAddress(management_gateway_ip_str) if ((subcloud_mgmt_gw_ip >= subcloud_mgmt_address_start) and (subcloud_mgmt_gw_ip <= subcloud_mgmt_address_end)): pecan.abort( 400, _("management-gateway-ip invalid, " "is within management pool: %(start)s - " "%(end)s") % { 'start': subcloud_mgmt_address_start, 'end': subcloud_mgmt_address_end }) # Ensure systemcontroller gateway is in the management subnet # for the systemcontroller region. management_address_pool = self._get_management_address_pool(context) systemcontroller_subnet_str = "%s/%d" % ( management_address_pool.network, management_address_pool.prefix) systemcontroller_subnet = IPNetwork(systemcontroller_subnet_str) try: validate_address_str(systemcontroller_gateway_ip_str, systemcontroller_subnet) except ValidateFail as e: LOG.exception(e) pecan.abort(400, _("systemcontroller-gateway-ip invalid: %s") % e) # Ensure systemcontroller gateway is not within the actual # management subnet address pool to prevent address collision. mgmt_address_start = IPAddress(management_address_pool.ranges[0][0]) mgmt_address_end = IPAddress(management_address_pool.ranges[0][1]) systemcontroller_gw_ip = IPAddress(systemcontroller_gateway_ip_str) if ((systemcontroller_gw_ip >= mgmt_address_start) and (systemcontroller_gw_ip <= mgmt_address_end)): pecan.abort( 400, _("systemcontroller-gateway-ip invalid, " "is within management pool: %(start)s - " "%(end)s") % { 'start': mgmt_address_start, 'end': mgmt_address_end })
def _periodic_subcloud_audit_loop(self): """Audit availability of subclouds loop.""" # We will be running in our own green thread here. LOG.info('Triggered subcloud audit.') # For each subcloud, if at least one service is active in # each service of servicegroup-list then declare the subcloud online. for subcloud in db_api.subcloud_get_all(self.context): subcloud_name = subcloud.name subcloud_id = subcloud.id management_state = subcloud.management_state avail_status_current = subcloud.availability_status audit_fail_count = subcloud.audit_fail_count # Set defaults to None and disabled so we will still set disabled # status if we encounter an error. sysinv_client = None svc_groups = None avail_to_set = consts.AVAILABILITY_OFFLINE try: ks_client = KeystoneClient(subcloud_name) sysinv_client = SysinvClient(subcloud_name, ks_client.session) except (keystone_exceptions.EndpointNotFound, keystone_exceptions.ConnectFailure, IndexError) as e: if avail_status_current == consts.AVAILABILITY_OFFLINE: LOG.info("Identity or Platform endpoint for %s not " "found, ignoring for offline " "subcloud." % subcloud_name) continue else: LOG.error("Identity or Platform endpoint for online " "subcloud: %s not found." % subcloud_name) except Exception as e: LOG.exception(e) if sysinv_client: try: svc_groups = sysinv_client.get_service_groups() except Exception as e: svc_groups = None LOG.warn('Cannot retrieve service groups for ' 'subcloud:%s, %s' % (subcloud_name, e)) if svc_groups: active_sgs = [] inactive_sgs = [] # Build 2 lists, 1 of active service groups, # one with non-active. for sg in svc_groups: if sg.state != consts.SERVICE_GROUP_STATUS_ACTIVE: inactive_sgs.append(sg.service_group_name) else: active_sgs.append(sg.service_group_name) # Create a list of service groups that are only present # in non-active list inactive_only = [ sg for sg in inactive_sgs if sg not in active_sgs ] # An empty inactive only list and a non-empty active list # means we're good to go. if not inactive_only and active_sgs: avail_to_set = \ consts.AVAILABILITY_ONLINE else: LOG.info("Subcloud:%s has non-active " "service groups: %s" % (subcloud_name, inactive_only)) if avail_to_set == consts.AVAILABILITY_OFFLINE: if audit_fail_count < consts.AVAIL_FAIL_COUNT_MAX: audit_fail_count = audit_fail_count + 1 if (avail_status_current == consts.AVAILABILITY_ONLINE) and \ (audit_fail_count < consts.AVAIL_FAIL_COUNT_TO_ALARM): # Do not set offline until we have failed audit # the requisite number of times avail_to_set = consts.AVAILABILITY_ONLINE else: # In the case of a one off blip, we may need to set the # fail count back to 0 audit_fail_count = 0 if avail_to_set != avail_status_current: if avail_to_set == consts.AVAILABILITY_ONLINE: audit_fail_count = 0 LOG.info('Setting new availability status: %s ' 'on subcloud: %s' % (avail_to_set, subcloud_name)) entity_instance_id = "subcloud=%s" % subcloud_name fault = self.fm_api.get_fault( fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, entity_instance_id) if fault and (avail_to_set == consts.AVAILABILITY_ONLINE): try: self.fm_api.clear_fault( fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, entity_instance_id) except Exception as e: LOG.exception(e) elif not fault and \ (avail_to_set == consts.AVAILABILITY_OFFLINE): try: fault = fm_api.Fault( alarm_id=fm_const.FM_ALARM_ID_DC_SUBCLOUD_OFFLINE, alarm_state=fm_const.FM_ALARM_STATE_SET, entity_type_id=fm_const.FM_ENTITY_TYPE_SUBCLOUD, entity_instance_id=entity_instance_id, severity=fm_const.FM_ALARM_SEVERITY_CRITICAL, reason_text=('%s is offline' % subcloud_name), alarm_type=fm_const.FM_ALARM_TYPE_0, probable_cause=fm_const.ALARM_PROBABLE_CAUSE_29, proposed_repair_action="Wait for subcloud to " "become online; if " "problem persists contact " "next level of support.", service_affecting=True) self.fm_api.set_fault(fault) except Exception as e: LOG.exception(e) try: db_api.subcloud_update(self.context, subcloud_id, management_state=None, availability_status=avail_to_set, software_version=None, description=None, location=None, audit_fail_count=audit_fail_count) except exceptions.SubcloudNotFound: # slim possibility subcloud could have been deleted since # we found it in db, ignore this benign error. LOG.info('Ignoring SubcloudNotFound when attempting state' ' update: %s' % subcloud_name) continue try: self.dcorch_rpc_client.\ update_subcloud_states(self.context, subcloud_name, management_state, avail_to_set) LOG.info('Notifying dcorch, subcloud:%s management: %s, ' 'availability:%s' % (subcloud_name, management_state, avail_to_set)) except Exception as e: LOG.exception(e) LOG.warn('Problem informing dcorch of subcloud ' 'state change, subcloud: %s' % subcloud_name) if avail_to_set == consts.AVAILABILITY_OFFLINE: # Subcloud is going offline, set all endpoint statuses to # unknown. try: self.subcloud_manager.update_subcloud_endpoint_status( self.context, subcloud_name=subcloud_name, endpoint_type=None, sync_status=consts.SYNC_STATUS_UNKNOWN) except exceptions.SubcloudNotFound: LOG.info('Ignoring SubcloudNotFound when attempting ' 'sync_status update: %s' % subcloud_name) continue elif audit_fail_count != subcloud.audit_fail_count: try: db_api.subcloud_update(self.context, subcloud_id, management_state=None, availability_status=None, software_version=None, description=None, location=None, audit_fail_count=audit_fail_count) except exceptions.SubcloudNotFound: # slim possibility subcloud could have been deleted since # we found it in db, ignore this benign error. LOG.info('Ignoring SubcloudNotFound when attempting ' 'audit_fail_count update: %s' % subcloud_name) continue
def update_subcloud_endpoint_status( self, context, subcloud_name=None, endpoint_type=None, sync_status=consts.SYNC_STATUS_OUT_OF_SYNC, alarmable=True): """Update subcloud endpoint status :param context: request context object :param subcloud_name: name of subcloud to update :param endpoint_type: endpoint type to update :param sync_status: sync status to set """ subcloud = None if subcloud_name: try: subcloud = db_api.subcloud_get_by_name(context, subcloud_name) except Exception as e: LOG.exception(e) raise e # Only allow updating the sync status if managed and online. # This means if a subcloud is going offline or unmanaged, then # the sync status update must be done first. if (((subcloud.availability_status == consts.AVAILABILITY_ONLINE) and (subcloud.management_state == consts.MANAGEMENT_MANAGED)) or (sync_status != consts.SYNC_STATUS_IN_SYNC)): # update a single subcloud try: self._update_endpoint_status_for_subcloud( context, subcloud.id, endpoint_type, sync_status, alarmable) except Exception as e: LOG.exception(e) raise e else: LOG.info("Ignoring unmanaged/offline subcloud sync_status " "update for subcloud:%s endpoint:%s sync:%s" % (subcloud_name, endpoint_type, sync_status)) else: # update all subclouds for subcloud in db_api.subcloud_get_all(context): if (((subcloud.availability_status == consts.AVAILABILITY_ONLINE) and (subcloud.management_state == consts.MANAGEMENT_MANAGED)) or (sync_status != consts.SYNC_STATUS_IN_SYNC)): try: self._update_endpoint_status_for_subcloud( context, subcloud.id, endpoint_type, sync_status, alarmable) except Exception as e: LOG.exception(e) raise e else: LOG.info("Ignoring unmanaged/offline subcloud sync_status " "update for subcloud:%s endpoint:%s sync:%s" % (subcloud.name, endpoint_type, sync_status))