def _add_missing_subinterfaces(self, context, edge_id, vnic_binds, backend_vnics, if_changed, readonly): # Verify that all the entries in # nsxv_edge_vnic_bindings are attached on the Edge # Arrange the vnic binds in a list of lists - vnics and subinterfaces metadata_nets = [ net['network_id'] for net in nsxv_db.get_nsxv_internal_networks( context.session, vcns_const.InternalEdgePurposes.INTER_EDGE_PURPOSE) ] for vnic_bind in vnic_binds: if vnic_bind['network_id'] in metadata_nets: continue for vnic in backend_vnics: if vnic['index'] == vnic_bind['vnic_index']: found = False tunnel_index = vnic_bind['tunnel_index'] network_id = vnic_bind['network_id'] for sub_if in (vnic.get('subInterfaces', {}).get('subInterfaces', [])): if sub_if['tunnelId'] == tunnel_index: found = True if sub_if.get('logicalSwitchName') != network_id: self.error_count += 1 self.error_info = base_job.housekeeper_warning( self.error_info, 'subinterface %s on vnic %s on edge %s ' 'should be connected to network %s', tunnel_index, vnic['index'], edge_id, network_id) if_changed[vnic['index']] = True if not readonly: self._recreate_vnic_subinterface( context, network_id, edge_id, vnic, tunnel_index) self.fixed_count += 1 sub_if['name'] = network_id if not found: self.error_count += 1 self.error_info = base_job.housekeeper_warning( self.error_info, 'subinterface %s on vnic %s on edge %s should be ' 'connected to network %s but is missing', tunnel_index, vnic['index'], edge_id, network_id) if_changed[vnic['index']] = True if not readonly: self._recreate_vnic_subinterface( context, network_id, edge_id, vnic, tunnel_index) self.fixed_sub_if_count += 1
def run(self, context, readonly=False): super(MismatchLogicalportJob, self).run(context) # get all orphaned DHCP servers mismatch_ports = v3_utils.get_mismatch_logical_ports( context, self.plugin.nsxlib, self.plugin) info = "" if not mismatch_ports: msg = 'No mismatched logical ports detected.' info = base_job.housekeeper_info(info, msg) return {'error_count': 0, 'fixed_count': 0, 'error_info': info} msg = ("Found %(len)s mismatched logical port%(plural)s:" % {'len': len(mismatch_ports), 'plural': 's' if len(mismatch_ports) > 1 else ''}) info = base_job.housekeeper_warning(info, msg) fixed_count = 0 for port_problem in mismatch_ports: msg = ("Logical port %(nsx_id)s " "[neutron id: %(id)s] error: %(err)s" % {'nsx_id': port_problem['nsx_id'], 'id': port_problem['neutron_id'], 'err': port_problem['error']}) if not readonly: # currently we mitigate only address bindings mismatches err_type = port_problem['error_type'] if err_type == v3_utils.PORT_ERROR_TYPE_BINDINGS: # Create missing address bindings on backend port = port_problem['port'] try: address_bindings = self.plugin._build_address_bindings( port) self.plugin.nsxlib.logical_port.update( port_problem['nsx_id'], port_problem['neutron_id'], address_bindings=address_bindings) except Exception as e: msg = "%s failed to be fixed: %s" % (msg, e) else: fixed_count = fixed_count + 1 msg = "%s was fixed." % msg else: msg = "%s cannot be fixed automatically." % msg info = base_job.housekeeper_warning(info, msg) return {'error_count': len(mismatch_ports), 'error_info': info, 'fixed_count': fixed_count}
def _validate_edge_subinterfaces(self, context, edge_id, backend_vnics, vnic_dict, if_changed): # Validate that all the interfaces on the Edge # appliance are registered in nsxv_edge_vnic_bindings for vnic in backend_vnics: if_changed[vnic['index']] = False if (vnic['isConnected'] and vnic['type'] == 'trunk' and vnic['subInterfaces']): for sub_if in vnic['subInterfaces']['subInterfaces']: # Subinterface name field contains the net id vnic_bind = vnic_dict.get(sub_if['logicalSwitchName']) if (vnic_bind and vnic_bind['vnic_index'] == vnic['index'] and vnic_bind['tunnel_index'] == sub_if['tunnelId']): pass else: self.error_count += 1 self.error_info = base_job.housekeeper_warning( self.error_info, 'subinterface %s for vnic %s on edge %s is not ' 'defined in nsxv_edge_vnic_bindings', sub_if['tunnelId'], vnic['index'], edge_id) self.fixed_sub_if_count += 1 if_changed[vnic['index']] = True vnic['subInterfaces']['subInterfaces'].remove(sub_if)
def run(self, context, readonly=False): super(LbaasPendingJob, self).run(context) curr_time = time.time() error_count = 0 fixed_count = 0 error_info = '' for model in self.lbaas_models: sess = context.session elements = sess.query(model).filter( model.provisioning_status.in_( [constants.PENDING_CREATE, constants.PENDING_UPDATE, constants.PENDING_DELETE])).all() for element in elements: if element['id'] in self.lbaas_objects: obj = self.lbaas_objects[element['id']] lifetime = curr_time - obj['time_added'] if lifetime > ELEMENT_LIFETIME: # Entry has been pending for more than lifetime. # Report and remove when in R/W mode error_count += 1 error_info = base_job.housekeeper_warning( error_info, 'LBaaS %s %s is stuck in pending state', model.NAME, element['id']) if not readonly: element['provisioning_status'] = constants.ERROR fixed_count += 1 del self.lbaas_objects[element['id']] else: # Entry is still pending but haven't reached lifetime LOG.debug('Housekeeping: LBaaS object %s %s in ' 'PENDING state for %d seconds', model.NAME, element['id'], lifetime) obj['time_seen'] = curr_time else: # Entry wasn't seen before this iteration - add to dict LOG.debug('Housekeeping: monitoring PENDING state for ' 'LBaaS object %s %s', model.NAME, element['id']) self.lbaas_objects[element.id] = { 'model': model, 'time_added': curr_time, 'time_seen': curr_time} # Look for dictionary entries which weren't seen in this iteration. # Such entries were either removed from DB or their state was changed. for obj_id in self.lbaas_objects.keys(): if self.lbaas_objects[obj_id]['time_seen'] != curr_time: LOG.debug('Housekeeping: LBaaS %s %s is back to normal', self.lbaas_objects[obj_id]['model'].NAME, obj_id) del self.lbaas_objects[obj_id] if error_count == 0: error_info = 'No LBaaS objects in pending state' return {'error_count': error_count, 'fixed_count': fixed_count, 'error_info': error_info}
def run(self, context, readonly=False): super(ErrorBackupEdgeJob, self).run(context) error_count = 0 fixed_count = 0 error_info = '' # Gather ERROR state backup edges into dict filters = {'status': [constants.ERROR]} like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"} with locking.LockManager.get_lock('nsx-edge-backup-pool'): error_edge_bindings = nsxv_db.get_nsxv_router_bindings( context.session, filters=filters, like_filters=like_filters) if not error_edge_bindings: LOG.debug('Housekeeping: no backup edges in ERROR state detected') return {'error_count': 0, 'fixed_count': 0, 'error_info': 'No backup edges in ERROR state detected'} # Keep list of current broken backup edges - as it may change while # HK is running for binding in error_edge_bindings: error_count += 1 error_info = base_job.housekeeper_warning( error_info, 'Backup Edge appliance %s is in ERROR state', binding['edge_id']) if not readonly: with locking.LockManager.get_lock(binding['edge_id']): if self._handle_backup_edge(context, binding): fixed_count += 1 return {'error_count': error_count, 'fixed_count': fixed_count, 'error_info': error_info}
def run(self, context, readonly=False): super(OrphanedLogicalSwitchJob, self).run(context) # get all orphaned DHCP servers orphaned_swithces = v3_utils.get_orphaned_networks( context, self.plugin.nsxlib) info = "" if not orphaned_swithces: msg = 'No orphaned logical switches detected.' info = base_job.housekeeper_info(info, msg) return {'error_count': 0, 'fixed_count': 0, 'error_info': info} msg = ("Found %(len)s orphaned logical switch%(plural)s:" % { 'len': len(orphaned_swithces), 'plural': 'es' if len(orphaned_swithces) > 1 else '' }) info = base_job.housekeeper_warning(info, msg) fixed_count = 0 for switch in orphaned_swithces: msg = ("Logical switch %(name)s [id: %(id)s] " "(neutron network: %(net)s)" % { 'name': switch['display_name'], 'id': switch['id'], 'net': switch['neutron_net_id'] if switch['neutron_net_id'] else 'Unknown' }) if not readonly: try: self.plugin.nsxlib.logical_switch.delete(switch['id']) except Exception as e: msg = "%s failed to be removed: %s." % (msg, e) else: fixed_count = fixed_count + 1 msg = "%s was removed." % (msg) info = base_job.housekeeper_warning(info, msg) return { 'error_count': len(orphaned_swithces), 'error_info': info, 'fixed_count': fixed_count }
def run(self, context, readonly=False): super(OrphanedLogicalRouterJob, self).run(context) # get all orphaned DHCP servers orphaned_routers = v3_utils.get_orphaned_routers( context, self.plugin.nsxlib) info = "" if not orphaned_routers: msg = 'No orphaned logical routers detected.' info = base_job.housekeeper_info(info, msg) return {'error_count': 0, 'fixed_count': 0, 'error_info': info} msg = ("Found %(len)s orphaned logical router%(plural)s:" % { 'len': len(orphaned_routers), 'plural': 's' if len(orphaned_routers) > 1 else '' }) info = base_job.housekeeper_warning(info, msg) fixed_count = 0 for router in orphaned_routers: msg = ("Logical router %(name)s [id: %(id)s] " "(neutron router: %(rtr)s)" % { 'name': router['display_name'], 'id': router['id'], 'rtr': router['neutron_router_id'] if router['neutron_router_id'] else 'Unknown' }) if not readonly: success, error = v3_utils.delete_orphaned_router( self.plugin.nsxlib, router['id']) if success: fixed_count = fixed_count + 1 msg = "%s was removed." % msg else: msg = "%s failed to be removed: %s." % (msg, error) info = base_job.housekeeper_warning(info, msg) return { 'error_count': len(orphaned_routers), 'error_info': info, 'fixed_count': fixed_count }
def run(self, context, readonly=False): super(OrphanedFirewallSectionJob, self).run(context) # get all orphaned firewall sections orphaned_sections = v3_utils.get_orphaned_firewall_sections( context, self.plugin.nsxlib) info = "" if not orphaned_sections: msg = 'No orphaned firewall sections detected.' info = base_job.housekeeper_info(info, msg) return {'error_count': 0, 'fixed_count': 0, 'error_info': info} msg = ("Found %(len)s orphaned firewall section%(plural)s:" % { 'len': len(orphaned_sections), 'plural': 's' if len(orphaned_sections) > 1 else '' }) info = base_job.housekeeper_warning(info, msg) fixed_count = 0 for section in orphaned_sections: msg = ("Firewall section %(name)s [id: %(id)s] " "neutron security group: %(sg)s" % { 'name': section['display_name'], 'id': section['id'], 'sg': section['neutron_sg_id'] if section['neutron_sg_id'] else 'Unknown' }) if not readonly: try: self.plugin.nsxlib.firewall_section.delete(section['id']) except Exception as e: msg = "%s failed to be removed: %s." % (msg, e) else: fixed_count = fixed_count + 1 msg = "%s was removed." % msg info = base_job.housekeeper_warning(info, msg) return { 'error_count': len(orphaned_sections), 'error_info': info, 'fixed_count': fixed_count }
def run(self, context, readonly=False): super(ErrorDhcpEdgeJob, self).run(context) self.error_count = 0 self.fixed_count = 0 self.fixed_sub_if_count = 0 self.error_info = '' # Gather ERROR state DHCP edges into dict filters = {'status': [constants.ERROR]} error_edge_bindings = nsxv_db.get_nsxv_router_bindings(context.session, filters=filters) if not error_edge_bindings: LOG.debug('Housekeeping: no DHCP edges in ERROR state detected') return { 'error_count': self.error_count, 'fixed_count': self.fixed_count, 'error_info': 'No DHCP error state edges detected' } with locking.LockManager.get_lock('nsx-dhcp-edge-pool'): edge_dict = {} for binding in error_edge_bindings: if binding['router_id'].startswith( vcns_const.DHCP_EDGE_PREFIX): bind_list = edge_dict.get(binding['edge_id'], []) bind_list.append(binding) edge_dict[binding['edge_id']] = bind_list # Get valid neutron networks and create a prefix dict. networks = [ net['id'] for net in self.plugin.get_networks(context, fields=['id']) ] pfx_dict = { net[:36 - len(vcns_const.DHCP_EDGE_PREFIX)]: net for net in networks } for edge_id in edge_dict.keys(): try: self._validate_dhcp_edge(context, edge_dict, pfx_dict, networks, edge_id, readonly) except Exception as e: self.error_count += 1 self.error_info = base_job.housekeeper_warning( self.error_info, 'Failed to recover DHCP Edge %s (%s)', edge_id, e) return { 'error_count': self.error_count, 'fixed_count': self.fixed_count, 'error_info': self.error_info }
def run(self, context, readonly=False): super(OrphanedDhcpServerJob, self).run(context) # get all orphaned DHCP servers orphaned_servers = v3_utils.get_orphaned_dhcp_servers( context, self.plugin, self.plugin.nsxlib) info = "" if not orphaned_servers: msg = 'No orphaned DHCP servers detected.' info = base_job.housekeeper_info(info, msg) return {'error_count': 0, 'fixed_count': 0, 'error_info': msg} msg = ("Found %(len)s orphaned DHCP server%(plural)s:" % {'len': len(orphaned_servers), 'plural': 's' if len(orphaned_servers) > 1 else ''}) info = base_job.housekeeper_warning(info, msg) fixed_count = 0 for server in orphaned_servers: msg = ("DHCP server %(name)s [id: %(id)s] " "(neutron network: %(net)s)" % {'name': server['display_name'], 'id': server['id'], 'net': server['neutron_net_id'] if server.get('neutron_net_id') else 'Unknown'}) if not readonly: success, error = v3_utils.delete_orphaned_dhcp_server( context, self.plugin.nsxlib, server) if success: msg = "%s was removed." % msg fixed_count = fixed_count + 1 else: msg = "%s failed to be removed: %s." % (msg, error) info = base_job.housekeeper_warning(info, msg) return {'error_count': len(orphaned_servers), 'error_info': info, 'fixed_count': fixed_count}
def run(self, context, readonly=False): super(OrphanedFirewallSectionJob, self).run(context) # get all orphaned firewall sections orphaned_sections = v3_utils.get_orphaned_firewall_sections( context, self.plugin.nsxlib) info = "" if not orphaned_sections: msg = 'No orphaned firewall sections detected.' info = base_job.housekeeper_info(info, msg) return {'error_count': 0, 'fixed_count': 0, 'error_info': info} msg = ("Found %(len)s orphaned firewall section%(plural)s:" % {'len': len(orphaned_sections), 'plural': 's' if len(orphaned_sections) > 1 else ''}) info = base_job.housekeeper_warning(info, msg) fixed_count = 0 for section in orphaned_sections: msg = ("Firewall section %(name)s [id: %(id)s] " "neutron security group: %(sg)s" % {'name': section['display_name'], 'id': section['id'], 'sg': section['neutron_sg_id'] if section['neutron_sg_id'] else 'Unknown'}) if not readonly: try: self.plugin.nsxlib.firewall_section.delete(section['id']) except Exception as e: msg = "%s failed to be removed: %s." % (msg, e) else: fixed_count = fixed_count + 1 msg = "%s was removed." % msg info = base_job.housekeeper_warning(info, msg) return {'error_count': len(orphaned_sections), 'error_info': info, 'fixed_count': fixed_count}
def run(self, context, readonly=False): super(OrphanedLogicalSwitchJob, self).run(context) # get all orphaned DHCP servers orphaned_swithces = v3_utils.get_orphaned_networks( context, self.plugin.nsxlib) info = "" if not orphaned_swithces: msg = 'No orphaned logical switches detected.' info = base_job.housekeeper_info(info, msg) return {'error_count': 0, 'fixed_count': 0, 'error_info': info} msg = ("Found %(len)s orphaned logical switch%(plural)s:" % {'len': len(orphaned_swithces), 'plural': 'es' if len(orphaned_swithces) > 1 else ''}) info = base_job.housekeeper_warning(info, msg) fixed_count = 0 for switch in orphaned_swithces: msg = ("Logical switch %(name)s [id: %(id)s] " "(neutron network: %(net)s)" % {'name': switch['display_name'], 'id': switch['id'], 'net': switch['neutron_net_id'] if switch['neutron_net_id'] else 'Unknown'}) if not readonly: try: self.plugin.nsxlib.logical_switch.delete(switch['id']) except Exception as e: msg = "%s failed to be removed: %s." % (msg, e) else: fixed_count = fixed_count + 1 msg = "%s was removed." % (msg) info = base_job.housekeeper_warning(info, msg) return {'error_count': len(orphaned_swithces), 'error_info': info, 'fixed_count': fixed_count}
def run(self, context, readonly=False): super(ErrorBackupEdgeJob, self).run(context) error_count = 0 fixed_count = 0 error_info = '' # Gather ERROR state backup edges into dict filters = {'status': [constants.ERROR]} like_filters = {'router_id': vcns_const.BACKUP_ROUTER_PREFIX + "%"} with locking.LockManager.get_lock('nsx-edge-backup-pool'): error_edge_bindings = nsxv_db.get_nsxv_router_bindings( context.session, filters=filters, like_filters=like_filters) if not error_edge_bindings: LOG.debug('Housekeeping: no backup edges in ERROR state detected') return { 'error_count': 0, 'fixed_count': 0, 'error_info': 'No backup edges in ERROR state detected' } # Keep list of current broken backup edges - as it may change while # HK is running for binding in error_edge_bindings: error_count += 1 error_info = base_job.housekeeper_warning( error_info, 'Backup Edge appliance %s is in ERROR state', binding['edge_id']) if not readonly: with locking.LockManager.get_lock(binding['edge_id']): if self._handle_backup_edge(context, binding): fixed_count += 1 return { 'error_count': error_count, 'fixed_count': fixed_count, 'error_info': error_info }
def _validate_dhcp_edge(self, context, edge_dict, pfx_dict, networks, edge_id, readonly): # Also metadata network should be a valid network for the edge az_name = self.plugin.get_availability_zone_name_by_edge( context, edge_id) with locking.LockManager.get_lock(edge_id): vnic_binds = nsxv_db.get_edge_vnic_bindings_by_edge( context.session, edge_id) edge_networks = [bind['network_id'] for bind in vnic_binds] # Step (A) # Find router bindings which are mapped to dead networks, or # do not have interfaces registered in nsxv tables for binding in edge_dict[edge_id]: router_id = binding['router_id'] net_pfx = router_id[len(vcns_const.DHCP_EDGE_PREFIX):] net_id = pfx_dict.get(net_pfx) if net_id is None: # Delete router binding as we do not have such network # in Neutron self.error_count += 1 self.error_info = base_job.housekeeper_warning( self.error_info, 'router binding %s for edge %s has no matching ' 'neutron network', router_id, edge_id) if not readonly: nsxv_db.delete_nsxv_router_binding( context.session, binding['router_id']) self.fixed_count += 1 else: if net_id not in edge_networks: # Create vNic bind here self.error_count += 1 self.error_info = base_job.housekeeper_warning( self.error_info, 'edge %s vnic binding missing for network %s', edge_id, net_id) if not readonly: nsxv_db.allocate_edge_vnic_with_tunnel_index( context.session, edge_id, net_id, az_name) self.fixed_count += 1 # Step (B) # Find vNic bindings which reference invalid networks or aren't # bound to any router binding # Reread vNic binds as we might created more or deleted some in # step (A) vnic_binds = nsxv_db.get_edge_vnic_bindings_by_edge( context.session, edge_id) for bind in vnic_binds: if bind['network_id'] not in networks: self.error_count += 1 self.error_info = base_job.housekeeper_warning( self.error_info, 'edge vnic binding for edge %s is for invalid ' 'network id %s', edge_id, bind['network_id']) if not readonly: nsxv_db.free_edge_vnic_by_network( context.session, edge_id, bind['network_id']) self.fixed_count += 1 # Step (C) # Verify that backend is in sync with Neutron # Reread vNic binds as we might deleted some in step (B) vnic_binds = nsxv_db.get_edge_vnic_bindings_by_edge( context.session, edge_id) # Transform to network-keyed dict vnic_dict = { vnic['network_id']: { 'vnic_index': vnic['vnic_index'], 'tunnel_index': vnic['tunnel_index'] } for vnic in vnic_binds } backend_vnics = self.plugin.nsx_v.vcns.get_interfaces( edge_id)[1].get('vnics', []) if_changed = {} self._validate_edge_subinterfaces(context, edge_id, backend_vnics, vnic_dict, if_changed) self._add_missing_subinterfaces(context, edge_id, vnic_binds, backend_vnics, if_changed, readonly) if not readonly: for vnic in backend_vnics: if if_changed[vnic['index']]: self.plugin.nsx_v.vcns.update_interface(edge_id, vnic) self._update_router_bindings(context, edge_id) self.fixed_count += self.fixed_sub_if_count
def run(self, context, readonly=False): super(LbaasPendingJob, self).run(context) curr_time = time.time() error_count = 0 fixed_count = 0 error_info = '' for model in self.lbaas_models: sess = context.session elements = sess.query(model).filter( model.provisioning_status.in_([ constants.PENDING_CREATE, constants.PENDING_UPDATE, constants.PENDING_DELETE ])).all() for element in elements: if element['id'] in self.lbaas_objects: obj = self.lbaas_objects[element['id']] lifetime = curr_time - obj['time_added'] if lifetime > ELEMENT_LIFETIME: # Entry has been pending for more than lifetime. # Report and remove when in R/W mode error_count += 1 error_info = base_job.housekeeper_warning( error_info, 'LBaaS %s %s is stuck in pending state', model.NAME, element['id']) if not readonly: element['provisioning_status'] = constants.ERROR fixed_count += 1 del self.lbaas_objects[element['id']] else: # Entry is still pending but haven't reached lifetime LOG.debug( 'Housekeeping: LBaaS object %s %s in ' 'PENDING state for %d seconds', model.NAME, element['id'], lifetime) obj['time_seen'] = curr_time else: # Entry wasn't seen before this iteration - add to dict LOG.debug( 'Housekeeping: monitoring PENDING state for ' 'LBaaS object %s %s', model.NAME, element['id']) self.lbaas_objects[element.id] = { 'model': model, 'time_added': curr_time, 'time_seen': curr_time } # Look for dictionary entries which weren't seen in this iteration. # Such entries were either removed from DB or their state was changed. for obj_id in self.lbaas_objects.keys(): if self.lbaas_objects[obj_id]['time_seen'] != curr_time: LOG.debug('Housekeeping: LBaaS %s %s is back to normal', self.lbaas_objects[obj_id]['model'].NAME, obj_id) del self.lbaas_objects[obj_id] if error_count == 0: error_info = 'No LBaaS objects in pending state' return { 'error_count': error_count, 'fixed_count': fixed_count, 'error_info': error_info }