def is_okay(self): """Check if network is created and running.""" # TODO(andy):This will be built upon with further code re-design if not self.is_created(): return False if self.db_entry['provide_dhcp'] and util.is_network_node(): if not self.is_dnsmasq_running(): return False return True
def run(self): LOG.info('Starting') last_management = 0 while True: if util.is_network_node(): self._process_network_node_workitems() else: management_age = time.time() - last_management time.sleep(max(0, 30 - management_age)) if time.time() - last_management > 30: self._maintain_networks() last_management = time.time()
def remove_dhcp(self): if util.is_network_node(): subst = self.subst_dict() with util.RecordedOperation('remove dhcp', self): with db.get_lock('network', None, self.uuid, ttl=120): d = dhcp.DHCP(self.uuid, subst['vx_veth_inner']) d.remove_dhcpd() else: db.enqueue('networknode', { 'type': 'remove_dhcp', 'network_uuid': self.uuid }) db.add_event('network', self.uuid, 'remove dhcp', 'enqueued', None, None)
def remove_dhcp(self): if util.is_network_node(): subst = self.subst_dict() with util.RecordedOperation('remove dhcp', self): with db.get_object_lock(self, ttl=120, op='Network remove DHCP'): d = dhcp.DHCP(self, subst['vx_veth_inner']) d.remove_dhcpd() else: db.enqueue('networknode', RemoveDHCPNetworkTask(self.db_entry['uuid'])) db.add_event('network', self.db_entry['uuid'], 'remove dhcp', 'enqueued', None, None)
def update_dhcp(self): if not self.db_entry['provide_dhcp']: return if util.is_network_node(): subst = self.subst_dict() with util.RecordedOperation('update dhcp', self): with db.get_object_lock(self, ttl=120, op='Network update DHCP'): d = dhcp.DHCP(self, subst['vx_veth_inner']) d.restart_dhcpd() else: db.enqueue('networknode', UpdateDHCPNetworkTask(self.db_entry['uuid'])) db.add_event('network', self.db_entry['uuid'], 'update dhcp', 'enqueued', None, None)
def update_dhcp(self): if not self.provide_dhcp: return if util.is_network_node(): self.ensure_mesh() subst = self.subst_dict() with util.RecordedOperation('update dhcp', self): with db.get_lock('network', None, self.uuid, ttl=120): d = dhcp.DHCP(self.uuid, subst['vx_veth_inner']) d.restart_dhcpd() else: db.enqueue('networknode', { 'type': 'update_dhcp', 'network_uuid': self.uuid }) db.add_event('network', self.uuid, 'update dhcp', 'enqueued', None, None)
def delete(self): subst = self.subst_dict() LOG.withFields(subst).debug('net.delete()') # Cleanup local node with db.get_object_lock(self, ttl=120, op='Network delete'): if util.check_for_interface(subst['vx_bridge']): with util.RecordedOperation('delete vxlan bridge', self): util.execute(None, 'ip link delete %(vx_bridge)s' % subst) if util.check_for_interface(subst['vx_interface']): with util.RecordedOperation('delete vxlan interface', self): util.execute(None, 'ip link delete %(vx_interface)s' % subst) # If this is the network node do additional cleanup if util.is_network_node(): if util.check_for_interface(subst['vx_veth_outer']): with util.RecordedOperation('delete router veth', self): util.execute( None, 'ip link delete %(vx_veth_outer)s' % subst) if util.check_for_interface(subst['physical_veth_outer']): with util.RecordedOperation('delete physical veth', self): util.execute( None, 'ip link delete %(physical_veth_outer)s' % subst) if os.path.exists('/var/run/netns/%(netns)s' % subst): with util.RecordedOperation('delete netns', self): util.execute(None, 'ip netns del %(netns)s' % subst) if self.db_entry['floating_gateway']: with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Network delete'): ipm = db.get_ipmanager('floating') ipm.release(self.db_entry['floating_gateway']) db.persist_ipmanager('floating', ipm.save())
def create(self): subst = self.subst_dict() with db.get_object_lock(self, ttl=120, op='Network create'): # Ensure network was not deleted whilst waiting for the lock. if self.is_dead(): raise DeadNetwork('network=%s' % self) if not util.check_for_interface(subst['vx_interface']): with util.RecordedOperation('create vxlan interface', self): util.create_interface( subst['vx_interface'], 'vxlan', 'id %(vx_id)s dev %(physical_interface)s dstport 0' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.' '%(vx_interface)s.arp_notify=1' % subst) if not util.check_for_interface(subst['vx_bridge']): with util.RecordedOperation('create vxlan bridge', self): util.create_interface(subst['vx_bridge'], 'bridge', '') util.execute( None, 'ip link set %(vx_interface)s ' 'master %(vx_bridge)s' % subst) util.execute(None, 'ip link set %(vx_interface)s up' % subst) util.execute(None, 'ip link set %(vx_bridge)s up' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.' '%(vx_bridge)s.arp_notify=1' % subst) util.execute(None, 'brctl setfd %(vx_bridge)s 0' % subst) util.execute(None, 'brctl stp %(vx_bridge)s off' % subst) util.execute(None, 'brctl setageing %(vx_bridge)s 0' % subst) if util.is_network_node(): if not os.path.exists('/var/run/netns/%(netns)s' % subst): with util.RecordedOperation('create netns', self): util.execute(None, 'ip netns add %(netns)s' % subst) if not util.check_for_interface(subst['vx_veth_outer']): with util.RecordedOperation('create router veth', self): util.create_interface( subst['vx_veth_outer'], 'veth', 'peer name %(vx_veth_inner)s' % subst) util.execute( None, 'ip link set %(vx_veth_inner)s netns %(netns)s' % subst) util.execute( None, 'brctl addif %(vx_bridge)s %(vx_veth_outer)s' % subst) util.execute(None, 'ip link set %(vx_veth_outer)s up' % subst) util.execute( None, '%(in_netns)s ip link set %(vx_veth_inner)s up' % subst) util.execute( None, '%(in_netns)s ip addr add %(router)s/%(netmask)s ' 'dev %(vx_veth_inner)s' % subst) if not util.check_for_interface(subst['physical_veth_outer']): with util.RecordedOperation('create physical veth', self): util.create_interface( subst['physical_veth_outer'], 'veth', 'peer name %(physical_veth_inner)s' % subst) util.execute( None, 'brctl addif %(physical_bridge)s ' '%(physical_veth_outer)s' % subst) util.execute( None, 'ip link set %(physical_veth_outer)s up' % subst) util.execute( None, 'ip link set %(physical_veth_inner)s ' 'netns %(netns)s' % subst) self.deploy_nat() self.update_dhcp() else: db.enqueue('networknode', DeployNetworkTask(self.db_entry['uuid'])) db.add_event('network', self.db_entry['uuid'], 'deploy', 'enqueued', None, None)
def main(): global DAEMON_IMPLEMENTATIONS global DAEMON_PIDS setproctitle.setproctitle(daemon.process_name('main')) # Log configuration on startup for key, value in config.dict().items(): LOG.info('Configuration item %s = %s' % (key, value)) daemon.set_log_level(LOG, 'main') # Check in early and often, also reset processing queue items db.clear_stale_locks() db.see_this_node() db.restart_queues() def _start_daemon(d): pid = os.fork() if pid == 0: DAEMON_IMPLEMENTATIONS[d].Monitor(d).run() DAEMON_PIDS[pid] = d LOG.withField('pid', pid).info('Started %s' % d) # Resource usage publisher, we need this early because scheduling decisions # might happen quite early on. _start_daemon('resources') # If I am the network node, I need some setup if util.is_network_node(): # Bootstrap the floating network in the Networks table floating_network = db.get_network('floating') if not floating_network: db.create_floating_network(config.get('FLOATING_NETWORK')) floating_network = net.from_db('floating') subst = { 'physical_bridge': util.get_safe_interface_name('phy-br-%s' % config.get('NODE_EGRESS_NIC')), 'physical_nic': config.get('NODE_EGRESS_NIC') } if not util.check_for_interface(subst['physical_bridge']): # NOTE(mikal): Adding the physical interface to the physical bridge # is considered outside the scope of the orchestration software as # it will cause the node to lose network connectivity. So instead # all we do is create a bridge if it doesn't exist and the wire # everything up to it. We can do egress NAT in that state, even if # floating IPs don't work. with util.RecordedOperation('create physical bridge', None): # No locking as read only ipm = db.get_ipmanager('floating') subst['master_float'] = ipm.get_address_at_index(1) subst['netmask'] = ipm.netmask util.create_interface(subst['physical_bridge'], 'bridge', '') util.execute(None, 'ip link set %(physical_bridge)s up' % subst) util.execute( None, 'ip addr add %(master_float)s/%(netmask)s ' 'dev %(physical_bridge)s' % subst) util.execute( None, 'iptables -A FORWARD -o %(physical_nic)s ' '-i %(physical_bridge)s -j ACCEPT' % subst) util.execute( None, 'iptables -A FORWARD -i %(physical_nic)s ' '-o %(physical_bridge)s -j ACCEPT' % subst) util.execute( None, 'iptables -t nat -A POSTROUTING ' '-o %(physical_nic)s -j MASQUERADE' % subst) def _audit_daemons(): running_daemons = [] for pid in DAEMON_PIDS: running_daemons.append(DAEMON_PIDS[pid]) for d in DAEMON_IMPLEMENTATIONS: if d not in running_daemons: _start_daemon(d) for d in DAEMON_PIDS: if not psutil.pid_exists(d): LOG.warning('%s pid is missing, restarting' % DAEMON_PIDS[d]) _start_daemon(DAEMON_PIDS[d]) _audit_daemons() restore_instances() while True: time.sleep(10) wpid, _ = os.waitpid(-1, os.WNOHANG) while wpid != 0: LOG.warning('%s died (pid %d)' % (DAEMON_PIDS.get(wpid, 'unknown'), wpid)) del DAEMON_PIDS[wpid] wpid, _ = os.waitpid(-1, os.WNOHANG) _audit_daemons() db.see_this_node()
def _get_stats(): libvirt = util.get_libvirt() retval = {} conn = libvirt.open(None) # CPU info present_cpus, _, available_cpus = conn.getCPUMap() retval.update({ 'cpu_max': present_cpus, 'cpu_available': available_cpus, }) retval['cpu_max_per_instance'] = conn.getMaxVcpus(None) # This is disabled as data we don't currently use # for i in range(present_cpus): # per_cpu_stats = conn.getCPUStats(i) # for key in per_cpu_stats: # retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key] try: load_1, load_5, load_15 = psutil.getloadavg() retval.update({ 'cpu_load_1': load_1, 'cpu_load_5': load_5, 'cpu_load_15': load_15, }) except Exception as e: util.ignore_exception('load average', e) # System memory info, converting bytes to mb stats = psutil.virtual_memory() retval.update({ 'memory_max': stats.total // 1024 // 1024, 'memory_available': stats.available // 1024 // 1024 }) # libvirt memory info, converting kb to mb memory_status = conn.getMemoryStats( libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS) retval.update({ 'memory_max_libvirt': memory_status['total'] // 1024, 'memory_available_libvirt': memory_status['free'] // 1024, }) # Kernel Shared Memory (KSM) information ksm_details = {} for ent in os.listdir('/sys/kernel/mm/ksm'): with open('/sys/kernel/mm/ksm/%s' % ent) as f: ksm_details['memory_ksm_%s' % ent] = int(f.read().rstrip()) retval.update(ksm_details) # Disk info s = os.statvfs(config.get('STORAGE_PATH')) disk_counters = psutil.disk_io_counters() retval.update({ 'disk_total': s.f_frsize * s.f_blocks, 'disk_free': s.f_frsize * s.f_bavail, 'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree), 'disk_read_bytes': disk_counters.read_bytes, 'disk_write_bytes': disk_counters.write_bytes, }) # Network info net_counters = psutil.net_io_counters() retval.update({ 'network_read_bytes': net_counters.bytes_recv, 'network_write_bytes': net_counters.bytes_sent, }) # Virtual machine consumption info total_instances = 0 total_active_instances = 0 total_instance_max_memory = 0 total_instance_actual_memory = 0 total_instance_vcpus = 0 total_instance_cpu_time = 0 for guest in conn.listAllDomains(): try: active = guest.isActive() == 1 if active: _, maxmem, mem, cpus, cpu_time = guest.info() except libvirt.libvirtError as e: LOG.debug('During resource calc ignored libvirt error: %s' % e) active = False if active: total_instances += 1 total_active_instances += 1 total_instance_max_memory += maxmem total_instance_actual_memory += mem total_instance_vcpus += cpus total_instance_cpu_time += cpu_time # Queue health statistics node_queue_processing, node_queue_waiting = db.get_queue_length( config.NODE_NAME) retval.update({ 'cpu_total_instance_vcpus': total_instance_vcpus, 'cpu_total_instance_cpu_time': total_instance_cpu_time, 'memory_total_instance_max': total_instance_max_memory // 1024, 'memory_total_instance_actual': total_instance_actual_memory // 1024, 'instances_total': total_instances, 'instances_active': total_active_instances, 'node_queue_processing': node_queue_processing, 'node_queue_waiting': node_queue_waiting, }) if util.is_network_node(): network_queue_processing, network_queue_waiting = db.get_queue_length( 'networknode') retval.update({ 'network_queue_processing': network_queue_processing, 'network_queue_waiting': network_queue_waiting, }) return retval
def restart_queues(): # Move things which were in processing back to the queue because # we didn't complete them before crashing. if util.is_network_node(): _restart_queue('networknode') _restart_queue(config.NODE_NAME)
def test_is_network_node_no(self): self.assertFalse(util.is_network_node())
def create(self): subst = self.subst_dict() with db.get_lock('network', None, self.uuid, ttl=120): if not util.check_for_interface(subst['vx_interface']): with util.RecordedOperation('create vxlan interface', self): util.execute( None, 'ip link add %(vx_interface)s type vxlan id %(vx_id)s ' 'dev %(physical_interface)s dstport 0' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.%(vx_interface)s.arp_notify=1' % subst) if not util.check_for_interface(subst['vx_bridge']): with util.RecordedOperation('create vxlan bridge', self): util.execute( None, 'ip link add %(vx_bridge)s type bridge' % subst) util.execute( None, 'ip link set %(vx_interface)s master %(vx_bridge)s' % subst) util.execute(None, 'ip link set %(vx_interface)s up' % subst) util.execute(None, 'ip link set %(vx_bridge)s up' % subst) util.execute( None, 'sysctl -w net.ipv4.conf.%(vx_bridge)s.arp_notify=1' % subst) util.execute(None, 'brctl setfd %(vx_bridge)s 0' % subst) util.execute(None, 'brctl stp %(vx_bridge)s off' % subst) util.execute(None, 'brctl setageing %(vx_bridge)s 0' % subst) if util.is_network_node(): if not os.path.exists('/var/run/netns/%(netns)s' % subst): with util.RecordedOperation('create netns', self): util.execute(None, 'ip netns add %(netns)s' % subst) if not util.check_for_interface(subst['vx_veth_outer']): with util.RecordedOperation('create router veth', self): util.execute( None, 'ip link add %(vx_veth_outer)s type veth peer name %(vx_veth_inner)s' % subst) util.execute( None, 'ip link set %(vx_veth_inner)s netns %(netns)s' % subst) util.execute( None, 'brctl addif %(vx_bridge)s %(vx_veth_outer)s' % subst) util.execute(None, 'ip link set %(vx_veth_outer)s up' % subst) util.execute( None, '%(in_netns)s ip link set %(vx_veth_inner)s up' % subst) util.execute( None, '%(in_netns)s ip addr add %(router)s/%(netmask)s dev %(vx_veth_inner)s' % subst) if not util.check_for_interface(subst['physical_veth_outer']): with util.RecordedOperation('create physical veth', self): util.execute( None, 'ip link add %(physical_veth_outer)s type veth peer name ' '%(physical_veth_inner)s' % subst) util.execute( None, 'brctl addif %(physical_bridge)s %(physical_veth_outer)s' % subst) util.execute( None, 'ip link set %(physical_veth_outer)s up' % subst) util.execute( None, 'ip link set %(physical_veth_inner)s netns %(netns)s' % subst) self.deploy_nat() self.update_dhcp() else: db.enqueue('networknode', { 'type': 'deploy', 'network_uuid': self.uuid }) db.add_event('network', self.uuid, 'deploy', 'enqueued', None, None)
def test_is_network_node_no(self): config.parsed.parse() self.assertFalse(util.is_network_node())
def test_is_network_node_yes(self): config.parsed.parse() self.assertTrue(util.is_network_node())
def _maintain_networks(self): LOG.info('Maintaining networks') # Discover what networks are present _, _, vxid_to_mac = util.discover_interfaces() # Determine what networks we should be on host_networks = [] seen_vxids = [] if not util.is_network_node(): # For normal nodes, just the ones we have instances for for inst in list(db.get_instances(only_node=config.parsed.get('NODE_NAME'))): for iface in db.get_instance_interfaces(inst['uuid']): if not iface['network_uuid'] in host_networks: host_networks.append(iface['network_uuid']) else: # For network nodes, its all networks for n in db.get_networks(): host_networks.append(n['uuid']) # Network nodes also look for interfaces for absent instances # and delete them for ni in db.get_network_interfaces(n['uuid']): inst = db.get_instance(ni['instance_uuid']) if (not inst or inst.get('state', 'unknown') in ['deleted', 'error', 'unknown']): db.hard_delete_network_interface(ni['uuid']) LOG.withInstance( ni['instance_uuid']).withNetworkInterface( ni['uuid']).info('Hard deleted stray network interface') # Ensure we are on every network we have a host for for network in host_networks: try: n = net.from_db(network) if not n: continue if n.db_entry['state_updated'] - time.time() < 60: # Network state changed in the last minute, punt for now continue if not n.is_okay(): LOG.withObj(n).info('Recreating not okay network') n.create() n.ensure_mesh() seen_vxids.append(n.vxlan_id) except exceptions.LockException as e: LOG.warning( 'Failed to acquire lock while maintaining networks: %s' % e) # Determine if there are any extra vxids extra_vxids = set(vxid_to_mac.keys()) - set(seen_vxids) # Delete "deleted" SF networks and log unknown vxlans if extra_vxids: LOG.withField('vxids', extra_vxids).warning( 'Extra vxlans present!') # Determine the network uuids for those vxids # vxid_to_uuid = {} # for n in db.get_networks(): # vxid_to_uuid[n['vxid']] = n['uuid'] # for extra in extra_vxids: # if extra in vxid_to_uuid: # with db.get_lock('network', None, vxid_to_uuid[extra], # ttl=120, op='Network reap VXLAN'): # n = net.from_db(vxid_to_uuid[extra]) # n.delete() # LOG.info('Extra vxlan %s (network %s) removed.' # % (extra, vxid_to_uuid[extra])) # else: # LOG.error('Extra vxlan %s does not map to any network.' # % extra) # And record vxids in the database db.persist_node_vxid_mapping( config.parsed.get('NODE_NAME'), vxid_to_mac)
def test_is_network_node_yes(self): self.assertTrue(util.is_network_node())