def test_from_db(self, mock_get, mock_time): ipm = IPManager.from_db('floating') self.assertEqual( { '192.168.20.0': { 'user': ('ipmanager', 'uuid'), 'when': 1632261535.027476 }, '192.168.20.1': { 'user': ('ipmanager', 'uuid'), 'when': 1632261535.027476 }, '192.168.20.101': { 'user': ('ipmanager', 'uuid'), 'when': 1632261535.027476 }, '192.168.20.255': { 'user': ('ipmanager', 'uuid'), 'when': 1632261535.027476 }, '192.168.20.75': { 'user': ('ipmanager', 'uuid'), 'when': 1632261535.027476 } }, ipm.in_use)
def delete_on_network_node(self): with self.get_lock(op='Network delete'): subst = self.subst_dict() if util_network.check_for_interface(subst['vx_veth_outer']): with util_general.RecordedOperation('delete router veth', self): util_process.execute( None, 'ip link delete %(vx_veth_outer)s' % subst) if util_network.check_for_interface(subst['egress_veth_outer']): with util_general.RecordedOperation('delete egress veth', self): util_process.execute( None, 'ip link delete %(egress_veth_outer)s' % subst) if os.path.exists('/var/run/netns/%s' % self.uuid): with util_general.RecordedOperation('delete netns', self): util_process.execute( None, 'ip netns del %s' % self.uuid) if self.floating_gateway: with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Network delete'): ipm = IPManager.from_db('floating') ipm.release(self.floating_gateway) ipm.persist() self.update_floating_gateway(None) self.state = self.STATE_DELETED # Ensure that all hypervisors remove this network. This is really # just catching strays, apart from on the network node where we # absolutely need to do this thing. for hyp in Nodes([active_nodes]): etcd.enqueue(hyp.uuid, {'tasks': [ HypervisorDestroyNetworkTask(self.uuid) ]}) self.remove_dhcp() self.remove_nat() ipm = IPManager.from_db(self.uuid) ipm.delete()
def remove_nat(self): if config.NODE_IS_NETWORK_NODE: if self.floating_gateway: with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Remove NAT'): ipm = IPManager.from_db('floating') ipm.release(self.floating_gateway) ipm.persist() self.update_floating_gateway(None) else: etcd.enqueue('networknode', RemoveNATNetworkTask(self.uuid))
def assign_floating_ip(ni): float_net = net.Network.from_db('floating') if not float_net: return api_base.error(404, 'floating network not found') # Address is allocated and added to the record here, so the job has it later. db.add_event('interface', ni.uuid, 'api', 'float', None, None) with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Interface float'): ipm = IPManager.from_db('floating') addr = ipm.get_random_free_address(ni.unique_label()) ipm.persist() ni.floating = addr
def delete(self): if self.floating['floating_address']: etcd.enqueue( 'networknode', DefloatNetworkInterfaceTask(self.network_uuid, self.uuid)) with db.get_lock('ipmanager', None, self.network_uuid, ttl=120, op='Release fixed IP'): ipm = IPManager.from_db(self.network_uuid) ipm.release(self.ipv4) ipm.persist() self.state = dbo.STATE_DELETED
def _process_networkinterface_workitem(self, log_ctx, workitem): log_ctx = log_ctx.with_networkinterface(workitem.interface_uuid()) n = net.Network.from_db(workitem.network_uuid()) if not n: log_ctx.warning('Received work item for non-existent network') return ni = NetworkInterface.from_db(workitem.interface_uuid()) if not ni: log_ctx.warning( 'Received work item for non-existent network interface') return # Tasks that should not operate on a dead or delete waiting network if n.is_dead() and n.state.value != net.Network.STATE_DELETE_WAIT: log_ctx.with_fields({'state': n.state, 'workitem': workitem}).info( 'Received work item for a completely dead network') return if isinstance(workitem, DefloatNetworkInterfaceTask): n.remove_floating_ip(ni.floating.get('floating_address'), ni.ipv4) db.add_event('interface', ni.uuid, 'api', 'defloat', None, None) with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Instance defloat'): ipm = IPManager.from_db('floating') ipm.release(ni.floating.get('floating_address')) ipm.persist() ni.floating = None # Tasks that should not operate on a dead network if n.is_dead(): log_ctx.with_fields({'state': n.state, 'workitem': workitem}).info( 'Received work item for a dead network') return if isinstance(workitem, FloatNetworkInterfaceTask): n.add_floating_ip(ni.floating.get('floating_address'), ni.ipv4)
def __init__(self, static_values): super(Network, self).__init__(static_values.get('uuid'), static_values.get('version')) self.__name = static_values.get('name') self.__namespace = static_values.get('namespace') self.__netblock = static_values.get('netblock') self.__provide_dhcp = static_values.get('provide_dhcp') self.__provide_nat = static_values.get('provide_nat') self.__vxid = static_values.get('vxid') self.egress_nic = static_values.get( 'egress_nic', config.NODE_EGRESS_NIC) self.mesh_nic = static_values.get( 'mesh_nic', config.NODE_MESH_NIC) ipm = IPManager.from_db(self.uuid) self.__ipblock = ipm.network_address self.__router = ipm.get_address_at_index(1) self.__dhcp_start = ipm.get_address_at_index(2) self.__netmask = ipm.netmask self.__broadcast = ipm.broadcast_address self.__network_address = ipm.network_address
def _reap_leaked_floating_ips(self): # Block until the network node queue is idle to avoid races processing, waiting = etcd.get_queue_length('networknode') while processing + waiting > 0: self.exit.wait(60) processing, waiting = etcd.get_queue_length('networknode') # Ensure we haven't leaked any floating IPs (because we used to) with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Cleanup leaks'): floating_ipm = IPManager.from_db('floating') # Collect floating gateways and floating IPs, while ensuring that # they are correctly reserved on the floating network as well floating_gateways = [] for n in net.Networks([baseobject.active_states_filter]): fg = n.floating_gateway if fg: floating_gateways.append(fg) if floating_ipm.is_free(fg): floating_ipm.reserve(fg, n.unique_label()) floating_ipm.persist() LOG.with_fields({ 'network': n.uuid, 'address': fg }).error('Floating gateway not reserved correctly') LOG.info('Found floating gateways: %s' % floating_gateways) floating_addresses = [] for ni in networkinterface.NetworkInterfaces([baseobject.active_states_filter]): fa = ni.floating.get('floating_address') if fa: floating_addresses.append(fa) if floating_ipm.is_free(fa): floating_ipm.reserve(fa, ni.unique_label()) floating_ipm.persist() LOG.with_fields({ 'networkinterface': ni.uuid, 'address': fa }).error('Floating address not reserved correctly') LOG.info('Found floating addresses: %s' % floating_addresses) floating_reserved = [ floating_ipm.get_address_at_index(0), floating_ipm.get_address_at_index(1), floating_ipm.broadcast_address, floating_ipm.network_address ] LOG.info('Found floating reservations: %s' % floating_reserved) # Now the reverse check. Test if there are any reserved IPs which # are not actually in use. Free any we find. leaks = [] for ip in floating_ipm.in_use: if ip not in itertools.chain(floating_gateways, floating_addresses, floating_reserved): LOG.error('Floating IP %s has leaked.' % ip) # This IP needs to have been allocated more than 300 seconds # ago to ensure that the network setup isn't still queueud. if time.time() - floating_ipm.in_use[ip]['when'] > 300: leaks.append(ip) for ip in leaks: LOG.error('Leaked floating IP %s has been released.' % ip) floating_ipm.release(ip) floating_ipm.persist()
def main(): global DAEMON_IMPLEMENTATIONS global DAEMON_PIDS LOG.info('Starting...') setproctitle.setproctitle( daemon.process_name('main') + '-v%s' % util_general.get_version()) # If you ran this, it means we're not shutting down any more n = Node.new(config.NODE_NAME, config.NODE_MESH_IP) n.state = Node.STATE_CREATED # Log configuration on startup for key, value in config.dict().items(): LOG.info('Configuration item %s = %s' % (key, value)) daemon.set_log_level(LOG, 'main') # Check in early and often, also reset processing queue items. etcd.clear_stale_locks() Node.observe_this_node() etcd.restart_queues() def _start_daemon(d): pid = os.fork() if pid == 0: try: DAEMON_IMPLEMENTATIONS[d].Monitor(d).run() sys.exit(0) except Exception as e: util_general.ignore_exception('daemon creation', e) sys.exit(1) DAEMON_PIDS[pid] = d LOG.with_field('pid', pid).info('Started %s' % d) # Resource usage publisher, we need this early because scheduling decisions # might happen quite early on. _start_daemon('resources') # If I am the network node, I need some setup if config.NODE_IS_NETWORK_NODE: # Bootstrap the floating network in the Networks table floating_network = net.Network.from_db('floating') if not floating_network: floating_network = net.Network.create_floating_network( config.FLOATING_NETWORK) subst = { 'egress_bridge': util_network.get_safe_interface_name( 'egr-br-%s' % config.NODE_EGRESS_NIC), 'egress_nic': config.NODE_EGRESS_NIC } if not util_network.check_for_interface(subst['egress_bridge']): # NOTE(mikal): Adding the physical interface to the physical bridge # is considered outside the scope of the orchestration software as # it will cause the node to lose network connectivity. So instead # all we do is create a bridge if it doesn't exist and the wire # everything up to it. We can do egress NAT in that state, even if # floating IPs don't work. with util_general.RecordedOperation('create physical bridge', None): # No locking as read only ipm = IPManager.from_db('floating') subst['master_float'] = ipm.get_address_at_index(1) subst['netmask'] = ipm.netmask # We need to copy the MTU of the interface we are bridging to # or weird networking things happen. mtu = util_network.get_interface_mtu(config.NODE_EGRESS_NIC) util_network.create_interface( subst['egress_bridge'], 'bridge', '', mtu=mtu) util_process.execute(None, 'ip link set %(egress_bridge)s up' % subst) util_process.execute(None, 'ip addr add %(master_float)s/%(netmask)s ' 'dev %(egress_bridge)s' % subst) util_process.execute(None, 'iptables -A FORWARD -o %(egress_nic)s ' '-i %(egress_bridge)s -j ACCEPT' % subst) util_process.execute(None, 'iptables -A FORWARD -i %(egress_nic)s ' '-o %(egress_bridge)s -j ACCEPT' % subst) util_process.execute(None, 'iptables -t nat -A POSTROUTING ' '-o %(egress_nic)s -j MASQUERADE' % subst) def _audit_daemons(): running_daemons = [] for pid in DAEMON_PIDS: running_daemons.append(DAEMON_PIDS[pid]) for d in DAEMON_IMPLEMENTATIONS: if d not in running_daemons: _start_daemon(d) for d in list(DAEMON_PIDS): if not psutil.pid_exists(d): LOG.warning('%s pid is missing, restarting' % DAEMON_PIDS[d]) _start_daemon(DAEMON_PIDS[d]) _audit_daemons() restore_instances() running = True while True: time.sleep(5) try: wpid, _ = os.waitpid(-1, os.WNOHANG) while wpid != 0: LOG.warning('%s exited (pid %d)' % (DAEMON_PIDS.get(wpid, 'unknown'), wpid)) if wpid in DAEMON_PIDS: del DAEMON_PIDS[wpid] wpid, _ = os.waitpid(-1, os.WNOHANG) except ChildProcessError: # We get this if there are no child processes pass n = Node.from_db(config.NODE_NAME) if n.state.value not in [Node.STATE_STOPPING, Node.STATE_STOPPED]: _audit_daemons() Node.observe_this_node() elif len(DAEMON_PIDS) == 0: n.state = Node.STATE_STOPPED return else: if running: for pid in DAEMON_PIDS: try: os.kill(pid, signal.SIGTERM) LOG.info('Sent SIGTERM to %s (pid %s)' % (DAEMON_PIDS.get(pid, 'unknown'), pid)) except OSError as e: LOG.warn('Failed to send SIGTERM to %s: %s' % (pid, e)) running = False
def create_on_network_node(self): # The floating network does not have a vxlan mesh if self.uuid == 'floating': return with self.get_lock(op='create_on_network_node'): if self.is_dead(): raise DeadNetwork('network=%s' % self) self._create_common() subst = self.subst_dict() if not os.path.exists('/var/run/netns/%s' % self.uuid): with util_general.RecordedOperation('create netns', self): util_process.execute(None, 'ip netns add %s' % self.uuid) if not util_network.check_for_interface(subst['vx_veth_outer']): with util_general.RecordedOperation('create router veth', self): util_network.create_interface( subst['vx_veth_outer'], 'veth', 'peer name %(vx_veth_inner)s' % subst) util_process.execute( None, 'ip link set %(vx_veth_inner)s netns %(netns)s' % subst) # Refer to bug 952 for more details here, but it turns out # that adding an interface to a bridge overwrites the MTU of # the bridge in an undesirable way. So we lookup the existing # MTU and then re-specify it here. subst['vx_bridge_mtu'] = util_network.get_interface_mtu( subst['vx_bridge']) util_process.execute( None, 'ip link set %(vx_veth_outer)s master %(vx_bridge)s ' 'mtu %(vx_bridge_mtu)s' % subst) util_process.execute( None, 'ip link set %(vx_veth_outer)s up' % subst) util_process.execute( None, 'ip link set %(vx_veth_inner)s up' % subst, namespace=self.uuid) util_process.execute( None, 'ip addr add %(router)s/%(netmask)s ' 'dev %(vx_veth_inner)s' % subst, namespace=self.uuid) if not util_network.check_for_interface(subst['egress_veth_outer']): with util_general.RecordedOperation('create egress veth', self): util_network.create_interface( subst['egress_veth_outer'], 'veth', 'peer name %(egress_veth_inner)s' % subst) # Refer to bug 952 for more details here, but it turns out # that adding an interface to a bridge overwrites the MTU of # the bridge in an undesirable way. So we lookup the existing # MTU and then re-specify it here. subst['egress_bridge_mtu'] = util_network.get_interface_mtu( subst['egress_bridge']) util_process.execute( None, 'ip link set %(egress_veth_outer)s master %(egress_bridge)s ' 'mtu %(egress_bridge_mtu)s' % subst) util_process.execute( None, 'ip link set %(egress_veth_outer)s up' % subst) util_process.execute( None, 'ip link set %(egress_veth_inner)s netns %(netns)s' % subst) if self.provide_nat: # We don't always need this lock, but acquiring it here means # we don't need to construct two identical ipmanagers one after # the other. with db.get_lock('ipmanager', None, 'floating', ttl=120, op='Network deploy NAT'): ipm = IPManager.from_db('floating') if not self.floating_gateway: self.update_floating_gateway( ipm.get_random_free_address(self.unique_label())) ipm.persist() subst['floating_router'] = ipm.get_address_at_index(1) subst['floating_gateway'] = self.floating_gateway subst['floating_netmask'] = ipm.netmask with util_general.RecordedOperation('enable virtual routing', self): addresses = util_network.get_interface_addresses( subst['egress_veth_inner'], namespace=subst['netns']) if not subst['floating_gateway'] in list(addresses): util_process.execute( None, 'ip addr add %(floating_gateway)s/%(floating_netmask)s ' 'dev %(egress_veth_inner)s' % subst, namespace=self.uuid) util_process.execute( None, 'ip link set %(egress_veth_inner)s up' % subst, namespace=self.uuid) default_routes = util_network.get_default_routes( subst['netns']) if default_routes != [subst['floating_router']]: if default_routes: for default_route in default_routes: util_process.execute( None, 'route del default gw %s' % default_route, namespace=self.uuid) util_process.execute( None, 'route add default gw %(floating_router)s' % subst, namespace=self.uuid) self.enable_nat() self.update_dhcp() # A final check to ensure we haven't raced with a delete if self.is_dead(): raise DeadNetwork('network=%s' % self) self.state = self.STATE_CREATED