Esempio n. 1
0
    def _create_common(self):
        # The floating network does not have a vxlan mesh
        if self.uuid == 'floating':
            return

        subst = self.subst_dict()

        if not util_network.check_for_interface(subst['vx_interface']):
            with util_general.RecordedOperation('create vxlan interface', self):
                util_network.create_interface(
                    subst['vx_interface'], 'vxlan',
                    'id %(vx_id)s dev %(mesh_interface)s dstport 0'
                    % subst)
                util_process.execute(None, 'sysctl -w net.ipv4.conf.'
                                     '%(vx_interface)s.arp_notify=1' % subst)

        if not util_network.check_for_interface(subst['vx_bridge']):
            with util_general.RecordedOperation('create vxlan bridge', self):
                util_network.create_interface(subst['vx_bridge'], 'bridge', '')
                util_process.execute(None, 'ip link set %(vx_interface)s '
                                     'master %(vx_bridge)s' % subst)
                util_process.execute(
                    None, 'ip link set %(vx_interface)s up' % subst)
                util_process.execute(
                    None, 'ip link set %(vx_bridge)s up' % subst)
                util_process.execute(None, 'sysctl -w net.ipv4.conf.'
                                     '%(vx_bridge)s.arp_notify=1' % subst)
                util_process.execute(
                    None, 'brctl setfd %(vx_bridge)s 0' % subst)
                util_process.execute(
                    None, 'brctl stp %(vx_bridge)s off' % subst)
                util_process.execute(
                    None, 'brctl setageing %(vx_bridge)s 0' % subst)
Esempio n. 2
0
    def delete_on_hypervisor(self):
        with self.get_lock(op='Network delete'):
            subst = self.subst_dict()

            if util_network.check_for_interface(subst['vx_bridge']):
                with util_general.RecordedOperation('delete vxlan bridge', self):
                    util_process.execute(
                        None, 'ip link delete %(vx_bridge)s' % subst)

            if util_network.check_for_interface(subst['vx_interface']):
                with util_general.RecordedOperation('delete vxlan interface', self):
                    util_process.execute(
                        None, 'ip link delete %(vx_interface)s' % subst)
Esempio n. 3
0
    def create(self, iface_uuids, lock=None):
        self.state = self.STATE_CREATING
        self.interfaces = iface_uuids

        # Ensure we have state on disk
        os.makedirs(self.instance_path, exist_ok=True)

        # Configure block devices, include config drive creation
        self._configure_block_devices(lock)

        # Create the actual instance. Sometimes on Ubuntu 20.04 we need to wait
        # for port binding to work. Revisiting this is tracked by issue 320 on
        # github.
        with util_general.RecordedOperation('create domain', self):
            if not self.power_on():
                attempts = 0
                while not self.power_on() and attempts < 5:
                    self.log.warning(
                        'Instance required an additional attempt to power on')
                    time.sleep(5)
                    attempts += 1

        if self.is_powered_on():
            self.log.info('Instance now powered on')
            self.state = self.STATE_CREATED
        else:
            self.log.info('Instance failed to power on')
            self.enqueue_delete_due_error('Instance failed to power on')
Esempio n. 4
0
 def remove_dhcp(self):
     if config.NODE_IS_NETWORK_NODE:
         subst = self.subst_dict()
         with util_general.RecordedOperation('remove dhcp', self):
             with self.get_lock(op='Network remove DHCP'):
                 d = dhcp.DHCP(self, subst['vx_veth_inner'])
                 d.remove_dhcpd()
     else:
         etcd.enqueue('networknode', RemoveDHCPNetworkTask(self.uuid))
Esempio n. 5
0
    def delete_on_network_node(self):
        with self.get_lock(op='Network delete'):
            subst = self.subst_dict()

            if util_network.check_for_interface(subst['vx_veth_outer']):
                with util_general.RecordedOperation('delete router veth', self):
                    util_process.execute(
                        None, 'ip link delete %(vx_veth_outer)s' % subst)

            if util_network.check_for_interface(subst['egress_veth_outer']):
                with util_general.RecordedOperation('delete egress veth', self):
                    util_process.execute(
                        None,
                        'ip link delete %(egress_veth_outer)s' % subst)

            if os.path.exists('/var/run/netns/%s' % self.uuid):
                with util_general.RecordedOperation('delete netns', self):
                    util_process.execute(
                        None, 'ip netns del %s' % self.uuid)

            if self.floating_gateway:
                with db.get_lock('ipmanager', None, 'floating', ttl=120,
                                 op='Network delete'):
                    ipm = IPManager.from_db('floating')
                    ipm.release(self.floating_gateway)
                    ipm.persist()
                    self.update_floating_gateway(None)

            self.state = self.STATE_DELETED

        # Ensure that all hypervisors remove this network. This is really
        # just catching strays, apart from on the network node where we
        # absolutely need to do this thing.
        for hyp in Nodes([active_nodes]):
            etcd.enqueue(hyp.uuid,
                         {'tasks': [
                             HypervisorDestroyNetworkTask(self.uuid)
                         ]})

        self.remove_dhcp()
        self.remove_nat()

        ipm = IPManager.from_db(self.uuid)
        ipm.delete()
Esempio n. 6
0
    def delete(self):
        # Mark files we used in the image cache as recently used so that they
        # linger a little for possible future users.
        for disk in self.block_devices.get('devices', []):
            if 'blob_uuid' in disk and disk['blob_uuid']:
                cached_image_path = util_general.file_permutation_exists(
                    os.path.join(config.STORAGE_PATH, 'image_cache',
                                 disk['blob_uuid']), ['iso', 'qcow2'])
                if cached_image_path:
                    pathlib.Path(cached_image_path).touch(exist_ok=True)

        with util_general.RecordedOperation('delete domain', self):
            try:
                self.power_off()

                nvram_path = os.path.join(self.instance_path, 'nvram')
                if os.path.exists(nvram_path):
                    os.unlink(nvram_path)
                if self.nvram_template:
                    b = blob.Blob.from_db(self.nvram_template)
                    b.ref_count_dec()

                inst = self._get_domain()
                if inst:
                    inst.undefine()
            except Exception as e:
                util_general.ignore_exception(
                    'instance delete domain %s' % self, e)

        with util_general.RecordedOperation('delete disks', self):
            try:
                if os.path.exists(self.instance_path):
                    shutil.rmtree(self.instance_path)
            except Exception as e:
                util_general.ignore_exception(
                    'instance delete disks %s' % self, e)

        self.deallocate_instance_ports()

        if self.state.value.endswith('-%s' % self.STATE_ERROR):
            self.state = self.STATE_ERROR
        else:
            self.state = self.STATE_DELETED
Esempio n. 7
0
    def update_dhcp(self):
        if not self.provide_dhcp:
            return

        if config.NODE_IS_NETWORK_NODE:
            subst = self.subst_dict()
            with util_general.RecordedOperation('update dhcp', self):
                with self.get_lock(op='Network update DHCP'):
                    d = dhcp.DHCP(self, subst['vx_veth_inner'])
                    d.restart_dhcpd()
        else:
            etcd.enqueue('networknode', UpdateDHCPNetworkTask(self.uuid))
Esempio n. 8
0
def instance_start(inst, network):
    if inst.state.value.endswith('-error'):
        LOG.with_instance(inst).warning(
            'You cannot start an instance in an error state.')
        return
    if inst.state.value in (dbo.STATE_DELETE_WAIT, dbo.STATE_DELETED):
        LOG.with_instance(inst).warning(
            'You cannot start an instance which has been deleted.')
        return

    with inst.get_lock(ttl=900, op='Instance start') as lock:
        try:
            # Ensure networks are connected to this node
            iface_uuids = []
            for netdesc in network:
                iface_uuids.append(netdesc['iface_uuid'])
                n = net.Network.from_db(netdesc['network_uuid'])
                if not n:
                    inst.enqueue_delete_due_error('missing network: %s' %
                                                  netdesc['network_uuid'])
                    return

                if n.state.value != dbo.STATE_CREATED:
                    inst.enqueue_delete_due_error('network is not active: %s' %
                                                  n.uuid)
                    return

                # We must record interfaces very early for the vxlan leak
                # detection code in the net daemon to work correctly.
                ni = networkinterface.NetworkInterface.from_db(
                    netdesc['iface_uuid'])
                ni.state = dbo.STATE_CREATED

                n.create_on_hypervisor()
                n.ensure_mesh()
                n.update_dhcp()

            # Allocate console and VDI ports
            inst.allocate_instance_ports()

            # Now we can start the instance
            with util_general.RecordedOperation('instance creation', inst):
                inst.create(iface_uuids, lock=lock)

        except exceptions.InvalidStateException as e:
            # This instance is in an error or deleted state. Given the check
            # at the top of this method, that indicates a race.
            inst.enqueue_delete_due_error('invalid state transition: %s' % e)
            return
Esempio n. 9
0
def snapshot_disk(disk, blob_uuid, related_object=None):
    if not os.path.exists(disk['path']):
        return
    ensure_blob_path()
    dest_path = Blob.filepath(blob_uuid)

    # Actually make the snapshot
    with util_general.RecordedOperation('snapshot %s' % disk['device'],
                                        related_object):
        util_image.snapshot(None, disk['path'], dest_path)
        st = os.stat(dest_path)

    # And make the associated blob
    b = Blob.new(blob_uuid, st.st_size, time.time(), time.time())
    b.state = Blob.STATE_CREATED
    b.observe()
    b.request_replication()
    return b
Esempio n. 10
0
    def _http_get_inner(self, lock, url, checksum, checksum_type):
        """Fetch image if not downloaded and return image path."""

        with util_general.RecordedOperation('fetch image', self.instance):
            resp = self._open_connection(url)
            blob_uuid = str(uuid.uuid4())
            self.log.with_object(self.__artifact).with_fields({
                'blob': blob_uuid,
                'url': url
            }).info('Commencing HTTP fetch to blob')
            b = blob.http_fetch(resp, blob_uuid, [lock], self.log)

            # Ensure checksum is correct
            if not verify_checksum(
                    os.path.join(config.STORAGE_PATH, 'blobs', b.uuid),
                    checksum, checksum_type):
                self.instance.add_event('fetch image', 'bad checksum')
                raise exceptions.BadCheckSum('url=%s' % url)

            # Only persist values after the file has been verified.
            b.observe()
            b.request_replication()
            return b
Esempio n. 11
0
    def enable_nat(self):
        if not config.NODE_IS_NETWORK_NODE:
            return

        subst = self.subst_dict()
        if not util_network.nat_rules_for_ipblock(self.network_address):
            with util_general.RecordedOperation('enable nat', self):
                util_process.execute(
                    None, 'echo 1 > /proc/sys/net/ipv4/ip_forward')
                util_process.execute(
                    None,
                    'iptables -A FORWARD -o %(egress_veth_inner)s '
                    '-i %(vx_veth_inner)s -j ACCEPT' % subst,
                    namespace=self.uuid)
                util_process.execute(
                    None,
                    'iptables -A FORWARD -i %(egress_veth_inner)s '
                    '-o %(vx_veth_inner)s -j ACCEPT' % subst,
                    namespace=self.uuid)
                util_process.execute(
                    None,
                    'iptables -t nat -A POSTROUTING -s %(ipblock)s/%(netmask)s '
                    '-o %(egress_veth_inner)s -j MASQUERADE' % subst,
                    namespace=self.uuid)
Esempio n. 12
0
def restore_instances():
    # Ensure all instances for this node are defined and have up to date data.
    networks = []
    instances = []
    for inst in instance.Instances([instance.this_node_filter,
                                    instance.healthy_states_filter]):
        instance_problems = []
        inst_interfaces = inst.interfaces
        if not inst_interfaces:
            inst_interfaces = []
        updated_interfaces = False

        for ni in interfaces_for_instance(inst):
            if ni.network_uuid not in networks:
                networks.append(ni.network_uuid)
            if ni.uuid not in inst_interfaces:
                inst_interfaces.append(ni.uuid)
                updated_interfaces = True

        # We do not need a lock here because this loop only runs on the node
        # with the instance, and interfaces don't change post instance
        # creation.
        if updated_interfaces:
            inst.interfaces = inst_interfaces

        # TODO(mikal): do better here.
        # for disk in inst.disk_spec:
        #     if disk.get('base'):
        #         img = images.Image.new(disk['base'])
        #         # NOTE(mikal): this check isn't great -- it checks for the original
        #         # downloaded image, not the post transcode version
        #         if (img.state in [dbo.STATE_DELETED, dbo.STATE_ERROR] or
        #                 not os.path.exists(img.version_image_path())):
        #             instance_problems.append(
        #                 '%s missing from image cache' % disk['base'])
        #             img.delete()

        if instance_problems:
            inst.enqueue_delete_due_error(
                'instance bad on startup: %s' % '; '.join(instance_problems))
        else:
            instances.append(inst)

    with util_general.RecordedOperation('restore networks', None):
        for network in networks:
            try:
                n = net.Network.from_db(network)
                if not n.is_dead():
                    LOG.with_object(n).info('Restoring network')
                    n.create_on_hypervisor()
                    n.ensure_mesh()
            except Exception as e:
                util_general.ignore_exception(
                    'restore network %s' % network, e)

    with util_general.RecordedOperation('restore instances', None):
        for inst in instances:
            try:
                with inst.get_lock(ttl=120, timeout=120, op='Instance restore'):
                    started = ['on', 'transition-to-on',
                               instance.Instance.STATE_INITIAL, 'unknown']
                    if inst.power_state not in started:
                        continue

                    LOG.with_object(inst).info('Restoring instance')
                    inst.create_on_hypervisor()
            except Exception as e:
                util_general.ignore_exception(
                    'restore instance %s' % inst, e)
                inst.etcd.enqueue_delete_due_error(
                    'exception while restoring instance on daemon restart')
Esempio n. 13
0
    def transcode_image(self, lock, b):
        # NOTE(mikal): it is assumed the caller holds a lock on the artifact, and passes
        # it in lock.

        # If this blob uuid is not the most recent index for the artifact, set that
        if self.__artifact.most_recent_index.get('blob_uuid') != b.uuid:
            self.__artifact.add_index(b.uuid)

        # Transcode if required, placing the transcoded file in a well known location.
        os.makedirs(os.path.join(config.STORAGE_PATH, 'image_cache'),
                    exist_ok=True)
        cached = util_general.file_permutation_exists(
            os.path.join(config.STORAGE_PATH, 'image_cache', b.uuid),
            ['iso', 'qcow2'])
        if cached:
            # We touch the file here, because we want to know when it was last used.
            pathlib.Path(cached).touch(exist_ok=True)

        else:
            blob_path = os.path.join(config.STORAGE_PATH, 'blobs', b.uuid)
            mimetype = b.info.get('mime-type', '')

            if mimetype in [
                    'application/x-cd-image', 'application/x-iso9660-image'
            ]:
                cache_path = os.path.join(config.STORAGE_PATH, 'image_cache',
                                          b.uuid + '.iso')
                util_general.link(blob_path, cache_path)

            else:
                if mimetype == 'application/gzip':
                    cache_path = os.path.join(config.STORAGE_PATH,
                                              'image_cache', b.uuid)
                    with util_general.RecordedOperation(
                            'decompress image', self.instance):
                        util_process.execute([lock],
                                             'gunzip -k -q -c %s > %s' %
                                             (blob_path, cache_path))
                    blob_path = cache_path

                cache_path = os.path.join(config.STORAGE_PATH, 'image_cache',
                                          b.uuid + '.qcow2')
                cache_info = util_image.identify(blob_path)

                # Convert the cluster size from qemu format to an int
                cluster_size_as_int = QCOW2_CLUSTER_SIZE
                if cluster_size_as_int.endswith('M'):
                    cluster_size_as_int = int(cluster_size_as_int[:-1]) * MiB
                elif cluster_size_as_int.endswith('K'):
                    cluster_size_as_int = int(cluster_size_as_int[:-1]) * KiB
                else:
                    cluster_size_as_int = int(cluster_size_as_int)

                if (cache_info.get('file format', '') == 'qcow2'
                        and cache_info.get('cluster_size',
                                           0) == cluster_size_as_int):
                    util_general.link(blob_path, cache_path)
                else:
                    with util_general.RecordedOperation(
                            'transcode image', self.instance):
                        self.log.with_object(b).info('Transcoding %s -> %s' %
                                                     (blob_path, cache_path))
                        util_image.create_qcow2([lock], blob_path, cache_path)

            shutil.chown(cache_path, config.LIBVIRT_USER, config.LIBVIRT_GROUP)
            self.log.with_fields(
                util_general.stat_log_fields(cache_path)).info(
                    'Cache file %s created' % cache_path)

        self.__artifact.state = Artifact.STATE_CREATED
Esempio n. 14
0
    def _configure_block_devices(self, lock):
        with self.get_lock_attr('block_devices', 'Initialize block devices'):
            # Create block devices if required
            block_devices = self.block_devices
            if not block_devices:
                block_devices = self._initialize_block_devices()

            # Generate a config drive
            if self.configdrive == 'openstack-disk':
                with util_general.RecordedOperation('make config drive', self):
                    self._make_config_drive_openstack_disk(
                        os.path.join(self.instance_path,
                                     block_devices['devices'][1]['path']))

            # Prepare disks. A this point we have a file for each blob in the image
            # cache at a well known location (the blob uuid with .qcow2 appended).
            if not block_devices['finalized']:
                modified_disks = []
                for disk in block_devices['devices']:
                    disk['source'] = "<source file='%s'/>" % disk['path']
                    disk['source_type'] = 'file'

                    # All disk bases must have an associated blob, force that
                    # if an image had to be fetched from outside the cluster.
                    disk_base = None
                    if disk.get('blob_uuid'):
                        disk_base = '%s%s' % (artifact.BLOB_URL,
                                              disk['blob_uuid'])
                    elif disk.get('base') and not util_general.noneish(
                            disk.get('base')):
                        a = artifact.Artifact.from_url(
                            artifact.Artifact.TYPE_IMAGE, disk['base'])
                        mri = a.most_recent_index

                        if 'blob_uuid' not in mri:
                            raise exceptions.ArtifactHasNoBlobs(
                                'Artifact %s of type %s has no versions' %
                                (a.uuid, a.artifact_type))

                        disk['blob_uuid'] = mri['blob_uuid']
                        disk_base = '%s%s' % (artifact.BLOB_URL,
                                              disk['blob_uuid'])

                    if disk_base:
                        cached_image_path = util_general.file_permutation_exists(
                            os.path.join(config.STORAGE_PATH, 'image_cache',
                                         disk['blob_uuid']), ['iso', 'qcow2'])
                        if not cached_image_path:
                            raise exceptions.ImageMissingFromCache(
                                'Image %s is missing' % disk['blob_uuid'])

                        with util_general.RecordedOperation(
                                'detect cdrom images', self):
                            try:
                                cd = pycdlib.PyCdlib()
                                cd.open(cached_image_path)
                                disk['present_as'] = 'cdrom'
                            except Exception:
                                pass

                        if disk.get('present_as', 'cdrom') == 'cdrom':
                            # There is no point in resizing or COW'ing a cdrom
                            disk['path'] = disk['path'].replace(
                                '.qcow2', '.raw')
                            disk['type'] = 'raw'
                            disk['snapshot_ignores'] = True
                            util_general.link(cached_image_path, disk['path'])

                            # qemu does not support removable media on virtio buses. It also
                            # only supports one IDE bus. This is quite limiting. Instead, we
                            # use USB for cdrom drives, unless you've specified a bus other
                            # than virtio in the creation request.
                            if disk['bus'] == 'virtio':
                                disk['bus'] = 'usb'
                                disk['device'] = _get_disk_device(
                                    disk['bus'],
                                    LETTERS.find(disk['device'][-1]))

                        elif disk['bus'] == 'nvme':
                            # NVMe disks do not currently support a COW layer for the instance
                            # disk. This is because we don't have a libvirt <disk/> element for
                            # them and therefore can't specify their backing store. Instead we
                            # produce a flat layer here.
                            util_image.create_qcow2([lock],
                                                    cached_image_path,
                                                    disk['path'],
                                                    disk_size=disk['size'])

                        else:
                            with util_general.RecordedOperation(
                                    'create copy on write layer', self):
                                util_image.create_cow([lock],
                                                      cached_image_path,
                                                      disk['path'],
                                                      disk['size'])
                            self.log.with_fields(
                                util_general.stat_log_fields(
                                    disk['path'])).info(
                                        'COW layer %s created' % disk['path'])

                            # Record the backing store for modern libvirts
                            disk['backing'] = (
                                '<backingStore type=\'file\'>\n'
                                '        <format type=\'qcow2\'/>\n'
                                '        <source file=\'%s\'/>\n'
                                '      </backingStore>\n' %
                                (cached_image_path))

                    elif not os.path.exists(disk['path']):
                        util_image.create_blank([lock], disk['path'],
                                                disk['size'])

                    shutil.chown(disk['path'], 'libvirt-qemu', 'libvirt-qemu')
                    modified_disks.append(disk)

                block_devices['devices'] = modified_disks
                block_devices['finalized'] = True
                self._db_set_attribute('block_devices', block_devices)
Esempio n. 15
0
def main():
    global DAEMON_IMPLEMENTATIONS
    global DAEMON_PIDS

    LOG.info('Starting...')
    setproctitle.setproctitle(
        daemon.process_name('main') + '-v%s' % util_general.get_version())

    # If you ran this, it means we're not shutting down any more
    n = Node.new(config.NODE_NAME, config.NODE_MESH_IP)
    n.state = Node.STATE_CREATED

    # Log configuration on startup
    for key, value in config.dict().items():
        LOG.info('Configuration item %s = %s' % (key, value))

    daemon.set_log_level(LOG, 'main')

    # Check in early and often, also reset processing queue items.
    etcd.clear_stale_locks()
    Node.observe_this_node()
    etcd.restart_queues()

    def _start_daemon(d):
        pid = os.fork()
        if pid == 0:
            try:
                DAEMON_IMPLEMENTATIONS[d].Monitor(d).run()
                sys.exit(0)
            except Exception as e:
                util_general.ignore_exception('daemon creation', e)
                sys.exit(1)

        DAEMON_PIDS[pid] = d
        LOG.with_field('pid', pid).info('Started %s' % d)

    # Resource usage publisher, we need this early because scheduling decisions
    # might happen quite early on.
    _start_daemon('resources')

    # If I am the network node, I need some setup
    if config.NODE_IS_NETWORK_NODE:
        # Bootstrap the floating network in the Networks table
        floating_network = net.Network.from_db('floating')
        if not floating_network:
            floating_network = net.Network.create_floating_network(
                config.FLOATING_NETWORK)

        subst = {
            'egress_bridge': util_network.get_safe_interface_name(
                'egr-br-%s' % config.NODE_EGRESS_NIC),
            'egress_nic': config.NODE_EGRESS_NIC
        }

        if not util_network.check_for_interface(subst['egress_bridge']):
            # NOTE(mikal): Adding the physical interface to the physical bridge
            # is considered outside the scope of the orchestration software as
            # it will cause the node to lose network connectivity. So instead
            # all we do is create a bridge if it doesn't exist and the wire
            # everything up to it. We can do egress NAT in that state, even if
            # floating IPs don't work.
            with util_general.RecordedOperation('create physical bridge', None):
                # No locking as read only
                ipm = IPManager.from_db('floating')
                subst['master_float'] = ipm.get_address_at_index(1)
                subst['netmask'] = ipm.netmask

                # We need to copy the MTU of the interface we are bridging to
                # or weird networking things happen.
                mtu = util_network.get_interface_mtu(config.NODE_EGRESS_NIC)

                util_network.create_interface(
                    subst['egress_bridge'], 'bridge', '', mtu=mtu)

                util_process.execute(None,
                                     'ip link set %(egress_bridge)s up' % subst)
                util_process.execute(None,
                                     'ip addr add %(master_float)s/%(netmask)s '
                                     'dev %(egress_bridge)s' % subst)

                util_process.execute(None,
                                     'iptables -A FORWARD -o %(egress_nic)s '
                                     '-i %(egress_bridge)s -j ACCEPT' % subst)
                util_process.execute(None,
                                     'iptables -A FORWARD -i %(egress_nic)s '
                                     '-o %(egress_bridge)s -j ACCEPT' % subst)
                util_process.execute(None,
                                     'iptables -t nat -A POSTROUTING '
                                     '-o %(egress_nic)s -j MASQUERADE' % subst)

    def _audit_daemons():
        running_daemons = []
        for pid in DAEMON_PIDS:
            running_daemons.append(DAEMON_PIDS[pid])

        for d in DAEMON_IMPLEMENTATIONS:
            if d not in running_daemons:
                _start_daemon(d)

        for d in list(DAEMON_PIDS):
            if not psutil.pid_exists(d):
                LOG.warning('%s pid is missing, restarting' % DAEMON_PIDS[d])
                _start_daemon(DAEMON_PIDS[d])

    _audit_daemons()
    restore_instances()

    running = True
    while True:
        time.sleep(5)

        try:
            wpid, _ = os.waitpid(-1, os.WNOHANG)
            while wpid != 0:
                LOG.warning('%s exited (pid %d)'
                            % (DAEMON_PIDS.get(wpid, 'unknown'), wpid))
                if wpid in DAEMON_PIDS:
                    del DAEMON_PIDS[wpid]
                wpid, _ = os.waitpid(-1, os.WNOHANG)

        except ChildProcessError:
            # We get this if there are no child processes
            pass

        n = Node.from_db(config.NODE_NAME)
        if n.state.value not in [Node.STATE_STOPPING, Node.STATE_STOPPED]:
            _audit_daemons()
            Node.observe_this_node()

        elif len(DAEMON_PIDS) == 0:
            n.state = Node.STATE_STOPPED
            return

        else:
            if running:
                for pid in DAEMON_PIDS:
                    try:
                        os.kill(pid, signal.SIGTERM)
                        LOG.info('Sent SIGTERM to %s (pid %s)'
                                 % (DAEMON_PIDS.get(pid, 'unknown'), pid))
                    except OSError as e:
                        LOG.warn('Failed to send SIGTERM to %s: %s' % (pid, e))

            running = False
Esempio n. 16
0
    def create_on_network_node(self):
        # The floating network does not have a vxlan mesh
        if self.uuid == 'floating':
            return

        with self.get_lock(op='create_on_network_node'):
            if self.is_dead():
                raise DeadNetwork('network=%s' % self)

            self._create_common()

            subst = self.subst_dict()
            if not os.path.exists('/var/run/netns/%s' % self.uuid):
                with util_general.RecordedOperation('create netns', self):
                    util_process.execute(None, 'ip netns add %s' % self.uuid)

            if not util_network.check_for_interface(subst['vx_veth_outer']):
                with util_general.RecordedOperation('create router veth', self):
                    util_network.create_interface(
                        subst['vx_veth_outer'], 'veth',
                        'peer name %(vx_veth_inner)s' % subst)
                    util_process.execute(
                        None, 'ip link set %(vx_veth_inner)s netns %(netns)s' % subst)

                    # Refer to bug 952 for more details here, but it turns out
                    # that adding an interface to a bridge overwrites the MTU of
                    # the bridge in an undesirable way. So we lookup the existing
                    # MTU and then re-specify it here.
                    subst['vx_bridge_mtu'] = util_network.get_interface_mtu(
                        subst['vx_bridge'])
                    util_process.execute(
                        None,
                        'ip link set %(vx_veth_outer)s master %(vx_bridge)s '
                        'mtu %(vx_bridge_mtu)s' % subst)

                    util_process.execute(
                        None, 'ip link set %(vx_veth_outer)s up' % subst)
                    util_process.execute(
                        None, 'ip link set %(vx_veth_inner)s up' % subst,
                        namespace=self.uuid)
                    util_process.execute(
                        None,
                        'ip addr add %(router)s/%(netmask)s '
                        'dev %(vx_veth_inner)s' % subst,
                        namespace=self.uuid)

            if not util_network.check_for_interface(subst['egress_veth_outer']):
                with util_general.RecordedOperation('create egress veth', self):
                    util_network.create_interface(
                        subst['egress_veth_outer'], 'veth',
                        'peer name %(egress_veth_inner)s' % subst)

                    # Refer to bug 952 for more details here, but it turns out
                    # that adding an interface to a bridge overwrites the MTU of
                    # the bridge in an undesirable way. So we lookup the existing
                    # MTU and then re-specify it here.
                    subst['egress_bridge_mtu'] = util_network.get_interface_mtu(
                        subst['egress_bridge'])
                    util_process.execute(
                        None,
                        'ip link set %(egress_veth_outer)s master %(egress_bridge)s '
                        'mtu %(egress_bridge_mtu)s' % subst)

                    util_process.execute(
                        None, 'ip link set %(egress_veth_outer)s up' % subst)
                    util_process.execute(
                        None, 'ip link set %(egress_veth_inner)s netns %(netns)s' % subst)

            if self.provide_nat:
                # We don't always need this lock, but acquiring it here means
                # we don't need to construct two identical ipmanagers one after
                # the other.
                with db.get_lock('ipmanager', None, 'floating', ttl=120,
                                 op='Network deploy NAT'):
                    ipm = IPManager.from_db('floating')
                    if not self.floating_gateway:
                        self.update_floating_gateway(
                            ipm.get_random_free_address(self.unique_label()))
                        ipm.persist()

                    subst['floating_router'] = ipm.get_address_at_index(1)
                    subst['floating_gateway'] = self.floating_gateway
                    subst['floating_netmask'] = ipm.netmask

                with util_general.RecordedOperation('enable virtual routing', self):
                    addresses = util_network.get_interface_addresses(
                        subst['egress_veth_inner'], namespace=subst['netns'])
                    if not subst['floating_gateway'] in list(addresses):
                        util_process.execute(
                            None,
                            'ip addr add %(floating_gateway)s/%(floating_netmask)s '
                            'dev %(egress_veth_inner)s' % subst,
                            namespace=self.uuid)
                        util_process.execute(
                            None, 'ip link set  %(egress_veth_inner)s up' % subst,
                            namespace=self.uuid)

                    default_routes = util_network.get_default_routes(
                        subst['netns'])
                    if default_routes != [subst['floating_router']]:
                        if default_routes:
                            for default_route in default_routes:
                                util_process.execute(
                                    None, 'route del default gw %s' % default_route,
                                    namespace=self.uuid)

                        util_process.execute(
                            None, 'route add default gw %(floating_router)s' % subst,
                            namespace=self.uuid)

                self.enable_nat()

        self.update_dhcp()

        # A final check to ensure we haven't raced with a delete
        if self.is_dead():
            raise DeadNetwork('network=%s' % self)
        self.state = self.STATE_CREATED
Esempio n. 17
0
def instance_delete(inst):
    with inst.get_lock(op='Instance delete'):
        # There are two delete state flows:
        #   - error transition states (preflight-error etc) to error
        #   - created to deleted
        #
        # We don't need delete_wait for the error states as they're already
        # in a transition state.
        if not inst.state.value.endswith('-error'):
            inst.state = dbo.STATE_DELETE_WAIT
        db.add_event('instance', inst.uuid, 'queued', 'delete', None, None)

        # Create list of networks used by instance. We cannot use the
        # interfaces cached in the instance here, because the instance
        # may have failed to get to the point where it populates that
        # field (an image fetch failure for example).
        instance_networks = []
        interfaces = []
        for ni in networkinterface.interfaces_for_instance(inst):
            if ni:
                interfaces.append(ni)
                if ni.network_uuid not in instance_networks:
                    instance_networks.append(ni.network_uuid)

        # Stop the instance
        inst.power_off()

        # Delete the instance's interfaces
        with util_general.RecordedOperation('release network addresses', inst):
            for ni in interfaces:
                ni.delete()

        # Create list of networks used by all other instances
        host_networks = []
        for i in instance.Instances(
            [instance.this_node_filter, instance.active_states_filter]):
            if not i.uuid == inst.uuid:
                for iface_uuid in inst.interfaces:
                    ni = networkinterface.NetworkInterface.from_db(iface_uuid)
                    if ni and ni.network_uuid not in host_networks:
                        host_networks.append(ni.network_uuid)

        inst.delete()

        # Check each network used by the deleted instance
        for network in instance_networks:
            n = net.Network.from_db(network)
            if n:
                # If network used by another instance, only update
                if network in host_networks:
                    if n.state.value == dbo.STATE_DELETE_WAIT:
                        # Do not update a network about to be deleted
                        continue
                    with util_general.RecordedOperation(
                            'deallocate ip address', inst):
                        n.update_dhcp()
                else:
                    # Network not used by any other instance therefore delete
                    with util_general.RecordedOperation(
                            'remove network from node', n):
                        n.delete_on_hypervisor()