Esempio n. 1
0
 def run(self):
     LOG.info('Starting')
     util.execute(None, (config.get('API_COMMAND_LINE') % {
                         'port': config.get('API_PORT'),
                         'timeout': config.get('API_TIMEOUT'),
                         'name': daemon.process_name('api')
                         }),
                  env_variables=os.environ)
Esempio n. 2
0
    def __init__(self, db_entry):
        self.db_entry = db_entry

        self.instance_path = os.path.join(config.get('STORAGE_PATH'),
                                          'instances', self.db_entry['uuid'])
        self.snapshot_path = os.path.join(config.get('STORAGE_PATH'),
                                          'snapshots')
        self.xml_file = os.path.join(self.instance_path, 'libvirt.xml')

        if not self.db_entry['block_devices']:
            self._populate_block_devices()
Esempio n. 3
0
    def run(self):
        LOG.info('Starting Monitor Daemon')
        observers = {}

        while True:
            # Cleanup terminated observers
            all_observers = list(observers.keys())
            for instance_uuid in all_observers:
                if not observers[instance_uuid].is_alive():
                    # Reap process
                    observers[instance_uuid].join(1)
                    LOG.withInstance(instance_uuid).info(
                        'Trigger observer has terminated')
                    db.add_event('instance', instance_uuid, 'trigger monitor',
                                 'crashed', None, None)
                    del observers[instance_uuid]

            # Start missing observers
            extra_instances = list(observers.keys())

            for inst in db.get_instances(only_node=config.NODE_NAME):
                if inst['uuid'] in extra_instances:
                    extra_instances.remove(inst['uuid'])

                if inst['state'] != 'created':
                    continue

                if inst['uuid'] not in observers:
                    console_path = os.path.join(config.get('STORAGE_PATH'),
                                                'instances', inst['uuid'],
                                                'console.log')
                    p = multiprocessing.Process(
                        target=observe,
                        args=(console_path, inst['uuid']),
                        name='%s-%s' %
                        (daemon.process_name('triggers'), inst['uuid']))
                    p.start()

                    observers[inst['uuid']] = p
                    LOG.withInstance(
                        inst['uuid']).info('Started trigger observer')
                    db.add_event('instance', inst['uuid'], 'trigger monitor',
                                 'started', None, None)

            # Cleanup extra observers
            for instance_uuid in extra_instances:
                p = observers[instance_uuid]
                try:
                    os.kill(p.pid, signal.SIGKILL)
                    observers[instance_uuid].join(1)
                except Exception:
                    pass

                del observers[instance_uuid]
                LOG.withInstance(instance_uuid).info(
                    'Finished trigger observer')
                db.add_event('instance', instance_uuid, 'trigger monitor',
                             'finished', None, None)

            time.sleep(1)
Esempio n. 4
0
 def _has_sufficient_cpu(self, cpus, node):
     max_cpu = (self.metrics[node].get('cpu_max', 0) *
                config.get('CPU_OVERCOMMIT_RATIO'))
     current_cpu = self.metrics[node].get('cpu_total_instance_vcpus', 0)
     if current_cpu + cpus > max_cpu:
         return False
     return True
Esempio n. 5
0
    def run(self):
        LOG.info('Starting')

        # Delay first compaction until system startup load has reduced
        last_compaction = time.time() - random.randint(1, 20*60)

        while True:
            # Update power state of all instances on this hypervisor
            LOG.info('Updating power states')
            self._update_power_states()

            # Cleanup soft deleted instances and networks
            delay = config.get('CLEANER_DELAY')

            for i in db.get_stale_instances(delay):
                LOG.withInstance(i['uuid']).info('Hard deleting instance')
                db.hard_delete_instance(i['uuid'])

            for n in db.get_stale_networks(delay):
                LOG.withNetwork(n['uuid']).info('Hard deleting network')
                db.hard_delete_network(n['uuid'])

            for ni in db.get_stale_network_interfaces(delay):
                LOG.withNetworkInterface(
                    ni['uuid']).info('Hard deleting network interface')
                db.hard_delete_network_interface(ni['uuid'])

            # Perform etcd maintenance
            if time.time() - last_compaction > 1800:
                LOG.info('Compacting etcd')
                self._compact_etcd()
                last_compaction = time.time()

            time.sleep(60)
Esempio n. 6
0
    def _create_domain_xml(self):
        """Create the domain XML for the instance."""

        if os.path.exists(self.xml_file):
            return

        with open(os.path.join(config.get('STORAGE_PATH'),
                               'libvirt.tmpl')) as f:
            t = jinja2.Template(f.read())

        networks = []
        for iface in list(db.get_instance_interfaces(self.db_entry['uuid'])):
            n = net.from_db(iface['network_uuid'])
            networks.append({
                'macaddr': iface['macaddr'],
                'bridge': n.subst_dict()['vx_bridge'],
                'model': iface['model']
            })

        # NOTE(mikal): the database stores memory allocations in MB, but the
        # domain XML takes them in KB. That wouldn't be worth a comment here if
        # I hadn't spent _ages_ finding a bug related to it.
        xml = t.render(uuid=self.db_entry['uuid'],
                       memory=self.db_entry['memory'] * 1024,
                       vcpus=self.db_entry['cpus'],
                       disks=self.db_entry['block_devices']['devices'],
                       networks=networks,
                       instance_path=self.instance_path,
                       console_port=self.db_entry['console_port'],
                       vdi_port=self.db_entry['vdi_port'],
                       video_model=self.db_entry['video']['model'],
                       video_memory=self.db_entry['video']['memory'])

        with open(self.xml_file, 'w') as f:
            f.write(xml)
Esempio n. 7
0
def _get_cache_path():
    image_cache_path = os.path.join(config.get('STORAGE_PATH'), 'image_cache')
    if not os.path.exists(image_cache_path):
        LOG.withField('image_cache_path',
                      image_cache_path).debug('Creating image cache')
        os.makedirs(image_cache_path, exist_ok=True)
    return image_cache_path
Esempio n. 8
0
    def _populate_block_devices(self):
        if not self.disk_spec:
            # This should not occur since the API will filter for zero disks.
            LOG.withObj(self).error('Found disk spec empty')

            # Stop continuous crashing by falsely claiming disks are configured.
            self.block_devices = {'finalized': True}
            return

        bus = _get_defaulted_disk_bus(self.disk_spec[0])
        root_device = _get_disk_device_base(bus) + 'a'
        config_device = _get_disk_device_base(bus) + 'b'

        disk_type = 'qcow2'
        if config.get('DISK_FORMAT') == 'flat':
            disk_type = 'raw'

        self.block_devices = {
            'devices': [
                {
                    'type': disk_type,
                    'size': _safe_int_cast(self.disk_spec[0].get('size')),
                    'device': root_device,
                    'bus': bus,
                    'path': os.path.join(self.instance_path(), root_device),
                    'base': self.disk_spec[0].get('base'),
                    'present_as': _get_defaulted_disk_type(self.disk_spec[0]),
                    'snapshot_ignores': False
                },
                {
                    'type': 'raw',
                    'device': config_device,
                    'bus': bus,
                    'path': os.path.join(self.instance_path(), config_device),
                    'present_as': 'disk',
                    'snapshot_ignores': True
                }
            ]
        }

        i = 0
        for d in self.disk_spec[1:]:
            bus = _get_defaulted_disk_bus(d)
            device = _get_disk_device_base(bus) + chr(ord('c') + i)
            self.block_devices['devices'].append({
                'type': disk_type,
                'size': _safe_int_cast(d.get('size')),
                'device': device,
                'bus': bus,
                'path': os.path.join(self.instance_path(), device),
                'base': d.get('base'),
                'present_as': _get_defaulted_disk_type(d),
                'snapshot_ignores': False
            })
            i += 1

        self.block_devices['finalized'] = False
Esempio n. 9
0
    def _has_sufficient_ram(self, memory, node):
        # There are two things to track here... We must always have
        # RAM_SYSTEM_RESERVATION gb of RAM for operating system tasks -- assume
        # there is no overlap with existing VMs when checking this. Note as
        # well that metrics are in MB...
        available = (self.metrics[node].get('memory_available', 0) -
                     (config.get('RAM_SYSTEM_RESERVATION') * 1024))
        if available - memory < 0.0:
            return False

        # ...Secondly, if we're using KSM and over committing memory, we
        # shouldn't overcommit more than by RAM_OVERCOMMIT_RATIO
        instance_memory = (
            self.metrics[node].get('memory_total_instance_actual', 0) + memory)
        if (instance_memory / self.metrics[node].get('memory_max', 0) >
                config.get('RAM_OVERCOMMIT_RATIO')):
            return False

        return True
Esempio n. 10
0
def resolve(name):
    resp = requests.get(CIRROS_URL,
                        headers={'User-Agent': util.get_user_agent()})
    if resp.status_code != 200:
        raise exceptions.HTTPError(
            'Failed to fetch http://download.cirros-cloud.net/, '
            'status code %d' % resp.status_code)

    if name == 'cirros':
        versions = []
        dir_re = re.compile(r'.*<a href="([0-9]+\.[0-9]+\.[0-9]+)/">.*/</a>.*')
        for line in resp.text.split('\n'):
            m = dir_re.match(line)
            if m:
                versions.append(m.group(1))
        LOG.withField('versions', versions).info('Found cirros versions')
        vernum = versions[-1]
    else:
        try:
            # Name is assumed to be in the form cirros:0.4.0
            _, vernum = name.split(':')
        except Exception:
            raise exceptions.VersionSpecificationError(
                'Cannot parse version: %s' % name)

    url = config.get('DOWNLOAD_URL_CIRROS') % {'vernum': vernum}
    log = LOG.withField('url', url)

    # Retrieve check sum file
    checksum_url = CIRROS_URL + '/' + vernum + '/MD5SUMS'
    resp = requests.get(checksum_url,
                        headers={'User-Agent': util.get_user_agent()})
    log.withField('checksum_url', checksum_url).withField(
        'resp', resp).debug("Checksum request response")
    if resp.status_code != 200:
        # Cirros does not always have a checksum file available
        log.info('Unable to retrieve MD5SUMS for cirros image')
        return url, None

    sum_re = re.compile(r'^([0-9a-f]+) .*' + 'cirros-' + vernum +
                        '-x86_64-disk.img')
    checksum = None
    for line in resp.text.split('\n'):
        m = sum_re.match(line)
        if m:
            checksum = m.group(1)
            break
    if not checksum_url:
        log.warning('Did not find checksum')

    log.withField('checksum', checksum).info('Checksum retrieval')

    return (url, checksum)
Esempio n. 11
0
    def __init__(self, network, interface):
        self.network_uuid = network.db_entry['uuid']

        self.subst = {
            'config_dir':
            os.path.join(config.get('STORAGE_PATH'), 'dhcp',
                         self.network_uuid),
            'zone':
            config.get('ZONE'),
            'router':
            network.router,
            'dhcp_start':
            network.dhcp_start,
            'netmask':
            network.netmask,
            'broadcast':
            network.broadcast,
            'in_netns':
            'ip netns exec %s' % self.network_uuid,
            'interface':
            interface
        }
Esempio n. 12
0
def set_log_level(log, name):
    # Check that id is a valid name
    process_name(name)

    # Check for configuration override
    level = config.get('LOGLEVEL_' + name.upper())
    if level:
        numeric_level = getattr(logging, level.upper(), None)
        if not isinstance(numeric_level, int):
            raise ValueError('Invalid log level: %s' % level)
    else:
        numeric_level = logging.INFO

    log.setLevel(numeric_level)
Esempio n. 13
0
    def process(self, msg, kwargs):
        msg = '%s[%s] %s' % (setproctitle.getproctitle(), os.getpid(), msg)
        kwargs["extra"] = self.extra

        if config.get('LOG_METHOD_TRACE'):
            # Determine the name of the calling method
            filename = traceback.extract_stack()[-4].filename
            f_match = self.FILENAME_RE.match(filename)
            if f_match:
                filename = f_match.group(1)
            caller = '%s:%s:%s()' % (filename,
                                     traceback.extract_stack()[-4].lineno,
                                     traceback.extract_stack()[-4].name)
            self._extra['method'] = caller

        return msg, kwargs
Esempio n. 14
0
    def __enter__(self):
        start_time = time.time()
        slow_warned = False
        threshold = int(config.get('SLOW_LOCK_THRESHOLD'))

        while time.time() - start_time < self.timeout:
            res = self.acquire()
            if res:
                duration = time.time() - start_time
                if duration > threshold:
                    db.add_event(self.objecttype, self.objectname,
                                 'lock', 'acquired', None,
                                 'Waited %d seconds for lock' % duration)
                    self.log_ctx.withField('duration', duration
                                           ).info('Acquiring a lock was slow')
                return self

            duration = time.time() - start_time
            if (duration > threshold and not slow_warned):
                db.add_event(self.objecttype, self.objectname,
                             'lock', 'acquire', None,
                             'Waiting for lock more than threshold')

                node, pid = self.get_holder()
                self.log_ctx.withFields({'duration': duration,
                                         'threshold': threshold,
                                         'holder-pid': pid,
                                         'holder-node': node,
                                         }).info('Waiting for lock')
                slow_warned = True

            time.sleep(1)

        duration = time.time() - start_time
        db.add_event(self.objecttype, self.objectname,
                     'lock', 'failed', None,
                     'Failed to acquire lock after %.02f seconds' % duration)

        node, pid = self.get_holder()
        self.log_ctx.withFields({'duration': duration,
                                 'holder-pid': pid,
                                 'holder-node': node,
                                 }).info('Failed to acquire lock')

        raise exceptions.LockException(
            'Cannot acquire lock %s, timed out after %.02f seconds'
            % (self.name, duration))
Esempio n. 15
0
 def subst_dict(self):
     retval = {
         'vx_id': self.db_entry['vxid'],
         'vx_interface': 'vxlan-%s' % self.db_entry['vxid'],
         'vx_bridge': 'br-vxlan-%s' % self.db_entry['vxid'],
         'vx_veth_outer': 'veth-%s-o' % self.db_entry['vxid'],
         'vx_veth_inner': 'veth-%s-i' % self.db_entry['vxid'],
         'physical_interface': self.physical_nic,
         'physical_bridge': 'phy-br-%s' % config.get('NODE_EGRESS_NIC'),
         'physical_veth_outer': 'phy-%s-o' % self.db_entry['vxid'],
         'physical_veth_inner': 'phy-%s-i' % self.db_entry['vxid'],
         'netns': self.db_entry['uuid'],
         'in_netns': 'ip netns exec %s' % self.db_entry['uuid'],
         'ipblock': self.ipblock,
         'netmask': self.netmask,
         'router': self.router,
         'broadcast': self.broadcast,
     }
     return retval
Esempio n. 16
0
    def __init__(self, db_entry):
        self.db_entry = db_entry
        self.physical_nic = config.get('NODE_EGRESS_NIC')

        with db.get_lock('ipmanager',
                         None,
                         self.db_entry['uuid'],
                         ttl=120,
                         op='Network object initialization'):
            ipm = db.get_ipmanager(self.db_entry['uuid'])

            self.ipblock = ipm.network_address
            self.router = ipm.get_address_at_index(1)
            self.dhcp_start = ipm.get_address_at_index(2)
            self.netmask = ipm.netmask
            self.broadcast = ipm.broadcast_address
            self.network_address = ipm.network_address

            ipm.reserve(self.router)
            db.persist_ipmanager(self.db_entry['uuid'], ipm.save())
Esempio n. 17
0
    def run(self):
        LOG.info('Starting')
        gauges = {
            'updated_at': Gauge('updated_at',
                                'The last time metrics were updated')
        }

        last_metrics = 0

        def update_metrics():
            global last_metrics

            stats = _get_stats()
            for metric in stats:
                if metric not in gauges:
                    gauges[metric] = Gauge(metric, '')
                gauges[metric].set(stats[metric])

            db.update_metrics_bulk(stats)
            gauges['updated_at'].set_to_current_time()

        while True:
            try:
                jobname, _ = db.dequeue('%s-metrics' % config.NODE_NAME)
                if jobname:
                    if time.time() - last_metrics > 2:
                        update_metrics()
                        last_metrics = time.time()
                    db.resolve('%s-metrics' % config.NODE_NAME, jobname)
                else:
                    time.sleep(0.2)

                timer = time.time() - last_metrics
                if timer > config.get('SCHEDULER_CACHE_TIMEOUT'):
                    update_metrics()
                    last_metrics = time.time()

            except Exception as e:
                util.ignore_exception('resource statistics', e)
Esempio n. 18
0
def main():
    global DAEMON_IMPLEMENTATIONS
    global DAEMON_PIDS

    setproctitle.setproctitle(daemon.process_name('main'))

    # Log configuration on startup
    for key, value in config.dict().items():
        LOG.info('Configuration item %s = %s' % (key, value))

    daemon.set_log_level(LOG, 'main')

    # Check in early and often, also reset processing queue items
    db.clear_stale_locks()
    db.see_this_node()
    db.restart_queues()

    def _start_daemon(d):
        pid = os.fork()
        if pid == 0:
            DAEMON_IMPLEMENTATIONS[d].Monitor(d).run()
        DAEMON_PIDS[pid] = d
        LOG.withField('pid', pid).info('Started %s' % d)

    # Resource usage publisher, we need this early because scheduling decisions
    # might happen quite early on.
    _start_daemon('resources')

    # If I am the network node, I need some setup
    if util.is_network_node():
        # Bootstrap the floating network in the Networks table
        floating_network = db.get_network('floating')
        if not floating_network:
            db.create_floating_network(config.get('FLOATING_NETWORK'))
            floating_network = net.from_db('floating')

        subst = {
            'physical_bridge':
            util.get_safe_interface_name('phy-br-%s' %
                                         config.get('NODE_EGRESS_NIC')),
            'physical_nic':
            config.get('NODE_EGRESS_NIC')
        }

        if not util.check_for_interface(subst['physical_bridge']):
            # NOTE(mikal): Adding the physical interface to the physical bridge
            # is considered outside the scope of the orchestration software as
            # it will cause the node to lose network connectivity. So instead
            # all we do is create a bridge if it doesn't exist and the wire
            # everything up to it. We can do egress NAT in that state, even if
            # floating IPs don't work.
            with util.RecordedOperation('create physical bridge', None):
                # No locking as read only
                ipm = db.get_ipmanager('floating')
                subst['master_float'] = ipm.get_address_at_index(1)
                subst['netmask'] = ipm.netmask

                util.create_interface(subst['physical_bridge'], 'bridge', '')
                util.execute(None,
                             'ip link set %(physical_bridge)s up' % subst)
                util.execute(
                    None, 'ip addr add %(master_float)s/%(netmask)s '
                    'dev %(physical_bridge)s' % subst)

                util.execute(
                    None, 'iptables -A FORWARD -o %(physical_nic)s '
                    '-i %(physical_bridge)s -j ACCEPT' % subst)
                util.execute(
                    None, 'iptables -A FORWARD -i %(physical_nic)s '
                    '-o %(physical_bridge)s -j ACCEPT' % subst)
                util.execute(
                    None, 'iptables -t nat -A POSTROUTING '
                    '-o %(physical_nic)s -j MASQUERADE' % subst)

    def _audit_daemons():
        running_daemons = []
        for pid in DAEMON_PIDS:
            running_daemons.append(DAEMON_PIDS[pid])

        for d in DAEMON_IMPLEMENTATIONS:
            if d not in running_daemons:
                _start_daemon(d)

        for d in DAEMON_PIDS:
            if not psutil.pid_exists(d):
                LOG.warning('%s pid is missing, restarting' % DAEMON_PIDS[d])
                _start_daemon(DAEMON_PIDS[d])

    _audit_daemons()
    restore_instances()

    while True:
        time.sleep(10)

        wpid, _ = os.waitpid(-1, os.WNOHANG)
        while wpid != 0:
            LOG.warning('%s died (pid %d)' %
                        (DAEMON_PIDS.get(wpid, 'unknown'), wpid))
            del DAEMON_PIDS[wpid]
            wpid, _ = os.waitpid(-1, os.WNOHANG)

        _audit_daemons()
        db.see_this_node()
Esempio n. 19
0
def _get_stats():
    libvirt = util.get_libvirt()
    retval = {}
    conn = libvirt.open(None)

    # CPU info
    present_cpus, _, available_cpus = conn.getCPUMap()
    retval.update({
        'cpu_max': present_cpus,
        'cpu_available': available_cpus,
    })

    retval['cpu_max_per_instance'] = conn.getMaxVcpus(None)

    # This is disabled as data we don't currently use
    # for i in range(present_cpus):
    #    per_cpu_stats = conn.getCPUStats(i)
    #    for key in per_cpu_stats:
    #        retval['cpu_core%d_%s' % (i, key)] = per_cpu_stats[key]

    try:
        load_1, load_5, load_15 = psutil.getloadavg()
        retval.update({
            'cpu_load_1': load_1,
            'cpu_load_5': load_5,
            'cpu_load_15': load_15,
        })
    except Exception as e:
        util.ignore_exception('load average', e)

    # System memory info, converting bytes to mb
    stats = psutil.virtual_memory()
    retval.update({
        'memory_max': stats.total // 1024 // 1024,
        'memory_available': stats.available // 1024 // 1024
    })

    # libvirt memory info, converting kb to mb
    memory_status = conn.getMemoryStats(
        libvirt.VIR_NODE_MEMORY_STATS_ALL_CELLS)
    retval.update({
        'memory_max_libvirt': memory_status['total'] // 1024,
        'memory_available_libvirt': memory_status['free'] // 1024,
    })

    # Kernel Shared Memory (KSM) information
    ksm_details = {}
    for ent in os.listdir('/sys/kernel/mm/ksm'):
        with open('/sys/kernel/mm/ksm/%s' % ent) as f:
            ksm_details['memory_ksm_%s' % ent] = int(f.read().rstrip())
    retval.update(ksm_details)

    # Disk info
    s = os.statvfs(config.get('STORAGE_PATH'))
    disk_counters = psutil.disk_io_counters()
    retval.update({
        'disk_total': s.f_frsize * s.f_blocks,
        'disk_free': s.f_frsize * s.f_bavail,
        'disk_used': s.f_frsize * (s.f_blocks - s.f_bfree),
        'disk_read_bytes': disk_counters.read_bytes,
        'disk_write_bytes': disk_counters.write_bytes,
    })

    # Network info
    net_counters = psutil.net_io_counters()
    retval.update({
        'network_read_bytes': net_counters.bytes_recv,
        'network_write_bytes': net_counters.bytes_sent,
    })

    # Virtual machine consumption info
    total_instances = 0
    total_active_instances = 0
    total_instance_max_memory = 0
    total_instance_actual_memory = 0
    total_instance_vcpus = 0
    total_instance_cpu_time = 0

    for guest in conn.listAllDomains():
        try:
            active = guest.isActive() == 1
            if active:
                _, maxmem, mem, cpus, cpu_time = guest.info()

        except libvirt.libvirtError as e:
            LOG.debug('During resource calc ignored libvirt error: %s' % e)
            active = False

        if active:
            total_instances += 1
            total_active_instances += 1
            total_instance_max_memory += maxmem
            total_instance_actual_memory += mem
            total_instance_vcpus += cpus
            total_instance_cpu_time += cpu_time

    # Queue health statistics
    node_queue_processing, node_queue_waiting = db.get_queue_length(
        config.NODE_NAME)

    retval.update({
        'cpu_total_instance_vcpus':
        total_instance_vcpus,
        'cpu_total_instance_cpu_time':
        total_instance_cpu_time,
        'memory_total_instance_max':
        total_instance_max_memory // 1024,
        'memory_total_instance_actual':
        total_instance_actual_memory // 1024,
        'instances_total':
        total_instances,
        'instances_active':
        total_active_instances,
        'node_queue_processing':
        node_queue_processing,
        'node_queue_waiting':
        node_queue_waiting,
    })

    if util.is_network_node():
        network_queue_processing, network_queue_waiting = db.get_queue_length(
            'networknode')

        retval.update({
            'network_queue_processing': network_queue_processing,
            'network_queue_waiting': network_queue_waiting,
        })

    return retval
Esempio n. 20
0
    def create(self, lock=None):
        self.update_instance_state('creating')

        # Ensure we have state on disk
        if not os.path.exists(self.instance_path()):
            LOG.withObj(self).debug(
                'Creating instance storage at %s' % self.instance_path())
            os.makedirs(self.instance_path(), exist_ok=True)

        # Generate a config drive
        with util.RecordedOperation('make config drive', self):
            self._make_config_drive(os.path.join(
                self.instance_path(), self.block_devices['devices'][1]['path']))

        # Prepare disks
        if not self.block_devices['finalized']:
            modified_disks = []
            for disk in self.block_devices['devices']:
                if disk.get('base'):
                    img = images.Image.from_url(disk['base'])
                    hashed_image_path = img.version_image_path()

                    with util.RecordedOperation('detect cdrom images', self):
                        try:
                            cd = pycdlib.PyCdlib()
                            cd.open(hashed_image_path)
                            disk['present_as'] = 'cdrom'
                        except Exception:
                            pass

                    if disk.get('present_as', 'cdrom') == 'cdrom':
                        # There is no point in resizing or COW'ing a cdrom
                        disk['path'] = disk['path'].replace('.qcow2', '.raw')
                        disk['type'] = 'raw'
                        disk['snapshot_ignores'] = True

                        try:
                            os.link(hashed_image_path, disk['path'])
                        except OSError:
                            # Different filesystems
                            util.execute(
                                [lock], 'cp %s %s' % (hashed_image_path, disk['path']))

                        # Due to limitations in some installers, cdroms are always on IDE
                        disk['device'] = 'hd%s' % disk['device'][-1]
                        disk['bus'] = 'ide'
                    else:
                        if config.get('DISK_FORMAT') == 'qcow':
                            with util.RecordedOperation('create copy on write layer', self):
                                images.create_cow([lock], hashed_image_path,
                                                  disk['path'], disk['size'])

                            # Record the backing store for modern libvirts
                            disk['backing'] = (
                                '<backingStore type=\'file\'>\n'
                                '        <format type=\'qcow2\'/>\n'
                                '        <source file=\'%s\'/>\n'
                                '      </backingStore>\n'
                                % (hashed_image_path))

                        elif config.get('DISK_FORMAT') == 'qcow_flat':
                            with util.RecordedOperation('resize image', self):
                                resized_image_path = img.resize(
                                    [lock], disk['size'])

                            with util.RecordedOperation('create flat layer', self):
                                images.create_flat(
                                    [lock], resized_image_path, disk['path'])

                        elif config.get('DISK_FORMAT') == 'flat':
                            with util.RecordedOperation('resize image', self):
                                resized_image_path = img.resize(
                                    [lock], disk['size'])

                            with util.RecordedOperation('create raw disk', self):
                                images.create_raw(
                                    [lock], resized_image_path, disk['path'])

                        else:
                            raise Exception('Unknown disk format')

                elif not os.path.exists(disk['path']):
                    util.execute(None, 'qemu-img create -f qcow2 %s %sG'
                                 % (disk['path'], disk['size']))

                modified_disks.append(disk)

            self.block_devices['devices'] = modified_disks
            self.block_devices['finalized'] = True

        self.persist()

        # Create the actual instance
        with util.RecordedOperation('create domain XML', self):
            self._create_domain_xml()

        # Sometimes on Ubuntu 20.04 we need to wait for port binding to work.
        # Revisiting this is tracked by issue 320 on github.
        with util.RecordedOperation('create domain', self):
            if not self.power_on():
                attempts = 0
                while not self.power_on() and attempts < 100:
                    LOG.withObj(self).warning(
                        'Instance required an additional attempt to power on')
                    time.sleep(5)
                    attempts += 1

        if self.is_powered_on():
            LOG.withObj(self).info('Instance now powered on')
        else:
            LOG.withObj(self).info('Instance failed to power on')
        self.update_instance_state('created')
Esempio n. 21
0
    def _update_power_states(self):
        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        try:
            seen = []

            # Active VMs have an ID. Active means running in libvirt
            # land.
            for domain_id in conn.listDomainsID():
                domain = conn.lookupByID(domain_id)
                if not domain.name().startswith('sf:'):
                    continue

                instance_uuid = domain.name().split(':')[1]
                log_ctx = LOG.withInstance(instance_uuid)

                instance = db.get_instance(instance_uuid)
                if not instance:
                    # Instance is SF but not in database. Kill to reduce load.
                    log_ctx.warning('Destroying unknown instance')
                    util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid)
                    continue

                db.place_instance(instance_uuid, config.NODE_NAME)
                seen.append(domain.name())

                if instance.get('state') == 'deleted':
                    # NOTE(mikal): a delete might be in-flight in the queue.
                    # We only worry about instances which should have gone
                    # away five minutes ago.
                    if time.time() - instance['state_updated'] < 300:
                        continue

                    db.instance_enforced_deletes_increment(instance_uuid)
                    attempts = instance.get('enforced_deletes', 0)

                    if attempts > 5:
                        # Sometimes we just can't delete the VM. Try the big hammer instead.
                        log_ctx.warning(
                            'Attempting alternate delete method for instance')
                        util.execute(None,
                                     'virsh destroy "sf:%s"' % instance_uuid)

                        db.add_event('instance', instance_uuid,
                                     'enforced delete', 'complete', None, None)
                    else:
                        i = virt.from_db(instance_uuid)
                        i.delete()
                        i.update_instance_state('deleted')

                    log_ctx.withField(
                        'attempt', attempts).warning('Deleting stray instance')

                    continue

                state = util.extract_power_state(libvirt, domain)
                db.update_instance_power_state(instance_uuid, state)
                if state == 'crashed':
                    db.update_instance_state(instance_uuid, 'error')

            # Inactive VMs just have a name, and are powered off
            # in our state system.
            for domain_name in conn.listDefinedDomains():
                if not domain_name.startswith('sf:'):
                    continue

                if domain_name not in seen:
                    instance_uuid = domain_name.split(':')[1]
                    log_ctx = LOG.withInstance(instance_uuid)
                    instance = db.get_instance(instance_uuid)

                    if not instance:
                        # Instance is SF but not in database. Kill because unknown.
                        log_ctx.warning('Removing unknown inactive instance')
                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        continue

                    if instance.get('state') == 'deleted':
                        # NOTE(mikal): a delete might be in-flight in the queue.
                        # We only worry about instances which should have gone
                        # away five minutes ago.
                        if time.time() - instance['state_updated'] < 300:
                            continue

                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        log_ctx.info('Detected stray instance')
                        db.add_event('instance', instance_uuid,
                                     'deleted stray', 'complete', None, None)
                        continue

                    db.place_instance(instance_uuid, config.NODE_NAME)
                    instance_path = os.path.join(config.get('STORAGE_PATH'),
                                                 'instances', instance_uuid)

                    if not os.path.exists(instance_path):
                        # If we're inactive and our files aren't on disk,
                        # we have a problem.
                        log_ctx.info('Detected error state for instance')
                        db.update_instance_state(instance_uuid, 'error')

                    elif instance.get('power_state') != 'off':
                        log_ctx.info('Detected power off for instance')
                        db.update_instance_power_state(instance_uuid, 'off')
                        db.add_event('instance', instance_uuid,
                                     'detected poweroff', 'complete', None,
                                     None)

        except libvirt.libvirtError as e:
            LOG.error('Failed to lookup all domains: %s' % e)
Esempio n. 22
0
 def _read_template(self, template):
     with open(os.path.join(config.get('STORAGE_PATH'), template)) as f:
         return jinja2.Template(f.read())
Esempio n. 23
0
    def place_instance(self, instance, network, candidates=None):
        with util.RecordedOperation('schedule', instance):
            log_ctx = LOG.withObj(instance)

            diff = time.time() - self.metrics_updated
            if diff > config.get('SCHEDULER_CACHE_TIMEOUT'):
                self.refresh_metrics()

            if candidates:
                log_ctx.info('Scheduling %s forced as candidates' % candidates)
                instance.add_event('schedule', 'Forced candidates', None,
                                   str(candidates))
                for node in candidates:
                    if node not in self.metrics:
                        raise exceptions.CandidateNodeNotFoundException(node)
            else:
                candidates = []
                for node in self.metrics.keys():
                    candidates.append(node)
            log_ctx.info('Scheduling %s start as candidates' % candidates)
            instance.add_event('schedule', 'Initial candidates', None,
                               str(candidates))
            if not candidates:
                raise exceptions.LowResourceException('No nodes with metrics')

            # Can we host that many vCPUs?
            for node in copy.copy(candidates):
                max_cpu = self.metrics[node].get('cpu_max_per_instance', 0)
                if instance.cpus > max_cpu:
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough actual CPU' % candidates)
            instance.add_event('schedule', 'Have enough actual CPU', None,
                               str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'Requested vCPUs exceeds vCPU limit')

            # Do we have enough idle CPU?
            for node in copy.copy(candidates):
                if not self._has_sufficient_cpu(instance.cpus, node):
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough idle CPU' % candidates)
            instance.add_event('schedule', 'Have enough idle CPU', None,
                               str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'No nodes with enough idle CPU')

            # Do we have enough idle RAM?
            for node in copy.copy(candidates):
                if not self._has_sufficient_ram(instance.memory, node):
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough idle RAM' % candidates)
            instance.add_event('schedule', 'Have enough idle RAM', None,
                               str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'No nodes with enough idle RAM')

            # Do we have enough idle disk?
            for node in copy.copy(candidates):
                if not self._has_sufficient_disk(instance, node):
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough idle disk' % candidates)
            instance.add_event('schedule', 'Have enough idle disk', None,
                               str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'No nodes with enough disk space')

            # What nodes have the highest number of networks already present?
            if network:
                requested_networks = []
                for net in network:
                    network_uuid = net['network_uuid']
                    if network_uuid not in requested_networks:
                        requested_networks.append(network_uuid)

                candidates = self._find_most_matching_networks(
                    requested_networks, candidates)
                log_ctx.info('Scheduling %s have most matching networks' %
                             candidates)
                instance.add_event('schedule', 'Have most matching networks',
                                   None, str(candidates))

            # What nodes have the base image already?
            requested_images = []
            for disk in instance.block_devices['devices']:
                if disk.get('base'):
                    requested_images = disk.get('base')

            candidates = self._find_most_matching_images(
                requested_images, candidates)
            log_ctx.info('Scheduling %s have most matching images' %
                         candidates)
            instance.add_event('schedule', 'Have most matching images', None,
                               str(candidates))

            # Avoid allocating to network node if possible
            net_node = db.get_network_node()
            if len(candidates) > 1 and net_node['fqdn'] in candidates:
                candidates.remove(net_node['fqdn'])
                log_ctx.info('Scheduling %s are non-network nodes' %
                             candidates)
                instance.add_event('schedule', 'Are non-network nodes', None,
                                   str(candidates))

            # Return a shuffled list of options
            random.shuffle(candidates)
            return candidates
Esempio n. 24
0
def resolve(name):
    resp = requests.get(UBUNTU_URL,
                        headers={'User-Agent': util.get_user_agent()})
    if resp.status_code != 200:
        raise exceptions.HTTPError('Failed to fetch %s, status code %d' %
                                   (UBUNTU_URL, resp.status_code))

    num_to_name = {}
    name_to_num = {}
    dir_re = re.compile(
        r'.*<a href="(.*)/">.*Ubuntu Server ([0-9]+\.[0-9]+).*')
    for line in resp.text.split('\n'):
        m = dir_re.match(line)
        if m:
            num_to_name[m.group(2)] = m.group(1)
            name_to_num[m.group(1)] = m.group(2)
    LOG.withField('versions', num_to_name).info('Found ubuntu versions')

    vernum = None
    vername = None

    if name == 'ubuntu':
        vernum = sorted(num_to_name.keys())[-1]
        vername = num_to_name[vernum]
    else:
        try:
            # Name is assumed to be in the form ubuntu:18.04 or ubuntu:bionic
            _, version = name.split(':')
            if version in num_to_name:
                vernum = version
                vername = num_to_name[version]
            else:
                vername = version
                vernum = name_to_num[version]
        except Exception:
            raise exceptions.VersionSpecificationError(
                'Cannot parse version: %s' % name)

    url = (config.get('DOWNLOAD_URL_UBUNTU') % {
        'vernum': vernum,
        'vername': vername
    })
    log = LOG.withField('url', url)

    # Retrieve check sum file
    checksum_url = UBUNTU_URL + '/' + vername + '/current/MD5SUMS'
    resp = requests.get(checksum_url,
                        headers={'User-Agent': util.get_user_agent()})
    if resp.status_code != 200:
        raise exceptions.HTTPError('Failed to fetch %s, status code %d' %
                                   (checksum_url, resp.status_code))

    sum_re = re.compile(r'^([0-9a-f]+) .*' + vername +
                        '-server-cloudimg-amd64.img')
    checksum = None
    for line in resp.text.split('\n'):
        m = sum_re.match(line)
        if m:
            checksum = m.group(1)
            break
    if not checksum_url:
        log.warning('Did not find checksum')
    checksum = checksum.strip()

    log.withField('checksum', checksum).info('Checksum check')
    return (url, checksum)
Esempio n. 25
0
def _get_defaulted_disk_bus(disk):
    bus = disk.get('bus')
    if bus:
        return bus
    return config.get('DISK_BUS')
Esempio n. 26
0
 def instance_path(self):
     return os.path.join(config.get('STORAGE_PATH'), 'instances', self.uuid)
Esempio n. 27
0
 def __init__(self, id):
     super(Monitor, self).__init__(id)
     start_http_server(config.get('PROMETHEUS_METRICS_PORT'))
Esempio n. 28
0
    def _make_config_drive(self, disk_path):
        """Create a config drive"""

        # NOTE(mikal): with a big nod at https://gist.github.com/pshchelo/378f3c4e7d18441878b9652e9478233f
        iso = pycdlib.PyCdlib()
        iso.new(interchange_level=4,
                joliet=True,
                rock_ridge='1.09',
                vol_ident='config-2')

        # We're only going to pretend to do the most recent OpenStack version
        iso.add_directory('/openstack',
                          rr_name='openstack',
                          joliet_path='/openstack')
        iso.add_directory('/openstack/2017-02-22',
                          rr_name='2017-02-22',
                          joliet_path='/openstack/2017-02-22')
        iso.add_directory('/openstack/latest',
                          rr_name='latest',
                          joliet_path='/openstack/latest')

        # meta_data.json
        md = json.dumps({
            'random_seed': base64.b64encode(os.urandom(512)).decode('ascii'),
            'uuid': self.uuid,
            'availability_zone': config.get('ZONE'),
            'hostname': '%s.local' % self.name,
            'launch_index': 0,
            'devices': [],
            'project_id': None,
            'name': self.name,
            'public_keys': {
                'mykey': self.ssh_key
            }
        }).encode('ascii')
        iso.add_fp(io.BytesIO(md), len(md), '/openstack/latest/meta_data.json;1',
                   rr_name='meta_data.json',
                   joliet_path='/openstack/latest/meta_data.json')
        iso.add_fp(io.BytesIO(md), len(md), '/openstack/2017-02-22/meta_data.json;2',
                   rr_name='meta_data.json',
                   joliet_path='/openstack/2017-02-22/meta_data.json')

        # user_data
        if self.user_data:
            user_data = base64.b64decode(self.user_data)
            iso.add_fp(io.BytesIO(user_data), len(user_data), '/openstack/latest/user_data',
                       rr_name='user_data',
                       joliet_path='/openstack/latest/user_data.json')
            iso.add_fp(io.BytesIO(user_data), len(user_data), '/openstack/2017-02-22/user_data',
                       rr_name='user_data',
                       joliet_path='/openstack/2017-02-22/user_data.json')

        # network_data.json
        nd = {
            'links': [],
            'networks': [],
            'services': [
                {
                    'address': '8.8.8.8',
                    'type': 'dns'
                }
            ]
        }

        seen_networks = []
        for iface in db.get_instance_interfaces(self.uuid):
            devname = 'eth%d' % iface['order']
            nd['links'].append(
                {
                    'ethernet_mac_address': iface['macaddr'],
                    'id': devname,
                    'name': devname,
                    'mtu': 1450,
                    'type': 'vif',
                    'vif_id': iface['uuid']
                }
            )

            if not iface['network_uuid'] in seen_networks:
                n = net.from_db(iface['network_uuid'])
                nd['networks'].append(
                    {
                        'id': iface['network_uuid'],
                        'link': devname,
                        'type': 'ipv4',
                        'ip_address': iface['ipv4'],
                        'netmask': str(n.netmask),
                        'routes': [
                            {
                                'network': '0.0.0.0',
                                'netmask': '0.0.0.0',
                                'gateway': str(n.router)
                            }
                        ],
                        'network_id': iface['network_uuid']
                    }
                )
                seen_networks.append(iface['network_uuid'])

        nd_encoded = json.dumps(nd).encode('ascii')
        iso.add_fp(io.BytesIO(nd_encoded), len(nd_encoded),
                   '/openstack/latest/network_data.json;3',
                   rr_name='network_data.json',
                   joliet_path='/openstack/latest/vendor_data.json')
        iso.add_fp(io.BytesIO(nd_encoded), len(nd_encoded),
                   '/openstack/2017-02-22/network_data.json;4',
                   rr_name='network_data.json',
                   joliet_path='/openstack/2017-02-22/vendor_data.json')

        # empty vendor_data.json and vendor_data2.json
        vd = '{}'.encode('ascii')
        iso.add_fp(io.BytesIO(vd), len(vd),
                   '/openstack/latest/vendor_data.json;5',
                   rr_name='vendor_data.json',
                   joliet_path='/openstack/latest/vendor_data.json')
        iso.add_fp(io.BytesIO(vd), len(vd),
                   '/openstack/2017-02-22/vendor_data.json;6',
                   rr_name='vendor_data.json',
                   joliet_path='/openstack/2017-02-22/vendor_data.json')
        iso.add_fp(io.BytesIO(vd), len(vd),
                   '/openstack/latest/vendor_data2.json;7',
                   rr_name='vendor_data2.json',
                   joliet_path='/openstack/latest/vendor_data2.json')
        iso.add_fp(io.BytesIO(vd), len(vd),
                   '/openstack/2017-02-22/vendor_data2.json;8',
                   rr_name='vendor_data2.json',
                   joliet_path='/openstack/2017-02-22/vendor_data2.json')

        # Dump to disk
        iso.write(disk_path)
        iso.close()
Esempio n. 29
0
 def snapshot_path(self):
     return os.path.join(config.get('STORAGE_PATH'), 'snapshots')