Example #1
0
    def power_on(self):
        if not os.path.exists(self.xml_file):
            db.enqueue_instance_error(self.db_entry['uuid'],
                                      'missing domain file in power on')

        libvirt = util.get_libvirt()
        with open(self.xml_file) as f:
            xml = f.read()

        instance = self._get_domain()
        if not instance:
            conn = libvirt.open(None)
            instance = conn.defineXML(xml)
            if not instance:
                db.enqueue_instance_error(self.db_entry['uuid'],
                                          'power on failed to create domain')
                raise exceptions.NoDomainException()

        try:
            instance.create()
        except libvirt.libvirtError as e:
            if not str(e).startswith(
                    'Requested operation is not valid: domain is already running'
            ):
                LOG.withObj(self).warning('Instance start error: %s' % e)
                return False

        instance.setAutostart(1)
        db.update_instance_power_state(
            self.db_entry['uuid'], util.extract_power_state(libvirt, instance))
        db.add_event('instance', self.db_entry['uuid'], 'poweron', 'complete',
                     None, None)
        return True
Example #2
0
def image_fetch(url, instance_uuid):
    instance = None
    if instance_uuid:
        instance = virt.from_db(instance_uuid)

    try:
        # TODO(andy): Wait up to 15 mins for another queue process to download
        # the required image. This will be changed to queue on a
        # "waiting_image_fetch" queue but this works now.
        with db.get_lock('image',
                         config.NODE_NAME,
                         Image.calc_unique_ref(url),
                         timeout=15 * 60,
                         op='Image fetch') as lock:
            img = Image.from_url(url)
            img.get([lock], instance)
            db.add_event('image', url, 'fetch', None, None, 'success')

    except (exceptions.HTTPError, requests.exceptions.RequestException) as e:
        LOG.withField('image', url).info('Failed to fetch image')
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'Image fetch failed: %s' % e)

        # Clean common problems to store in events
        msg = str(e)
        re_conn_err = re.compile(r'.*NewConnectionError\(\'\<.*\>: (.*)\'')
        m = re_conn_err.match(msg)
        if m:
            msg = m.group(1)
        db.add_event('image', url, 'fetch', None, None, 'Error: ' + msg)

        raise exceptions.ImageFetchTaskFailedException(
            'Failed to fetch image %s' % url)
Example #3
0
def restore_instances():
    # Ensure all instances for this node are defined
    networks = []
    instances = []
    for inst in list(
            db.get_instances(only_node=config.parsed.get('NODE_NAME'))):
        for iface in db.get_instance_interfaces(inst['uuid']):
            if not iface['network_uuid'] in networks:
                networks.append(iface['network_uuid'])
        instances.append(inst['uuid'])

    with util.RecordedOperation('restore networks', None):
        for network in networks:
            try:
                n = net.from_db(network)
                LOG.withObj(n).info('Restoring network')
                n.create()
                n.ensure_mesh()
                n.update_dhcp()
            except Exception as e:
                util.ignore_exception('restore network %s' % network, e)

    with util.RecordedOperation('restore instances', None):
        for instance in instances:
            try:
                with db.get_lock('instance',
                                 None,
                                 instance,
                                 ttl=120,
                                 timeout=120,
                                 op='Instance restore'):
                    i = virt.from_db(instance)
                    if not i:
                        continue
                    started = ['on', 'transition-to-on', 'initial', 'unknown']
                    if i.db_entry.get('power_state', 'unknown') not in started:
                        continue

                    LOG.withObj(i).info('Restoring instance')
                    i.create()
            except Exception as e:
                util.ignore_exception('restore instance %s' % instance, e)
                db.enqueue_instance_error(
                    instance,
                    'exception while restoring instance on daemon restart')
Example #4
0
def instance_start(instance_uuid, network):
    with db.get_lock('instance',
                     None,
                     instance_uuid,
                     ttl=900,
                     timeout=120,
                     op='Instance start') as lock:
        instance = virt.from_db(instance_uuid)

        # Collect the networks
        nets = {}
        for netdesc in network:
            if netdesc['network_uuid'] not in nets:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_error(instance_uuid, 'missing network')
                    return

                nets[netdesc['network_uuid']] = n

        # Create the networks
        with util.RecordedOperation('ensure networks exist', instance):
            for network_uuid in nets:
                n = nets[network_uuid]
                n.create()
                n.ensure_mesh()
                n.update_dhcp()

        # Now we can start the instance
        libvirt = util.get_libvirt()
        try:
            with util.RecordedOperation('instance creation', instance):
                instance.create(lock=lock)

        except libvirt.libvirtError as e:
            code = e.get_error_code()
            if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED,
                        libvirt.VIR_ERR_XML_ERROR):
                db.enqueue_instance_error(instance_uuid,
                                          'instance failed to start: %s' % e)
                return

        for iface in db.get_instance_interfaces(instance_uuid):
            db.update_network_interface_state(iface['uuid'], 'created')
Example #5
0
def handle(jobname, workitem):
    log = LOG.withField('workitem', jobname)
    log.info('Processing workitem')

    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            if not QueueTask.__subclasscheck__(type(task)):
                raise exceptions.UnknownTaskException(
                    'Task was not decoded: %s' % task)

            if (InstanceTask.__subclasscheck__(type(task))
                    or isinstance(task, FetchImageTask)):
                instance_uuid = task.instance_uuid()

            if instance_uuid:
                log_i = log.withInstance(instance_uuid)
            else:
                log_i = log

            log_i.withField('task_name', task.name()).info('Starting task')

            # TODO(andy) Should network events also come through here eventually?
            # Then this can be generalised to record events on networks/instances

            # TODO(andy) This event should be recorded when it is recorded as
            # dequeued in the DB. Currently it's reporting action on the item
            # and calling it 'dequeue'.

            if instance_uuid:
                # TODO(andy) move to QueueTask
                db.add_event('instance', instance_uuid,
                             task.pretty_task_name(), 'dequeued', None,
                             'Work item %s' % jobname)

            if isinstance(task, FetchImageTask):
                image_fetch(task.url(), instance_uuid)

            elif isinstance(task, PreflightInstanceTask):
                redirect_to = instance_preflight(instance_uuid, task.network())
                if redirect_to:
                    log_i.info('Redirecting instance start to %s' %
                               redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            elif isinstance(task, StartInstanceTask):
                instance_start(instance_uuid, task.network())
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.NODE_NAME, {})

            elif isinstance(task, DeleteInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'deleted')
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            elif isinstance(task, ErrorInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'error')

                    if task.error_msg():
                        db.update_instance_error_message(
                            instance_uuid, task.error_msg())
                    db.enqueue('%s-metrics' % config.NODE_NAME, {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            else:
                log_i.withField('task', task).error('Unhandled task - dropped')

            log_i.info('Task complete')

    except exceptions.ImageFetchTaskFailedException as e:
        # Usually caused by external issue and not an application error
        log.info('Fetch Image Error: %s', e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    except Exception as e:
        util.ignore_exception(daemon.process_name('queues'), e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    finally:
        db.resolve(config.NODE_NAME, jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        log.info('Completed workitem')