def image_fetch(url, instance_uuid):
    instance = None
    if instance_uuid:
        instance = virt.from_db(instance_uuid)

    try:
        # TODO(andy): Wait up to 15 mins for another queue process to download
        # the required image. This will be changed to queue on a
        # "waiting_image_fetch" queue but this works now.
        with db.get_lock('image',
                         config.NODE_NAME,
                         Image.calc_unique_ref(url),
                         timeout=15 * 60,
                         op='Image fetch') as lock:
            img = Image.from_url(url)
            img.get([lock], instance)
            db.add_event('image', url, 'fetch', None, None, 'success')

    except (exceptions.HTTPError, requests.exceptions.RequestException) as e:
        LOG.withField('image', url).info('Failed to fetch image')
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'Image fetch failed: %s' % e)

        # Clean common problems to store in events
        msg = str(e)
        re_conn_err = re.compile(r'.*NewConnectionError\(\'\<.*\>: (.*)\'')
        m = re_conn_err.match(msg)
        if m:
            msg = m.group(1)
        db.add_event('image', url, 'fetch', None, None, 'Error: ' + msg)

        raise exceptions.ImageFetchTaskFailedException(
            'Failed to fetch image %s' % url)
Exemple #2
0
    def post(self, interface_uuid=None):
        ni = db.get_interface(interface_uuid)
        if not ni:
            return error(404, 'network interface not found')

        if not ni['floating']:
            return error(409, 'this interface does not have a floating ip')

        n = net.from_db(ni['network_uuid'])
        if not n:
            LOG.info('network(%s): network not found, genuinely missing' %
                     ni['network_uuid'])
            return error(404, 'network not found')

        if get_jwt_identity() not in [n.namespace, 'system']:
            LOG.info('%s: network not found, ownership test' % n)
            return error(404, 'network not found')

        i = virt.from_db(ni['instance_uuid'])
        if get_jwt_identity() not in [i.db_entry['namespace'], 'system']:
            LOG.info('%s: instance not found, ownership test' % i)
            return error(404, 'instance not found')

        float_net = net.from_db('floating')
        if not float_net:
            return error(404, 'floating network not found')

        db.add_event('interface', interface_uuid, 'api', 'defloat', None, None)
        with db.get_lock('sf/ipmanager/floating', ttl=120) as _:
            ipm = db.get_ipmanager('floating')
            ipm.release(ni['floating'])
            db.persist_ipmanager('floating', ipm.save())

        db.remove_floating_from_interface(ni['uuid'])
        n.remove_floating_ip(ni['floating'], ni['ipv4'])
Exemple #3
0
def restore_instances():
    # Ensure all instances for this node are defined
    networks = []
    instances = []
    for inst in list(
            db.get_instances(only_node=config.parsed.get('NODE_NAME'))):
        for iface in db.get_instance_interfaces(inst['uuid']):
            if not iface['network_uuid'] in networks:
                networks.append(iface['network_uuid'])
        instances.append(inst['uuid'])

    with util.RecordedOperation('restore networks', None) as _:
        for network in networks:
            try:
                n = net.from_db(network)
                LOG.info('%s Restoring network' % n)
                n.create()
                n.ensure_mesh()
                n.update_dhcp()
            except Exception as e:
                LOG.error('%s Failed to restore network: %s' % (n, e))

    with util.RecordedOperation('restore instances', None) as _:
        for instance in instances:
            try:
                i = virt.from_db(instance)
                LOG.info('%s Restoring instance' % i)
                i.create()
            except Exception as e:
                LOG.error('%s Failed to restore instance: %s' % (i, e))
                db.update_instance_state(instance, 'error')
def instance_preflight(instance_uuid, network):
    db.update_instance_state(instance_uuid, 'preflight')

    s = scheduler.Scheduler()
    instance = virt.from_db(instance_uuid)

    try:
        s.place_instance(instance, network, candidates=[config.NODE_NAME])
        return None

    except exceptions.LowResourceException as e:
        db.add_event('instance', instance_uuid, 'schedule', 'retry', None,
                     'insufficient resources: ' + str(e))

    if instance.db_entry.get('placement_attempts') > 3:
        raise exceptions.AbortInstanceStartException('Too many start attempts')

    try:
        if instance.db_entry.get('requested_placement'):
            candidates = [instance.db_entry.get('requested_placement')]
        else:
            candidates = []
            for node in s.metrics.keys():
                if node != config.NODE_NAME:
                    candidates.append(node)

        candidates = s.place_instance(instance, network, candidates=candidates)
        return candidates[0]

    except exceptions.LowResourceException as e:
        db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                     'insufficient resources: ' + str(e))
        # This raise implies delete above
        raise exceptions.AbortInstanceStartException(
            'Unable to find suitable node')
Exemple #5
0
def _safe_get_network_interface(interface_uuid):
    ni = db.get_interface(interface_uuid)
    if not ni:
        return None, None, error(404, 'interface not found')

    n = net.from_db(ni['network_uuid'])
    if not n:
        logutil.info([
            net.ThinNetwork(ni['network_uuid']),
            net.ThinNetworkInterface(ni['uuid'])
        ], 'Network not found or deleted')
        return None, None, error(404, 'interface network not found')

    if get_jwt_identity() not in [n.namespace, 'system']:
        logutil.info([n, net.ThinNetworkInterface(ni['uuid'])],
                     'Interface not found, ownership test')
        return None, None, error(404, 'interface not found')

    i = virt.from_db(ni['instance_uuid'])
    if get_jwt_identity() not in [i.db_entry['namespace'], 'system']:
        logutil.info([n, i, net.ThinNetworkInterface(ni['uuid'])],
                     'Instance not found, ownership test')
        return None, None, error(404, 'interface not found')

    return ni, n, None
def instance_start(instance_uuid, network):
    log = LOG.withField('instance', instance_uuid)

    with db.get_lock('instance',
                     None,
                     instance_uuid,
                     ttl=900,
                     timeout=120,
                     op='Instance start') as lock:
        instance = virt.from_db(instance_uuid)

        # Collect the networks
        nets = {}
        for netdesc in network:
            if netdesc['network_uuid'] not in nets:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_error(instance_uuid, 'missing network')
                    return

                nets[netdesc['network_uuid']] = n

        # Create the networks
        with util.RecordedOperation('ensure networks exist', instance):
            for network_uuid in nets:
                n = nets[network_uuid]
                try:
                    n.create()
                    n.ensure_mesh()
                    n.update_dhcp()
                except exceptions.DeadNetwork as e:
                    log.withField(
                        'network',
                        n).warning('Instance tried to use dead network')
                    db.enqueue_instance_error(
                        instance_uuid, 'tried to use dead network: %s' % e)
                    return

        # Allocate console and VDI ports
        instance.allocate_instance_ports()

        # Now we can start the instance
        libvirt = util.get_libvirt()
        try:
            with util.RecordedOperation('instance creation', instance):
                instance.create(lock=lock)

        except libvirt.libvirtError as e:
            code = e.get_error_code()
            if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED,
                        libvirt.VIR_ERR_XML_ERROR):
                db.enqueue_instance_error(instance_uuid,
                                          'instance failed to start: %s' % e)
                return

        for iface in db.get_instance_interfaces(instance_uuid):
            db.update_network_interface_state(iface['uuid'], 'created')
Exemple #7
0
    def wrapper(*args, **kwargs):
        if 'instance_uuid' in kwargs:
            kwargs['instance_from_db_virt'] = virt.from_db(
                kwargs['instance_uuid']
            )
        if not kwargs.get('instance_from_db_virt'):
            return error(404, 'instance not found')

        return func(*args, **kwargs)
Exemple #8
0
    def wrapper(*args, **kwargs):
        if 'instance_uuid' in kwargs:
            kwargs['instance_from_db_virt'] = virt.from_db(
                kwargs['instance_uuid'])
        if not kwargs.get('instance_from_db_virt'):
            LOG.info('instance(%s): instance not found, genuinely missing' %
                     kwargs.get('instance_uuid'))
            return error(404, 'instance not found')

        return func(*args, **kwargs)
Exemple #9
0
def image_fetch(url, instance_uuid):
    try:
        instance = None
        if instance_uuid:
            instance = virt.from_db(instance_uuid)

        img = images.Image(url)
        img.get([], instance)
    except exceptions.LockException:
        pass
Exemple #10
0
    def wrapper(*args, **kwargs):
        if 'instance_uuid' in kwargs:
            kwargs['instance_from_db_virt'] = virt.from_db(
                kwargs['instance_uuid'])
        if not kwargs.get('instance_from_db_virt'):
            logutil.info([virt.ThinInstance(kwargs['instance_uuid'])],
                         'Instance not found, genuinely missing')
            return error(404, 'instance not found')

        return func(*args, **kwargs)
Exemple #11
0
def restore_instances():
    # Ensure all instances for this node are defined
    networks = []
    instances = []
    for inst in list(
            db.get_instances(only_node=config.parsed.get('NODE_NAME'))):
        for iface in db.get_instance_interfaces(inst['uuid']):
            if not iface['network_uuid'] in networks:
                networks.append(iface['network_uuid'])
        instances.append(inst['uuid'])

    with util.RecordedOperation('restore networks', None):
        for network in networks:
            try:
                n = net.from_db(network)
                LOG.withObj(n).info('Restoring network')
                n.create()
                n.ensure_mesh()
                n.update_dhcp()
            except Exception as e:
                util.ignore_exception('restore network %s' % network, e)

    with util.RecordedOperation('restore instances', None):
        for instance in instances:
            try:
                with db.get_lock('instance',
                                 None,
                                 instance,
                                 ttl=120,
                                 timeout=120,
                                 op='Instance restore'):
                    i = virt.from_db(instance)
                    if not i:
                        continue
                    started = ['on', 'transition-to-on', 'initial', 'unknown']
                    if i.db_entry.get('power_state', 'unknown') not in started:
                        continue

                    LOG.withObj(i).info('Restoring instance')
                    i.create()
            except Exception as e:
                util.ignore_exception('restore instance %s' % instance, e)
                db.enqueue_instance_error(
                    instance,
                    'exception while restoring instance on daemon restart')
Exemple #12
0
def instance_start(instance_uuid, network):
    with db.get_lock('instance', None, instance_uuid, ttl=900) as lock:
        instance = virt.from_db(instance_uuid)

        # Collect the networks
        nets = {}
        for netdesc in network:
            if netdesc['network_uuid'] not in nets:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                               instance_uuid, 'error',
                                               'missing network')
                    return

                nets[netdesc['network_uuid']] = n

        # Create the networks
        with util.RecordedOperation('ensure networks exist', instance):
            for network_uuid in nets:
                n = nets[network_uuid]
                n.create()
                n.ensure_mesh()
                n.update_dhcp()

        # Now we can start the isntance
        libvirt = util.get_libvirt()
        try:
            with util.RecordedOperation('instance creation', instance):
                instance.create(lock=lock)

        except libvirt.libvirtError as e:
            code = e.get_error_code()
            if code in (libvirt.VIR_ERR_CONFIG_UNSUPPORTED,
                        libvirt.VIR_ERR_XML_ERROR):
                db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                           instance_uuid, 'error',
                                           'instance failed to start')
                return

        for iface in db.get_instance_interfaces(instance_uuid):
            db.update_network_interface_state(iface['uuid'], 'created')
Exemple #13
0
    def get(self, interface_uuid=None):
        ni = db.get_interface(interface_uuid)
        if not ni:
            return error(404, 'interface not found')

        n = net.from_db(ni['network_uuid'])
        if not n:
            LOG.info('network(%s): network not found, genuinely missing' %
                     ni['network_uuid'])
            return error(404, 'interface network not found')

        if get_jwt_identity() not in [n.namespace, 'system']:
            LOG.info('%s: interface not found, ownership test' % n)
            return error(404, 'interface not found')

        i = virt.from_db(ni['instance_uuid'])
        if get_jwt_identity() not in [i.db_entry['namespace'], 'system']:
            LOG.info('%s: instance not found, ownership test' % i)
            return error(404, 'interface not found')

        return ni
Exemple #14
0
def instance_delete(instance_uuid):
    with db.get_lock('instance',
                     None,
                     instance_uuid,
                     timeout=120,
                     op='Instance delete'):
        db.add_event('instance', instance_uuid, 'queued', 'delete', None, None)

        # Create list of networks used by instance
        instance_networks = []
        for iface in list(db.get_instance_interfaces(instance_uuid)):
            if not iface['network_uuid'] in instance_networks:
                instance_networks.append(iface['network_uuid'])

        # Create list of networks used by all other instances
        host_networks = []
        for inst in list(
                db.get_instances(only_node=config.parsed.get('NODE_NAME'))):
            if not inst['uuid'] == instance_uuid:
                for iface in db.get_instance_interfaces(inst['uuid']):
                    if not iface['network_uuid'] in host_networks:
                        host_networks.append(iface['network_uuid'])

        instance_from_db_virt = virt.from_db(instance_uuid)
        if instance_from_db_virt:
            instance_from_db_virt.delete()

        # Check each network used by the deleted instance
        for network in instance_networks:
            n = net.from_db(network)
            if n:
                # If network used by another instance, only update
                if network in host_networks:
                    with util.RecordedOperation('deallocate ip address',
                                                instance_from_db_virt):
                        n.update_dhcp()
                else:
                    # Network not used by any other instance therefore delete
                    with util.RecordedOperation('remove network', n):
                        n.delete()
Exemple #15
0
def restore_instances():
    # Ensure all instances for this node are defined
    networks = []
    instances = []
    for inst in list(db.get_instances(local_only=True)):
        for iface in db.get_instance_interfaces(inst['uuid']):
            if not iface['network_uuid'] in networks:
                networks.append(iface['network_uuid'])
        instances.append(inst['uuid'])

    with util.RecordedOperation('restore networks', None) as _:
        for network in networks:
            LOG.info('Restoring network %s' % network)
            n = net.from_db(network)
            n.create()
            n.ensure_mesh()
            n.update_dhcp()

    with util.RecordedOperation('restore instances', None) as _:
        for instance in instances:
            LOG.info('Restoring instance %s' % instance)
            i = virt.from_db(instance)
            i.create()
Exemple #16
0
def handle(jobname, workitem):
    j = JobName(jobname)
    logutil.info([j], 'Processing workitem')
    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            ro = [j]
            instance_uuid = task.get('instance_uuid')
            if instance_uuid:
                i = virt.from_db(instance_uuid)
                ro.append(i)

            if task.get('type').startswith('instance_') and not instance_uuid:
                logutil.error(ro, 'Instance task lacks instance uuid')
                return

            if instance_uuid:
                db.add_event('instance', instance_uuid,
                             task.get('type').replace('_', ' '), 'dequeued',
                             None, 'Work item %s' % jobname)

            logutil.info(
                ro,
                'Executing task %s: %s' % (task.get('type', 'unknown'), task))
            if task.get('type') == 'image_fetch':
                image_fetch(task.get('url'), instance_uuid)

            if task.get('type') == 'instance_preflight':
                redirect_to = instance_preflight(instance_uuid,
                                                 task.get('network'))
                if redirect_to:
                    util.log('info', ro,
                             'Redirecting instance start to %s' % redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            if task.get('type') == 'instance_start':
                instance_start(instance_uuid, task.get('network'))
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {})

            if task.get('type') == 'instance_delete':
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid,
                                             task.get('next_state', 'unknown'))
                    if task.get('next_state_message'):
                        db.update_instance_error_message(
                            instance_uuid, task.get('next_state_message'))
                    db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'),
                               {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

    except Exception as e:
        if instance_uuid:
            util.ignore_exception(daemon.process_name('queues'), e)
            db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'failed queue task: %s' % e)

    finally:
        db.resolve(config.parsed.get('NODE_NAME'), jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        logutil.info([j], 'Completed workitem')
Exemple #17
0
    def post(self,
             name=None,
             cpus=None,
             memory=None,
             network=None,
             disk=None,
             ssh_key=None,
             user_data=None,
             placed_on=None,
             namespace=None,
             instance_uuid=None):
        global SCHEDULER

        # We need to sanitise the name so its safe for DNS
        name = re.sub(r'([^a-zA-Z0-9_\-])', '', name)

        if not namespace:
            namespace = get_jwt_identity()

        # If accessing a foreign namespace, we need to be an admin
        if get_jwt_identity() not in [namespace, 'system']:
            return error(
                401,
                'only admins can create resources in a different namespace')

        # The instance needs to exist in the DB before network interfaces are created
        if not instance_uuid:
            instance_uuid = str(uuid.uuid4())
            db.add_event('instance', instance_uuid, 'uuid allocated', None,
                         None, None)

        # Create instance object
        instance = virt.from_db(instance_uuid)
        if instance:
            if get_jwt_identity() not in [
                    instance.db_entry['namespace'], 'system'
            ]:
                LOG.info('instance(%s): instance not found, ownership test' %
                         instance_uuid)
                return error(404, 'instance not found')

        if not instance:
            instance = virt.from_definition(uuid=instance_uuid,
                                            name=name,
                                            disks=disk,
                                            memory_mb=memory,
                                            vcpus=cpus,
                                            ssh_key=ssh_key,
                                            user_data=user_data,
                                            owner=namespace)

        if not SCHEDULER:
            SCHEDULER = scheduler.Scheduler()

        # Have we been placed?
        if not placed_on:
            candidates = SCHEDULER.place_instance(instance, network)
            if len(candidates) == 0:
                db.add_event('instance', instance_uuid, 'schedule', 'failed',
                             None, 'insufficient resources')
                db.update_instance_state(instance_uuid, 'error')
                return error(507, 'insufficient capacity')

            placed_on = candidates[0]
            db.place_instance(instance_uuid, placed_on)
            db.add_event('instance', instance_uuid, 'placement', None, None,
                         placed_on)

        else:
            try:
                candidates = SCHEDULER.place_instance(instance,
                                                      network,
                                                      candidates=[placed_on])
                if len(candidates) == 0:
                    db.add_event('instance', instance_uuid, 'schedule',
                                 'failed', None, 'insufficient resources')
                    db.update_instance_state(instance_uuid, 'error')
                    return error(507, 'insufficient capacity')
            except scheduler.CandidateNodeNotFoundException as e:
                return error(404, 'node not found: %s' % e)

        # Have we been placed on a different node?
        if not placed_on == config.parsed.get('NODE_NAME'):
            body = flask_get_post_body()
            body['placed_on'] = placed_on
            body['instance_uuid'] = instance_uuid
            body['namespace'] = namespace

            token = util.get_api_token(
                'http://%s:%d' % (placed_on, config.parsed.get('API_PORT')),
                namespace=namespace)
            r = requests.request('POST',
                                 'http://%s:%d/instances' %
                                 (placed_on, config.parsed.get('API_PORT')),
                                 data=json.dumps(body),
                                 headers={
                                     'Authorization': token,
                                     'User-Agent': util.get_user_agent()
                                 })

            LOG.info('Returning proxied request: %d, %s' %
                     (r.status_code, r.text))
            resp = flask.Response(r.text, mimetype='application/json')
            resp.status_code = r.status_code
            return resp

        # Check we can get the required IPs
        nets = {}
        allocations = {}

        def error_with_cleanup(status_code, message):
            for network_uuid in allocations:
                n = net.from_db(network_uuid)
                for addr, _ in allocations[network_uuid]:
                    with db.get_lock('sf/ipmanager/%s' % n.uuid, ttl=120) as _:
                        ipm = db.get_ipmanager(n.uuid)
                        ipm.release(addr)
                        db.persist_ipmanager(n.uuid, ipm.save())
            return error(status_code, message)

        order = 0
        if network:
            for netdesc in network:
                if 'network_uuid' not in netdesc or not netdesc['network_uuid']:
                    return error_with_cleanup(404, 'network not specified')

                if netdesc['network_uuid'] not in nets:
                    n = net.from_db(netdesc['network_uuid'])
                    if not n:
                        return error_with_cleanup(
                            404,
                            'network %s not found' % netdesc['network_uuid'])
                    nets[netdesc['network_uuid']] = n
                    n.create()

                with db.get_lock('sf/ipmanager/%s' % netdesc['network_uuid'],
                                 ttl=120) as _:
                    db.add_event('network', netdesc['network_uuid'],
                                 'allocate address', None, None, instance_uuid)
                    allocations.setdefault(netdesc['network_uuid'], [])
                    ipm = db.get_ipmanager(netdesc['network_uuid'])
                    if 'address' not in netdesc or not netdesc['address']:
                        netdesc['address'] = ipm.get_random_free_address()
                    else:
                        if not ipm.reserve(netdesc['address']):
                            return error_with_cleanup(
                                409, 'address %s in use' % netdesc['address'])
                    db.persist_ipmanager(netdesc['network_uuid'], ipm.save())
                    allocations[netdesc['network_uuid']].append(
                        (netdesc['address'], order))

                if 'model' not in netdesc or not netdesc['model']:
                    netdesc['model'] = 'virtio'

                db.create_network_interface(str(uuid.uuid4()), netdesc,
                                            instance_uuid, order)

                order += 1

        # Initialise metadata
        db.persist_metadata('instance', instance_uuid, {})

        # Now we can start the instance
        with db.get_lock('sf/instance/%s' % instance.db_entry['uuid'],
                         ttl=900) as lock:
            with util.RecordedOperation('ensure networks exist',
                                        instance) as _:
                for network_uuid in nets:
                    n = nets[network_uuid]
                    n.ensure_mesh()
                    n.update_dhcp()

            with util.RecordedOperation('instance creation', instance) as _:
                instance.create(lock=lock)

            for iface in db.get_instance_interfaces(instance.db_entry['uuid']):
                db.update_network_interface_state(iface['uuid'], 'created')

            return db.get_instance(instance_uuid)
    def _update_power_states(self):
        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        try:
            seen = []

            # Active VMs have an ID. Active means running in libvirt
            # land.
            for domain_id in conn.listDomainsID():
                domain = conn.lookupByID(domain_id)
                if not domain.name().startswith('sf:'):
                    continue

                instance_uuid = domain.name().split(':')[1]
                log_ctx = LOG.withInstance(instance_uuid)

                instance = db.get_instance(instance_uuid)
                if not instance:
                    # Instance is SF but not in database. Kill to reduce load.
                    log_ctx.warning('Destroying unknown instance')
                    util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid)
                    continue

                db.place_instance(instance_uuid, config.NODE_NAME)
                seen.append(domain.name())

                if instance.get('state') == 'deleted':
                    # NOTE(mikal): a delete might be in-flight in the queue.
                    # We only worry about instances which should have gone
                    # away five minutes ago.
                    if time.time() - instance['state_updated'] < 300:
                        continue

                    db.instance_enforced_deletes_increment(instance_uuid)
                    attempts = instance.get('enforced_deletes', 0)

                    if attempts > 5:
                        # Sometimes we just can't delete the VM. Try the big hammer instead.
                        log_ctx.warning(
                            'Attempting alternate delete method for instance')
                        util.execute(None,
                                     'virsh destroy "sf:%s"' % instance_uuid)

                        db.add_event('instance', instance_uuid,
                                     'enforced delete', 'complete', None, None)
                    else:
                        i = virt.from_db(instance_uuid)
                        i.delete()
                        i.update_instance_state('deleted')

                    log_ctx.withField(
                        'attempt', attempts).warning('Deleting stray instance')

                    continue

                state = util.extract_power_state(libvirt, domain)
                db.update_instance_power_state(instance_uuid, state)
                if state == 'crashed':
                    db.update_instance_state(instance_uuid, 'error')

            # Inactive VMs just have a name, and are powered off
            # in our state system.
            for domain_name in conn.listDefinedDomains():
                if not domain_name.startswith('sf:'):
                    continue

                if domain_name not in seen:
                    instance_uuid = domain_name.split(':')[1]
                    log_ctx = LOG.withInstance(instance_uuid)
                    instance = db.get_instance(instance_uuid)

                    if not instance:
                        # Instance is SF but not in database. Kill because unknown.
                        log_ctx.warning('Removing unknown inactive instance')
                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        continue

                    if instance.get('state') == 'deleted':
                        # NOTE(mikal): a delete might be in-flight in the queue.
                        # We only worry about instances which should have gone
                        # away five minutes ago.
                        if time.time() - instance['state_updated'] < 300:
                            continue

                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        log_ctx.info('Detected stray instance')
                        db.add_event('instance', instance_uuid,
                                     'deleted stray', 'complete', None, None)
                        continue

                    db.place_instance(instance_uuid, config.NODE_NAME)
                    instance_path = os.path.join(config.get('STORAGE_PATH'),
                                                 'instances', instance_uuid)

                    if not os.path.exists(instance_path):
                        # If we're inactive and our files aren't on disk,
                        # we have a problem.
                        log_ctx.info('Detected error state for instance')
                        db.update_instance_state(instance_uuid, 'error')

                    elif instance.get('power_state') != 'off':
                        log_ctx.info('Detected power off for instance')
                        db.update_instance_power_state(instance_uuid, 'off')
                        db.add_event('instance', instance_uuid,
                                     'detected poweroff', 'complete', None,
                                     None)

        except libvirt.libvirtError as e:
            LOG.error('Failed to lookup all domains: %s' % e)
Exemple #19
0
    def post(self,
             name=None,
             cpus=None,
             memory=None,
             network=None,
             disk=None,
             ssh_key=None,
             user_data=None,
             placed_on=None,
             namespace=None,
             instance_uuid=None,
             video=None):
        global SCHEDULER

        # Check that the instance name is safe for use as a DNS host name
        if name != re.sub(r'([^a-zA-Z0-9_\-])', '', name) or len(name) > 63:
            return error(400,
                         'instance name must be useable as a DNS host name')

        # Sanity check
        if not disk:
            return error(400, 'instance must specify at least one disk')
        for d in disk:
            if not isinstance(d, dict):
                return error(400,
                             'disk specification should contain JSON objects')

        if network:
            for n in network:
                if not isinstance(n, dict):
                    return error(
                        400,
                        'network specification should contain JSON objects')

                if 'network_uuid' not in n:
                    return error(
                        400, 'network specification is missing network_uuid')

        if not video:
            video = {'model': 'cirrus', 'memory': 16384}

        if not namespace:
            namespace = get_jwt_identity()

        # Only system can specify a uuid
        if instance_uuid and get_jwt_identity() != 'system':
            return error(401, 'only system can specify an instance uuid')

        # If accessing a foreign namespace, we need to be an admin
        if get_jwt_identity() not in [namespace, 'system']:
            return error(
                401,
                'only admins can create resources in a different namespace')

        # The instance needs to exist in the DB before network interfaces are created
        if not instance_uuid:
            instance_uuid = str(uuid.uuid4())
            db.add_event('instance', instance_uuid, 'uuid allocated', None,
                         None, None)

        # Create instance object
        instance = virt.from_db(instance_uuid)
        if instance:
            if get_jwt_identity() not in [
                    instance.db_entry['namespace'], 'system'
            ]:
                logutil.info([virt.ThinInstance(instance_uuid)],
                             'Instance not found, ownership test')
                return error(404, 'instance not found')

        if not instance:
            instance = virt.from_definition(uuid=instance_uuid,
                                            name=name,
                                            disks=disk,
                                            memory_mb=memory,
                                            vcpus=cpus,
                                            ssh_key=ssh_key,
                                            user_data=user_data,
                                            owner=namespace,
                                            video=video,
                                            requested_placement=placed_on)

        # Initialise metadata
        db.persist_metadata('instance', instance_uuid, {})

        # Allocate IP addresses
        order = 0
        if network:
            for netdesc in network:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_delete(
                        config.parsed.get('NODE_NAME'), instance_uuid, 'error',
                        'missing network %s during IP allocation phase' %
                        netdesc['network_uuid'])
                    return error(
                        404, 'network %s not found' % netdesc['network_uuid'])

                with db.get_lock('ipmanager',
                                 None,
                                 netdesc['network_uuid'],
                                 ttl=120):
                    db.add_event('network', netdesc['network_uuid'],
                                 'allocate address', None, None, instance_uuid)
                    ipm = db.get_ipmanager(netdesc['network_uuid'])
                    if 'address' not in netdesc or not netdesc['address']:
                        netdesc['address'] = ipm.get_random_free_address()
                    else:
                        if not ipm.reserve(netdesc['address']):
                            db.enqueue_instance_delete(
                                config.parsed.get('NODE_NAME'), instance_uuid,
                                'error',
                                'failed to reserve an IP on network %s' %
                                netdesc['network_uuid'])
                            return error(
                                409, 'address %s in use' % netdesc['address'])

                    db.persist_ipmanager(netdesc['network_uuid'], ipm.save())

                if 'model' not in netdesc or not netdesc['model']:
                    netdesc['model'] = 'virtio'

                db.create_network_interface(str(uuid.uuid4()), netdesc,
                                            instance_uuid, order)

        if not SCHEDULER:
            SCHEDULER = scheduler.Scheduler()

        try:
            # Have we been placed?
            if not placed_on:
                candidates = SCHEDULER.place_instance(instance, network)
                placement = candidates[0]

            else:
                SCHEDULER.place_instance(instance,
                                         network,
                                         candidates=[placed_on])
                placement = placed_on

        except exceptions.LowResourceException as e:
            db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                         'insufficient resources: ' + str(e))
            db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'scheduling failed')
            return error(507, str(e))

        except exceptions.CandidateNodeNotFoundException as e:
            db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                         'candidate node not found: ' + str(e))
            db.enqueue_instance_delete(config.get.parsed('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'scheduling failed')
            return error(404, 'node not found: %s' % e)

        # Record placement
        db.place_instance(instance_uuid, placement)
        db.add_event('instance', instance_uuid, 'placement', None, None,
                     placement)

        # Create a queue entry for the instance start
        tasks = [{
            'type': 'instance_preflight',
            'instance_uuid': instance_uuid,
            'network': network
        }]
        for disk in instance.db_entry['block_devices']['devices']:
            if 'base' in disk and disk['base']:
                tasks.append({
                    'type': 'image_fetch',
                    'instance_uuid': instance_uuid,
                    'url': disk['base']
                })
        tasks.append({
            'type': 'instance_start',
            'instance_uuid': instance_uuid,
            'network': network
        })

        # Enqueue creation tasks on desired node task queue
        db.enqueue(placement, {'tasks': tasks})
        db.add_event('instance', instance_uuid, 'create', 'enqueued', None,
                     None)

        # Watch for a while and return results if things are fast, give up
        # after a while and just return the current state
        start_time = time.time()
        while time.time() - start_time < config.parsed.get('API_ASYNC_WAIT'):
            i = db.get_instance(instance_uuid)
            if i['state'] in ['created', 'deleted', 'error']:
                return i
            time.sleep(0.5)
        return i