Example #1
0
def handle(jobname, workitem):
    log = LOG.withField('workitem', jobname)
    log.info('Processing workitem')

    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            if not QueueTask.__subclasscheck__(type(task)):
                raise exceptions.UnknownTaskException(
                    'Task was not decoded: %s' % task)

            if (InstanceTask.__subclasscheck__(type(task))
                    or isinstance(task, FetchImageTask)):
                instance_uuid = task.instance_uuid()

            if instance_uuid:
                log_i = log.withInstance(instance_uuid)
            else:
                log_i = log

            log_i.withField('task_name', task.name()).info('Starting task')

            # TODO(andy) Should network events also come through here eventually?
            # Then this can be generalised to record events on networks/instances

            # TODO(andy) This event should be recorded when it is recorded as
            # dequeued in the DB. Currently it's reporting action on the item
            # and calling it 'dequeue'.

            if instance_uuid:
                # TODO(andy) move to QueueTask
                db.add_event('instance', instance_uuid,
                             task.pretty_task_name(), 'dequeued', None,
                             'Work item %s' % jobname)

            if isinstance(task, FetchImageTask):
                image_fetch(task.url(), instance_uuid)

            elif isinstance(task, PreflightInstanceTask):
                redirect_to = instance_preflight(instance_uuid, task.network())
                if redirect_to:
                    log_i.info('Redirecting instance start to %s' %
                               redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            elif isinstance(task, StartInstanceTask):
                instance_start(instance_uuid, task.network())
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.NODE_NAME, {})

            elif isinstance(task, DeleteInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'deleted')
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            elif isinstance(task, ErrorInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'error')

                    if task.error_msg():
                        db.update_instance_error_message(
                            instance_uuid, task.error_msg())
                    db.enqueue('%s-metrics' % config.NODE_NAME, {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            else:
                log_i.withField('task', task).error('Unhandled task - dropped')

            log_i.info('Task complete')

    except exceptions.ImageFetchTaskFailedException as e:
        # Usually caused by external issue and not an application error
        log.info('Fetch Image Error: %s', e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    except Exception as e:
        util.ignore_exception(daemon.process_name('queues'), e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    finally:
        db.resolve(config.NODE_NAME, jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        log.info('Completed workitem')
    def _update_power_states(self):
        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        try:
            seen = []

            # Active VMs have an ID. Active means running in libvirt
            # land.
            for domain_id in conn.listDomainsID():
                domain = conn.lookupByID(domain_id)
                if not domain.name().startswith('sf:'):
                    continue

                instance_uuid = domain.name().split(':')[1]
                log_ctx = LOG.withInstance(instance_uuid)

                instance = db.get_instance(instance_uuid)
                if not instance:
                    # Instance is SF but not in database. Kill to reduce load.
                    log_ctx.warning('Destroying unknown instance')
                    util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid)
                    continue

                db.place_instance(instance_uuid, config.NODE_NAME)
                seen.append(domain.name())

                if instance.get('state') == 'deleted':
                    # NOTE(mikal): a delete might be in-flight in the queue.
                    # We only worry about instances which should have gone
                    # away five minutes ago.
                    if time.time() - instance['state_updated'] < 300:
                        continue

                    db.instance_enforced_deletes_increment(instance_uuid)
                    attempts = instance.get('enforced_deletes', 0)

                    if attempts > 5:
                        # Sometimes we just can't delete the VM. Try the big hammer instead.
                        log_ctx.warning(
                            'Attempting alternate delete method for instance')
                        util.execute(None,
                                     'virsh destroy "sf:%s"' % instance_uuid)

                        db.add_event('instance', instance_uuid,
                                     'enforced delete', 'complete', None, None)
                    else:
                        i = virt.from_db(instance_uuid)
                        i.delete()
                        i.update_instance_state('deleted')

                    log_ctx.withField(
                        'attempt', attempts).warning('Deleting stray instance')

                    continue

                state = util.extract_power_state(libvirt, domain)
                db.update_instance_power_state(instance_uuid, state)
                if state == 'crashed':
                    db.update_instance_state(instance_uuid, 'error')

            # Inactive VMs just have a name, and are powered off
            # in our state system.
            for domain_name in conn.listDefinedDomains():
                if not domain_name.startswith('sf:'):
                    continue

                if domain_name not in seen:
                    instance_uuid = domain_name.split(':')[1]
                    log_ctx = LOG.withInstance(instance_uuid)
                    instance = db.get_instance(instance_uuid)

                    if not instance:
                        # Instance is SF but not in database. Kill because unknown.
                        log_ctx.warning('Removing unknown inactive instance')
                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        continue

                    if instance.get('state') == 'deleted':
                        # NOTE(mikal): a delete might be in-flight in the queue.
                        # We only worry about instances which should have gone
                        # away five minutes ago.
                        if time.time() - instance['state_updated'] < 300:
                            continue

                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        log_ctx.info('Detected stray instance')
                        db.add_event('instance', instance_uuid,
                                     'deleted stray', 'complete', None, None)
                        continue

                    db.place_instance(instance_uuid, config.NODE_NAME)
                    instance_path = os.path.join(config.get('STORAGE_PATH'),
                                                 'instances', instance_uuid)

                    if not os.path.exists(instance_path):
                        # If we're inactive and our files aren't on disk,
                        # we have a problem.
                        log_ctx.info('Detected error state for instance')
                        db.update_instance_state(instance_uuid, 'error')

                    elif instance.get('power_state') != 'off':
                        log_ctx.info('Detected power off for instance')
                        db.update_instance_power_state(instance_uuid, 'off')
                        db.add_event('instance', instance_uuid,
                                     'detected poweroff', 'complete', None,
                                     None)

        except libvirt.libvirtError as e:
            LOG.error('Failed to lookup all domains: %s' % e)
Example #3
0
    def post(self,
             name=None,
             cpus=None,
             memory=None,
             network=None,
             disk=None,
             ssh_key=None,
             user_data=None,
             placed_on=None,
             namespace=None,
             instance_uuid=None):
        global SCHEDULER

        # We need to sanitise the name so its safe for DNS
        name = re.sub(r'([^a-zA-Z0-9_\-])', '', name)

        if not namespace:
            namespace = get_jwt_identity()

        # If accessing a foreign namespace, we need to be an admin
        if get_jwt_identity() not in [namespace, 'system']:
            return error(
                401,
                'only admins can create resources in a different namespace')

        # The instance needs to exist in the DB before network interfaces are created
        if not instance_uuid:
            instance_uuid = str(uuid.uuid4())
            db.add_event('instance', instance_uuid, 'uuid allocated', None,
                         None, None)

        # Create instance object
        instance = virt.from_db(instance_uuid)
        if instance:
            if get_jwt_identity() not in [
                    instance.db_entry['namespace'], 'system'
            ]:
                LOG.info('instance(%s): instance not found, ownership test' %
                         instance_uuid)
                return error(404, 'instance not found')

        if not instance:
            instance = virt.from_definition(uuid=instance_uuid,
                                            name=name,
                                            disks=disk,
                                            memory_mb=memory,
                                            vcpus=cpus,
                                            ssh_key=ssh_key,
                                            user_data=user_data,
                                            owner=namespace)

        if not SCHEDULER:
            SCHEDULER = scheduler.Scheduler()

        # Have we been placed?
        if not placed_on:
            candidates = SCHEDULER.place_instance(instance, network)
            if len(candidates) == 0:
                db.add_event('instance', instance_uuid, 'schedule', 'failed',
                             None, 'insufficient resources')
                db.update_instance_state(instance_uuid, 'error')
                return error(507, 'insufficient capacity')

            placed_on = candidates[0]
            db.place_instance(instance_uuid, placed_on)
            db.add_event('instance', instance_uuid, 'placement', None, None,
                         placed_on)

        else:
            try:
                candidates = SCHEDULER.place_instance(instance,
                                                      network,
                                                      candidates=[placed_on])
                if len(candidates) == 0:
                    db.add_event('instance', instance_uuid, 'schedule',
                                 'failed', None, 'insufficient resources')
                    db.update_instance_state(instance_uuid, 'error')
                    return error(507, 'insufficient capacity')
            except scheduler.CandidateNodeNotFoundException as e:
                return error(404, 'node not found: %s' % e)

        # Have we been placed on a different node?
        if not placed_on == config.parsed.get('NODE_NAME'):
            body = flask_get_post_body()
            body['placed_on'] = placed_on
            body['instance_uuid'] = instance_uuid
            body['namespace'] = namespace

            token = util.get_api_token(
                'http://%s:%d' % (placed_on, config.parsed.get('API_PORT')),
                namespace=namespace)
            r = requests.request('POST',
                                 'http://%s:%d/instances' %
                                 (placed_on, config.parsed.get('API_PORT')),
                                 data=json.dumps(body),
                                 headers={
                                     'Authorization': token,
                                     'User-Agent': util.get_user_agent()
                                 })

            LOG.info('Returning proxied request: %d, %s' %
                     (r.status_code, r.text))
            resp = flask.Response(r.text, mimetype='application/json')
            resp.status_code = r.status_code
            return resp

        # Check we can get the required IPs
        nets = {}
        allocations = {}

        def error_with_cleanup(status_code, message):
            for network_uuid in allocations:
                n = net.from_db(network_uuid)
                for addr, _ in allocations[network_uuid]:
                    with db.get_lock('sf/ipmanager/%s' % n.uuid, ttl=120) as _:
                        ipm = db.get_ipmanager(n.uuid)
                        ipm.release(addr)
                        db.persist_ipmanager(n.uuid, ipm.save())
            return error(status_code, message)

        order = 0
        if network:
            for netdesc in network:
                if 'network_uuid' not in netdesc or not netdesc['network_uuid']:
                    return error_with_cleanup(404, 'network not specified')

                if netdesc['network_uuid'] not in nets:
                    n = net.from_db(netdesc['network_uuid'])
                    if not n:
                        return error_with_cleanup(
                            404,
                            'network %s not found' % netdesc['network_uuid'])
                    nets[netdesc['network_uuid']] = n
                    n.create()

                with db.get_lock('sf/ipmanager/%s' % netdesc['network_uuid'],
                                 ttl=120) as _:
                    db.add_event('network', netdesc['network_uuid'],
                                 'allocate address', None, None, instance_uuid)
                    allocations.setdefault(netdesc['network_uuid'], [])
                    ipm = db.get_ipmanager(netdesc['network_uuid'])
                    if 'address' not in netdesc or not netdesc['address']:
                        netdesc['address'] = ipm.get_random_free_address()
                    else:
                        if not ipm.reserve(netdesc['address']):
                            return error_with_cleanup(
                                409, 'address %s in use' % netdesc['address'])
                    db.persist_ipmanager(netdesc['network_uuid'], ipm.save())
                    allocations[netdesc['network_uuid']].append(
                        (netdesc['address'], order))

                if 'model' not in netdesc or not netdesc['model']:
                    netdesc['model'] = 'virtio'

                db.create_network_interface(str(uuid.uuid4()), netdesc,
                                            instance_uuid, order)

                order += 1

        # Initialise metadata
        db.persist_metadata('instance', instance_uuid, {})

        # Now we can start the instance
        with db.get_lock('sf/instance/%s' % instance.db_entry['uuid'],
                         ttl=900) as lock:
            with util.RecordedOperation('ensure networks exist',
                                        instance) as _:
                for network_uuid in nets:
                    n = nets[network_uuid]
                    n.ensure_mesh()
                    n.update_dhcp()

            with util.RecordedOperation('instance creation', instance) as _:
                instance.create(lock=lock)

            for iface in db.get_instance_interfaces(instance.db_entry['uuid']):
                db.update_network_interface_state(iface['uuid'], 'created')

            return db.get_instance(instance_uuid)
Example #4
0
    def post(self,
             name=None,
             cpus=None,
             memory=None,
             network=None,
             disk=None,
             ssh_key=None,
             user_data=None,
             placed_on=None,
             namespace=None,
             instance_uuid=None,
             video=None):
        global SCHEDULER

        # Check that the instance name is safe for use as a DNS host name
        if name != re.sub(r'([^a-zA-Z0-9_\-])', '', name) or len(name) > 63:
            return error(400,
                         'instance name must be useable as a DNS host name')

        # Sanity check
        if not disk:
            return error(400, 'instance must specify at least one disk')
        for d in disk:
            if not isinstance(d, dict):
                return error(400,
                             'disk specification should contain JSON objects')

        if network:
            for n in network:
                if not isinstance(n, dict):
                    return error(
                        400,
                        'network specification should contain JSON objects')

                if 'network_uuid' not in n:
                    return error(
                        400, 'network specification is missing network_uuid')

        if not video:
            video = {'model': 'cirrus', 'memory': 16384}

        if not namespace:
            namespace = get_jwt_identity()

        # Only system can specify a uuid
        if instance_uuid and get_jwt_identity() != 'system':
            return error(401, 'only system can specify an instance uuid')

        # If accessing a foreign namespace, we need to be an admin
        if get_jwt_identity() not in [namespace, 'system']:
            return error(
                401,
                'only admins can create resources in a different namespace')

        # The instance needs to exist in the DB before network interfaces are created
        if not instance_uuid:
            instance_uuid = str(uuid.uuid4())
            db.add_event('instance', instance_uuid, 'uuid allocated', None,
                         None, None)

        # Create instance object
        instance = virt.from_db(instance_uuid)
        if instance:
            if get_jwt_identity() not in [
                    instance.db_entry['namespace'], 'system'
            ]:
                logutil.info([virt.ThinInstance(instance_uuid)],
                             'Instance not found, ownership test')
                return error(404, 'instance not found')

        if not instance:
            instance = virt.from_definition(uuid=instance_uuid,
                                            name=name,
                                            disks=disk,
                                            memory_mb=memory,
                                            vcpus=cpus,
                                            ssh_key=ssh_key,
                                            user_data=user_data,
                                            owner=namespace,
                                            video=video,
                                            requested_placement=placed_on)

        # Initialise metadata
        db.persist_metadata('instance', instance_uuid, {})

        # Allocate IP addresses
        order = 0
        if network:
            for netdesc in network:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_delete(
                        config.parsed.get('NODE_NAME'), instance_uuid, 'error',
                        'missing network %s during IP allocation phase' %
                        netdesc['network_uuid'])
                    return error(
                        404, 'network %s not found' % netdesc['network_uuid'])

                with db.get_lock('ipmanager',
                                 None,
                                 netdesc['network_uuid'],
                                 ttl=120):
                    db.add_event('network', netdesc['network_uuid'],
                                 'allocate address', None, None, instance_uuid)
                    ipm = db.get_ipmanager(netdesc['network_uuid'])
                    if 'address' not in netdesc or not netdesc['address']:
                        netdesc['address'] = ipm.get_random_free_address()
                    else:
                        if not ipm.reserve(netdesc['address']):
                            db.enqueue_instance_delete(
                                config.parsed.get('NODE_NAME'), instance_uuid,
                                'error',
                                'failed to reserve an IP on network %s' %
                                netdesc['network_uuid'])
                            return error(
                                409, 'address %s in use' % netdesc['address'])

                    db.persist_ipmanager(netdesc['network_uuid'], ipm.save())

                if 'model' not in netdesc or not netdesc['model']:
                    netdesc['model'] = 'virtio'

                db.create_network_interface(str(uuid.uuid4()), netdesc,
                                            instance_uuid, order)

        if not SCHEDULER:
            SCHEDULER = scheduler.Scheduler()

        try:
            # Have we been placed?
            if not placed_on:
                candidates = SCHEDULER.place_instance(instance, network)
                placement = candidates[0]

            else:
                SCHEDULER.place_instance(instance,
                                         network,
                                         candidates=[placed_on])
                placement = placed_on

        except exceptions.LowResourceException as e:
            db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                         'insufficient resources: ' + str(e))
            db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'scheduling failed')
            return error(507, str(e))

        except exceptions.CandidateNodeNotFoundException as e:
            db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                         'candidate node not found: ' + str(e))
            db.enqueue_instance_delete(config.get.parsed('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'scheduling failed')
            return error(404, 'node not found: %s' % e)

        # Record placement
        db.place_instance(instance_uuid, placement)
        db.add_event('instance', instance_uuid, 'placement', None, None,
                     placement)

        # Create a queue entry for the instance start
        tasks = [{
            'type': 'instance_preflight',
            'instance_uuid': instance_uuid,
            'network': network
        }]
        for disk in instance.db_entry['block_devices']['devices']:
            if 'base' in disk and disk['base']:
                tasks.append({
                    'type': 'image_fetch',
                    'instance_uuid': instance_uuid,
                    'url': disk['base']
                })
        tasks.append({
            'type': 'instance_start',
            'instance_uuid': instance_uuid,
            'network': network
        })

        # Enqueue creation tasks on desired node task queue
        db.enqueue(placement, {'tasks': tasks})
        db.add_event('instance', instance_uuid, 'create', 'enqueued', None,
                     None)

        # Watch for a while and return results if things are fast, give up
        # after a while and just return the current state
        start_time = time.time()
        while time.time() - start_time < config.parsed.get('API_ASYNC_WAIT'):
            i = db.get_instance(instance_uuid)
            if i['state'] in ['created', 'deleted', 'error']:
                return i
            time.sleep(0.5)
        return i
Example #5
0
def handle(jobname, workitem):
    j = JobName(jobname)
    logutil.info([j], 'Processing workitem')
    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            ro = [j]
            instance_uuid = task.get('instance_uuid')
            if instance_uuid:
                i = virt.from_db(instance_uuid)
                ro.append(i)

            if task.get('type').startswith('instance_') and not instance_uuid:
                logutil.error(ro, 'Instance task lacks instance uuid')
                return

            if instance_uuid:
                db.add_event('instance', instance_uuid,
                             task.get('type').replace('_', ' '), 'dequeued',
                             None, 'Work item %s' % jobname)

            logutil.info(
                ro,
                'Executing task %s: %s' % (task.get('type', 'unknown'), task))
            if task.get('type') == 'image_fetch':
                image_fetch(task.get('url'), instance_uuid)

            if task.get('type') == 'instance_preflight':
                redirect_to = instance_preflight(instance_uuid,
                                                 task.get('network'))
                if redirect_to:
                    util.log('info', ro,
                             'Redirecting instance start to %s' % redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            if task.get('type') == 'instance_start':
                instance_start(instance_uuid, task.get('network'))
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {})

            if task.get('type') == 'instance_delete':
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid,
                                             task.get('next_state', 'unknown'))
                    if task.get('next_state_message'):
                        db.update_instance_error_message(
                            instance_uuid, task.get('next_state_message'))
                    db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'),
                               {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

    except Exception as e:
        if instance_uuid:
            util.ignore_exception(daemon.process_name('queues'), e)
            db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'failed queue task: %s' % e)

    finally:
        db.resolve(config.parsed.get('NODE_NAME'), jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        logutil.info([j], 'Completed workitem')