Beispiel #1
0
def instance_preflight(instance_uuid, network):
    db.update_instance_state(instance_uuid, 'preflight')

    s = scheduler.Scheduler()
    instance = virt.from_db(instance_uuid)

    try:
        s.place_instance(instance,
                         network,
                         candidates=[config.parsed.get('NODE_NAME')])
        return None

    except exceptions.LowResourceException as e:
        db.add_event('instance', instance_uuid, 'schedule', 'retry', None,
                     'insufficient resources: ' + str(e))

    if instance.db_entry.get('placement_attempts') > 3:
        raise exceptions.AbortInstanceStartException('Too many start attempts')

    try:
        if instance.db_entry.get('requested_placement'):
            candidates = [instance.db_entry.get('requested_placement')]
        else:
            candidates = []
            for node in s.metrics.keys():
                if node != config.parsed.get('NODE_NAME'):
                    candidates.append(node)

        candidates = s.place_instance(instance, network, candidates=candidates)
        return candidates[0]

    except exceptions.LowResourceException as e:
        db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                     'insufficient resources: ' + str(e))
        # This raise implies delete above
        raise exceptions.AbortInstanceStartException(
            'Unable to find suitable node')
Beispiel #2
0
def instance_delete(instance_uuid):
    with db.get_lock('instance', None, instance_uuid):
        db.add_event('instance', instance_uuid, 'queued', 'delete', None, None)

        # Create list of networks used by instance
        instance_networks = []
        for iface in list(db.get_instance_interfaces(instance_uuid)):
            if not iface['network_uuid'] in instance_networks:
                instance_networks.append(iface['network_uuid'])

        # Create list of networks used by all other instances
        host_networks = []
        for inst in list(
                db.get_instances(only_node=config.parsed.get('NODE_NAME'))):
            if not inst['uuid'] == instance_uuid:
                for iface in db.get_instance_interfaces(inst['uuid']):
                    if not iface['network_uuid'] in host_networks:
                        host_networks.append(iface['network_uuid'])

        instance_from_db_virt = virt.from_db(instance_uuid)
        if instance_from_db_virt:
            instance_from_db_virt.delete()

        # Check each network used by the deleted instance
        for network in instance_networks:
            n = net.from_db(network)
            if n:
                # If network used by another instance, only update
                if network in host_networks:
                    with util.RecordedOperation('deallocate ip address',
                                                instance_from_db_virt):
                        n.update_dhcp()
                else:
                    # Network not used by any other instance therefore delete
                    with util.RecordedOperation('remove network', n):
                        n.delete()
Beispiel #3
0
    def _get(self, locks, related_object):
        """Fetch image if not downloaded and return image path."""
        actual_image = self.version_image_path()

        with util.RecordedOperation('fetch image', related_object):
            resp = self._open_connection()

            diff_field = self._new_image_available(resp)
            if diff_field:
                self.log.withField(
                    'diff_field',
                    diff_field).info('Fetch required due HTTP field change')
                if related_object:
                    t, u = related_object.unique_label()
                    msg = '%s: %s -> %s' % diff_field
                    db.add_event(t, u, 'image requires fetch', None, None, msg)

                actual_image = self._fetch(resp, locks)

                # Ensure checksum is correct
                if not self.correct_checksum(actual_image):
                    if isinstance(related_object, virt.Instance):
                        related_object.add_event('fetch image', 'bad checksum')
                    raise exceptions.BadCheckSum('url=%s' % self.url)

                # Only persist values after the file has been verified.
                # Otherwise diff_field will not trigger a new download in the
                # case of a checksum verification failure.
                self.fetched = email.utils.formatdate()
                self.modified = resp.headers.get('Last-Modified')
                self.size = resp.headers.get('Content-Length')
                self.persist()

        _transcode(locks, actual_image, related_object)

        return actual_image
Beispiel #4
0
    def _process_network_node_workitems(self):
        jobname, workitem = db.dequeue('networknode')
        try:
            if not workitem:
                time.sleep(0.2)
                return

            log_ctx = LOG.withField('workitem', workitem)
            if not NetworkTask.__subclasscheck__(type(workitem)):
                raise exceptions.UnknownTaskException(
                    'Network workitem was not decoded: %s' % workitem)

            n = net.from_db(workitem.network_uuid())
            if not n:
                log_ctx.withNetwork(workitem.network_uuid()).warning(
                    'Received work item for non-existent network')
                return

            # NOTE(mikal): there's really nothing stopping us from processing a bunch
            # of these jobs in parallel with a pool of workers, but I am not sure its
            # worth the complexity right now. Are we really going to be changing
            # networks that much?
            if isinstance(workitem, DeployNetworkTask):
                try:
                    n.create()
                    n.ensure_mesh()
                    db.add_event('network', workitem.network_uuid(),
                                 'network node', 'deploy', None, None)
                except exceptions.DeadNetwork as e:
                    log_ctx.withField('exception', e).warning(
                        'DeployNetworkTask on dead network')

            elif isinstance(workitem, UpdateDHCPNetworkTask):
                try:
                    n.create()
                    n.ensure_mesh()
                    n.update_dhcp()
                    db.add_event('network', workitem.network_uuid(),
                                 'network node', 'update dhcp', None, None)
                except exceptions.DeadNetwork as e:
                    log_ctx.withField('exception', e).warning(
                        'UpdateDHCPNetworkTask on dead network')

            elif isinstance(workitem, RemoveDHCPNetworkTask):
                n.remove_dhcp()
                db.add_event('network', workitem.network_uuid(),
                             'network node', 'remove dhcp', None, None)

        finally:
            if jobname:
                db.resolve('networknode', jobname)
Beispiel #5
0
    def __enter__(self):
        start_time = time.time()
        slow_warned = False
        threshold = int(config.parsed.get('SLOW_LOCK_THRESHOLD'))

        try:
            while time.time() - start_time < self.timeout:
                res = self.acquire()
                if res:
                    return self

                duration = time.time() - start_time
                if (duration > threshold and not slow_warned):
                    db.add_event(self.objecttype, self.objectname, 'lock',
                                 'acquire', None,
                                 'Waiting for lock more than threshold')

                    node, pid = self.get_holder()
                    logutil.info(
                        self.relatedobjects,
                        'Waiting for lock on %s: %.02f seconds, threshold '
                        '%d seconds. Holder is pid %s on %s.' %
                        (self.path, duration, threshold, pid, node))
                    slow_warned = True

                time.sleep(1)

            duration = time.time() - start_time
            db.add_event(
                self.objecttype, self.objectname, 'lock', 'failed', None,
                'Failed to acquire lock after %.02f seconds' % duration)

            node, pid = self.get_holder()
            logutil.info(
                self.relatedobjects,
                'Failed to acquire lock %s after %.02f seconds. Holder is pid %s on %s.'
                % (self.path, duration, pid, node))
            raise exceptions.LockException(
                'Cannot acquire lock %s, timed out after %.02f seconds' %
                (self.name, duration))

        finally:
            duration = time.time() - start_time
            if duration > threshold:
                db.add_event(self.objecttype, self.objectname, 'lock',
                             'acquired', None,
                             'Waited %d seconds for lock' % duration)
                logutil.info(
                    self.relatedobjects,
                    'Acquiring a lock on %s was slow: %.02f seconds' %
                    (self.path, duration))
Beispiel #6
0
    def __enter__(self):
        start_time = time.time()
        slow_warned = False
        threshold = int(config.SLOW_LOCK_THRESHOLD)

        while time.time() - start_time < self.timeout:
            res = self.acquire()
            if res:
                duration = time.time() - start_time
                if duration > threshold:
                    db.add_event(self.objecttype, self.objectname, 'lock',
                                 'acquired', None,
                                 'Waited %d seconds for lock' % duration)
                    self.log_ctx.with_field(
                        'duration', duration).info('Acquiring a lock was slow')
                return self

            duration = time.time() - start_time
            if (duration > threshold and not slow_warned):
                db.add_event(self.objecttype, self.objectname, 'lock',
                             'acquire', None,
                             'Waiting for lock more than threshold')

                node, pid = self.get_holder()
                self.log_ctx.with_fields({
                    'duration': duration,
                    'threshold': threshold,
                    'holder-pid': pid,
                    'holder-node': node,
                    'requesting-op': self.operation,
                }).info('Waiting for lock')
                slow_warned = True

            time.sleep(1)

        duration = time.time() - start_time
        db.add_event(self.objecttype, self.objectname, 'lock', 'failed', None,
                     'Failed to acquire lock after %.02f seconds' % duration)

        node, pid = self.get_holder()
        self.log_ctx.with_fields({
            'duration': duration,
            'holder-pid': pid,
            'holder-node': node,
            'requesting-op': self.operation,
        }).info('Failed to acquire lock')

        raise exceptions.LockException(
            'Cannot acquire lock %s, timed out after %.02f seconds' %
            (self.name, self.timeout))
Beispiel #7
0
    def post(self,
             netblock=None,
             provide_dhcp=None,
             provide_nat=None,
             name=None,
             namespace=None):
        try:
            ipaddress.ip_network(netblock)
        except ValueError as e:
            return error(400, 'cannot parse netblock: %s' % e)

        if not namespace:
            namespace = get_jwt_identity()

        # If accessing a foreign name namespace, we need to be an admin
        if get_jwt_identity() not in [namespace, 'system']:
            return error(
                401,
                'only admins can create resources in a different namespace')

        network = db.allocate_network(netblock, provide_dhcp, provide_nat,
                                      name, namespace)
        db.add_event('network', network['uuid'], 'api', 'create', None, None)

        # Networks should immediately appear on the network node
        db.enqueue('networknode', {
            'type': 'deploy',
            'network_uuid': network['uuid']
        })

        db.add_event('network', network['uuid'], 'deploy', 'enqueued', None,
                     None)
        db.add_event('network', network['uuid'], 'api', 'created', None, None)
        db.update_network_state(network['uuid'], 'created')

        # Initialise metadata
        db.persist_metadata('network', network['uuid'], {})

        return network
Beispiel #8
0
    def place_instance(self, instance, network, candidates=None):
        with util.RecordedOperation('schedule', instance):
            log_ctx = LOG.withObj(instance)

            diff = time.time() - self.metrics_updated
            if diff > config.parsed.get('SCHEDULER_CACHE_TIMEOUT'):
                self.refresh_metrics()

            if candidates:
                log_ctx.info('Scheduling %s forced as candidates' % candidates)
                db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                             'Forced candidates', None, str(candidates))
                for node in candidates:
                    if node not in self.metrics:
                        raise exceptions.CandidateNodeNotFoundException(node)
            else:
                candidates = []
                for node in self.metrics.keys():
                    candidates.append(node)
            log_ctx.info('Scheduling %s start as candidates' % candidates)
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Initial candidates', None, str(candidates))
            if not candidates:
                raise exceptions.LowResourceException('No nodes with metrics')

            # Can we host that many vCPUs?
            for node in copy.copy(candidates):
                max_cpu = self.metrics[node].get('cpu_max_per_instance', 0)
                if instance.db_entry['cpus'] > max_cpu:
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough actual CPU' % candidates)
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough actual CPU', None, str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'Requested vCPUs exceeds vCPU limit')

            # Do we have enough idle CPU?
            for node in copy.copy(candidates):
                if not self._has_sufficient_cpu(instance.db_entry['cpus'],
                                                node):
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough idle CPU' % candidates)
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough idle CPU', None, str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'No nodes with enough idle CPU')

            # Do we have enough idle RAM?
            for node in copy.copy(candidates):
                if not self._has_sufficient_ram(instance.db_entry['memory'],
                                                node):
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough idle RAM' % candidates)
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough idle RAM', None, str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'No nodes with enough idle RAM')

            # Do we have enough idle disk?
            for node in copy.copy(candidates):
                if not self._has_sufficient_disk(instance, node):
                    candidates.remove(node)
            log_ctx.info('Scheduling %s have enough idle disk' % candidates)
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough idle disk', None, str(candidates))
            if not candidates:
                raise exceptions.LowResourceException(
                    'No nodes with enough disk space')

            # What nodes have the highest number of networks already present?
            if network:
                requested_networks = []
                for net in network:
                    network_uuid = net['network_uuid']
                    if network_uuid not in requested_networks:
                        requested_networks.append(network_uuid)

                candidates = self._find_most_matching_networks(
                    requested_networks, candidates)
                log_ctx.info('Scheduling %s have most matching networks' %
                             candidates)
                db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                             'Have most matching networks', None,
                             str(candidates))

            # What nodes have the base image already?
            requested_images = []
            for disk in instance.db_entry['block_devices']['devices']:
                if disk.get('base'):
                    requested_images = disk.get('base')

            candidates = self._find_most_matching_images(
                requested_images, candidates)
            log_ctx.info('Scheduling %s have most matching images' %
                         candidates)
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have most matching images', None, str(candidates))

            # Avoid allocating to network node if possible
            net_node = db.get_network_node()
            if len(candidates) > 1 and net_node['fqdn'] in candidates:
                candidates.remove(net_node['fqdn'])
                log_ctx.info('Scheduling %s are non-network nodes' %
                             candidates)
                db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                             'Are non-network nodes', None, str(candidates))

            # Return a shuffled list of options
            random.shuffle(candidates)
            return candidates
    def _update_power_states(self):
        libvirt = util.get_libvirt()
        conn = libvirt.open(None)
        try:
            seen = []

            # Active VMs have an ID. Active means running in libvirt
            # land.
            for domain_id in conn.listDomainsID():
                domain = conn.lookupByID(domain_id)
                if not domain.name().startswith('sf:'):
                    continue

                instance_uuid = domain.name().split(':')[1]
                log_ctx = LOG.withInstance(instance_uuid)

                instance = db.get_instance(instance_uuid)
                if not instance:
                    # Instance is SF but not in database. Kill to reduce load.
                    log_ctx.warning('Destroying unknown instance')
                    util.execute(None, 'virsh destroy "sf:%s"' % instance_uuid)
                    continue

                db.place_instance(instance_uuid, config.NODE_NAME)
                seen.append(domain.name())

                if instance.get('state') == 'deleted':
                    # NOTE(mikal): a delete might be in-flight in the queue.
                    # We only worry about instances which should have gone
                    # away five minutes ago.
                    if time.time() - instance['state_updated'] < 300:
                        continue

                    db.instance_enforced_deletes_increment(instance_uuid)
                    attempts = instance.get('enforced_deletes', 0)

                    if attempts > 5:
                        # Sometimes we just can't delete the VM. Try the big hammer instead.
                        log_ctx.warning(
                            'Attempting alternate delete method for instance')
                        util.execute(None,
                                     'virsh destroy "sf:%s"' % instance_uuid)

                        db.add_event('instance', instance_uuid,
                                     'enforced delete', 'complete', None, None)
                    else:
                        i = virt.from_db(instance_uuid)
                        i.delete()
                        i.update_instance_state('deleted')

                    log_ctx.withField(
                        'attempt', attempts).warning('Deleting stray instance')

                    continue

                state = util.extract_power_state(libvirt, domain)
                db.update_instance_power_state(instance_uuid, state)
                if state == 'crashed':
                    db.update_instance_state(instance_uuid, 'error')

            # Inactive VMs just have a name, and are powered off
            # in our state system.
            for domain_name in conn.listDefinedDomains():
                if not domain_name.startswith('sf:'):
                    continue

                if domain_name not in seen:
                    instance_uuid = domain_name.split(':')[1]
                    log_ctx = LOG.withInstance(instance_uuid)
                    instance = db.get_instance(instance_uuid)

                    if not instance:
                        # Instance is SF but not in database. Kill because unknown.
                        log_ctx.warning('Removing unknown inactive instance')
                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        continue

                    if instance.get('state') == 'deleted':
                        # NOTE(mikal): a delete might be in-flight in the queue.
                        # We only worry about instances which should have gone
                        # away five minutes ago.
                        if time.time() - instance['state_updated'] < 300:
                            continue

                        domain = conn.lookupByName(domain_name)
                        domain.undefine()
                        log_ctx.info('Detected stray instance')
                        db.add_event('instance', instance_uuid,
                                     'deleted stray', 'complete', None, None)
                        continue

                    db.place_instance(instance_uuid, config.NODE_NAME)
                    instance_path = os.path.join(config.get('STORAGE_PATH'),
                                                 'instances', instance_uuid)

                    if not os.path.exists(instance_path):
                        # If we're inactive and our files aren't on disk,
                        # we have a problem.
                        log_ctx.info('Detected error state for instance')
                        db.update_instance_state(instance_uuid, 'error')

                    elif instance.get('power_state') != 'off':
                        log_ctx.info('Detected power off for instance')
                        db.update_instance_power_state(instance_uuid, 'off')
                        db.add_event('instance', instance_uuid,
                                     'detected poweroff', 'complete', None,
                                     None)

        except libvirt.libvirtError as e:
            LOG.error('Failed to lookup all domains: %s' % e)
Beispiel #10
0
 def post(self,
          instance_uuid=None,
          instance_from_db=None,
          instance_from_db_virt=None):
     db.add_event('instance', instance_uuid, 'api', 'unpause', None, None)
     return instance_from_db_virt.unpause()
Beispiel #11
0
    def create(self):
        subst = self.subst_dict()

        with db.get_lock('network', None, self.uuid, ttl=120):
            if not util.check_for_interface(subst['vx_interface']):
                with util.RecordedOperation('create vxlan interface', self):
                    util.execute(
                        None,
                        'ip link add %(vx_interface)s type vxlan id %(vx_id)s '
                        'dev %(physical_interface)s dstport 0' % subst)
                    util.execute(
                        None,
                        'sysctl -w net.ipv4.conf.%(vx_interface)s.arp_notify=1'
                        % subst)

            if not util.check_for_interface(subst['vx_bridge']):
                with util.RecordedOperation('create vxlan bridge', self):
                    util.execute(
                        None, 'ip link add %(vx_bridge)s type bridge' % subst)
                    util.execute(
                        None,
                        'ip link set %(vx_interface)s master %(vx_bridge)s' %
                        subst)
                    util.execute(None,
                                 'ip link set %(vx_interface)s up' % subst)
                    util.execute(None, 'ip link set %(vx_bridge)s up' % subst)
                    util.execute(
                        None,
                        'sysctl -w net.ipv4.conf.%(vx_bridge)s.arp_notify=1' %
                        subst)
                    util.execute(None, 'brctl setfd %(vx_bridge)s 0' % subst)
                    util.execute(None, 'brctl stp %(vx_bridge)s off' % subst)
                    util.execute(None,
                                 'brctl setageing %(vx_bridge)s 0' % subst)

        if util.is_network_node():
            if not os.path.exists('/var/run/netns/%(netns)s' % subst):
                with util.RecordedOperation('create netns', self):
                    util.execute(None, 'ip netns add %(netns)s' % subst)

            if not util.check_for_interface(subst['vx_veth_outer']):
                with util.RecordedOperation('create router veth', self):
                    util.execute(
                        None,
                        'ip link add %(vx_veth_outer)s type veth peer name %(vx_veth_inner)s'
                        % subst)
                    util.execute(
                        None, 'ip link set %(vx_veth_inner)s netns %(netns)s' %
                        subst)
                    util.execute(
                        None,
                        'brctl addif %(vx_bridge)s %(vx_veth_outer)s' % subst)
                    util.execute(None,
                                 'ip link set %(vx_veth_outer)s up' % subst)
                    util.execute(
                        None, '%(in_netns)s ip link set %(vx_veth_inner)s up' %
                        subst)
                    util.execute(
                        None,
                        '%(in_netns)s ip addr add %(router)s/%(netmask)s dev %(vx_veth_inner)s'
                        % subst)

            if not util.check_for_interface(subst['physical_veth_outer']):
                with util.RecordedOperation('create physical veth', self):
                    util.execute(
                        None,
                        'ip link add %(physical_veth_outer)s type veth peer name '
                        '%(physical_veth_inner)s' % subst)
                    util.execute(
                        None,
                        'brctl addif %(physical_bridge)s %(physical_veth_outer)s'
                        % subst)
                    util.execute(
                        None, 'ip link set %(physical_veth_outer)s up' % subst)
                    util.execute(
                        None,
                        'ip link set %(physical_veth_inner)s netns %(netns)s' %
                        subst)

            self.deploy_nat()
            self.update_dhcp()
        else:
            db.enqueue('networknode', {
                'type': 'deploy',
                'network_uuid': self.uuid
            })
            db.add_event('network', self.uuid, 'deploy', 'enqueued', None,
                         None)
Beispiel #12
0
    def create(self):
        subst = self.subst_dict()

        with db.get_object_lock(self, ttl=120, op='Network create'):
            # Ensure network was not deleted whilst waiting for the lock.
            if self.is_dead():
                raise DeadNetwork('network=%s' % self)

            if not util.check_for_interface(subst['vx_interface']):
                with util.RecordedOperation('create vxlan interface', self):
                    util.create_interface(
                        subst['vx_interface'], 'vxlan',
                        'id %(vx_id)s dev %(physical_interface)s dstport 0' %
                        subst)
                    util.execute(
                        None, 'sysctl -w net.ipv4.conf.'
                        '%(vx_interface)s.arp_notify=1' % subst)

            if not util.check_for_interface(subst['vx_bridge']):
                with util.RecordedOperation('create vxlan bridge', self):
                    util.create_interface(subst['vx_bridge'], 'bridge', '')
                    util.execute(
                        None, 'ip link set %(vx_interface)s '
                        'master %(vx_bridge)s' % subst)
                    util.execute(None,
                                 'ip link set %(vx_interface)s up' % subst)
                    util.execute(None, 'ip link set %(vx_bridge)s up' % subst)
                    util.execute(
                        None, 'sysctl -w net.ipv4.conf.'
                        '%(vx_bridge)s.arp_notify=1' % subst)
                    util.execute(None, 'brctl setfd %(vx_bridge)s 0' % subst)
                    util.execute(None, 'brctl stp %(vx_bridge)s off' % subst)
                    util.execute(None,
                                 'brctl setageing %(vx_bridge)s 0' % subst)

        if util.is_network_node():
            if not os.path.exists('/var/run/netns/%(netns)s' % subst):
                with util.RecordedOperation('create netns', self):
                    util.execute(None, 'ip netns add %(netns)s' % subst)

            if not util.check_for_interface(subst['vx_veth_outer']):
                with util.RecordedOperation('create router veth', self):
                    util.create_interface(
                        subst['vx_veth_outer'], 'veth',
                        'peer name %(vx_veth_inner)s' % subst)
                    util.execute(
                        None, 'ip link set %(vx_veth_inner)s netns %(netns)s' %
                        subst)
                    util.execute(
                        None,
                        'brctl addif %(vx_bridge)s %(vx_veth_outer)s' % subst)
                    util.execute(None,
                                 'ip link set %(vx_veth_outer)s up' % subst)
                    util.execute(
                        None, '%(in_netns)s ip link set %(vx_veth_inner)s up' %
                        subst)
                    util.execute(
                        None,
                        '%(in_netns)s ip addr add %(router)s/%(netmask)s '
                        'dev %(vx_veth_inner)s' % subst)

            if not util.check_for_interface(subst['physical_veth_outer']):
                with util.RecordedOperation('create physical veth', self):
                    util.create_interface(
                        subst['physical_veth_outer'], 'veth',
                        'peer name %(physical_veth_inner)s' % subst)
                    util.execute(
                        None, 'brctl addif %(physical_bridge)s '
                        '%(physical_veth_outer)s' % subst)
                    util.execute(
                        None, 'ip link set %(physical_veth_outer)s up' % subst)
                    util.execute(
                        None, 'ip link set %(physical_veth_inner)s '
                        'netns %(netns)s' % subst)

            self.deploy_nat()
            self.update_dhcp()
        else:
            db.enqueue('networknode', DeployNetworkTask(self.db_entry['uuid']))
            db.add_event('network', self.db_entry['uuid'], 'deploy',
                         'enqueued', None, None)
Beispiel #13
0
    def run(self):
        LOG.info('Starting Monitor Daemon')
        observers = {}

        while True:
            # Cleanup terminated observers
            all_observers = list(observers.keys())
            for instance_uuid in all_observers:
                if not observers[instance_uuid].is_alive():
                    # Reap process
                    observers[instance_uuid].join(1)
                    LOG.withInstance(instance_uuid).info(
                        'Trigger observer has terminated')
                    db.add_event('instance', instance_uuid, 'trigger monitor',
                                 'crashed', None, None)
                    del observers[instance_uuid]

            # Start missing observers
            extra_instances = list(observers.keys())

            for inst in db.get_instances(
                    only_node=config.parsed.get('NODE_NAME')):
                if inst['uuid'] in extra_instances:
                    extra_instances.remove(inst['uuid'])

                if inst['state'] != 'created':
                    continue

                if inst['uuid'] not in observers:
                    console_path = os.path.join(
                        config.parsed.get('STORAGE_PATH'), 'instances',
                        inst['uuid'], 'console.log')
                    p = multiprocessing.Process(
                        target=observe,
                        args=(console_path, inst['uuid']),
                        name='%s-%s' %
                        (daemon.process_name('triggers'), inst['uuid']))
                    p.start()

                    observers[inst['uuid']] = p
                    LOG.withInstance(
                        inst['uuid']).info('Started trigger observer')
                    db.add_event('instance', inst['uuid'], 'trigger monitor',
                                 'started', None, None)

            # Cleanup extra observers
            for instance_uuid in extra_instances:
                p = observers[instance_uuid]
                try:
                    os.kill(p.pid, signal.SIGKILL)
                    observers[instance_uuid].join(1)
                except Exception:
                    pass

                del observers[instance_uuid]
                LOG.withInstance(instance_uuid).info(
                    'Finished trigger observer')
                db.add_event('instance', instance_uuid, 'trigger monitor',
                             'finished', None, None)

            time.sleep(1)
Beispiel #14
0
def instance_delete(inst):
    with inst.get_lock(op='Instance delete'):
        # There are two delete state flows:
        #   - error transition states (preflight-error etc) to error
        #   - created to deleted
        #
        # We don't need delete_wait for the error states as they're already
        # in a transition state.
        if not inst.state.value.endswith('-error'):
            inst.state = dbo.STATE_DELETE_WAIT
        db.add_event('instance', inst.uuid, 'queued', 'delete', None, None)

        # Create list of networks used by instance. We cannot use the
        # interfaces cached in the instance here, because the instance
        # may have failed to get to the point where it populates that
        # field (an image fetch failure for example).
        instance_networks = []
        interfaces = []
        for ni in networkinterface.interfaces_for_instance(inst):
            if ni:
                interfaces.append(ni)
                if ni.network_uuid not in instance_networks:
                    instance_networks.append(ni.network_uuid)

        # Stop the instance
        inst.power_off()

        # Delete the instance's interfaces
        with util_general.RecordedOperation('release network addresses', inst):
            for ni in interfaces:
                ni.delete()

        # Create list of networks used by all other instances
        host_networks = []
        for i in instance.Instances(
            [instance.this_node_filter, instance.active_states_filter]):
            if not i.uuid == inst.uuid:
                for iface_uuid in inst.interfaces:
                    ni = networkinterface.NetworkInterface.from_db(iface_uuid)
                    if ni and ni.network_uuid not in host_networks:
                        host_networks.append(ni.network_uuid)

        inst.delete()

        # Check each network used by the deleted instance
        for network in instance_networks:
            n = net.Network.from_db(network)
            if n:
                # If network used by another instance, only update
                if network in host_networks:
                    if n.state.value == dbo.STATE_DELETE_WAIT:
                        # Do not update a network about to be deleted
                        continue
                    with util_general.RecordedOperation(
                            'deallocate ip address', inst):
                        n.update_dhcp()
                else:
                    # Network not used by any other instance therefore delete
                    with util_general.RecordedOperation(
                            'remove network from node', n):
                        n.delete_on_hypervisor()
Beispiel #15
0
 def get(self, instance_uuid=None, instance_from_db=None):
     db.add_event('instance', instance_uuid, 'api', 'get interfaces', None,
                  None)
     return list(db.get_instance_interfaces(instance_uuid))
Beispiel #16
0
    def post(self,
             name=None,
             cpus=None,
             memory=None,
             network=None,
             disk=None,
             ssh_key=None,
             user_data=None,
             placed_on=None,
             namespace=None,
             instance_uuid=None):
        global SCHEDULER

        # We need to sanitise the name so its safe for DNS
        name = re.sub(r'([^a-zA-Z0-9_\-])', '', name)

        if not namespace:
            namespace = get_jwt_identity()

        # If accessing a foreign namespace, we need to be an admin
        if get_jwt_identity() not in [namespace, 'system']:
            return error(
                401,
                'only admins can create resources in a different namespace')

        # The instance needs to exist in the DB before network interfaces are created
        if not instance_uuid:
            instance_uuid = str(uuid.uuid4())
            db.add_event('instance', instance_uuid, 'uuid allocated', None,
                         None, None)

        # Create instance object
        instance = virt.from_db(instance_uuid)
        if instance:
            if get_jwt_identity() not in [
                    instance.db_entry['namespace'], 'system'
            ]:
                LOG.info('instance(%s): instance not found, ownership test' %
                         instance_uuid)
                return error(404, 'instance not found')

        if not instance:
            instance = virt.from_definition(uuid=instance_uuid,
                                            name=name,
                                            disks=disk,
                                            memory_mb=memory,
                                            vcpus=cpus,
                                            ssh_key=ssh_key,
                                            user_data=user_data,
                                            owner=namespace)

        if not SCHEDULER:
            SCHEDULER = scheduler.Scheduler()

        # Have we been placed?
        if not placed_on:
            candidates = SCHEDULER.place_instance(instance, network)
            if len(candidates) == 0:
                db.add_event('instance', instance_uuid, 'schedule', 'failed',
                             None, 'insufficient resources')
                db.update_instance_state(instance_uuid, 'error')
                return error(507, 'insufficient capacity')

            placed_on = candidates[0]
            db.place_instance(instance_uuid, placed_on)
            db.add_event('instance', instance_uuid, 'placement', None, None,
                         placed_on)

        else:
            try:
                candidates = SCHEDULER.place_instance(instance,
                                                      network,
                                                      candidates=[placed_on])
                if len(candidates) == 0:
                    db.add_event('instance', instance_uuid, 'schedule',
                                 'failed', None, 'insufficient resources')
                    db.update_instance_state(instance_uuid, 'error')
                    return error(507, 'insufficient capacity')
            except scheduler.CandidateNodeNotFoundException as e:
                return error(404, 'node not found: %s' % e)

        # Have we been placed on a different node?
        if not placed_on == config.parsed.get('NODE_NAME'):
            body = flask_get_post_body()
            body['placed_on'] = placed_on
            body['instance_uuid'] = instance_uuid
            body['namespace'] = namespace

            token = util.get_api_token(
                'http://%s:%d' % (placed_on, config.parsed.get('API_PORT')),
                namespace=namespace)
            r = requests.request('POST',
                                 'http://%s:%d/instances' %
                                 (placed_on, config.parsed.get('API_PORT')),
                                 data=json.dumps(body),
                                 headers={
                                     'Authorization': token,
                                     'User-Agent': util.get_user_agent()
                                 })

            LOG.info('Returning proxied request: %d, %s' %
                     (r.status_code, r.text))
            resp = flask.Response(r.text, mimetype='application/json')
            resp.status_code = r.status_code
            return resp

        # Check we can get the required IPs
        nets = {}
        allocations = {}

        def error_with_cleanup(status_code, message):
            for network_uuid in allocations:
                n = net.from_db(network_uuid)
                for addr, _ in allocations[network_uuid]:
                    with db.get_lock('sf/ipmanager/%s' % n.uuid, ttl=120) as _:
                        ipm = db.get_ipmanager(n.uuid)
                        ipm.release(addr)
                        db.persist_ipmanager(n.uuid, ipm.save())
            return error(status_code, message)

        order = 0
        if network:
            for netdesc in network:
                if 'network_uuid' not in netdesc or not netdesc['network_uuid']:
                    return error_with_cleanup(404, 'network not specified')

                if netdesc['network_uuid'] not in nets:
                    n = net.from_db(netdesc['network_uuid'])
                    if not n:
                        return error_with_cleanup(
                            404,
                            'network %s not found' % netdesc['network_uuid'])
                    nets[netdesc['network_uuid']] = n
                    n.create()

                with db.get_lock('sf/ipmanager/%s' % netdesc['network_uuid'],
                                 ttl=120) as _:
                    db.add_event('network', netdesc['network_uuid'],
                                 'allocate address', None, None, instance_uuid)
                    allocations.setdefault(netdesc['network_uuid'], [])
                    ipm = db.get_ipmanager(netdesc['network_uuid'])
                    if 'address' not in netdesc or not netdesc['address']:
                        netdesc['address'] = ipm.get_random_free_address()
                    else:
                        if not ipm.reserve(netdesc['address']):
                            return error_with_cleanup(
                                409, 'address %s in use' % netdesc['address'])
                    db.persist_ipmanager(netdesc['network_uuid'], ipm.save())
                    allocations[netdesc['network_uuid']].append(
                        (netdesc['address'], order))

                if 'model' not in netdesc or not netdesc['model']:
                    netdesc['model'] = 'virtio'

                db.create_network_interface(str(uuid.uuid4()), netdesc,
                                            instance_uuid, order)

                order += 1

        # Initialise metadata
        db.persist_metadata('instance', instance_uuid, {})

        # Now we can start the instance
        with db.get_lock('sf/instance/%s' % instance.db_entry['uuid'],
                         ttl=900) as lock:
            with util.RecordedOperation('ensure networks exist',
                                        instance) as _:
                for network_uuid in nets:
                    n = nets[network_uuid]
                    n.ensure_mesh()
                    n.update_dhcp()

            with util.RecordedOperation('instance creation', instance) as _:
                instance.create(lock=lock)

            for iface in db.get_instance_interfaces(instance.db_entry['uuid']):
                db.update_network_interface_state(iface['uuid'], 'created')

            return db.get_instance(instance_uuid)
Beispiel #17
0
 def get(self, instance_uuid=None, instance_from_db=None):
     db.add_event('instance', instance_uuid, 'api', 'get', None, None)
     return instance_from_db
Beispiel #18
0
 def get(self, network_uuid=None, network_from_db=None):
     db.add_event('network', network_uuid, 'api', 'get events', None, None)
     return list(db.get_events('network', network_uuid))
Beispiel #19
0
 def get(self, network_uuid=None, network_from_db=None):
     db.add_event('network', network_uuid, 'api', 'get', None, None)
     if network_from_db is not None and 'ipmanager' in network_from_db:
         del network_from_db['ipmanager']
     return network_from_db
Beispiel #20
0
    def run(self):
        LOG.info('Starting')
        observers = {}

        while not self.exit.is_set():
            # Cleanup terminated observers
            all_observers = list(observers.keys())
            for instance_uuid in all_observers:
                if not observers[instance_uuid].is_alive():
                    # Reap process
                    observers[instance_uuid].join(1)
                    LOG.with_instance(instance_uuid).info(
                        'Trigger observer has terminated')
                    db.add_event('instance', instance_uuid, 'trigger monitor',
                                 'crashed', None, None)
                    del observers[instance_uuid]

            # Audit desired observers
            extra_instances = list(observers.keys())
            missing_instances = []

            with etcd.ThreadLocalReadOnlyCache():
                for inst in instance.Instances([
                        instance.this_node_filter,
                        partial(baseobject.state_filter,
                                [instance.Instance.STATE_CREATED])
                ]):
                    if inst.uuid in extra_instances:
                        extra_instances.remove(inst.uuid)

                    if inst.uuid not in observers:
                        missing_instances.append(inst.uuid)

            # Start missing observers
            for instance_uuid in missing_instances:
                console_path = os.path.join(config.STORAGE_PATH, 'instances',
                                            instance_uuid, 'console.log')
                p = multiprocessing.Process(
                    target=observe,
                    args=(console_path, instance_uuid),
                    name='%s-%s' %
                    (daemon.process_name('triggers'), instance_uuid))
                p.start()

                observers[instance_uuid] = p
                LOG.with_instance(instance_uuid).info(
                    'Started trigger observer')
                db.add_event('instance', instance_uuid, 'trigger monitor',
                             'started', None, None)

            # Cleanup extra observers
            for instance_uuid in extra_instances:
                p = observers[instance_uuid]
                try:
                    os.kill(p.pid, signal.SIGKILL)
                    observers[instance_uuid].join(1)
                except Exception:
                    pass

                del observers[instance_uuid]
                LOG.with_instance(instance_uuid).info(
                    'Finished trigger observer')
                db.add_event('instance', instance_uuid, 'trigger monitor',
                             'finished', None, None)

            self.exit.wait(1)

        # No longer running, clean up all trigger deaemons
        for instance_uuid in observers:
            os.kill(observers[instance_uuid].pid, signal.SIGKILL)
Beispiel #21
0
def observe(path, instance_uuid):
    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('triggers'), instance_uuid))
    regexps = {
        'login prompt':
        re.compile('.* login: .*'),
        'user-data script start':
        re.compile('.*Starting.*Execute cloud user/final scripts.*'),
        'user-data script end':
        re.compile('.*Finished.*Execute cloud user/final scripts.*'),
        'cloud-init complete':
        re.compile('.*Reached target.*Cloud-init target.*')
    }

    while not os.path.exists(path):
        time.sleep(1)
    fd = os.open(path, os.O_RDONLY | os.O_NONBLOCK)

    log_ctx = LOG.with_instance(instance_uuid)
    log_ctx.with_field('path', path).info('Monitoring path for triggers')
    db.add_event('instance', instance_uuid, 'trigger monitor',
                 'detected console log', None, None)

    # Sometimes the trigger process is slow to start, so rewind 4KB to ensure
    # that the last few log lines are not missed. (4KB since Cloud-Init can be
    # noisy after the login prompt.)
    os.lseek(fd, max(0, os.fstat(fd).st_size - 4096), os.SEEK_SET)

    # Record how long the file is, because we need to detect truncations and
    # re-open.
    previous_size = os.stat(path).st_size

    buffer = ''
    while True:
        # Detect file truncations, and die if we see one. We will be restarted
        # by the monitor process.
        if not os.path.exists(path):
            return
        size = os.stat(path).st_size
        if size < previous_size:
            return
        previous_size = size

        # Read data, os.read() is non-blocking by the way.
        d = os.read(fd, 1024).decode('utf-8', errors='ignore')
        if d:
            buffer += d
            lines = buffer.split('\n')
            buffer = lines[-1]

            for line in lines:
                if line:
                    for trigger in regexps:
                        m = regexps[trigger].match(line)
                        if m:
                            log_ctx.with_field(
                                'trigger',
                                trigger,
                            ).info('Trigger matched')
                            db.add_event('instance', instance_uuid, 'trigger',
                                         None, None, trigger)
        else:
            # Only pause if there was no data to read
            time.sleep(0.2)
Beispiel #22
0
 def add_event(self, operation, phase, duration=None, msg=None):
     db.add_event('instance', self.uuid, operation, phase, duration, msg)
Beispiel #23
0
def handle(jobname, workitem):
    libvirt = util_libvirt.get_libvirt()

    log = LOG.with_field('workitem', jobname)
    log.info('Processing workitem')

    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    inst = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            if not QueueTask.__subclasscheck__(type(task)):
                raise exceptions.UnknownTaskException(
                    'Task was not decoded: %s' % task)

            if InstanceTask.__subclasscheck__(type(task)):
                inst = instance.Instance.from_db(task.instance_uuid())
                if not inst:
                    raise exceptions.InstanceNotInDBException(
                        task.instance_uuid())

            if isinstance(task, FetchImageTask):
                inst = instance.Instance.from_db(task.instance_uuid())

            if isinstance(task, SnapshotTask):
                inst = instance.Instance.from_db(task.instance_uuid())

            if inst:
                log_i = log.with_instance(inst)
            else:
                log_i = log

            log_i.with_field('task_name', task.name()).info('Starting task')

            # TODO(andy) Should network events also come through here eventually?
            # Then this can be generalised to record events on networks/instances

            # TODO(andy) This event should be recorded when it is recorded as
            # dequeued in the DB. Currently it's reporting action on the item
            # and calling it 'dequeue'.

            if inst:
                # TODO(andy) move to QueueTask
                db.add_event('instance', inst.uuid, task.pretty_task_name(),
                             'dequeued', None, 'Work item %s' % jobname)

            if isinstance(task, FetchImageTask):
                image_fetch(task.url(), inst)

            elif isinstance(task, PreflightInstanceTask):
                if (inst.state.value == dbo.STATE_DELETED
                        or inst.state.value.endswith('-error')):
                    log_i.warning(
                        'You cannot preflight an instance in state %s, skipping task'
                        % inst.state.value)
                    continue

                redirect_to = instance_preflight(inst, task.network())
                if redirect_to:
                    log_i.info('Redirecting instance start to %s' %
                               redirect_to)
                    etcd.enqueue(redirect_to, workitem)
                    return

            elif isinstance(task, StartInstanceTask):
                if (inst.state.value == dbo.STATE_DELETED
                        or inst.state.value.endswith('-error')):
                    log_i.warning(
                        'You cannot start an instance in state %s, skipping task'
                        % inst.state.value)
                    continue

                instance_start(inst, task.network())
                etcd.enqueue('%s-metrics' % config.NODE_NAME, {})

            elif isinstance(task, DeleteInstanceTask):
                try:
                    instance_delete(inst)
                    etcd.enqueue('%s-metrics' % config.NODE_NAME, {})
                except Exception as e:
                    util_general.ignore_exception(
                        'instance %s delete task' % inst, e)

            elif isinstance(task, FloatNetworkInterfaceTask):
                # Just punt it to the network node now that the interface is ready
                etcd.enqueue('networknode', task)

            elif isinstance(task, SnapshotTask):
                snapshot(inst, task.disk(), task.artifact_uuid(),
                         task.blob_uuid())

            elif isinstance(task, DeleteNetworkWhenClean):
                # Check if any interfaces remain on network
                task_network = net.Network.from_db(task.network_uuid())
                ifaces = networkinterface.interfaces_for_network(task_network)
                cur_interfaces = {i.uuid: i for i in ifaces}

                if cur_interfaces:
                    LOG.with_network(task_network).error(
                        'During DeleteNetworkWhenClean new interfaces have '
                        'connected to network: %s', cur_interfaces)

                # Only check those present at delete task initiation time.
                remain_interfaces = list(
                    set(task.wait_interfaces()) & set(cur_interfaces))
                if remain_interfaces:
                    # Queue task on a node with a remaining instance
                    first_iface = cur_interfaces[remain_interfaces[0]]
                    inst = instance.Instance.from_db(first_iface.instance_uuid)
                    etcd.enqueue(inst.placement['node'], {
                        'tasks': [
                            DeleteNetworkWhenClean(task.network_uuid(),
                                                   remain_interfaces)
                        ]
                    },
                                 delay=60)

                else:
                    # All original instances deleted, safe to delete network
                    etcd.enqueue('networknode',
                                 DestroyNetworkTask(task.network_uuid()))

            elif isinstance(task, HypervisorDestroyNetworkTask):
                n = net.Network.from_db(task.network_uuid())
                n.delete_on_hypervisor()

            elif isinstance(task, FetchBlobTask):
                metrics = etcd.get('metrics', config.NODE_NAME, None)
                if metrics:
                    metrics = metrics.get('metrics', {})
                else:
                    metrics = {}

                b = blob.Blob.from_db(task.blob_uuid())
                if not b:
                    log.with_fields({
                        'blob': task.blob_uuid()
                    }).info('Cannot replicate blob, not found')

                elif (int(metrics.get('disk_free_blobs', 0)) - int(b.size) <
                      config.MINIMUM_FREE_DISK):
                    log.with_fields({
                        'blob': task.blob_uuid()
                    }).info('Cannot replicate blob, insufficient space')

                else:
                    log.with_object(b).info('Replicating blob')
                    size = b.ensure_local([])
                    log.with_object(b).with_fields({
                        'transferred': size,
                        'expected': b.size
                    }).info('Replicating blob complete')

            else:
                log_i.with_field('task',
                                 task).error('Unhandled task - dropped')

            log_i.info('Task complete')

    except exceptions.ImageFetchTaskFailedException as e:
        # Usually caused by external issue and not an application error
        log.info('Fetch Image Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Image fetch failed: %s' % e)

    except exceptions.ImagesCannotShrinkException as e:
        log.info('Fetch Resize Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Image resize failed: %s' % e)

    except libvirt.libvirtError as e:
        log.info('Libvirt Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Instance task failed: %s' % e)

    except exceptions.InstanceException as e:
        log.info('Instance Error: %s', e)
        if inst:
            inst.enqueue_delete_due_error('Instance task failed: %s' % e)

    except Exception as e:
        # Logging ignored exception - this should be investigated
        util_general.ignore_exception('queue worker', e)
        if inst:
            inst.enqueue_delete_due_error('Failed queue task: %s' % e)

    finally:
        etcd.resolve(config.NODE_NAME, jobname)
        if inst:
            inst.add_event('tasks complete',
                           'dequeued',
                           msg='Work item %s' % jobname)
        log.info('Completed workitem')
Beispiel #24
0
 def add_event(self, operation, phase, duration=None, msg=None):
     if not self.__in_memory_only:
         db.add_event(self.object_type, self.__uuid, operation, phase,
                      duration, msg)
def handle(jobname, workitem):
    log = LOG.withField('workitem', jobname)
    log.info('Processing workitem')

    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            if not QueueTask.__subclasscheck__(type(task)):
                raise exceptions.UnknownTaskException(
                    'Task was not decoded: %s' % task)

            if (InstanceTask.__subclasscheck__(type(task))
                    or isinstance(task, FetchImageTask)):
                instance_uuid = task.instance_uuid()

            if instance_uuid:
                log_i = log.withInstance(instance_uuid)
            else:
                log_i = log

            log_i.withField('task_name', task.name()).info('Starting task')

            # TODO(andy) Should network events also come through here eventually?
            # Then this can be generalised to record events on networks/instances

            # TODO(andy) This event should be recorded when it is recorded as
            # dequeued in the DB. Currently it's reporting action on the item
            # and calling it 'dequeue'.

            if instance_uuid:
                # TODO(andy) move to QueueTask
                db.add_event('instance', instance_uuid,
                             task.pretty_task_name(), 'dequeued', None,
                             'Work item %s' % jobname)

            if isinstance(task, FetchImageTask):
                image_fetch(task.url(), instance_uuid)

            elif isinstance(task, PreflightInstanceTask):
                redirect_to = instance_preflight(instance_uuid, task.network())
                if redirect_to:
                    log_i.info('Redirecting instance start to %s' %
                               redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            elif isinstance(task, StartInstanceTask):
                instance_start(instance_uuid, task.network())
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.NODE_NAME, {})

            elif isinstance(task, DeleteInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'deleted')
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            elif isinstance(task, ErrorInstanceTask):
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid, 'error')

                    if task.error_msg():
                        db.update_instance_error_message(
                            instance_uuid, task.error_msg())
                    db.enqueue('%s-metrics' % config.NODE_NAME, {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

            else:
                log_i.withField('task', task).error('Unhandled task - dropped')

            log_i.info('Task complete')

    except exceptions.ImageFetchTaskFailedException as e:
        # Usually caused by external issue and not an application error
        log.info('Fetch Image Error: %s', e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    except Exception as e:
        util.ignore_exception(daemon.process_name('queues'), e)
        if instance_uuid:
            db.enqueue_instance_error(instance_uuid,
                                      'failed queue task: %s' % e)

    finally:
        db.resolve(config.NODE_NAME, jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        log.info('Completed workitem')
Beispiel #26
0
    def post(self,
             name=None,
             cpus=None,
             memory=None,
             network=None,
             disk=None,
             ssh_key=None,
             user_data=None,
             placed_on=None,
             namespace=None,
             instance_uuid=None,
             video=None):
        global SCHEDULER

        # Check that the instance name is safe for use as a DNS host name
        if name != re.sub(r'([^a-zA-Z0-9_\-])', '', name) or len(name) > 63:
            return error(400,
                         'instance name must be useable as a DNS host name')

        # Sanity check
        if not disk:
            return error(400, 'instance must specify at least one disk')
        for d in disk:
            if not isinstance(d, dict):
                return error(400,
                             'disk specification should contain JSON objects')

        if network:
            for n in network:
                if not isinstance(n, dict):
                    return error(
                        400,
                        'network specification should contain JSON objects')

                if 'network_uuid' not in n:
                    return error(
                        400, 'network specification is missing network_uuid')

        if not video:
            video = {'model': 'cirrus', 'memory': 16384}

        if not namespace:
            namespace = get_jwt_identity()

        # Only system can specify a uuid
        if instance_uuid and get_jwt_identity() != 'system':
            return error(401, 'only system can specify an instance uuid')

        # If accessing a foreign namespace, we need to be an admin
        if get_jwt_identity() not in [namespace, 'system']:
            return error(
                401,
                'only admins can create resources in a different namespace')

        # The instance needs to exist in the DB before network interfaces are created
        if not instance_uuid:
            instance_uuid = str(uuid.uuid4())
            db.add_event('instance', instance_uuid, 'uuid allocated', None,
                         None, None)

        # Create instance object
        instance = virt.from_db(instance_uuid)
        if instance:
            if get_jwt_identity() not in [
                    instance.db_entry['namespace'], 'system'
            ]:
                logutil.info([virt.ThinInstance(instance_uuid)],
                             'Instance not found, ownership test')
                return error(404, 'instance not found')

        if not instance:
            instance = virt.from_definition(uuid=instance_uuid,
                                            name=name,
                                            disks=disk,
                                            memory_mb=memory,
                                            vcpus=cpus,
                                            ssh_key=ssh_key,
                                            user_data=user_data,
                                            owner=namespace,
                                            video=video,
                                            requested_placement=placed_on)

        # Initialise metadata
        db.persist_metadata('instance', instance_uuid, {})

        # Allocate IP addresses
        order = 0
        if network:
            for netdesc in network:
                n = net.from_db(netdesc['network_uuid'])
                if not n:
                    db.enqueue_instance_delete(
                        config.parsed.get('NODE_NAME'), instance_uuid, 'error',
                        'missing network %s during IP allocation phase' %
                        netdesc['network_uuid'])
                    return error(
                        404, 'network %s not found' % netdesc['network_uuid'])

                with db.get_lock('ipmanager',
                                 None,
                                 netdesc['network_uuid'],
                                 ttl=120):
                    db.add_event('network', netdesc['network_uuid'],
                                 'allocate address', None, None, instance_uuid)
                    ipm = db.get_ipmanager(netdesc['network_uuid'])
                    if 'address' not in netdesc or not netdesc['address']:
                        netdesc['address'] = ipm.get_random_free_address()
                    else:
                        if not ipm.reserve(netdesc['address']):
                            db.enqueue_instance_delete(
                                config.parsed.get('NODE_NAME'), instance_uuid,
                                'error',
                                'failed to reserve an IP on network %s' %
                                netdesc['network_uuid'])
                            return error(
                                409, 'address %s in use' % netdesc['address'])

                    db.persist_ipmanager(netdesc['network_uuid'], ipm.save())

                if 'model' not in netdesc or not netdesc['model']:
                    netdesc['model'] = 'virtio'

                db.create_network_interface(str(uuid.uuid4()), netdesc,
                                            instance_uuid, order)

        if not SCHEDULER:
            SCHEDULER = scheduler.Scheduler()

        try:
            # Have we been placed?
            if not placed_on:
                candidates = SCHEDULER.place_instance(instance, network)
                placement = candidates[0]

            else:
                SCHEDULER.place_instance(instance,
                                         network,
                                         candidates=[placed_on])
                placement = placed_on

        except exceptions.LowResourceException as e:
            db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                         'insufficient resources: ' + str(e))
            db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'scheduling failed')
            return error(507, str(e))

        except exceptions.CandidateNodeNotFoundException as e:
            db.add_event('instance', instance_uuid, 'schedule', 'failed', None,
                         'candidate node not found: ' + str(e))
            db.enqueue_instance_delete(config.get.parsed('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'scheduling failed')
            return error(404, 'node not found: %s' % e)

        # Record placement
        db.place_instance(instance_uuid, placement)
        db.add_event('instance', instance_uuid, 'placement', None, None,
                     placement)

        # Create a queue entry for the instance start
        tasks = [{
            'type': 'instance_preflight',
            'instance_uuid': instance_uuid,
            'network': network
        }]
        for disk in instance.db_entry['block_devices']['devices']:
            if 'base' in disk and disk['base']:
                tasks.append({
                    'type': 'image_fetch',
                    'instance_uuid': instance_uuid,
                    'url': disk['base']
                })
        tasks.append({
            'type': 'instance_start',
            'instance_uuid': instance_uuid,
            'network': network
        })

        # Enqueue creation tasks on desired node task queue
        db.enqueue(placement, {'tasks': tasks})
        db.add_event('instance', instance_uuid, 'create', 'enqueued', None,
                     None)

        # Watch for a while and return results if things are fast, give up
        # after a while and just return the current state
        start_time = time.time()
        while time.time() - start_time < config.parsed.get('API_ASYNC_WAIT'):
            i = db.get_instance(instance_uuid)
            if i['state'] in ['created', 'deleted', 'error']:
                return i
            time.sleep(0.5)
        return i
Beispiel #27
0
    def place_instance(self, instance, network, candidates=None):
        with util.RecordedOperation('schedule', instance) as _:
            if time.time() - self.metrics_updated > config.parsed.get(
                    'SCHEDULER_CACHE_TIMEOUT'):
                self.refresh_metrics()

            if candidates:
                LOG.info('Scheduling %s, %s forced as candidates' %
                         (instance, candidates))
                db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                             'Forced candidates', None, str(candidates))
                for node in candidates:
                    if node not in self.metrics:
                        raise CandidateNodeNotFoundException(node)
            else:
                candidates = []
                for node in self.metrics.keys():
                    candidates.append(node)
            LOG.info('Scheduling %s, %s start as candidates' %
                     (instance, candidates))
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Initial candidates', None, str(candidates))

            # Can we host that many vCPUs?
            for node in copy.copy(candidates):
                if instance.db_entry['cpus'] > self.metrics[node].get(
                        'cpu_max_per_instance', 0):
                    candidates.remove(node)
            LOG.info('Scheduling %s, %s have enough actual CPU' %
                     (instance, candidates))
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough actual CPU', None, str(candidates))

            # Do we have enough idle CPU?
            for node in copy.copy(candidates):
                if not self._has_sufficient_cpu(instance.db_entry['cpus'],
                                                node):
                    candidates.remove(node)
            LOG.info('Scheduling %s, %s have enough idle CPU' %
                     (instance, candidates))
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough idle CPU', None, str(candidates))

            # Do we have enough idle RAM?
            for node in copy.copy(candidates):
                if not self._has_sufficient_ram(instance.db_entry['memory'],
                                                node):
                    candidates.remove(node)
            LOG.info('Scheduling %s, %s have enough idle RAM' %
                     (instance, candidates))
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough idle RAM', None, str(candidates))

            # Do we have enough idle disk?
            for node in copy.copy(candidates):
                if not self._has_sufficient_disk(instance, node):
                    candidates.remove(node)
            LOG.info('Scheduling %s, %s have enough idle disk' %
                     (instance, candidates))
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have enough idle disk', None, str(candidates))

            # What nodes have the highest number of networks already present?
            if network:
                requested_networks = []
                for net in network:
                    network_uuid = net['network_uuid']
                    if network_uuid not in requested_networks:
                        requested_networks.append(network_uuid)

                candidates = self._find_most_matching_networks(
                    requested_networks, candidates)
                LOG.info('Scheduling %s, %s have most matching networks' %
                         (instance, candidates))
                db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                             'Have most matching networks', None,
                             str(candidates))

            # What nodes have the base image already?
            requested_images = []
            for disk in instance.db_entry['block_devices']['devices']:
                if disk.get('base'):
                    requested_images = disk.get('base')

            candidates = self._find_most_matching_images(
                requested_images, candidates)
            LOG.info('Scheduling %s, %s have most matching images' %
                     (instance, candidates))
            db.add_event('instance', instance.db_entry['uuid'], 'schedule',
                         'Have most matching images', None, str(candidates))

            # Return a shuffled list of options
            random.shuffle(candidates)
            return candidates
Beispiel #28
0
 def post(self,
          instance_uuid=None,
          instance_from_db=None,
          instance_from_db_virt=None):
     db.add_event('instance', instance_uuid, 'api', 'poweron', None, None)
     return instance_from_db_virt.power_on()
Beispiel #29
0
 def get(self, instance_uuid=None, instance_from_db=None):
     db.add_event('instance', instance_uuid, 'api', 'get events', None,
                  None)
     return list(db.get_events('instance', instance_uuid))
Beispiel #30
0
def handle(jobname, workitem):
    j = JobName(jobname)
    logutil.info([j], 'Processing workitem')
    setproctitle.setproctitle('%s-%s' %
                              (daemon.process_name('queues'), jobname))

    instance_uuid = None
    task = None
    try:
        for task in workitem.get('tasks', []):
            ro = [j]
            instance_uuid = task.get('instance_uuid')
            if instance_uuid:
                i = virt.from_db(instance_uuid)
                ro.append(i)

            if task.get('type').startswith('instance_') and not instance_uuid:
                logutil.error(ro, 'Instance task lacks instance uuid')
                return

            if instance_uuid:
                db.add_event('instance', instance_uuid,
                             task.get('type').replace('_', ' '), 'dequeued',
                             None, 'Work item %s' % jobname)

            logutil.info(
                ro,
                'Executing task %s: %s' % (task.get('type', 'unknown'), task))
            if task.get('type') == 'image_fetch':
                image_fetch(task.get('url'), instance_uuid)

            if task.get('type') == 'instance_preflight':
                redirect_to = instance_preflight(instance_uuid,
                                                 task.get('network'))
                if redirect_to:
                    util.log('info', ro,
                             'Redirecting instance start to %s' % redirect_to)
                    db.place_instance(instance_uuid, redirect_to)
                    db.enqueue(redirect_to, workitem)
                    return

            if task.get('type') == 'instance_start':
                instance_start(instance_uuid, task.get('network'))
                db.update_instance_state(instance_uuid, 'created')
                db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'), {})

            if task.get('type') == 'instance_delete':
                try:
                    instance_delete(instance_uuid)
                    db.update_instance_state(instance_uuid,
                                             task.get('next_state', 'unknown'))
                    if task.get('next_state_message'):
                        db.update_instance_error_message(
                            instance_uuid, task.get('next_state_message'))
                    db.enqueue('%s-metrics' % config.parsed.get('NODE_NAME'),
                               {})
                except Exception as e:
                    util.ignore_exception(daemon.process_name('queues'), e)

    except Exception as e:
        if instance_uuid:
            util.ignore_exception(daemon.process_name('queues'), e)
            db.enqueue_instance_delete(config.parsed.get('NODE_NAME'),
                                       instance_uuid, 'error',
                                       'failed queue task: %s' % e)

    finally:
        db.resolve(config.parsed.get('NODE_NAME'), jobname)
        if instance_uuid:
            db.add_event('instance', instance_uuid, 'tasks complete',
                         'dequeued', None, 'Work item %s' % jobname)
        logutil.info([j], 'Completed workitem')