def test_claim_resources(self, mock_is_rebuild, mock_client): """Tests that when claim_resources() is called, that we appropriately call the placement client to claim resources for the instance. """ mock_is_rebuild.return_value = False ctx = nova_context.RequestContext(user_id=uuids.user_id) spec_obj = objects.RequestSpec(project_id=uuids.project_id) instance_uuid = uuids.instance alloc_req = mock.sentinel.alloc_req mock_client.claim_resources.return_value = True res = utils.claim_resources(ctx, mock_client, spec_obj, instance_uuid, alloc_req) mock_client.claim_resources.assert_called_once_with( ctx, uuids.instance, mock.sentinel.alloc_req, uuids.project_id, uuids.user_id, allocation_request_version=None, consumer_generation=None) self.assertTrue(res) # Now do it again but with RequestSpec.user_id set. spec_obj.user_id = uuids.spec_user_id mock_client.reset_mock() utils.claim_resources(ctx, mock_client, spec_obj, instance_uuid, alloc_req) mock_client.claim_resources.assert_called_once_with( ctx, uuids.instance, mock.sentinel.alloc_req, uuids.project_id, uuids.spec_user_id, allocation_request_version=None, consumer_generation=None)
def test_claim_resources(self, mock_is_rebuild, mock_client): """Tests that when claim_resources() is called, that we appropriately call the placement client to claim resources for the instance. """ mock_is_rebuild.return_value = False ctx = nova_context.RequestContext(user_id=uuids.user_id) spec_obj = objects.RequestSpec(project_id=uuids.project_id) instance_uuid = uuids.instance alloc_req = mock.sentinel.alloc_req mock_client.claim_resources.return_value = True res = utils.claim_resources(ctx, mock_client, spec_obj, instance_uuid, alloc_req) mock_client.claim_resources.assert_called_once_with( ctx, uuids.instance, mock.sentinel.alloc_req, uuids.project_id, uuids.user_id, allocation_request_version=None, consumer_generation=None) self.assertTrue(res) # Now do it again but with RequestSpec.user_id set. spec_obj.user_id = uuids.spec_user_id mock_client.reset_mock() utils.claim_resources(ctx, mock_client, spec_obj, instance_uuid, alloc_req) mock_client.claim_resources.assert_called_once_with( ctx, uuids.instance, mock.sentinel.alloc_req, uuids.project_id, uuids.spec_user_id, allocation_request_version=None, consumer_generation=None)
def test_claim_resouces_for_policy_check(self, mock_is_rebuild, mock_client): mock_is_rebuild.return_value = True ctx = mock.Mock(user_id=uuids.user_id) res = utils.claim_resources(ctx, None, mock.sentinel.spec_obj, mock.sentinel.instance_uuid, []) self.assertTrue(res) mock_is_rebuild.assert_called_once_with(mock.sentinel.spec_obj) self.assertFalse(mock_client.claim_resources.called)
def test_claim_resouces_for_policy_check(self, mock_is_rebuild, mock_client): mock_is_rebuild.return_value = True ctx = mock.Mock(user_id=uuids.user_id) res = utils.claim_resources(ctx, None, mock.sentinel.spec_obj, mock.sentinel.instance_uuid, []) self.assertTrue(res) mock_is_rebuild.assert_called_once_with(mock.sentinel.spec_obj) self.assertFalse(mock_client.claim_resources.called)
def _reschedule(self): # Since the resources on these alternates may have been consumed and # might not be able to support the migrated instance, we need to first # claim the resources to verify the host still has sufficient # available resources. elevated = self.context.elevated() host_available = False selection = None while self.host_list and not host_available: selection = self.host_list.pop(0) if (self.request_spec.requested_resources and not self._support_resource_request(selection)): LOG.debug( 'Scheduler returned alternate host %(host)s as a possible ' 'migration target for re-schedule but that host is not ' 'new enough to support the migration with resource ' 'request %(request)s. Trying another alternate.', { 'host': selection.service_host, 'request': self.request_spec.requested_resources }, instance=self.instance) continue if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) if host_available: scheduler_utils.fill_provider_mapping( self.context, self.reportclient, self.request_spec, selection) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % { "instance_uuid": self.instance.uuid }) raise exception.MaxRetriesExceeded(reason=reason) return selection
def test_claim_resources(self, mock_is_rebuild, mock_client): """Tests that when claim_resources() is called, that we appropriately call the placement client to claim resources for the instance. """ mock_is_rebuild.return_value = False ctx = mock.Mock(user_id=uuids.user_id) spec_obj = mock.Mock(project_id=uuids.project_id) instance_uuid = uuids.instance alloc_req = mock.sentinel.alloc_req mock_client.claim_resources.return_value = True res = utils.claim_resources(ctx, mock_client, spec_obj, instance_uuid, alloc_req) mock_client.claim_resources.assert_called_once_with(uuids.instance, mock.sentinel.alloc_req, uuids.project_id, uuids.user_id, allocation_request_version=None) self.assertTrue(res)
def test_claim_resources(self, mock_is_rebuild, mock_client): """Tests that when claim_resources() is called, that we appropriately call the placement client to claim resources for the instance. """ mock_is_rebuild.return_value = False ctx = mock.Mock(user_id=uuids.user_id) spec_obj = mock.Mock(project_id=uuids.project_id) instance_uuid = uuids.instance alloc_req = mock.sentinel.alloc_req mock_client.claim_resources.return_value = True res = utils.claim_resources(ctx, mock_client, spec_obj, instance_uuid, alloc_req) mock_client.claim_resources.assert_called_once_with( ctx, uuids.instance, mock.sentinel.alloc_req, uuids.project_id, uuids.user_id, allocation_request_version=None) self.assertTrue(res)
def _schedule(self, context, spec_obj, instance_uuids, alloc_reqs_by_rp_uuid, provider_summaries, allocation_request_version=None, return_alternates=False): """Returns a list of lists of Selection objects. :param context: The RequestContext object :param spec_obj: The RequestSpec object :param instance_uuids: List of instance UUIDs to place or move. :param alloc_reqs_by_rp_uuid: Optional dict, keyed by resource provider UUID, of the allocation_requests that may be used to claim resources against matched hosts. If None, indicates either the placement API wasn't reachable or that there were no allocation_requests returned by the placement API. If the latter, the provider_summaries will be an empty dict, not None. :param provider_summaries: Optional dict, keyed by resource provider UUID, of information that will be used by the filters/weighers in selecting matching hosts for a request. If None, indicates that the scheduler driver should grab all compute node information locally and that the Placement API is not used. If an empty dict, indicates the Placement API returned no potential matches for the requested resources. :param allocation_request_version: The microversion used to request the allocations. :param return_alternates: When True, zero or more alternate hosts are returned with each selected host. The number of alternates is determined by the configuration option `CONF.scheduler.max_attempts`. """ elevated = context.elevated() # Find our local list of acceptable hosts by repeatedly # filtering and weighing our options. Each time we choose a # host, we virtually consume resources on it so subsequent # selections can adjust accordingly. # Note: remember, we are using a generator-iterator here. So only # traverse this list once. This can bite you if the hosts # are being scanned in a filter or weighing function. hosts = self._get_all_host_states(elevated, spec_obj, provider_summaries) # NOTE(sbauza): The RequestSpec.num_instances field contains the number # of instances created when the RequestSpec was used to first boot some # instances. This is incorrect when doing a move or resize operation, # so prefer the length of instance_uuids unless it is None. num_instances = (len(instance_uuids) if instance_uuids else spec_obj.num_instances) # For each requested instance, we want to return a host whose resources # for the instance have been claimed, along with zero or more # alternates. These alternates will be passed to the cell that the # selected host is in, so that if for some reason the build fails, the # cell conductor can retry building the instance on one of these # alternates instead of having to simply fail. The number of alternates # is based on CONF.scheduler.max_attempts; note that if there are not # enough filtered hosts to provide the full number of alternates, the # list of hosts may be shorter than this amount. num_alts = (CONF.scheduler.max_attempts - 1 if return_alternates else 0) if (instance_uuids is None or not self.USES_ALLOCATION_CANDIDATES or alloc_reqs_by_rp_uuid is None): # We need to support the caching scheduler, which doesn't use the # placement API (and has USES_ALLOCATION_CANDIDATE = False) and # therefore we skip all the claiming logic for that scheduler # driver. Also, if there was a problem communicating with the # placement API, alloc_reqs_by_rp_uuid will be None, so we skip # claiming in that case as well. In the case where instance_uuids # is None, that indicates an older conductor, so we need to return # the objects without alternates. They will be converted back to # the older dict format representing HostState objects. return self._legacy_find_hosts(context, num_instances, spec_obj, hosts, num_alts, instance_uuids=instance_uuids) # A list of the instance UUIDs that were successfully claimed against # in the placement API. If we are not able to successfully claim for # all involved instances, we use this list to remove those allocations # before returning claimed_instance_uuids = [] # The list of hosts that have been selected (and claimed). claimed_hosts = [] for num, instance_uuid in enumerate(instance_uuids): # In a multi-create request, the first request spec from the list # is passed to the scheduler and that request spec's instance_uuid # might not be the same as the instance we're processing, so we # update the instance_uuid in that case before passing the request # spec to filters since at least one filter # (ServerGroupAntiAffinityFilter) depends on that information being # accurate. spec_obj.instance_uuid = instance_uuid # Reset the field so it's not persisted accidentally. spec_obj.obj_reset_changes(['instance_uuid']) hosts = self._get_sorted_hosts(spec_obj, hosts, num) if not hosts: # NOTE(jaypipes): If we get here, that means not all instances # in instance_uuids were able to be matched to a selected host. # Any allocations will be cleaned up in the # _ensure_sufficient_hosts() call. break # Attempt to claim the resources against one or more resource # providers, looping over the sorted list of possible hosts # looking for an allocation_request that contains that host's # resource provider UUID claimed_host = None for host in hosts: cn_uuid = host.uuid if cn_uuid not in alloc_reqs_by_rp_uuid: msg = ("A host state with uuid = '%s' that did not have a " "matching allocation_request was encountered while " "scheduling. This host was skipped.") LOG.debug(msg, cn_uuid) continue alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid] # TODO(jaypipes): Loop through all allocation_requests instead # of just trying the first one. For now, since we'll likely # want to order the allocation_requests in the future based on # information in the provider summaries, we'll just try to # claim resources using the first allocation_request alloc_req = alloc_reqs[0] if utils.claim_resources( elevated, self.placement_client, spec_obj, instance_uuid, alloc_req, allocation_request_version=allocation_request_version): claimed_host = host break if claimed_host is None: # We weren't able to claim resources in the placement API # for any of the sorted hosts identified. So, clean up any # successfully-claimed resources for prior instances in # this request and return an empty list which will cause # select_destinations() to raise NoValidHost LOG.debug("Unable to successfully claim against any host.") break claimed_instance_uuids.append(instance_uuid) claimed_hosts.append(claimed_host) # Now consume the resources so the filter/weights will change for # the next instance. self._consume_selected_host(claimed_host, spec_obj, instance_uuid=instance_uuid) # Check if we were able to fulfill the request. If not, this call will # raise a NoValidHost exception. self._ensure_sufficient_hosts(context, claimed_hosts, num_instances, claimed_instance_uuids) # We have selected and claimed hosts for each instance. Now we need to # find alternates for each host. selections_to_return = self._get_alternate_hosts( claimed_hosts, spec_obj, hosts, num, num_alts, alloc_reqs_by_rp_uuid, allocation_request_version) return selections_to_return
def _execute(self): # TODO(sbauza): Remove that once prep_resize() accepts a RequestSpec # object in the signature and all the scheduler.utils methods too legacy_spec = self.request_spec.to_legacy_request_spec_dict() legacy_props = self.request_spec.to_legacy_filter_properties_dict() scheduler_utils.setup_instance_group(self.context, self.request_spec) # If a target host is set in a requested destination, # 'populate_retry' need not be executed. if not ('requested_destination' in self.request_spec and self.request_spec.requested_destination and 'host' in self.request_spec.requested_destination): scheduler_utils.populate_retry(legacy_props, self.instance.uuid) # NOTE(sbauza): Force_hosts/nodes needs to be reset # if we want to make sure that the next destination # is not forced to be the original host self.request_spec.reset_forced_destinations() # NOTE(danms): Right now we only support migrate to the same # cell as the current instance, so request that the scheduler # limit thusly. instance_mapping = objects.InstanceMapping.get_by_instance_uuid( self.context, self.instance.uuid) LOG.debug('Requesting cell %(cell)s while migrating', {'cell': instance_mapping.cell_mapping.identity}, instance=self.instance) if ('requested_destination' in self.request_spec and self.request_spec.requested_destination): self.request_spec.requested_destination.cell = ( instance_mapping.cell_mapping) # NOTE(takashin): In the case that the target host is specified, # if the migration is failed, it is not necessary to retry # the cold migration to the same host. So make sure that # reschedule will not occur. if 'host' in self.request_spec.requested_destination: legacy_props.pop('retry', None) self.request_spec.retry = None else: self.request_spec.requested_destination = objects.Destination( cell=instance_mapping.cell_mapping) # Once _preallocate_migration() is done, the source node allocation is # moved from the instance consumer to the migration record consumer, # and the instance consumer doesn't have any allocations. If this is # the first time through here (not a reschedule), select_destinations # below will allocate resources on the selected destination node for # the instance consumer. If we're rescheduling, host_list is not None # and we'll call claim_resources for the instance and the selected # alternate. If we exhaust our alternates and raise MaxRetriesExceeded, # the rollback() method should revert the allocation swaparoo and move # the source node allocation from the migration record back to the # instance record. migration = self._preallocate_migration() self.request_spec.ensure_project_and_user_id(self.instance) # On an initial call to migrate, 'self.host_list' will be None, so we # have to call the scheduler to get a list of acceptable hosts to # migrate to. That list will consist of a selected host, along with # zero or more alternates. On a reschedule, though, the alternates will # be passed to this object and stored in 'self.host_list', so we can # pop the first alternate from the list to use for the destination, and # pass the remaining alternates to the compute. if self.host_list is None: selection_lists = self.scheduler_client.select_destinations( self.context, self.request_spec, [self.instance.uuid], return_objects=True, return_alternates=True) # Since there is only ever one instance to migrate per call, we # just need the first returned element. selection_list = selection_lists[0] # The selected host is the first item in the list, with the # alternates being the remainder of the list. selection, self.host_list = selection_list[0], selection_list[1:] else: # This is a reschedule that will use the supplied alternate hosts # in the host_list as destinations. Since the resources on these # alternates may have been consumed and might not be able to # support the migrated instance, we need to first claim the # resources to verify the host still has sufficient availabile # resources. elevated = self.context.elevated() host_available = False while self.host_list and not host_available: selection = self.host_list.pop(0) if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % {"instance_uuid": self.instance.uuid}) raise exception.MaxRetriesExceeded(reason=reason) scheduler_utils.populate_filter_properties(legacy_props, selection) # context is not serializable legacy_props.pop('context', None) (host, node) = (selection.service_host, selection.nodename) self.instance.availability_zone = ( availability_zones.get_host_availability_zone( self.context, host)) # FIXME(sbauza): Serialize/Unserialize the legacy dict because of # oslo.messaging #1529084 to transform datetime values into strings. # tl;dr: datetimes in dicts are not accepted as correct values by the # rpc fake driver. legacy_spec = jsonutils.loads(jsonutils.dumps(legacy_spec)) LOG.debug("Calling prep_resize with selected host: %s; " "Selected node: %s; Alternates: %s", host, node, self.host_list, instance=self.instance) # RPC cast to the destination host to start the migration process. self.compute_rpcapi.prep_resize( self.context, self.instance, legacy_spec['image'], self.flavor, host, migration, request_spec=legacy_spec, filter_properties=legacy_props, node=node, clean_shutdown=self.clean_shutdown, host_list=self.host_list)
def _get_host_supporting_request(self, selection_list): """Return the first compute selection from the selection_list where the service is new enough to support resource request during migration and the resources claimed successfully. :param selection_list: a list of Selection objects returned by the scheduler :return: A two tuple. The first item is a Selection object representing the host that supports the request. The second item is a list of Selection objects representing the remaining alternate hosts. :raises MaxRetriesExceeded: if none of the hosts in the selection_list is new enough to support the request or we cannot claim resource on any of the hosts that are new enough. """ if not self.request_spec.requested_resources: return selection_list[0], selection_list[1:] # Scheduler allocated resources on the first host. So check if the # first host is new enough if self._support_resource_request(selection_list[0]): return selection_list[0], selection_list[1:] # First host is old, so we need to use an alternate. Therefore we have # to remove the allocation from the first host. self.reportclient.delete_allocation_for_instance( self.context, self.instance.uuid) LOG.debug( 'Scheduler returned host %(host)s as a possible migration target ' 'but that host is not new enough to support the migration with ' 'resource request %(request)s or the compute RPC is pinned to ' 'less than 5.2. Trying alternate hosts.', {'host': selection_list[0].service_host, 'request': self.request_spec.requested_resources}, instance=self.instance) alternates = selection_list[1:] for i, selection in enumerate(alternates): if self._support_resource_request(selection): # this host is new enough so we need to try to claim resources # on it if selection.allocation_request: alloc_req = jsonutils.loads( selection.allocation_request) resource_claimed = scheduler_utils.claim_resources( self.context, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) if not resource_claimed: LOG.debug( 'Scheduler returned alternate host %(host)s as a ' 'possible migration target but resource claim ' 'failed on that host. Trying another alternate.', {'host': selection.service_host}, instance=self.instance) else: return selection, alternates[i + 1:] else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the resources are available. return selection, alternates[i + 1:] else: LOG.debug( 'Scheduler returned alternate host %(host)s as a possible ' 'migration target but that host is not new enough to ' 'support the migration with resource request %(request)s ' 'or the compute RPC is pinned to less than 5.2. ' 'Trying another alternate.', {'host': selection.service_host, 'request': self.request_spec.requested_resources}, instance=self.instance) # if we reach this point then none of the hosts was new enough for the # request or we failed to claim resources on every alternate reason = ("Exhausted all hosts available during compute service level " "check for instance %(instance_uuid)s." % {"instance_uuid": self.instance.uuid}) raise exception.MaxRetriesExceeded(reason=reason)
def _execute(self): # TODO(sbauza): Remove once all the scheduler.utils methods accept a # RequestSpec object in the signature. legacy_props = self.request_spec.to_legacy_filter_properties_dict() scheduler_utils.setup_instance_group(self.context, self.request_spec) # If a target host is set in a requested destination, # 'populate_retry' need not be executed. if not ('requested_destination' in self.request_spec and self.request_spec.requested_destination and 'host' in self.request_spec.requested_destination): scheduler_utils.populate_retry(legacy_props, self.instance.uuid) # NOTE(sbauza): Force_hosts/nodes needs to be reset # if we want to make sure that the next destination # is not forced to be the original host self.request_spec.reset_forced_destinations() # TODO(gibi): We need to make sure that the requested_resources field # is re calculated based on neutron ports. self._restrict_request_spec_to_cell(legacy_props) # Once _preallocate_migration() is done, the source node allocation is # moved from the instance consumer to the migration record consumer, # and the instance consumer doesn't have any allocations. If this is # the first time through here (not a reschedule), select_destinations # below will allocate resources on the selected destination node for # the instance consumer. If we're rescheduling, host_list is not None # and we'll call claim_resources for the instance and the selected # alternate. If we exhaust our alternates and raise MaxRetriesExceeded, # the rollback() method should revert the allocation swaparoo and move # the source node allocation from the migration record back to the # instance record. migration = self._preallocate_migration() self.request_spec.ensure_project_and_user_id(self.instance) self.request_spec.ensure_network_metadata(self.instance) compute_utils.heal_reqspec_is_bfv( self.context, self.request_spec, self.instance) # On an initial call to migrate, 'self.host_list' will be None, so we # have to call the scheduler to get a list of acceptable hosts to # migrate to. That list will consist of a selected host, along with # zero or more alternates. On a reschedule, though, the alternates will # be passed to this object and stored in 'self.host_list', so we can # pop the first alternate from the list to use for the destination, and # pass the remaining alternates to the compute. if self.host_list is None: selection_lists = self.query_client.select_destinations( self.context, self.request_spec, [self.instance.uuid], return_objects=True, return_alternates=True) # Since there is only ever one instance to migrate per call, we # just need the first returned element. selection_list = selection_lists[0] # The selected host is the first item in the list, with the # alternates being the remainder of the list. selection, self.host_list = selection_list[0], selection_list[1:] else: # This is a reschedule that will use the supplied alternate hosts # in the host_list as destinations. Since the resources on these # alternates may have been consumed and might not be able to # support the migrated instance, we need to first claim the # resources to verify the host still has sufficient availabile # resources. elevated = self.context.elevated() host_available = False while self.host_list and not host_available: selection = self.host_list.pop(0) if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % {"instance_uuid": self.instance.uuid}) raise exception.MaxRetriesExceeded(reason=reason) scheduler_utils.populate_filter_properties(legacy_props, selection) # context is not serializable legacy_props.pop('context', None) (host, node) = (selection.service_host, selection.nodename) self.instance.availability_zone = ( availability_zones.get_host_availability_zone( self.context, host)) LOG.debug("Calling prep_resize with selected host: %s; " "Selected node: %s; Alternates: %s", host, node, self.host_list, instance=self.instance) # RPC cast to the destination host to start the migration process. self.compute_rpcapi.prep_resize( # NOTE(mriedem): Using request_spec.image here is potentially # dangerous if it is not kept up to date (i.e. rebuild/unshelve); # seems like the sane thing to do would be to pass the current # instance.image_meta since that is what MoveClaim will use for # any NUMA topology claims on the destination host... self.context, self.instance, self.request_spec.image, self.flavor, host, migration, request_spec=self.request_spec, filter_properties=legacy_props, node=node, clean_shutdown=self.clean_shutdown, host_list=self.host_list)
def _execute(self): # TODO(sbauza): Remove that once prep_resize() accepts a RequestSpec # object in the signature and all the scheduler.utils methods too legacy_spec = self.request_spec.to_legacy_request_spec_dict() legacy_props = self.request_spec.to_legacy_filter_properties_dict() scheduler_utils.setup_instance_group(self.context, self.request_spec) # If a target host is set in a requested destination, # 'populate_retry' need not be executed. if not ('requested_destination' in self.request_spec and self.request_spec.requested_destination and 'host' in self.request_spec.requested_destination): scheduler_utils.populate_retry(legacy_props, self.instance.uuid) # NOTE(sbauza): Force_hosts/nodes needs to be reset # if we want to make sure that the next destination # is not forced to be the original host self.request_spec.reset_forced_destinations() # NOTE(danms): Right now we only support migrate to the same # cell as the current instance, so request that the scheduler # limit thusly. instance_mapping = objects.InstanceMapping.get_by_instance_uuid( self.context, self.instance.uuid) LOG.debug('Requesting cell %(cell)s while migrating', {'cell': instance_mapping.cell_mapping.identity}, instance=self.instance) if ('requested_destination' in self.request_spec and self.request_spec.requested_destination): self.request_spec.requested_destination.cell = ( instance_mapping.cell_mapping) # NOTE(takashin): In the case that the target host is specified, # if the migration is failed, it is not necessary to retry # the cold migration to the same host. So make sure that # reschedule will not occur. if 'host' in self.request_spec.requested_destination: legacy_props.pop('retry', None) self.request_spec.retry = None else: self.request_spec.requested_destination = objects.Destination( cell=instance_mapping.cell_mapping) # Once _preallocate_migration() is done, the source node allocation is # moved from the instance consumer to the migration record consumer, # and the instance consumer doesn't have any allocations. If this is # the first time through here (not a reschedule), select_destinations # below will allocate resources on the selected destination node for # the instance consumer. If we're rescheduling, host_list is not None # and we'll call claim_resources for the instance and the selected # alternate. If we exhaust our alternates and raise MaxRetriesExceeded, # the rollback() method should revert the allocation swaparoo and move # the source node allocation from the migration record back to the # instance record. migration = self._preallocate_migration() self.request_spec.ensure_project_and_user_id(self.instance) compute_utils.heal_reqspec_is_bfv(self.context, self.request_spec, self.instance) # On an initial call to migrate, 'self.host_list' will be None, so we # have to call the scheduler to get a list of acceptable hosts to # migrate to. That list will consist of a selected host, along with # zero or more alternates. On a reschedule, though, the alternates will # be passed to this object and stored in 'self.host_list', so we can # pop the first alternate from the list to use for the destination, and # pass the remaining alternates to the compute. if self.host_list is None: selection_lists = self.scheduler_client.select_destinations( self.context, self.request_spec, [self.instance.uuid], return_objects=True, return_alternates=True) # Since there is only ever one instance to migrate per call, we # just need the first returned element. selection_list = selection_lists[0] # The selected host is the first item in the list, with the # alternates being the remainder of the list. selection, self.host_list = selection_list[0], selection_list[1:] else: # This is a reschedule that will use the supplied alternate hosts # in the host_list as destinations. Since the resources on these # alternates may have been consumed and might not be able to # support the migrated instance, we need to first claim the # resources to verify the host still has sufficient availabile # resources. elevated = self.context.elevated() host_available = False while self.host_list and not host_available: selection = self.host_list.pop(0) if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % { "instance_uuid": self.instance.uuid }) raise exception.MaxRetriesExceeded(reason=reason) scheduler_utils.populate_filter_properties(legacy_props, selection) # context is not serializable legacy_props.pop('context', None) (host, node) = (selection.service_host, selection.nodename) self.instance.availability_zone = ( availability_zones.get_host_availability_zone(self.context, host)) # FIXME(sbauza): Serialize/Unserialize the legacy dict because of # oslo.messaging #1529084 to transform datetime values into strings. # tl;dr: datetimes in dicts are not accepted as correct values by the # rpc fake driver. legacy_spec = jsonutils.loads(jsonutils.dumps(legacy_spec)) LOG.debug( "Calling prep_resize with selected host: %s; " "Selected node: %s; Alternates: %s", host, node, self.host_list, instance=self.instance) # RPC cast to the destination host to start the migration process. self.compute_rpcapi.prep_resize(self.context, self.instance, legacy_spec['image'], self.flavor, host, migration, request_spec=legacy_spec, filter_properties=legacy_props, node=node, clean_shutdown=self.clean_shutdown, host_list=self.host_list)
def _schedule(self, context, spec_obj, instance_uuids, alloc_reqs_by_rp_uuid, provider_summaries, allocation_request_version=None, return_alternates=False): """Returns a list of lists of Selection objects. :param context: The RequestContext object :param spec_obj: The RequestSpec object :param instance_uuids: List of instance UUIDs to place or move. :param alloc_reqs_by_rp_uuid: Optional dict, keyed by resource provider UUID, of the allocation_requests that may be used to claim resources against matched hosts. If None, indicates either the placement API wasn't reachable or that there were no allocation_requests returned by the placement API. If the latter, the provider_summaries will be an empty dict, not None. :param provider_summaries: Optional dict, keyed by resource provider UUID, of information that will be used by the filters/weighers in selecting matching hosts for a request. If None, indicates that the scheduler driver should grab all compute node information locally and that the Placement API is not used. If an empty dict, indicates the Placement API returned no potential matches for the requested resources. :param allocation_request_version: The microversion used to request the allocations. :param return_alternates: When True, zero or more alternate hosts are returned with each selected host. The number of alternates is determined by the configuration option `CONF.scheduler.max_attempts`. """ elevated = context.elevated() # Find our local list of acceptable hosts by repeatedly # filtering and weighing our options. Each time we choose a # host, we virtually consume resources on it so subsequent # selections can adjust accordingly. # Note: remember, we are using a generator-iterator here. So only # traverse this list once. This can bite you if the hosts # are being scanned in a filter or weighing function. hosts = self._get_all_host_states(elevated, spec_obj, provider_summaries) # NOTE(sbauza): The RequestSpec.num_instances field contains the number # of instances created when the RequestSpec was used to first boot some # instances. This is incorrect when doing a move or resize operation, # so prefer the length of instance_uuids unless it is None. num_instances = (len(instance_uuids) if instance_uuids else spec_obj.num_instances) # For each requested instance, we want to return a host whose resources # for the instance have been claimed, along with zero or more # alternates. These alternates will be passed to the cell that the # selected host is in, so that if for some reason the build fails, the # cell conductor can retry building the instance on one of these # alternates instead of having to simply fail. The number of alternates # is based on CONF.scheduler.max_attempts; note that if there are not # enough filtered hosts to provide the full number of alternates, the # list of hosts may be shorter than this amount. num_alts = (CONF.scheduler.max_attempts - 1 if return_alternates else 0) if (instance_uuids is None or not self.USES_ALLOCATION_CANDIDATES or alloc_reqs_by_rp_uuid is None): # We need to support the caching scheduler, which doesn't use the # placement API (and has USES_ALLOCATION_CANDIDATE = False) and # therefore we skip all the claiming logic for that scheduler # driver. Also, if there was a problem communicating with the # placement API, alloc_reqs_by_rp_uuid will be None, so we skip # claiming in that case as well. In the case where instance_uuids # is None, that indicates an older conductor, so we need to return # the objects without alternates. They will be converted back to # the older dict format representing HostState objects. return self._legacy_find_hosts(context, num_instances, spec_obj, hosts, num_alts) # A list of the instance UUIDs that were successfully claimed against # in the placement API. If we are not able to successfully claim for # all involved instances, we use this list to remove those allocations # before returning claimed_instance_uuids = [] # The list of hosts that have been selected (and claimed). claimed_hosts = [] for num in range(num_instances): hosts = self._get_sorted_hosts(spec_obj, hosts, num) if not hosts: # NOTE(jaypipes): If we get here, that means not all instances # in instance_uuids were able to be matched to a selected host. # Any allocations will be cleaned up in the # _ensure_sufficient_hosts() call. break instance_uuid = instance_uuids[num] # Attempt to claim the resources against one or more resource # providers, looping over the sorted list of possible hosts # looking for an allocation_request that contains that host's # resource provider UUID claimed_host = None for host in hosts: cn_uuid = host.uuid if cn_uuid not in alloc_reqs_by_rp_uuid: msg = ("A host state with uuid = '%s' that did not have a " "matching allocation_request was encountered while " "scheduling. This host was skipped.") LOG.debug(msg, cn_uuid) continue alloc_reqs = alloc_reqs_by_rp_uuid[cn_uuid] # TODO(jaypipes): Loop through all allocation_requests instead # of just trying the first one. For now, since we'll likely # want to order the allocation_requests in the future based on # information in the provider summaries, we'll just try to # claim resources using the first allocation_request alloc_req = alloc_reqs[0] if utils.claim_resources(elevated, self.placement_client, spec_obj, instance_uuid, alloc_req, allocation_request_version=allocation_request_version): claimed_host = host break if claimed_host is None: # We weren't able to claim resources in the placement API # for any of the sorted hosts identified. So, clean up any # successfully-claimed resources for prior instances in # this request and return an empty list which will cause # select_destinations() to raise NoValidHost LOG.debug("Unable to successfully claim against any host.") break claimed_instance_uuids.append(instance_uuid) claimed_hosts.append(claimed_host) # Now consume the resources so the filter/weights will change for # the next instance. self._consume_selected_host(claimed_host, spec_obj) # Check if we were able to fulfill the request. If not, this call will # raise a NoValidHost exception. self._ensure_sufficient_hosts(context, claimed_hosts, num_instances, claimed_instance_uuids) # We have selected and claimed hosts for each instance. Now we need to # find alternates for each host. selections_to_return = self._get_alternate_hosts( claimed_hosts, spec_obj, hosts, num, num_alts, alloc_reqs_by_rp_uuid, allocation_request_version) return selections_to_return
def _execute(self): # TODO(sbauza): Remove once all the scheduler.utils methods accept a # RequestSpec object in the signature. legacy_props = self.request_spec.to_legacy_filter_properties_dict() scheduler_utils.setup_instance_group(self.context, self.request_spec) # If a target host is set in a requested destination, # 'populate_retry' need not be executed. if not ('requested_destination' in self.request_spec and self.request_spec.requested_destination and 'host' in self.request_spec.requested_destination): scheduler_utils.populate_retry(legacy_props, self.instance.uuid) # NOTE(sbauza): Force_hosts/nodes needs to be reset # if we want to make sure that the next destination # is not forced to be the original host self.request_spec.reset_forced_destinations() # TODO(gibi): We need to make sure that the requested_resources field # is re calculated based on neutron ports. self._restrict_request_spec_to_cell(legacy_props) # Once _preallocate_migration() is done, the source node allocation is # moved from the instance consumer to the migration record consumer, # and the instance consumer doesn't have any allocations. If this is # the first time through here (not a reschedule), select_destinations # below will allocate resources on the selected destination node for # the instance consumer. If we're rescheduling, host_list is not None # and we'll call claim_resources for the instance and the selected # alternate. If we exhaust our alternates and raise MaxRetriesExceeded, # the rollback() method should revert the allocation swaparoo and move # the source node allocation from the migration record back to the # instance record. migration = self._preallocate_migration() self.request_spec.ensure_project_and_user_id(self.instance) self.request_spec.ensure_network_metadata(self.instance) compute_utils.heal_reqspec_is_bfv(self.context, self.request_spec, self.instance) # On an initial call to migrate, 'self.host_list' will be None, so we # have to call the scheduler to get a list of acceptable hosts to # migrate to. That list will consist of a selected host, along with # zero or more alternates. On a reschedule, though, the alternates will # be passed to this object and stored in 'self.host_list', so we can # pop the first alternate from the list to use for the destination, and # pass the remaining alternates to the compute. if self.host_list is None: selection_lists = self.query_client.select_destinations( self.context, self.request_spec, [self.instance.uuid], return_objects=True, return_alternates=True) # Since there is only ever one instance to migrate per call, we # just need the first returned element. selection_list = selection_lists[0] # The selected host is the first item in the list, with the # alternates being the remainder of the list. selection, self.host_list = selection_list[0], selection_list[1:] else: # This is a reschedule that will use the supplied alternate hosts # in the host_list as destinations. Since the resources on these # alternates may have been consumed and might not be able to # support the migrated instance, we need to first claim the # resources to verify the host still has sufficient availabile # resources. elevated = self.context.elevated() host_available = False while self.host_list and not host_available: selection = self.host_list.pop(0) if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % { "instance_uuid": self.instance.uuid }) raise exception.MaxRetriesExceeded(reason=reason) scheduler_utils.populate_filter_properties(legacy_props, selection) # context is not serializable legacy_props.pop('context', None) (host, node) = (selection.service_host, selection.nodename) self.instance.availability_zone = ( availability_zones.get_host_availability_zone(self.context, host)) LOG.debug( "Calling prep_resize with selected host: %s; " "Selected node: %s; Alternates: %s", host, node, self.host_list, instance=self.instance) # RPC cast to the destination host to start the migration process. self.compute_rpcapi.prep_resize( # NOTE(mriedem): Using request_spec.image here is potentially # dangerous if it is not kept up to date (i.e. rebuild/unshelve); # seems like the sane thing to do would be to pass the current # instance.image_meta since that is what MoveClaim will use for # any NUMA topology claims on the destination host... self.context, self.instance, self.request_spec.image, self.flavor, host, migration, request_spec=self.request_spec, filter_properties=legacy_props, node=node, clean_shutdown=self.clean_shutdown, host_list=self.host_list)