def populate_retry(filter_properties, instance_uuid): max_attempts = CONF.scheduler.max_attempts force_hosts = filter_properties.get('force_hosts', []) force_nodes = filter_properties.get('force_nodes', []) # In the case of multiple force hosts/nodes, scheduler should not # disable retry filter but traverse all force hosts/nodes one by # one till scheduler gets a valid target host. if (max_attempts == 1 or len(force_hosts) == 1 or len(force_nodes) == 1): # re-scheduling is disabled. return # retry is enabled, update attempt count: retry = filter_properties.setdefault( 'retry', { 'num_attempts': 0, 'hosts': [] # list of compute hosts tried }) retry['num_attempts'] += 1 _log_compute_error(instance_uuid, retry) exc_reason = retry.pop('exc_reason', None) if retry['num_attempts'] > max_attempts: msg = (_('Exceeded max scheduling attempts %(max_attempts)d ' 'for instance %(instance_uuid)s. ' 'Last exception: %(exc_reason)s') % { 'max_attempts': max_attempts, 'instance_uuid': instance_uuid, 'exc_reason': exc_reason }) raise exception.MaxRetriesExceeded(reason=msg)
def _check_not_over_max_retries(self, attempted_hosts): if CONF.migrate_max_retries == -1: return retries = len(attempted_hosts) - 1 if retries > CONF.migrate_max_retries: msg = (_('Exceeded max scheduling retries %(max_retries)d for ' 'instance %(instance_uuid)s during live migration') % {'max_retries': retries, 'instance_uuid': self.instance.uuid}) raise exception.MaxRetriesExceeded(reason=msg)
def _reschedule(self): # Since the resources on these alternates may have been consumed and # might not be able to support the migrated instance, we need to first # claim the resources to verify the host still has sufficient # available resources. elevated = self.context.elevated() host_available = False selection = None while self.host_list and not host_available: selection = self.host_list.pop(0) if (self.request_spec.requested_resources and not self._support_resource_request(selection)): LOG.debug( 'Scheduler returned alternate host %(host)s as a possible ' 'migration target for re-schedule but that host is not ' 'new enough to support the migration with resource ' 'request %(request)s. Trying another alternate.', { 'host': selection.service_host, 'request': self.request_spec.requested_resources }, instance=self.instance) continue if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) if host_available: scheduler_utils.fill_provider_mapping( self.context, self.reportclient, self.request_spec, selection) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % { "instance_uuid": self.instance.uuid }) raise exception.MaxRetriesExceeded(reason=reason) return selection
def _get_host_supporting_request(self, selection_list): """Return the first compute selection from the selection_list where the service is new enough to support resource request during migration and the resources claimed successfully. :param selection_list: a list of Selection objects returned by the scheduler :return: A two tuple. The first item is a Selection object representing the host that supports the request. The second item is a list of Selection objects representing the remaining alternate hosts. :raises MaxRetriesExceeded: if none of the hosts in the selection_list is new enough to support the request or we cannot claim resource on any of the hosts that are new enough. """ if not self.request_spec.requested_resources: return selection_list[0], selection_list[1:] # Scheduler allocated resources on the first host. So check if the # first host is new enough if self._support_resource_request(selection_list[0]): return selection_list[0], selection_list[1:] # First host is old, so we need to use an alternate. Therefore we have # to remove the allocation from the first host. self.reportclient.delete_allocation_for_instance( self.context, self.instance.uuid) LOG.debug( 'Scheduler returned host %(host)s as a possible migration target ' 'but that host is not new enough to support the migration with ' 'resource request %(request)s or the compute RPC is pinned to ' 'less than 5.2. Trying alternate hosts.', {'host': selection_list[0].service_host, 'request': self.request_spec.requested_resources}, instance=self.instance) alternates = selection_list[1:] for i, selection in enumerate(alternates): if self._support_resource_request(selection): # this host is new enough so we need to try to claim resources # on it if selection.allocation_request: alloc_req = jsonutils.loads( selection.allocation_request) resource_claimed = scheduler_utils.claim_resources( self.context, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) if not resource_claimed: LOG.debug( 'Scheduler returned alternate host %(host)s as a ' 'possible migration target but resource claim ' 'failed on that host. Trying another alternate.', {'host': selection.service_host}, instance=self.instance) else: return selection, alternates[i + 1:] else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the resources are available. return selection, alternates[i + 1:] else: LOG.debug( 'Scheduler returned alternate host %(host)s as a possible ' 'migration target but that host is not new enough to ' 'support the migration with resource request %(request)s ' 'or the compute RPC is pinned to less than 5.2. ' 'Trying another alternate.', {'host': selection.service_host, 'request': self.request_spec.requested_resources}, instance=self.instance) # if we reach this point then none of the hosts was new enough for the # request or we failed to claim resources on every alternate reason = ("Exhausted all hosts available during compute service level " "check for instance %(instance_uuid)s." % {"instance_uuid": self.instance.uuid}) raise exception.MaxRetriesExceeded(reason=reason)
def _execute(self): # TODO(sbauza): Remove that once prep_resize() accepts a RequestSpec # object in the signature and all the scheduler.utils methods too legacy_spec = self.request_spec.to_legacy_request_spec_dict() legacy_props = self.request_spec.to_legacy_filter_properties_dict() scheduler_utils.setup_instance_group(self.context, self.request_spec) # If a target host is set in a requested destination, # 'populate_retry' need not be executed. if not ('requested_destination' in self.request_spec and self.request_spec.requested_destination and 'host' in self.request_spec.requested_destination): scheduler_utils.populate_retry(legacy_props, self.instance.uuid) # NOTE(sbauza): Force_hosts/nodes needs to be reset # if we want to make sure that the next destination # is not forced to be the original host self.request_spec.reset_forced_destinations() # NOTE(danms): Right now we only support migrate to the same # cell as the current instance, so request that the scheduler # limit thusly. instance_mapping = objects.InstanceMapping.get_by_instance_uuid( self.context, self.instance.uuid) LOG.debug('Requesting cell %(cell)s while migrating', {'cell': instance_mapping.cell_mapping.identity}, instance=self.instance) if ('requested_destination' in self.request_spec and self.request_spec.requested_destination): self.request_spec.requested_destination.cell = ( instance_mapping.cell_mapping) # NOTE(takashin): In the case that the target host is specified, # if the migration is failed, it is not necessary to retry # the cold migration to the same host. So make sure that # reschedule will not occur. if 'host' in self.request_spec.requested_destination: legacy_props.pop('retry', None) self.request_spec.retry = None else: self.request_spec.requested_destination = objects.Destination( cell=instance_mapping.cell_mapping) # Once _preallocate_migration() is done, the source node allocation is # moved from the instance consumer to the migration record consumer, # and the instance consumer doesn't have any allocations. If this is # the first time through here (not a reschedule), select_destinations # below will allocate resources on the selected destination node for # the instance consumer. If we're rescheduling, host_list is not None # and we'll call claim_resources for the instance and the selected # alternate. If we exhaust our alternates and raise MaxRetriesExceeded, # the rollback() method should revert the allocation swaparoo and move # the source node allocation from the migration record back to the # instance record. migration = self._preallocate_migration() self.request_spec.ensure_project_and_user_id(self.instance) compute_utils.heal_reqspec_is_bfv(self.context, self.request_spec, self.instance) # On an initial call to migrate, 'self.host_list' will be None, so we # have to call the scheduler to get a list of acceptable hosts to # migrate to. That list will consist of a selected host, along with # zero or more alternates. On a reschedule, though, the alternates will # be passed to this object and stored in 'self.host_list', so we can # pop the first alternate from the list to use for the destination, and # pass the remaining alternates to the compute. if self.host_list is None: selection_lists = self.scheduler_client.select_destinations( self.context, self.request_spec, [self.instance.uuid], return_objects=True, return_alternates=True) # Since there is only ever one instance to migrate per call, we # just need the first returned element. selection_list = selection_lists[0] # The selected host is the first item in the list, with the # alternates being the remainder of the list. selection, self.host_list = selection_list[0], selection_list[1:] else: # This is a reschedule that will use the supplied alternate hosts # in the host_list as destinations. Since the resources on these # alternates may have been consumed and might not be able to # support the migrated instance, we need to first claim the # resources to verify the host still has sufficient availabile # resources. elevated = self.context.elevated() host_available = False while self.host_list and not host_available: selection = self.host_list.pop(0) if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % { "instance_uuid": self.instance.uuid }) raise exception.MaxRetriesExceeded(reason=reason) scheduler_utils.populate_filter_properties(legacy_props, selection) # context is not serializable legacy_props.pop('context', None) (host, node) = (selection.service_host, selection.nodename) self.instance.availability_zone = ( availability_zones.get_host_availability_zone(self.context, host)) # FIXME(sbauza): Serialize/Unserialize the legacy dict because of # oslo.messaging #1529084 to transform datetime values into strings. # tl;dr: datetimes in dicts are not accepted as correct values by the # rpc fake driver. legacy_spec = jsonutils.loads(jsonutils.dumps(legacy_spec)) LOG.debug( "Calling prep_resize with selected host: %s; " "Selected node: %s; Alternates: %s", host, node, self.host_list, instance=self.instance) # RPC cast to the destination host to start the migration process. self.compute_rpcapi.prep_resize(self.context, self.instance, legacy_spec['image'], self.flavor, host, migration, request_spec=legacy_spec, filter_properties=legacy_props, node=node, clean_shutdown=self.clean_shutdown, host_list=self.host_list)
def _execute(self): # TODO(sbauza): Remove once all the scheduler.utils methods accept a # RequestSpec object in the signature. legacy_props = self.request_spec.to_legacy_filter_properties_dict() scheduler_utils.setup_instance_group(self.context, self.request_spec) # If a target host is set in a requested destination, # 'populate_retry' need not be executed. if not ('requested_destination' in self.request_spec and self.request_spec.requested_destination and 'host' in self.request_spec.requested_destination): scheduler_utils.populate_retry(legacy_props, self.instance.uuid) # NOTE(sbauza): Force_hosts/nodes needs to be reset # if we want to make sure that the next destination # is not forced to be the original host self.request_spec.reset_forced_destinations() # TODO(gibi): We need to make sure that the requested_resources field # is re calculated based on neutron ports. self._restrict_request_spec_to_cell(legacy_props) # Once _preallocate_migration() is done, the source node allocation is # moved from the instance consumer to the migration record consumer, # and the instance consumer doesn't have any allocations. If this is # the first time through here (not a reschedule), select_destinations # below will allocate resources on the selected destination node for # the instance consumer. If we're rescheduling, host_list is not None # and we'll call claim_resources for the instance and the selected # alternate. If we exhaust our alternates and raise MaxRetriesExceeded, # the rollback() method should revert the allocation swaparoo and move # the source node allocation from the migration record back to the # instance record. migration = self._preallocate_migration() self.request_spec.ensure_project_and_user_id(self.instance) self.request_spec.ensure_network_metadata(self.instance) compute_utils.heal_reqspec_is_bfv(self.context, self.request_spec, self.instance) # On an initial call to migrate, 'self.host_list' will be None, so we # have to call the scheduler to get a list of acceptable hosts to # migrate to. That list will consist of a selected host, along with # zero or more alternates. On a reschedule, though, the alternates will # be passed to this object and stored in 'self.host_list', so we can # pop the first alternate from the list to use for the destination, and # pass the remaining alternates to the compute. if self.host_list is None: selection_lists = self.query_client.select_destinations( self.context, self.request_spec, [self.instance.uuid], return_objects=True, return_alternates=True) # Since there is only ever one instance to migrate per call, we # just need the first returned element. selection_list = selection_lists[0] # The selected host is the first item in the list, with the # alternates being the remainder of the list. selection, self.host_list = selection_list[0], selection_list[1:] else: # This is a reschedule that will use the supplied alternate hosts # in the host_list as destinations. Since the resources on these # alternates may have been consumed and might not be able to # support the migrated instance, we need to first claim the # resources to verify the host still has sufficient availabile # resources. elevated = self.context.elevated() host_available = False while self.host_list and not host_available: selection = self.host_list.pop(0) if selection.allocation_request: alloc_req = jsonutils.loads(selection.allocation_request) else: alloc_req = None if alloc_req: # If this call succeeds, the resources on the destination # host will be claimed by the instance. host_available = scheduler_utils.claim_resources( elevated, self.reportclient, self.request_spec, self.instance.uuid, alloc_req, selection.allocation_request_version) else: # Some deployments use different schedulers that do not # use Placement, so they will not have an # allocation_request to claim with. For those cases, # there is no concept of claiming, so just assume that # the host is valid. host_available = True # There are no more available hosts. Raise a MaxRetriesExceeded # exception in that case. if not host_available: reason = ("Exhausted all hosts available for retrying build " "failures for instance %(instance_uuid)s." % { "instance_uuid": self.instance.uuid }) raise exception.MaxRetriesExceeded(reason=reason) scheduler_utils.populate_filter_properties(legacy_props, selection) # context is not serializable legacy_props.pop('context', None) (host, node) = (selection.service_host, selection.nodename) self.instance.availability_zone = ( availability_zones.get_host_availability_zone(self.context, host)) LOG.debug( "Calling prep_resize with selected host: %s; " "Selected node: %s; Alternates: %s", host, node, self.host_list, instance=self.instance) # RPC cast to the destination host to start the migration process. self.compute_rpcapi.prep_resize( # NOTE(mriedem): Using request_spec.image here is potentially # dangerous if it is not kept up to date (i.e. rebuild/unshelve); # seems like the sane thing to do would be to pass the current # instance.image_meta since that is what MoveClaim will use for # any NUMA topology claims on the destination host... self.context, self.instance, self.request_spec.image, self.flavor, host, migration, request_spec=self.request_spec, filter_properties=legacy_props, node=node, clean_shutdown=self.clean_shutdown, host_list=self.host_list)