def _poll_and_check(self, osc, bay): poller = HeatPoller(osc, bay) lc = loopingcall.FixedIntervalLoopingCall(f=poller.poll_and_check) lc.start(cfg.CONF.bay_heat.wait_interval, True)
def test_repeat(self): self.useFixture(fixture.SleepFixture()) self.num_runs = 2 timer = loopingcall.FixedIntervalLoopingCall(self._wait_for_zero) self.assertFalse(timer.start(interval=0.5).wait())
def start_shellinabox_console(node_uuid, port, console_cmd): """Open the serial console for a node. :param node_uuid: the uuid for the node. :param port: the terminal port for the node. :param console_cmd: the shell command that gets the console. :raises: ConsoleError if the directory for the PID file cannot be created. :raises: ConsoleSubprocessFailed when invoking the subprocess failed. """ # make sure that the old console for this node is stopped # and the files are cleared try: _stop_console(node_uuid) except exception.NoConsolePid: pass except processutils.ProcessExecutionError as exc: LOG.warning( _LW("Failed to kill the old console process " "before starting a new shellinabox console " "for node %(node)s. Reason: %(err)s"), { 'node': node_uuid, 'err': exc }) _ensure_console_pid_dir_exists() pid_file = _get_console_pid_file(node_uuid) # put together the command and arguments for invoking the console args = [] args.append(CONF.console.terminal) if CONF.console.terminal_cert_dir: args.append("-c") args.append(CONF.console.terminal_cert_dir) else: args.append("-t") args.append("-p") args.append(str(port)) args.append("--background=%s" % pid_file) args.append("-s") args.append(console_cmd) # run the command as a subprocess try: LOG.debug('Running subprocess: %s', ' '.join(args)) # use pipe here to catch the error in case shellinaboxd # failed to start. obj = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except (OSError, ValueError) as e: error = _("%(exec_error)s\n" "Command: %(command)s") % { 'exec_error': str(e), 'command': ' '.join(args) } LOG.warning(error) raise exception.ConsoleSubprocessFailed(error=error) def _wait(node_uuid, popen_obj): locals['returncode'] = popen_obj.poll() # check if the console pid is created. # if it is, then the shellinaboxd is invoked successfully as a daemon. # otherwise check the error. if locals['returncode'] is not None: if locals['returncode'] == 0 and os.path.exists(pid_file): raise loopingcall.LoopingCallDone() else: (stdout, stderr) = popen_obj.communicate() locals['errstr'] = _("Command: %(command)s.\n" "Exit code: %(return_code)s.\n" "Stdout: %(stdout)r\n" "Stderr: %(stderr)r") % { 'command': ' '.join(args), 'return_code': locals['returncode'], 'stdout': stdout, 'stderr': stderr } LOG.warning(locals['errstr']) raise loopingcall.LoopingCallDone() if (time.time() > expiration): locals['errstr'] = _("Timeout while waiting for console subprocess" "to start for node %s.") % node_uuid LOG.warning(locals['errstr']) raise loopingcall.LoopingCallDone() locals = {'returncode': None, 'errstr': ''} expiration = time.time() + CONF.console.subprocess_timeout timer = loopingcall.FixedIntervalLoopingCall(_wait, node_uuid, obj) timer.start(interval=CONF.console.subprocess_checking_interval).wait() if locals['errstr']: raise exception.ConsoleSubprocessFailed(error=locals['errstr'])
def add_task(self, call_function, interval, initial_delay=0): looping_call = loopingcall.FixedIntervalLoopingCall(call_function) task = LoopingTask(looping_call, interval, initial_delay) self.tasks.append(task)
def backup(self, backup, volume_file, backup_metadata=True): """Backup the given volume. If backup['parent_id'] is given, then an incremental backup is performed. """ if self.chunk_size_bytes % self.sha_block_size_bytes: err = _('Chunk size is not multiple of ' 'block size for creating hash.') raise exception.InvalidBackup(reason=err) # Read the shafile of the parent backup if backup['parent_id'] # is given. parent_backup_shafile = None parent_backup = None if backup.parent_id: parent_backup = objects.Backup.get_by_id(self.context, backup.parent_id) parent_backup_shafile = self._read_sha256file(parent_backup) parent_backup_shalist = parent_backup_shafile['sha256s'] if (parent_backup_shafile['chunk_size'] != self.sha_block_size_bytes): err = (_('Hash block size has changed since the last ' 'backup. New hash block size: %(new)s. Old hash ' 'block size: %(old)s. Do a full backup.') % {'old': parent_backup_shafile['chunk_size'], 'new': self.sha_block_size_bytes}) raise exception.InvalidBackup(reason=err) # If the volume size increased since the last backup, fail # the incremental backup and ask user to do a full backup. if backup.size > parent_backup.size: err = _('Volume size increased since the last ' 'backup. Do a full backup.') raise exception.InvalidBackup(reason=err) if sys.platform == 'win32': # When dealing with Windows physical disks, we need the exact # size of the disk. Attempting to read passed this boundary will # lead to an IOError exception. At the same time, we cannot # seek to the end of file. win32_disk_size = self._get_win32_phys_disk_size(volume_file.name) (object_meta, object_sha256, extra_metadata, container, volume_size_bytes) = self._prepare_backup(backup) counter = 0 total_block_sent_num = 0 # There are two mechanisms to send the progress notification. # 1. The notifications are periodically sent in a certain interval. # 2. The notifications are sent after a certain number of chunks. # Both of them are working simultaneously during the volume backup, # when "chunked" backup drivers are deployed. def _notify_progress(): self._send_progress_notification(self.context, backup, object_meta, total_block_sent_num, volume_size_bytes) timer = loopingcall.FixedIntervalLoopingCall( _notify_progress) if self.enable_progress_timer: timer.start(interval=self.backup_timer_interval) sha256_list = object_sha256['sha256s'] shaindex = 0 is_backup_canceled = False while True: # First of all, we check the status of this backup. If it # has been changed to delete or has been deleted, we cancel the # backup process to do forcing delete. with backup.as_read_deleted(): backup.refresh() if backup.status in (fields.BackupStatus.DELETING, fields.BackupStatus.DELETED): is_backup_canceled = True # To avoid the chunk left when deletion complete, need to # clean up the object of chunk again. self.delete_backup(backup) LOG.debug('Cancel the backup process of %s.', backup.id) break data_offset = volume_file.tell() if sys.platform == 'win32': read_bytes = min(self.chunk_size_bytes, win32_disk_size - data_offset) else: read_bytes = self.chunk_size_bytes data = volume_file.read(read_bytes) if data == b'': break # Calculate new shas with the datablock. shalist = eventlet.tpool.execute(self._calculate_sha, data) sha256_list.extend(shalist) # If parent_backup is not None, that means an incremental # backup will be performed. if parent_backup: # Find the extent that needs to be backed up. extent_off = -1 for idx, sha in enumerate(shalist): if sha != parent_backup_shalist[shaindex]: if extent_off == -1: # Start of new extent. extent_off = idx * self.sha_block_size_bytes else: if extent_off != -1: # We've reached the end of extent. extent_end = idx * self.sha_block_size_bytes segment = data[extent_off:extent_end] self._backup_chunk(backup, container, segment, data_offset + extent_off, object_meta, extra_metadata) extent_off = -1 shaindex += 1 # The last extent extends to the end of data buffer. if extent_off != -1: extent_end = len(data) segment = data[extent_off:extent_end] self._backup_chunk(backup, container, segment, data_offset + extent_off, object_meta, extra_metadata) extent_off = -1 else: # Do a full backup. self._backup_chunk(backup, container, data, data_offset, object_meta, extra_metadata) # Notifications total_block_sent_num += self.data_block_num counter += 1 if counter == self.data_block_num: # Send the notification to Ceilometer when the chunk # number reaches the data_block_num. The backup percentage # is put in the metadata as the extra information. self._send_progress_notification(self.context, backup, object_meta, total_block_sent_num, volume_size_bytes) # Reset the counter counter = 0 # Stop the timer. timer.stop() # If backup has been cancelled we have nothing more to do # but timer.stop(). if is_backup_canceled: return # All the data have been sent, the backup_percent reaches 100. self._send_progress_end(self.context, backup, object_meta) object_sha256['sha256s'] = sha256_list if backup_metadata: try: self._backup_metadata(backup, object_meta) # Whatever goes wrong, we want to log, cleanup, and re-raise. except Exception: with excutils.save_and_reraise_exception(): LOG.exception("Backup volume metadata failed.") self.delete_backup(backup) self._finalize_backup(backup, container, object_meta, object_sha256)
def _unlink_volume(self, array, source_device_id, target_device_id, snap_name, extra_specs, snap_id=None, list_volume_pairs=None, loop=True): """Unlink a target volume from its source volume. :param array: the array serial number :param source_device_id: the source device id :param target_device_id: the target device id :param snap_name: the snap name :param extra_specs: extra specifications :param snap_id: the unique snap id of the SnapVX :param list_volume_pairs: list of volume pairs, optional :param loop: if looping call is required for handling retries :returns: return code """ def _unlink_vol(): """Called at an interval until the synchronization is finished. :raises: loopingcall.LoopingCallDone """ retries = kwargs['retries'] try: kwargs['retries'] = retries + 1 if not kwargs['modify_vol_success']: self.rest.modify_volume_snap( array, source_device_id, target_device_id, snap_name, extra_specs, snap_id=snap_id, unlink=True, list_volume_pairs=list_volume_pairs) kwargs['modify_vol_success'] = True except exception.VolumeBackendAPIException: pass if kwargs['retries'] > UNLINK_RETRIES: LOG.error("_unlink_volume failed after %(retries)d " "tries.", {'retries': retries}) raise loopingcall.LoopingCallDone(retvalue=30) if kwargs['modify_vol_success']: raise loopingcall.LoopingCallDone() if not loop: self.rest.modify_volume_snap(array, source_device_id, target_device_id, snap_name, extra_specs, snap_id=snap_id, unlink=True, list_volume_pairs=list_volume_pairs) else: kwargs = {'retries': 0, 'modify_vol_success': False} timer = loopingcall.FixedIntervalLoopingCall(_unlink_vol) rc = timer.start(interval=UNLINK_INTERVAL).wait() return rc
def _wait_vdisk_copy_completed(self, vdisk_name): timer = loopingcall.FixedIntervalLoopingCall( self._is_vdisk_copy_in_progress, vdisk_name) timer.start(interval=self._check_lock_interval).wait()
def _init_periodic_resync(self): self.resync_thread = loopingcall.FixedIntervalLoopingCall( self._periodic_resync) self.resync_thread.start(interval=self.RESYNC_TRY_INTERVAL)
def create_cloned_volume(self, volume, src_vref): """Create a clone of the specified volume.""" LOG.debug("Creating clone of volume: %s.", src_vref['id']) name = self.configuration.volume_name_prefix, volume['id'] vol_name = ''.join(name) vol_size = volume['size'] * units.Ki src_vol_id = src_vref['provider_location'] LOG.debug( "Clone volume : " "[name] %(name)s - [source] %(source)s - [size] %(size)s.", { 'name': vol_name, 'source': src_vol_id, 'size': six.text_type(vol_size) }) reply = self.client.service.volumeClone(src_vol_id, vol_name) status = reply['status'] result = reply['result'] LOG.debug("Clone volume : [status] %(stat)s - [result] %(res)s.", { 'stat': six.text_type(status), 'res': result }) if status != 0: msg = (_("Error while creating volume " "[status] %(stat)s - [result] %(res)s.") % { 'stat': six.text_type(status), 'res': result }) LOG.error(msg) raise exception.VolumeBackendAPIException(data=msg) # Monitor the status until it becomes # either success, fail or timeout params = {'clone_id': int(result), 'vol_name': vol_name} start_time = int(time.time()) timer = loopingcall.FixedIntervalLoopingCall( self._retry_get_detail, start_time, self.configuration.clone_check_timeout, 'clone_detail', params) reply = timer.start(interval=self.configuration.retry_interval).wait() reply = self.client.service.volumeDetailByName(vol_name) status = reply['status'] new_vol_id = reply['volumeInfoResult']['volumeId'] if status != 0: msg = (_("Error[%(stat)s - %(res)s] " "while getting volume id."), { 'stat': six.text_type(status), 'res': result }) LOG.error(msg) raise exception.VolumeBackendAPIException(data=msg) LOG.debug( "clone done : " "[status] %(stat)s - [volume id] %(vol_id)s.", { 'stat': status, 'vol_id': six.text_type(new_vol_id) }) return {'provider_location': new_vol_id}
def _set_and_wait(task, target_state): """Helper function for DynamicLoopingCall. This method changes the power state and polls AMT until the desired power state is reached. :param task: a TaskManager instance contains the target node. :param target_state: desired power state. :returns: one of ironic.common.states. :raises: PowerStateFailure if cannot set the node to target_state. :raises: AMTFailure. :raises: AMTConnectFailure :raises: InvalidParameterValue """ node = task.node driver = task.driver if target_state not in (states.POWER_ON, states.POWER_OFF): raise exception.InvalidParameterValue( _('Unsupported target_state: %s') % target_state) elif target_state == states.POWER_ON: boot_device = node.driver_internal_info.get('amt_boot_device') if boot_device and boot_device != amt_common.DEFAULT_BOOT_DEVICE: driver.management.ensure_next_boot_device(node, boot_device) def _wait(status): status['power'] = _power_status(node) if status['power'] == target_state: raise loopingcall.LoopingCallDone() if status['iter'] >= CONF.amt.max_attempts: status['power'] = states.ERROR LOG.warning( _LW("AMT failed to set power state %(state)s after " "%(tries)s retries on node %(node_id)s."), { 'state': target_state, 'tries': status['iter'], 'node_id': node.uuid }) raise loopingcall.LoopingCallDone() try: _set_power_state(node, target_state) except Exception: # Log failures but keep trying LOG.warning( _LW("AMT set power state %(state)s for node %(node)s " "- Attempt %(attempt)s times of %(max_attempt)s " "failed."), { 'state': target_state, 'node': node.uuid, 'attempt': status['iter'] + 1, 'max_attempt': CONF.amt.max_attempts }) status['iter'] += 1 status = {'power': None, 'iter': 0} timer = loopingcall.FixedIntervalLoopingCall(_wait, status) timer.start(interval=CONF.amt.action_wait).wait() if status['power'] != target_state: raise exception.PowerStateFailure(pstate=target_state) return status['power']
def create_volume_from_snapshot(self, volume, snapshot): """Create a volume from a snapshot.""" name = self.configuration.volume_name_prefix, volume['id'] snap_id = snapshot['provider_location'] vol_name = ''.join(name) # Trigger an asynchronous restore operation LOG.debug( "[start] Create volume from snapshot : " "%(snap_id)s - name : %(vol_name)s.", { 'snap_id': snap_id, 'vol_name': vol_name }) reply = self.client.service.restoreFromSnapshot(snap_id, vol_name) status = reply['status'] result = reply['result'] LOG.debug( "Restore volume from snapshot " "[status] %(stat)s - [result] %(res)s.", { 'stat': six.text_type(status), 'res': result }) if status != 0: msg = (_("Error[%(stat)s - %(res)s] while restoring snapshot " "[%(snap_id)s] into volume [%(vol)s].") % { 'stat': six.text_type(status), 'res': result, 'snap_id': snap_id, 'vol': vol_name }) LOG.error(msg) raise exception.VolumeBackendAPIException(data=msg) # Monitor the status until it becomes # either success, fail or timeout params = {'restore_id': int(result)} start_time = int(time.time()) timer = loopingcall.FixedIntervalLoopingCall( self._retry_get_detail, start_time, self.configuration.restore_check_timeout, 'restore_detail', params) reply = timer.start(interval=self.configuration.retry_interval).wait() reply = self.client.service.volumeDetailByName(vol_name) status = reply['status'] new_vol_id = reply['volumeInfoResult']['volumeId'] if status != 0: msg = (_("Error[status] %(stat)s - [result] %(res)s] " "while getting volume id.") % { 'stat': six.text_type(status), 'res': result }) LOG.error(msg) raise exception.VolumeBackendAPIException(data=msg) LOG.debug( "Restore done [status] %(stat)s - " "[volume id] %(vol_id)s.", { 'stat': status, 'vol_id': six.text_type(new_vol_id) }) return {'provider_location': new_vol_id}
def __init__(self): self.timer = loopingcall.FixedIntervalLoopingCall(self.execute_ops) self.maintenance_interval = cfg.CONF.ml2_odl.maintenance_interval self.maintenance_ops = []
def _start_periodic_tasks(self): self.loop = loopingcall.FixedIntervalLoopingCall(self.process_services) self.loop.start(interval=self.conf.cfg_agent.rpc_loop_interval)
def test_return_false(self): def _raise_it(): raise loopingcall.LoopingCallDone(False) timer = loopingcall.FixedIntervalLoopingCall(_raise_it) self.assertFalse(timer.start(interval=0.5).wait())
def execute(self, instance_uuid): """Stop the instance for recovery.""" instance = self.novaclient.get_server(self.context, instance_uuid) ha_enabled_key = CONF.instance_failure.ha_enabled_instance_metadata_key # If an instance is not HA_Enabled and "process_all_instances" config # option is also disabled, then there is no need to take any recovery # action. if not CONF.instance_failure.process_all_instances and not ( strutils.bool_from_string( instance.metadata.get(ha_enabled_key, False))): msg = ("Skipping recovery for instance: %(instance_uuid)s as it is" " not Ha_Enabled") % { 'instance_uuid': instance_uuid } LOG.info(msg) self.update_details(msg, 1.0) raise exception.SkipInstanceRecoveryException() vm_state = getattr(instance, 'OS-EXT-STS:vm_state') if vm_state in ['paused', 'rescued']: msg = ("Recovery of instance '%(instance_uuid)s' is ignored as it " "is in '%(vm_state)s' state.") % { 'instance_uuid': instance_uuid, 'vm_state': vm_state } LOG.warning(msg) self.update_details(msg, 1.0) raise exception.IgnoreInstanceRecoveryException(msg) if vm_state != 'stopped': if vm_state == 'resized': self.novaclient.reset_instance_state(self.context, instance.id, 'active') msg = "Stopping instance: %s" % instance_uuid self.update_details(msg) self.novaclient.stop_server(self.context, instance.id) def _wait_for_power_off(): new_instance = self.novaclient.get_server(self.context, instance_uuid) vm_state = getattr(new_instance, 'OS-EXT-STS:vm_state') if vm_state == 'stopped': raise loopingcall.LoopingCallDone() periodic_call = loopingcall.FixedIntervalLoopingCall( _wait_for_power_off) try: # add a timeout to the periodic call. periodic_call.start(interval=CONF.verify_interval) etimeout.with_timeout(CONF.wait_period_after_power_off, periodic_call.wait) msg = "Stopped instance: '%s'" % instance_uuid self.update_details(msg, 1.0) except etimeout.Timeout: msg = "Failed to stop instance %(instance)s" % { 'instance': instance.id } self.update_details(msg, 1.0) raise exception.InstanceRecoveryFailureException(message=msg) finally: # stop the periodic call, in case of exceptions or Timeout. periodic_call.stop()
def test_terminate_on_exception(self): def _raise_it(): raise RuntimeError() timer = loopingcall.FixedIntervalLoopingCall(_raise_it) self.assertRaises(RuntimeError, timer.start(interval=0.5).wait)
def _setup_backlog_handling(self): LOG.debug('Activating periodic backlog processor') self._heartbeat = loopingcall.FixedIntervalLoopingCall( self._process_backlogged_routers) self._heartbeat.start( interval=cfg.CONF.general.backlog_processing_interval)
def _setup_backlog_handling(self): self._heartbeat = loopingcall.FixedIntervalLoopingCall( self._process_backlogged_routers) self._heartbeat.start( interval=cfg.CONF.general.backlog_processing_interval)
def start(self): version_string = version.version_string() LOG.info('Starting %(topic)s node (version %(version_string)s)', { 'topic': self.topic, 'version_string': version_string }) self.model_disconnected = False if self.coordination: coordination.COORDINATOR.start() self.manager.init_host(added_to_cluster=self.added_to_cluster, service_id=Service.service_id) LOG.debug("Creating RPC server for service %s", self.topic) ctxt = context.get_admin_context() endpoints = [self.manager] endpoints.extend(self.manager.additional_endpoints) obj_version_cap = objects.Service.get_minimum_obj_version(ctxt) LOG.debug("Pinning object versions for RPC server serializer to %s", obj_version_cap) serializer = objects_base.CinderObjectSerializer(obj_version_cap) target = messaging.Target(topic=self.topic, server=self.host) self.rpcserver = rpc.get_server(target, endpoints, serializer) self.rpcserver.start() # NOTE(dulek): Kids, don't do that at home. We're relying here on # oslo.messaging implementation details to keep backward compatibility # with pre-Ocata services. This will not matter once we drop # compatibility with them. if self.topic == constants.VOLUME_TOPIC: target = messaging.Target(topic='%(topic)s.%(host)s' % { 'topic': self.topic, 'host': self.host }, server=vol_utils.extract_host( self.host, 'host')) self.backend_rpcserver = rpc.get_server(target, endpoints, serializer) self.backend_rpcserver.start() if self.cluster: LOG.info( 'Starting %(topic)s cluster %(cluster)s (version ' '%(version)s)', { 'topic': self.topic, 'version': version_string, 'cluster': self.cluster }) target = messaging.Target( topic='%s.%s' % (self.topic, self.cluster), server=vol_utils.extract_host(self.cluster, 'host')) serializer = objects_base.CinderObjectSerializer(obj_version_cap) self.cluster_rpcserver = rpc.get_server(target, endpoints, serializer) self.cluster_rpcserver.start() self.manager.init_host_with_rpc() if self.report_interval: pulse = loopingcall.FixedIntervalLoopingCall(self.report_state) pulse.start(interval=self.report_interval, initial_delay=self.report_interval) self.timers.append(pulse) if self.periodic_interval: if self.periodic_fuzzy_delay: initial_delay = random.randint(0, self.periodic_fuzzy_delay) else: initial_delay = None periodic = loopingcall.FixedIntervalLoopingCall( self.periodic_tasks) periodic.start(interval=self.periodic_interval, initial_delay=initial_delay) self.timers.append(periodic)
def execute(self, stack_id, heat_client): LOG.info(_LI('Syncing Heat stack status, stack_id: %s'), stack_id) sync_status_loop = loopingcall.FixedIntervalLoopingCall( self._sync_status, heat_client, stack_id) sync_status_loop.start(interval=CONF.sync_status_interval) sync_status_loop.wait()
def rebuild(self, context, instance, image_meta, injected_files, admin_password, bdms, detach_block_devices, attach_block_devices, network_info=None, recreate=False, block_device_info=None, preserve_ephemeral=False): """Rebuild/redeploy an instance. This version of rebuild() allows for supporting the option to preserve the ephemeral partition. We cannot call spawn() from here because it will attempt to set the instance_uuid value again, which is not allowed by the Ironic API. It also requires the instance to not have an 'active' provision state, but we cannot safely change that. Given that, we implement only the portions of spawn() we need within rebuild(). :param context: The security context. :param instance: The instance object. :param image_meta: Image object returned by nova.image.glance that defines the image from which to boot this instance. Ignored by this driver. :param injected_files: User files to inject into instance. Ignored by this driver. :param admin_password: Administrator password to set in instance. Ignored by this driver. :param bdms: block-device-mappings to use for rebuild. Ignored by this driver. :param detach_block_devices: function to detach block devices. See nova.compute.manager.ComputeManager:_rebuild_default_impl for usage. Ignored by this driver. :param attach_block_devices: function to attach block devices. See nova.compute.manager.ComputeManager:_rebuild_default_impl for usage. Ignored by this driver. :param network_info: Instance network information. Ignored by this driver. :param recreate: Boolean value; if True the instance is recreated on a new hypervisor - all the cleanup of old state is skipped. Ignored by this driver. :param block_device_info: Instance block device information. Ignored by this driver. :param preserve_ephemeral: Boolean value; if True the ephemeral must be preserved on rebuild. """ LOG.debug('Rebuild called for instance', instance=instance) instance.task_state = task_states.REBUILD_SPAWNING instance.save(expected_task_state=[task_states.REBUILDING]) node_uuid = instance.node node = self._get_node(node_uuid) self._add_driver_fields(node, instance, image_meta, instance.flavor, preserve_ephemeral) # Trigger the node rebuild/redeploy. try: self.ironicclient.call("node.set_provision_state", node_uuid, ironic_states.REBUILD) except (exception.NovaException, # Retry failed ironic.exc.InternalServerError, # Validations ironic.exc.BadRequest) as e: # Maintenance msg = (_("Failed to request Ironic to rebuild instance " "%(inst)s: %(reason)s") % {'inst': instance.uuid, 'reason': six.text_type(e)}) raise exception.InstanceDeployFailure(msg) # Although the target provision state is REBUILD, it will actually go # to ACTIVE once the redeploy is finished. timer = loopingcall.FixedIntervalLoopingCall(self._wait_for_active, instance) timer.start(interval=CONF.ironic.api_retry_interval).wait() LOG.info(_LI('Instance was successfully rebuilt'), instance=instance)
def start_socat_console(node_uuid, port, console_cmd): """Open the serial console for a node. :param node_uuid: the uuid of the node :param port: the terminal port for the node :param console_cmd: the shell command that will be executed by socat to establish console to the node :raises ConsoleError: if the directory for the PID file or the PID file cannot be created :raises ConsoleSubprocessFailed: when invoking the subprocess failed """ # Make sure that the old console for this node is stopped. # If no console is running, we may get exception NoConsolePid. try: _stop_console(node_uuid) except exception.NoConsolePid: pass _ensure_console_pid_dir_exists() pid_file = _get_console_pid_file(node_uuid) # put together the command and arguments for invoking the console args = ['socat'] # set timeout check for user's connection. If the timeout value # is not 0, after timeout seconds of inactivity on the client side, # the connection will be closed. if CONF.console.terminal_timeout > 0: args.append('-T%d' % CONF.console.terminal_timeout) args.append('-L%s' % pid_file) console_host = CONF.console.socat_address if netutils.is_valid_ipv6(console_host): arg = 'TCP6-LISTEN:%(port)s,bind=[%(host)s],reuseaddr' else: arg = 'TCP4-LISTEN:%(port)s,bind=%(host)s,reuseaddr' args.append(arg % {'host': console_host, 'port': port}) args.append('EXEC:"%s",pty,stderr' % console_cmd) # run the command as a subprocess try: LOG.debug('Running subprocess: %s', ' '.join(args)) # Use pipe here to catch the error in case socat # fails to start. Note that socat uses stdout as transferring # data, so we only capture stderr for checking if it fails. obj = subprocess.Popen(args, stderr=subprocess.PIPE) except (OSError, ValueError) as e: error = _("%(exec_error)s\n" "Command: %(command)s") % {'exec_error': str(e), 'command': ' '.join(args)} LOG.exception('Unable to start socat console') raise exception.ConsoleSubprocessFailed(error=error) # NOTE: we need to check if socat fails to start here. # If it starts successfully, it will run in non-daemon mode and # will not return until the console session is stopped. def _wait(node_uuid, popen_obj): wait_state['returncode'] = popen_obj.poll() # socat runs in non-daemon mode, so it should not return now if wait_state['returncode'] is None: # If the pid file is created and the process is running, # we stop checking it periodically. if (os.path.exists(pid_file) and psutil.pid_exists(_get_console_pid(node_uuid))): raise loopingcall.LoopingCallDone() else: # socat returned, it failed to start. # We get the error (out should be None in this case). (_out, err) = popen_obj.communicate() wait_state['errstr'] = _( "Command: %(command)s.\n" "Exit code: %(return_code)s.\n" "Stderr: %(error)r") % { 'command': ' '.join(args), 'return_code': wait_state['returncode'], 'error': err} LOG.error(wait_state['errstr']) raise loopingcall.LoopingCallDone() if time.time() > expiration: wait_state['errstr'] = (_("Timeout while waiting for console " "subprocess to start for node %s.") % node_uuid) LOG.error(wait_state['errstr']) raise loopingcall.LoopingCallDone() wait_state = {'returncode': None, 'errstr': ''} expiration = time.time() + CONF.console.subprocess_timeout timer = loopingcall.FixedIntervalLoopingCall(_wait, node_uuid, obj) timer.start(interval=CONF.console.subprocess_checking_interval).wait() if wait_state['errstr']: raise exception.ConsoleSubprocessFailed(error=wait_state['errstr'])
def spawn(self, context, instance, image_meta, injected_files, admin_password, network_info=None, block_device_info=None): """Deploy an instance. :param context: The security context. :param instance: The instance object. :param image_meta: Image dict returned by nova.image.glance that defines the image from which to boot this instance. :param injected_files: User files to inject into instance. :param admin_password: Administrator password to set in instance. :param network_info: Instance network information. :param block_device_info: Instance block device information. Ignored by this driver. """ LOG.debug('Spawn called for instance', instance=instance) # The compute manager is meant to know the node uuid, so missing uuid # is a significant issue. It may mean we've been passed the wrong data. node_uuid = instance.get('node') if not node_uuid: raise ironic.exc.BadRequest( _("Ironic node uuid not supplied to " "driver for instance %s.") % instance.uuid) node = self._get_node(node_uuid) flavor = instance.flavor self._add_driver_fields(node, instance, image_meta, flavor) # NOTE(Shrews): The default ephemeral device needs to be set for # services (like cloud-init) that depend on it being returned by the # metadata server. Addresses bug https://launchpad.net/bugs/1324286. if flavor.ephemeral_gb: instance.default_ephemeral_device = '/dev/sda1' instance.save() # validate we are ready to do the deploy validate_chk = self.ironicclient.call("node.validate", node_uuid) if (not validate_chk.deploy.get('result') or not validate_chk.power.get('result')): # something is wrong. undo what we have done self._cleanup_deploy(node, instance, network_info) raise exception.ValidationError(_( "Ironic node: %(id)s failed to validate." " (deploy: %(deploy)s, power: %(power)s)") % {'id': node.uuid, 'deploy': validate_chk.deploy, 'power': validate_chk.power}) # prepare for the deploy try: self._plug_vifs(node, instance, network_info) self._start_firewall(instance, network_info) except Exception: with excutils.save_and_reraise_exception(): LOG.error(_LE("Error preparing deploy for instance " "%(instance)s on baremetal node %(node)s."), {'instance': instance.uuid, 'node': node_uuid}) self._cleanup_deploy(node, instance, network_info) # Config drive configdrive_value = None if configdrive.required_by(instance): extra_md = {} if admin_password: extra_md['admin_pass'] = admin_password try: configdrive_value = self._generate_configdrive( instance, node, network_info, extra_md=extra_md, files=injected_files) except Exception as e: with excutils.save_and_reraise_exception(): msg = (_LE("Failed to build configdrive: %s") % six.text_type(e)) LOG.error(msg, instance=instance) self._cleanup_deploy(node, instance, network_info) LOG.info(_LI("Config drive for instance %(instance)s on " "baremetal node %(node)s created."), {'instance': instance['uuid'], 'node': node_uuid}) # trigger the node deploy try: self.ironicclient.call("node.set_provision_state", node_uuid, ironic_states.ACTIVE, configdrive=configdrive_value) except Exception as e: with excutils.save_and_reraise_exception(): msg = (_LE("Failed to request Ironic to provision instance " "%(inst)s: %(reason)s"), {'inst': instance.uuid, 'reason': six.text_type(e)}) LOG.error(msg) self._cleanup_deploy(node, instance, network_info) timer = loopingcall.FixedIntervalLoopingCall(self._wait_for_active, instance) try: timer.start(interval=CONF.ironic.api_retry_interval).wait() LOG.info(_LI('Successfully provisioned Ironic node %s'), node.uuid, instance=instance) except Exception: with excutils.save_and_reraise_exception(): LOG.error(_LE("Error deploying instance %(instance)s on " "baremetal node %(node)s."), {'instance': instance.uuid, 'node': node_uuid})
def execute(self, stack_id): LOG.info(_("syncing stack status, stack_id:%s"), stack_id) sync_status_loop = loopingcall.FixedIntervalLoopingCall( self._sync_status, self._checkpoint, stack_id) sync_status_loop.start(interval=CONF.sync_status_interval)
def start(self): super(AgentStatusCheckWorker, self).start() if self._loop is None: self._loop = loopingcall.FixedIntervalLoopingCall(self._check_func) self._loop.start(interval=self._interval, initial_delay=self._initial_delay)
def connect_volume(self, connection_properties): """Attach the volume to instance_name. :param connection_properties: The dictionary that describes all of the target volume attributes. :type connection_properties: dict :returns: dict connection_properties for Fibre Channel must include: target_wwn - World Wide Name target_lun - LUN id of the volume """ device_info = {'type': 'block'} connection_properties = self._add_targets_to_connection_properties( connection_properties) hbas = self._linuxfc.get_fc_hbas_info() if not hbas: LOG.warning("We are unable to locate any Fibre Channel devices.") raise exception.NoFibreChannelHostsFound() host_devices = self._get_possible_volume_paths(connection_properties, hbas) # The /dev/disk/by-path/... node is not always present immediately # We only need to find the first device. Once we see the first device # multipath will have any others. def _wait_for_device_discovery(host_devices): for device in host_devices: LOG.debug("Looking for Fibre Channel dev %(device)s", {'device': device}) if os.path.exists(device) and self.check_valid_device(device): self.host_device = device # get the /dev/sdX device. This is used # to find the multipath device. self.device_name = os.path.realpath(device) raise loopingcall.LoopingCallDone() if self.tries >= self.device_scan_attempts: LOG.error("Fibre Channel volume device not found.") raise exception.NoFibreChannelVolumeDeviceFound() LOG.info( "Fibre Channel volume device not yet found. " "Will rescan & retry. Try number: %(tries)s.", {'tries': self.tries}) self._linuxfc.rescan_hosts(hbas, connection_properties) self.tries = self.tries + 1 self.host_device = None self.device_name = None self.tries = 0 timer = loopingcall.FixedIntervalLoopingCall( _wait_for_device_discovery, host_devices) timer.start(interval=2).wait() LOG.debug( "Found Fibre Channel volume %(name)s " "(after %(tries)s rescans.)", { 'name': self.device_name, 'tries': self.tries }) # find out the WWN of the device device_wwn = self._linuxscsi.get_scsi_wwn(self.host_device) LOG.debug("Device WWN = '%(wwn)s'", {'wwn': device_wwn}) device_info['scsi_wwn'] = device_wwn # see if the new drive is part of a multipath # device. If so, we'll use the multipath device. if self.use_multipath: (device_path, multipath_id) = (super(FibreChannelConnector, self)._discover_mpath_device( device_wwn, connection_properties, self.device_name)) if multipath_id: # only set the multipath_id if we found one device_info['multipath_id'] = multipath_id else: device_path = self.host_device device_info['path'] = device_path return device_info
def _wait_for_export_state(self, volume_name, snapshot_name=None, state=False): """Polls backend to verify volume's export state. XG sets/queries following a request to create or delete a lun export may fail on the backend if vshared is still processing the export action (or times out). We can check whether it is done by polling the export binding for a lun to ensure it is created or deleted. This function will try to verify the creation or removal of export state on both gateway nodes of the array every 5 seconds. Arguments: volume_name -- name of volume snapshot_name -- name of volume's snapshot state -- True to poll for existence, False for lack of Returns: True if the export state was correctly added or removed (depending on 'state' param) """ if not snapshot_name: bn = "/vshare/state/local/container/%s/lun/%s/usn_id" \ % (self.container, volume_name) else: bn = "/vshare/state/snapshot/container/%s/lun/%s/snap/%s/usn_id" \ % (self.container, volume_name, snapshot_name) def _loop_func(state): status = [False, False] mg_conns = [self.mga, self.mgb] LOG.debug("Entering _wait_for_export_state loop: state=%s.", state) # TODO(rlucio): May need to handle situations where export # fails, i.e., HBAs go offline and the array is in # degraded mode. # for node_id in range(2): resp = mg_conns[node_id].basic.get_node_values(bn) if state: # Verify export was added. Validates when the usn_id is # altered to a non-default binding string. # if resp[bn] != "(not exported)": status[node_id] = True else: # Verify export was removed. Validates when the usn_id is # reset to the default binding string. # if resp[bn] == "(not exported)": status[node_id] = True if status[0] and status[1]: LOG.debug("_wait_for_export_state loopingcall complete.") raise loopingcall.LoopingCallDone(retvalue=True) timer = loopingcall.FixedIntervalLoopingCall(_loop_func, state) success = timer.start(interval=5).wait() return success