def _on_periodic_tasks_stop(self, fut): try: fut.result() except Exception as exc: LOG.critical(_LC('Periodic tasks worker has failed: %s'), exc) else: LOG.info(_LI('Successfully shut down periodic tasks'))
def init_host(self, admin_context=None): """Initialize the conductor host. :param admin_context: the admin context to pass to periodic tasks. :raises: RuntimeError when conductor is already running. :raises: NoDriversLoaded when no drivers are enabled on the conductor. :raises: DriverNotFound if a driver is enabled that does not exist. :raises: DriverLoadError if an enabled driver cannot be loaded. """ if self._started: raise RuntimeError(_('Attempt to start an already running ' 'conductor manager')) self.dbapi = dbapi.get_instance() self._keepalive_evt = threading.Event() """Event for the keepalive thread.""" # TODO(dtantsur): make the threshold configurable? rejection_func = rejection.reject_when_reached( CONF.conductor.workers_pool_size) self._executor = futurist.GreenThreadPoolExecutor( max_workers=CONF.conductor.workers_pool_size, check_and_reject=rejection_func) """Executor for performing tasks async.""" self.ring_manager = hash.HashRingManager() """Consistent hash ring which maps drivers to conductors.""" # NOTE(deva): this call may raise DriverLoadError or DriverNotFound drivers = driver_factory.drivers() if not drivers: msg = _LE("Conductor %s cannot be started because no drivers " "were loaded. This could be because no drivers were " "specified in 'enabled_drivers' config option.") LOG.error(msg, self.host) raise exception.NoDriversLoaded(conductor=self.host) # NOTE(jroll) this is passed to the dbapi, which requires a list, not # a generator (which keys() returns in py3) driver_names = list(drivers) # Collect driver-specific periodic tasks. # Conductor periodic tasks accept context argument, driver periodic # tasks accept this manager and context. We have to ensure that the # same driver interface class is not traversed twice, otherwise # we'll have several instances of the same task. LOG.debug('Collecting periodic tasks') self._periodic_task_callables = [] periodic_task_classes = set() self._collect_periodic_tasks(self, (admin_context,)) for driver_obj in drivers.values(): self._collect_periodic_tasks(driver_obj, (self, admin_context)) for iface_name in driver_obj.all_interfaces: iface = getattr(driver_obj, iface_name, None) if iface and iface.__class__ not in periodic_task_classes: self._collect_periodic_tasks(iface, (self, admin_context)) periodic_task_classes.add(iface.__class__) if (len(self._periodic_task_callables) > CONF.conductor.workers_pool_size): LOG.warning(_LW('This conductor has %(tasks)d periodic tasks ' 'enabled, but only %(workers)d task workers ' 'allowed by [conductor]workers_pool_size option'), {'tasks': len(self._periodic_task_callables), 'workers': CONF.conductor.workers_pool_size}) self._periodic_tasks = periodics.PeriodicWorker( self._periodic_task_callables, executor_factory=periodics.ExistingExecutor(self._executor)) # clear all locks held by this conductor before registering self.dbapi.clear_node_reservations_for_conductor(self.host) try: # Register this conductor with the cluster cdr = self.dbapi.register_conductor({'hostname': self.host, 'drivers': driver_names}) except exception.ConductorAlreadyRegistered: # This conductor was already registered and did not shut down # properly, so log a warning and update the record. LOG.warning( _LW("A conductor with hostname %(hostname)s " "was previously registered. Updating registration"), {'hostname': self.host}) cdr = self.dbapi.register_conductor({'hostname': self.host, 'drivers': driver_names}, update_existing=True) self.conductor = cdr # Start periodic tasks self._periodic_tasks_worker = self._executor.submit( self._periodic_tasks.start, allow_empty=True) self._periodic_tasks_worker.add_done_callback( self._on_periodic_tasks_stop) # NOTE(lucasagomes): If the conductor server dies abruptly # mid deployment (OMM Killer, power outage, etc...) we # can not resume the deployment even if the conductor # comes back online. Cleaning the reservation of the nodes # (dbapi.clear_node_reservations_for_conductor) is not enough to # unstick it, so let's gracefully fail the deployment so the node # can go through the steps (deleting & cleaning) to make itself # available again. filters = {'reserved': False, 'provision_state': states.DEPLOYING} last_error = (_("The deployment can't be resumed by conductor " "%s. Moving to fail state.") % self.host) self._fail_if_in_state(ironic_context.get_admin_context(), filters, states.DEPLOYING, 'provision_updated_at', last_error=last_error) # Spawn a dedicated greenthread for the keepalive try: self._spawn_worker(self._conductor_service_record_keepalive) LOG.info(_LI('Successfully started conductor with hostname ' '%(hostname)s.'), {'hostname': self.host}) except exception.NoFreeConductorWorker: with excutils.save_and_reraise_exception(): LOG.critical(_LC('Failed to start keepalive')) self.del_host() self._started = True
def init_host(self): self.dbapi = dbapi.get_instance() self._keepalive_evt = threading.Event() """Event for the keepalive thread.""" self._worker_pool = greenpool.GreenPool( size=CONF.conductor.workers_pool_size) """GreenPool of background workers for performing tasks async.""" self.ring_manager = hash.HashRingManager() """Consistent hash ring which maps drivers to conductors.""" # NOTE(deva): instantiating DriverFactory may raise DriverLoadError # or DriverNotFound self._driver_factory = driver_factory.DriverFactory() """Driver factory loads all enabled drivers.""" self.drivers = self._driver_factory.names """List of driver names which this conductor supports.""" if not self.drivers: msg = _LE("Conductor %s cannot be started because no drivers " "were loaded. This could be because no drivers were " "specified in 'enabled_drivers' config option.") LOG.error(msg, self.host) raise exception.NoDriversLoaded(conductor=self.host) # Collect driver-specific periodic tasks for driver_obj in driver_factory.drivers().values(): self._collect_periodic_tasks(driver_obj) for iface_name in (driver_obj.core_interfaces + driver_obj.standard_interfaces + ['vendor']): iface = getattr(driver_obj, iface_name, None) if iface: self._collect_periodic_tasks(iface) # clear all locks held by this conductor before registering self.dbapi.clear_node_reservations_for_conductor(self.host) try: # Register this conductor with the cluster cdr = self.dbapi.register_conductor({ 'hostname': self.host, 'drivers': self.drivers }) except exception.ConductorAlreadyRegistered: # This conductor was already registered and did not shut down # properly, so log a warning and update the record. LOG.warning( _LW("A conductor with hostname %(hostname)s " "was previously registered. Updating registration"), {'hostname': self.host}) cdr = self.dbapi.register_conductor( { 'hostname': self.host, 'drivers': self.drivers }, update_existing=True) self.conductor = cdr # NOTE(lucasagomes): If the conductor server dies abruptly # mid deployment (OMM Killer, power outage, etc...) we # can not resume the deployment even if the conductor # comes back online. Cleaning the reservation of the nodes # (dbapi.clear_node_reservations_for_conductor) is not enough to # unstick it, so let's gracefully fail the deployment so the node # can go through the steps (deleting & cleaning) to make itself # available again. filters = {'reserved': False, 'provision_state': states.DEPLOYING} last_error = (_("The deployment can't be resumed by conductor " "%s. Moving to fail state.") % self.host) self._fail_if_in_state(ironic_context.get_admin_context(), filters, states.DEPLOYING, 'provision_updated_at', last_error=last_error) # Spawn a dedicated greenthread for the keepalive try: self._spawn_worker(self._conductor_service_record_keepalive) LOG.info( _LI('Successfully started conductor with hostname ' '%(hostname)s.'), {'hostname': self.host}) except exception.NoFreeConductorWorker: with excutils.save_and_reraise_exception(): LOG.critical(_LC('Failed to start keepalive')) self.del_host()
def init_host(self, admin_context=None): """Initialize the conductor host. :param admin_context: the admin context to pass to periodic tasks. :raises: RuntimeError when conductor is already running. :raises: NoDriversLoaded when no drivers are enabled on the conductor. :raises: DriverNotFound if a driver is enabled that does not exist. :raises: DriverLoadError if an enabled driver cannot be loaded. :raises: DriverNameConflict if a classic driver and a dynamic driver are both enabled and have the same name. """ if self._started: raise RuntimeError( _('Attempt to start an already running ' 'conductor manager')) self.dbapi = dbapi.get_instance() self._keepalive_evt = threading.Event() """Event for the keepalive thread.""" # TODO(dtantsur): make the threshold configurable? rejection_func = rejection.reject_when_reached( CONF.conductor.workers_pool_size) self._executor = futurist.GreenThreadPoolExecutor( max_workers=CONF.conductor.workers_pool_size, check_and_reject=rejection_func) """Executor for performing tasks async.""" self.ring_manager = hash_ring.HashRingManager() """Consistent hash ring which maps drivers to conductors.""" _check_enabled_interfaces() # NOTE(deva): these calls may raise DriverLoadError or DriverNotFound # NOTE(vdrok): Instantiate network and storage interface factory on # startup so that all the interfaces are loaded at the very # beginning, and failures prevent the conductor from starting. drivers = driver_factory.drivers() hardware_types = driver_factory.hardware_types() driver_factory.NetworkInterfaceFactory() driver_factory.StorageInterfaceFactory() # NOTE(jroll) this is passed to the dbapi, which requires a list, not # a generator (which keys() returns in py3) driver_names = list(drivers) hardware_type_names = list(hardware_types) # check that at least one driver is loaded, whether classic or dynamic if not driver_names and not hardware_type_names: msg = _LE("Conductor %s cannot be started because no drivers " "were loaded. This could be because no classic drivers " "were specified in the 'enabled_drivers' config option " "and no dynamic drivers were specified in the " "'enabled_hardware_types' config option.") LOG.error(msg, self.host) raise exception.NoDriversLoaded(conductor=self.host) # check for name clashes between classic and dynamic drivers name_clashes = set(driver_names).intersection(hardware_type_names) if name_clashes: name_clashes = ', '.join(name_clashes) msg = _LE("Conductor %(host)s cannot be started because there is " "one or more name conflicts between classic drivers and " "dynamic drivers (%(names)s). Check any external driver " "plugins and the 'enabled_drivers' and " "'enabled_hardware_types' config options.") LOG.error(msg, {'host': self.host, 'names': name_clashes}) raise exception.DriverNameConflict(names=name_clashes) # Collect driver-specific periodic tasks. # Conductor periodic tasks accept context argument, driver periodic # tasks accept this manager and context. We have to ensure that the # same driver interface class is not traversed twice, otherwise # we'll have several instances of the same task. LOG.debug('Collecting periodic tasks') self._periodic_task_callables = [] periodic_task_classes = set() self._collect_periodic_tasks(self, (admin_context, )) for driver_obj in drivers.values(): for iface_name in driver_obj.all_interfaces: iface = getattr(driver_obj, iface_name, None) if iface and iface.__class__ not in periodic_task_classes: self._collect_periodic_tasks(iface, (self, admin_context)) periodic_task_classes.add(iface.__class__) if (len(self._periodic_task_callables) > CONF.conductor.workers_pool_size): LOG.warning( _LW('This conductor has %(tasks)d periodic tasks ' 'enabled, but only %(workers)d task workers ' 'allowed by [conductor]workers_pool_size option'), { 'tasks': len(self._periodic_task_callables), 'workers': CONF.conductor.workers_pool_size }) self._periodic_tasks = periodics.PeriodicWorker( self._periodic_task_callables, executor_factory=periodics.ExistingExecutor(self._executor)) # clear all target_power_state with locks by this conductor self.dbapi.clear_node_target_power_state(self.host) # clear all locks held by this conductor before registering self.dbapi.clear_node_reservations_for_conductor(self.host) try: # Register this conductor with the cluster self.conductor = objects.Conductor.register( admin_context, self.host, driver_names) except exception.ConductorAlreadyRegistered: # This conductor was already registered and did not shut down # properly, so log a warning and update the record. LOG.warning( _LW("A conductor with hostname %(hostname)s " "was previously registered. Updating registration"), {'hostname': self.host}) self.conductor = objects.Conductor.register(admin_context, self.host, driver_names, update_existing=True) # register hardware types and interfaces supported by this conductor # and validate them against other conductors try: self._register_and_validate_hardware_interfaces(hardware_types) except (exception.DriverLoadError, exception.DriverNotFound, exception.ConductorHardwareInterfacesAlreadyRegistered, exception.InterfaceNotFoundInEntrypoint, exception.NoValidDefaultForInterface) as e: with excutils.save_and_reraise_exception(): LOG.error(_LE('Failed to register hardware types. %s'), e) self.del_host() # Start periodic tasks self._periodic_tasks_worker = self._executor.submit( self._periodic_tasks.start, allow_empty=True) self._periodic_tasks_worker.add_done_callback( self._on_periodic_tasks_stop) # NOTE(lucasagomes): If the conductor server dies abruptly # mid deployment (OMM Killer, power outage, etc...) we # can not resume the deployment even if the conductor # comes back online. Cleaning the reservation of the nodes # (dbapi.clear_node_reservations_for_conductor) is not enough to # unstick it, so let's gracefully fail the deployment so the node # can go through the steps (deleting & cleaning) to make itself # available again. filters = {'reserved': False, 'provision_state': states.DEPLOYING} last_error = (_("The deployment can't be resumed by conductor " "%s. Moving to fail state.") % self.host) self._fail_if_in_state(ironic_context.get_admin_context(), filters, states.DEPLOYING, 'provision_updated_at', last_error=last_error) # Start consoles if it set enabled in a greenthread. try: self._spawn_worker(self._start_consoles, ironic_context.get_admin_context()) except exception.NoFreeConductorWorker: LOG.warning(_LW('Failed to start worker for restarting consoles.')) # Spawn a dedicated greenthread for the keepalive try: self._spawn_worker(self._conductor_service_record_keepalive) LOG.info( _LI('Successfully started conductor with hostname ' '%(hostname)s.'), {'hostname': self.host}) except exception.NoFreeConductorWorker: with excutils.save_and_reraise_exception(): LOG.critical(_LC('Failed to start keepalive')) self.del_host() self._started = True
def init_host(self): self.dbapi = dbapi.get_instance() self._keepalive_evt = threading.Event() """Event for the keepalive thread.""" self._worker_pool = greenpool.GreenPool( size=CONF.conductor.workers_pool_size) """GreenPool of background workers for performing tasks async.""" self.ring_manager = hash.HashRingManager() """Consistent hash ring which maps drivers to conductors.""" # NOTE(deva): instantiating DriverFactory may raise DriverLoadError # or DriverNotFound self._driver_factory = driver_factory.DriverFactory() """Driver factory loads all enabled drivers.""" self.drivers = self._driver_factory.names """List of driver names which this conductor supports.""" if not self.drivers: msg = _LE("Conductor %s cannot be started because no drivers " "were loaded. This could be because no drivers were " "specified in 'enabled_drivers' config option.") LOG.error(msg, self.host) raise exception.NoDriversLoaded(conductor=self.host) # Collect driver-specific periodic tasks for driver_obj in driver_factory.drivers().values(): self._collect_periodic_tasks(driver_obj) for iface_name in (driver_obj.core_interfaces + driver_obj.standard_interfaces + ['vendor']): iface = getattr(driver_obj, iface_name, None) if iface: self._collect_periodic_tasks(iface) # clear all locks held by this conductor before registering self.dbapi.clear_node_reservations_for_conductor(self.host) try: # Register this conductor with the cluster cdr = self.dbapi.register_conductor({'hostname': self.host, 'drivers': self.drivers}) except exception.ConductorAlreadyRegistered: # This conductor was already registered and did not shut down # properly, so log a warning and update the record. LOG.warning( _LW("A conductor with hostname %(hostname)s " "was previously registered. Updating registration"), {'hostname': self.host}) cdr = self.dbapi.register_conductor({'hostname': self.host, 'drivers': self.drivers}, update_existing=True) self.conductor = cdr # NOTE(lucasagomes): If the conductor server dies abruptly # mid deployment (OMM Killer, power outage, etc...) we # can not resume the deployment even if the conductor # comes back online. Cleaning the reservation of the nodes # (dbapi.clear_node_reservations_for_conductor) is not enough to # unstick it, so let's gracefully fail the deployment so the node # can go through the steps (deleting & cleaning) to make itself # available again. filters = {'reserved': False, 'provision_state': states.DEPLOYING} last_error = (_("The deployment can't be resumed by conductor " "%s. Moving to fail state.") % self.host) self._fail_if_in_state(ironic_context.get_admin_context(), filters, states.DEPLOYING, 'provision_updated_at', last_error=last_error) # Spawn a dedicated greenthread for the keepalive try: self._spawn_worker(self._conductor_service_record_keepalive) LOG.info(_LI('Successfully started conductor with hostname ' '%(hostname)s.'), {'hostname': self.host}) except exception.NoFreeConductorWorker: with excutils.save_and_reraise_exception(): LOG.critical(_LC('Failed to start keepalive')) self.del_host()