def node_lock_acquire(context, node_id, action_id, engine=None, forced=False): """Try to lock the specified node. :param context: the context used for DB operations; :param node_id: ID of the node to be locked. :param action_id: ID of the action that attempts to lock the node. :param engine: ID of the engine that attempts to lock the node. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the node - if the returned owner_id is the # action id, it was a success owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for node %s again' % node_id) owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owner = db_api.node_lock_steal(node_id, action_id) return action_id == owner # if this node lock by dead engine action = db_api.action_get(context, owner) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info( _LI('The node %(n)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'n': node_id, 'a': owner }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) db_api.node_lock_steal(node_id, action_id) return True LOG.error( _LE('Node is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), { 'old': owner, 'new': action_id }) return False
def cluster_lock_acquire(context, cluster_id, action_id, engine=None, scope=CLUSTER_SCOPE, forced=False): """Try to lock the specified cluster. :param cluster_id: ID of the cluster to be locked. :param action_id: ID of the action which wants to lock the cluster. :param engine: ID of the engine which wants to lock the cluster. :param scope: scope of lock, could be cluster wide lock, or node-wide lock. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the cluster - if the returned owner_id is the # action id, it was a success owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for cluster %s again' % cluster_id) owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners # Will reach here only because scope == CLUSTER_SCOPE action = db_api.action_get(context, owners[0]) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info(_LI('The cluster %(c)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'c': cluster_id, 'a': owners[0] }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners LOG.error(_LE('Cluster is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), {'old': str(owners), 'new': action_id}) return False
def node_lock_acquire(context, node_id, action_id, engine=None, forced=False): """Try to lock the specified node. :param context: the context used for DB operations; :param node_id: ID of the node to be locked. :param action_id: ID of the action that attempts to lock the node. :param engine: ID of the engine that attempts to lock the node. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the node - if the returned owner_id is the # action id, it was a success owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for node %s again' % node_id) owner = db_api.node_lock_acquire(node_id, action_id) if action_id == owner: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owner = db_api.node_lock_steal(node_id, action_id) return action_id == owner # if this node lock by dead engine action = db_api.action_get(context, owner) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info(_LI('The node %(n)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'n': node_id, 'a': owner }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) db_api.node_lock_steal(node_id, action_id) return True LOG.error(_LE('Node is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), {'old': owner, 'new': action_id}) return False
def set_status(self, result, reason=None): """Set action status based on return value from execute.""" timestamp = wallclock() if result == self.RES_OK: status = self.SUCCEEDED db_api.action_mark_succeeded(self.context, self.id, timestamp) elif result == self.RES_ERROR: status = self.FAILED db_api.action_mark_failed(self.context, self.id, timestamp, reason=reason or 'ERROR') elif result == self.RES_TIMEOUT: status = self.FAILED db_api.action_mark_failed(self.context, self.id, timestamp, reason=reason or 'TIMEOUT') elif result == self.RES_CANCEL: status = self.CANCELLED db_api.action_mark_cancelled(self.context, self.id, timestamp) else: # result == self.RES_RETRY: status = self.READY # Action failed at the moment, but can be retried # We abandon it and then notify other dispatchers to execute it db_api.action_abandon(self.context, self.id) if status == self.SUCCEEDED: EVENT.info(self.context, self, self.action, status, reason) elif status == self.READY: EVENT.warning(self.context, self, self.action, status, reason) else: EVENT.error(self.context, self, self.action, status, reason) self.status = status self.status_reason = reason
def set_status(self, result, reason=None): '''Set action status based on return value from execute.''' timestamp = wallclock() if result == self.RES_OK: status = self.SUCCEEDED msg = _LI('Action %(name)s [%(id)s] completed with SUCCESS.') db_api.action_mark_succeeded(self.context, self.id, timestamp) elif result == self.RES_ERROR: status = self.FAILED msg = _LI('Action %(name)s [%(id)s] failed with ERROR.') db_api.action_mark_failed(self.context, self.id, timestamp, reason=reason or 'ERROR') elif result == self.RES_TIMEOUT: status = self.FAILED msg = _LI('Action %(name)s [%(id)s] failed with TIMEOUT.') db_api.action_mark_failed(self.context, self.id, timestamp, reason=reason or 'TIMEOUT') elif result == self.RES_CANCEL: status = self.CANCELLED msg = _LI('Action %(name)s [%(id)s] was cancelled.') db_api.action_mark_cancelled(self.context, self.id, timestamp) else: # result == self.RES_RETRY: status = self.READY # Action failed at the moment, but can be retried # We abandon it and then notify other dispatchers to execute it db_api.action_abandon(self.context, self.id) msg = _LI('Action %(name)s [%(id)s] aborted with RETRY.') LOG.info(msg, {'name': self.action, 'id': self.id, 'status': status}) self.status = status self.status_reason = reason
def mark_failed(cls, context, action_id, timestamp, reason=None): return db_api.action_mark_failed(context, action_id, timestamp, reason)
def mark_failed(cls, context, action_id, timestamp): return db_api.action_mark_failed(context, action_id, timestamp)
def cluster_lock_acquire(context, cluster_id, action_id, engine=None, scope=CLUSTER_SCOPE, forced=False): """Try to lock the specified cluster. :param cluster_id: ID of the cluster to be locked. :param action_id: ID of the action which wants to lock the cluster. :param engine: ID of the engine which wants to lock the cluster. :param scope: scope of lock, could be cluster wide lock, or node-wide lock. :param forced: set to True to cancel current action that owns the lock, if any. :returns: True if lock is acquired, or False otherwise. """ # Step 1: try lock the cluster - if the returned owner_id is the # action id, it was a success owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True # Step 2: retry using global configuration options retries = cfg.CONF.lock_retry_times retry_interval = cfg.CONF.lock_retry_interval while retries > 0: scheduler.sleep(retry_interval) LOG.debug('Acquire lock for cluster %s again' % cluster_id) owners = db_api.cluster_lock_acquire(cluster_id, action_id, scope) if action_id in owners: return True retries = retries - 1 # Step 3: Last resort is 'forced locking', only needed when retry failed if forced: owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners # Will reach here only because scope == CLUSTER_SCOPE action = db_api.action_get(context, owners[0]) if (action and action.owner and action.owner != engine and is_engine_dead(context, action.owner)): LOG.info( _LI('The cluster %(c)s is locked by dead action %(a)s, ' 'try to steal the lock.'), { 'c': cluster_id, 'a': owners[0] }) reason = _('Engine died when executing this action.') db_api.action_mark_failed(context, action.id, time.time(), reason=reason) owners = db_api.cluster_lock_steal(cluster_id, action_id) return action_id in owners LOG.error( _LE('Cluster is already locked by action %(old)s, ' 'action %(new)s failed grabbing the lock'), { 'old': str(owners), 'new': action_id }) return False