class ResourceRef(object): """ A reference to a resource. Can be used to conveniently modify an owned resource. This object will auto release the referenced resource unless autorelease is set to `False` """ _log = logging.getLogger("Storage.ResourceManager.ResourceRef") namespace = property(lambda self: self._namespace) name = property(lambda self: self._name) fullName = property(lambda self: "%s.%s" % (self._namespace, self._name)) # States whether this reference is pointing to an owned reference isValid = property(lambda self: self._isValid) def __init__(self, namespace, name, wrappedObject=None, resRefID=str(uuid4())): self._namespace = namespace self._name = name self._log = SimpleLogAdapter(self._log, { "ResName": self.fullName, "ResRefID": resRefID }) self.__wrappedObject = wrappedObject if wrappedObject is not None: self.__wrapObj() self.autoRelease = True self._isValid = True self._syncRoot = misc.RWLock() def __wrapObj(self): for attr in dir(self.__wrappedObject): if hasattr(self, attr) or attr in ('close', 'switchLockType'): continue setattr(self, attr, partial(self.__methodProxy, attr)) def __methodProxy(self, attr, *args, **kwargs): with self._syncRoot.shared: if not self.isValid: raise se.ResourceReferenceInvalid return getattr(self.__wrappedObject, attr)(*args, **kwargs) def release(self): with self._syncRoot.exclusive: self.__wrappedObject = None if not self._isValid: self._log.warn("Tried to re-release a resource. Request " "ignored.") return ResourceManager.getInstance().releaseResource( self.namespace, self.name) self._isValid = False def getStatus(self): return ResourceManager.getInstance().getResourceStatus( self.namespace, self.name) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.release() def __del__(self): if self._isValid and self.autoRelease: def release(log, namespace, name): log.warn("Resource reference was not properly released. " "Autoreleasing.") # In Python, objects are refcounted and are deleted immediately # when the last reference is freed. This means the __del__ # method can be called inside of any context. The # releaseResource method we use tries to acquire locks. So we # might try to acquire the lock in a locked context and reach a # deadlock. This is why I need to use a timer. It will defer # the operation and use a different context. ResourceManager.getInstance().releaseResource(namespace, name) threading.Thread(target=release, args=(self._log, self.namespace, self.name)).start() self._isValid = False def __repr__(self): return "< ResourceRef '%s', isValid: '%s' obj: '%s'>" % ( self.fullName, self.isValid, repr(self.__wrappedObject) if self.isValid else None)
class Request(object): """ Internal request object, don't use directly """ _log = logging.getLogger("Storage.ResourceManager.Request") namespace = property(lambda self: self._namespace) name = property(lambda self: self._name) fullName = property(lambda self: "%s.%s" % (self._namespace, self._name)) lockType = property(lambda self: self._lockType) syncRoot = property(lambda self: self._syncRoot) def __init__(self, namespace, name, lockType, callback): self._syncRoot = threading.RLock() self._namespace = namespace self._name = name self._lockType = lockType self._isActive = True self._isCanceled = False self._doneEvent = threading.Event() self._callback = callback self.reqID = str(uuid4()) self._log = SimpleLogAdapter(self._log, { "ResName": self.fullName, "ReqID": self.reqID }) # Because findCaller is expensive. We make sure it wll be printed # before calculating it if logging.getLogger("Storage.ResourceManager.ResourceRef").\ isEnabledFor(logging.WARN): createdAt = misc.findCaller(ignoreSourceFiles=[__file__], logSkipName="ResourceManager") self._log.debug("Request was made in '%s' line '%d' at '%s'", *createdAt) def cancel(self): with self._syncRoot: if not self._isActive: self._log.warn("Tried to cancel a processed request") raise RequestAlreadyProcessedError("Cannot cancel a processed " "request") self._isActive = False self._isCanceled = True self._log.debug("Canceled request") try: self._callback(RequestRef(self), None) except Exception: self._log.warn("Request callback threw an exception", exc_info=True) self._callback = None self._doneEvent.set() def _status(self): with self._syncRoot: if self._isCanceled: return "canceled" if self._doneEvent.isSet(): return "granted" return "waiting" def canceled(self): return self._isCanceled def grant(self): with self._syncRoot: if not self._isActive: self._log.warn("Tried to grant a processed request") raise RequestAlreadyProcessedError("Cannot grant a processed " "request") self._isActive = False self._log.debug("Granted request") self._doneEvent.set() def emit(self, resource): try: ref = RequestRef(self) self._callback(ref, resource) except Exception: self._log.warn("Request callback threw an exception", exc_info=True) def wait(self, timeout=None): return self._doneEvent.wait(timeout) def granted(self): with self._syncRoot: return (not self._isCanceled) and self._doneEvent.isSet() def __str__(self): return "Request for %s - %s: %s" % (self.fullName, self.lockType, self._status())
class ResourceRef(object): """ A reference to a resource. Can be used to conveniently modify an owned resource. This object will auto release the referenced resource unless autorelease is set to `False` """ _log = logging.getLogger("ResourceManager.ResourceRef") namespace = property(lambda self : self._namespace) name = property(lambda self : self._name) fullName = property(lambda self : "%s.%s" % (self._namespace, self._name)) # States whether this reference is pointing to an owned reference isValid = property(lambda self : self._isValid) def __init__(self, namespace, name, wrappedObject=None, resRefID=str(uuid4())): self._namespace = namespace self._name = name self._log = SimpleLogAdapter(self._log, {"ResName" : self.fullName, "ResRefID" : resRefID}) self.__wrappedObject = wrappedObject if wrappedObject is not None: self.__wrapObj() self.autoRelease = True self._isValid = True self._syncRoot = misc.RWLock() def __wrapObj(self): for attr in dir(self.__wrappedObject): if hasattr(self,attr) or attr in ('close', 'switchLockType'): continue setattr(self, attr, partial(self.__methodProxy, attr)) def __methodProxy(self, attr, *args, **kwargs): with self._syncRoot.shared: if not self.isValid: raise se.ResourceReferenceInvalid return getattr(self.__wrappedObject, attr)(*args, **kwargs) def release(self): with self._syncRoot.exclusive: self.__wrappedObject = None if not self._isValid: self._log.warn("Tried to re-release a resource. Request ignored.") return ResourceManager.getInstance().releaseResource(self.namespace, self.name) self._isValid = False def getStatus(self): return ResourceManager.getInstance().getResourceStatus(self.namespace, self.name) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.release() def __del__(self): if self._isValid and self.autoRelease: def release(log, namespace, name): log.warn("Resource referece was not properly released. Autoreleasing.") # In Python, objects are refcounted and are deleted immediately when # the last reference is freed. This means the __del__ method can be called # inside of any context. The releaseResource method we use tries to acquire # locks. So we might try to acquire the lock in a locked context and reach # a deadlock. This is why I need to use a timer. It will defer the operation # and use a different context. ResourceManager.getInstance().releaseResource(namespace, name) threading.Thread(target = release, args=(self._log, self.namespace, self.name)).start() self._isValid = False def __repr__(self): return "< ResourceRef '%s', isValid: '%s' obj: '%s'>" % (self.fullName, self.isValid, repr(self.__wrappedObject) if self.isValid else None)
class Request(object): """ Internal request object, don't use directly """ _log = logging.getLogger("ResourceManager.Request") namespace = property(lambda self : self._namespace) name = property(lambda self : self._name) fullName = property(lambda self : "%s.%s" % (self._namespace, self._name)) lockType = property(lambda self : self._lockType) syncRoot = property(lambda self : self._syncRoot) def __init__(self, namespace, name, lockType, callback): self._syncRoot = threading.RLock() self._namespace = namespace self._name = name self._lockType = lockType self._isActive = True self._isCanceled = False self._doneEvent = threading.Event() self._callback = callback self.reqID = str(uuid4()) self._log = SimpleLogAdapter(self._log, {"ResName" : self.fullName, "ReqID" : self.reqID}) # Becuase findCaller is expensive. We make sure it wll be printed before # calculating it if logging.getLogger("ResourceManager.ResourceRef").isEnabledFor(logging.WARN): createdAt = misc.findCaller(ignoreSourceFiles=[__file__], logSkipName="ResourceManager") self._log.debug("Request was made in '%s' line '%d' at '%s'", *createdAt) def cancel(self): with self._syncRoot: if not self._isActive: self._log.warn("Tried to cancel a processed request") raise RequestAlreadyProcessedError("Cannot cancel a processed request") self._isActive = False self._isCanceled = True self._log.debug("Canceled request") try: self._callback(RequestRef(self), None) except Exception: self._log.warn("Request callback threw an exception", exc_info=True) self._callback = None self._doneEvent.set() def _status(self): with self._syncRoot: if self._isCanceled: return "canceled" if self._doneEvent.isSet(): return "granted" return "waiting" def canceled(self): return self._isCanceled def grant(self): with self._syncRoot: if not self._isActive: self._log.warn("Tried to grant a processed request") raise RequestAlreadyProcessedError("Cannot grant a processed request") self._isActive = False self._log.debug("Granted request") self._doneEvent.set() def emit(self, resource): try: ref = RequestRef(self) self._callback(ref, resource) except Exception: self._log.warn("Request callback threw an exception", exc_info=True) def wait(self, timeout = None): return self._doneEvent.wait(timeout) def granted(self): with self._syncRoot: return (not self._isCanceled) and self._doneEvent.isSet() def __str__(self): return "Request for %s - %s: %s" % (self.fullName, self.lockType, self._status())
class Task: # External Task info fields = { # field_name: type "id": str, "name": unicode, "tag": unicode, "store": unicode, "recoveryPolicy": TaskRecoveryType, "persistPolicy": TaskPersistType, "cleanPolicy": TaskCleanType, "priority": TaskPriority, "state": State, "njobs": int, "nrecoveries": int, "metadataVersion": int } log = logging.getLogger('Storage.TaskManager.Task') def __init__(self, id, name="", tag="", recovery=TaskRecoveryType.none, priority=TaskPriority.low): """ id - Unique ID name - human readable name persist - persistency type: auto-clean/manual-clean/not-persistent """ if not id: id = str(uuid.uuid4()) self.metadataVersion = TASK_METADATA_VERSION self.validateID(id) self.lock = threading.Lock() self.callbackLock = threading.Lock() self.id = str(id) self.name = name self.tag = tag self.priority = priority self.recoveryPolicy = recovery self.persistPolicy = TaskPersistType.none self.cleanPolicy = TaskCleanType.auto self.store = None self.defaultException = None self.state = State(State.init) self.result = TaskResult(0, "Task is initializing", "") self.resOwner = resourceManager.Owner(proxy(self), raiseonfailure=True) self.error = se.TaskAborted("Unknown error encountered") self.mng = None self._aborting = False self._forceAbort = False self.ref = 0 self.recoveries = [] self.jobs = [] self.nrecoveries = 0 # just utility count - used by save/load self.njobs = 0 # just utility count - used by save/load self.log = SimpleLogAdapter(self.log, {"Task": self.id}) def __del__(self): def finalize(log, owner, taskDir): log.warn("Task was autocleaned") owner.releaseAll() if taskDir is not None: getProcPool().fileUtils.cleanupdir(taskDir) if not self.state.isDone(): taskDir = None if (self.cleanPolicy == TaskCleanType.auto and self.store is not None): taskDir = os.path.join(self.store, self.id) threading.Thread(target=finalize, args=(self.log, self.resOwner, taskDir)).start() def _done(self): self.resOwner.releaseAll() if self.cleanPolicy == TaskCleanType.auto: self.clean() def __state_preparing(self, fromState): pass def __state_blocked(self, fromState): pass def __state_acquiring(self, fromState): if self.resOwner.requestsGranted(): self._updateState(State.queued) def __state_queued(self, fromState): try: self.mng.queue(self) except Exception as e: self._setError(e) self.stop() def __state_running(self, fromState): self._runJobs() def __state_finished(self, fromState): self._done() def __state_aborting(self, fromState): if self.ref > 1: return self.log.debug("_aborting: recover policy %s", self.recoveryPolicy) if self.recoveryPolicy == TaskRecoveryType.auto: self._updateState(State.racquiring) elif self.recoveryPolicy == TaskRecoveryType.none: self._updateState(State.failed) else: self._updateState(State.waitrecover) def __state_waitrecover(self, fromState): pass def __state_racquiring(self, fromState): if self.resOwner.requestsGranted(): self._updateState(State.recovering) def __state_recovering(self, fromState): self._recover() def __state_raborting(self, fromState): if self.ref == 1: self._updateState(State.failed) else: self.log.warn("State was change to 'raborting' " "when ref was not 1.") def __state_recovered(self, fromState): self._done() def __state_failed(self, fromState): self._done() def __state_cleaning(self, fromState): pass def _updateState(self, state, force=False): fromState = self.state requestedState = state if self._aborting: if self.state.canAbort(): state = State.aborting elif self.state.canAbortRecovery() and state != State.recovered: state = State.raborting self._aborting = False if requestedState == state: self.log.debug("moving from state %s -> state %s", fromState, state) else: self.log.debug("moving from state %s -> state %s instead of %s", fromState, state, requestedState) self.state.moveto(state, force) if self.persistPolicy == TaskPersistType.auto: try: self.persist() except Exception: self.log.warning("Task._updateState: failed persisting task" " %s", self.id, exc_info=True) fn = getattr(self, "_Task__state_%s" % state) fn(fromState) def _updateResult(self, code, message, result): self.result.result = result self.result.code = code self.result.message = message @classmethod def validateID(cls, taskID): if not taskID or "." in taskID: raise se.InvalidParameterException("taskID", taskID) @classmethod def _loadMetaFile(cls, filename, obj, fields): try: for line in getProcPool().readLines(filename): # process current line line = line.encode('utf8') if line.find(KEY_SEPARATOR) < 0: continue parts = line.split(KEY_SEPARATOR) if len(parts) != 2: cls.log.warning("Task._loadMetaFile: %s - ignoring line" " '%s'", filename, line) continue field = _eq_decode(parts[0].strip()) value = _eq_decode(parts[1].strip()) if field not in fields: cls.log.warning("Task._loadMetaFile: %s - ignoring field" " %s in line '%s'", filename, field, line) continue ftype = fields[field] setattr(obj, field, ftype(value)) except Exception: cls.log.error("Unexpected error", exc_info=True) raise se.TaskMetaDataLoadError(filename) @classmethod def _dump(cls, obj, fields): lines = [] for field in fields: try: value = unicode(getattr(obj, field)) except AttributeError: cls.log.warning("object %s field %s not found" % (obj, field), exc_info=True) else: try: field = _eq_encode(field) value = _eq_encode(value) except ValueError as e: cls.log.warning("Object %s: Cannot encode field %s or " "value %s. Skipping field. %s", obj, field, value, e) else: lines.append("%s %s %s" % (field, KEY_SEPARATOR, value)) return lines @classmethod def _saveMetaFile(cls, filename, obj, fields): try: getProcPool().writeLines(filename, [l.encode('utf8') + "\n" for l in cls._dump(obj, fields)]) except Exception: cls.log.error("Unexpected error", exc_info=True) raise se.TaskMetaDataSaveError(filename) def _loadTaskMetaFile(self, taskDir): taskFile = os.path.join(taskDir, self.id + TASK_EXT) self._loadMetaFile(taskFile, self, Task.fields) def _saveTaskMetaFile(self, taskDir): taskFile = os.path.join(taskDir, self.id + TASK_EXT) self._saveMetaFile(taskFile, self, Task.fields) def _loadJobMetaFile(self, taskDir, n): taskFile = os.path.join(taskDir, self.id + JOB_EXT + NUM_SEP + str(n)) self._loadMetaFile(taskFile, self.jobs[n], Job.fields) def _saveJobMetaFile(self, taskDir, n): taskFile = os.path.join(taskDir, self.id + JOB_EXT + NUM_SEP + str(n)) self._saveMetaFile(taskFile, self.jobs[n], Job.fields) def _loadRecoveryMetaFile(self, taskDir, n): taskFile = os.path.join(taskDir, self.id + RECOVER_EXT + NUM_SEP + str(n)) self._loadMetaFile(taskFile, self.recoveries[n], Recovery.fields) def _saveRecoveryMetaFile(self, taskDir, n): taskFile = os.path.join(taskDir, self.id + RECOVER_EXT + NUM_SEP + str(n)) self._saveMetaFile(taskFile, self.recoveries[n], Recovery.fields) def _loadTaskResultMetaFile(self, taskDir): taskFile = os.path.join(taskDir, self.id + RESULT_EXT) self._loadMetaFile(taskFile, self.result, TaskResult.fields) def _saveTaskResultMetaFile(self, taskDir): taskFile = os.path.join(taskDir, self.id + RESULT_EXT) self._saveMetaFile(taskFile, self.result, TaskResult.fields) def _getResourcesKeyList(self, taskDir): keys = [] for path in getProcPool().glob.glob(os.path.join(taskDir, "*" + RESOURCE_EXT)): filename = os.path.basename(path) keys.append(filename[:filename.rfind(RESOURCE_EXT)]) return keys def _load(self, storPath, ext=""): self.log.debug("%s: load from %s, ext '%s'", self, storPath, ext) if self.state != State.init: raise se.TaskMetaDataLoadError("task %s - can't load self: " "not in init state" % self) taskDir = os.path.join(storPath, str(self.id) + str(ext)) if not getProcPool().os.path.exists(taskDir): raise se.TaskDirError("load: no such task dir '%s'" % taskDir) oldid = self.id self._loadTaskMetaFile(taskDir) if self.id != oldid: raise se.TaskMetaDataLoadError("task %s: loaded file do not match" " id (%s != %s)" % (self, self.id, oldid)) if self.state == State.finished: self._loadTaskResultMetaFile(taskDir) for jn in range(self.njobs): self.jobs.append(Job("load", None)) self._loadJobMetaFile(taskDir, jn) self.jobs[jn].setOwnerTask(self) for rn in range(self.nrecoveries): self.recoveries.append(Recovery("load", "load", "load", "load", "")) self._loadRecoveryMetaFile(taskDir, rn) self.recoveries[rn].setOwnerTask(self) def _save(self, storPath): origTaskDir = os.path.join(storPath, self.id) if not getProcPool().os.path.exists(origTaskDir): raise se.TaskDirError("_save: no such task dir '%s'" % origTaskDir) taskDir = os.path.join(storPath, self.id + TEMP_EXT) self.log.debug("_save: orig %s temp %s", origTaskDir, taskDir) if getProcPool().os.path.exists(taskDir): getProcPool().fileUtils.cleanupdir(taskDir) getProcPool().os.mkdir(taskDir) try: self.njobs = len(self.jobs) self.nrecoveries = len(self.recoveries) self._saveTaskMetaFile(taskDir) if self.state == State.finished: self._saveTaskResultMetaFile(taskDir) for jn in range(self.njobs): self._saveJobMetaFile(taskDir, jn) for rn in range(self.nrecoveries): self._saveRecoveryMetaFile(taskDir, rn) except Exception as e: self.log.error("Unexpected error", exc_info=True) try: getProcPool().fileUtils.cleanupdir(taskDir) except: self.log.warning("can't remove temp taskdir %s" % taskDir) raise se.TaskPersistError("%s persist failed: %s" % (self, e)) # Make sure backup dir doesn't exist getProcPool().fileUtils.cleanupdir(origTaskDir + BACKUP_EXT) getProcPool().os.rename(origTaskDir, origTaskDir + BACKUP_EXT) getProcPool().os.rename(taskDir, origTaskDir) getProcPool().fileUtils.cleanupdir(origTaskDir + BACKUP_EXT) getProcPool().fileUtils.fsyncPath(origTaskDir) def _clean(self, storPath): taskDir = os.path.join(storPath, self.id) getProcPool().fileUtils.cleanupdir(taskDir) def _recoverDone(self): # protect agains races with stop/abort self.log.debug("Recover Done: state %s", self.state) while True: try: if self.state == State.recovering: self._updateState(State.recovered) elif self.state == State.raborting: self._updateState(State.failed) return except se.TaskStateTransitionError: self.log.error("Unexpected error", exc_info=True) def _recover(self): self.log.debug("_recover") if not self.state == State.recovering: raise se.TaskStateError("%s: _recover in state %s" % (self, self.state)) try: while self.state == State.recovering: rec = self.popRecovery() self.log.debug("running recovery %s", rec) if not rec: break self._run(rec.run) except Exception as e: self.log.warning("task %s: recovery failed: %s", self, e, exc_info=True) # protect agains races with stop/abort try: if self.state == State.recovering: self._updateState(State.raborting) except se.TaskStateTransitionError: pass self._recoverDone() def resourceAcquired(self, namespace, resource, locktype): # Callback from resourceManager.Owner. May be called by another thread. self._incref() try: self.callbackLock.acquire() try: self.log.debug("_resourcesAcquired: %s.%s (%s)", namespace, resource, locktype) if self.state == State.preparing: return if self.state == State.acquiring: self._updateState(State.acquiring) elif self.state == State.racquiring: self._updateState(State.racquiring) elif self.state == State.blocked: self._updateState(State.preparing) elif (self.state == State.aborting or self.state == State.raborting): self.log.debug("resource %s.%s acquired while in state %s", namespace, resource, self.state) else: raise se.TaskStateError("acquire is not allowed in state" " %s" % self.state) finally: self.callbackLock.release() finally: self._decref() def resourceRegistered(self, namespace, resource, locktype): self._incref() try: self.callbackLock.acquire() try: # Callback from resourceManager.Owner. May be called # by another thread. self.log.debug("_resourcesAcquired: %s.%s (%s)", namespace, resource, locktype) # Protect against races with stop/abort if self.state == State.preparing: self._updateState(State.blocked) finally: self.callbackLock.release() finally: self._decref() def _setError(self, e=se.TaskAborted("Unknown error encountered")): self.log.error("Unexpected error", exc_info=True) self.error = e def _run(self, fn, *args, **kargs): code = 100 message = "Unknown Error" try: return fn(*args, **kargs) except se.StorageException as e: code = e.code message = e.message self._setError(e) except Exception as e: message = unicode(e) self._setError(e) except: self._setError() self.log.debug("Task._run: %s %s %s failed - stopping task", self, args, kargs) self.stop() raise se.TaskAborted(message, code) def _runJobs(self): result = "" code = 100 message = "Unknown Error" i = 0 j = None try: if self.aborting(): raise se.TaskAborted("shutting down") if not self.state == State.running: raise se.TaskStateError("%s: can't run Jobs in state %s" % (self, self.state)) # for now: result is the last job result, jobs are run sequentially for j in self.jobs: if self.aborting(): raise se.TaskAborted("shutting down") self.log.debug("Task.run: running job %s: %s" % (i, j)) self._updateResult( 0, 'running job {0} of {0}'.format(i + 1, len(self.jobs)), '') result = self._run(j.run) if result is None: result = "" i += 1 j = None self._updateResult(0, "%s jobs completed successfully" % i, result) self._updateState(State.finished) self.log.debug('Task.run: exit - success: result %s' % result) return result except se.TaskAborted as e: self.log.debug("aborting: %s", e) message = e.value code = e.abortedcode if not self.aborting(): self.log.error("Aborted exception but not in aborting state") raise self._updateResult(code, message, "") def _doAbort(self, force=False): self.log.debug("Task._doAbort: force %s" % force) self.lock.acquire() # Am I really the last? if self.ref != 0: self.lock.release() return self.ref += 1 self.lock.release() try: try: if (not self.state.canAbort() and (force and not self.state.canAbortRecovery())): self.log.warning("Task._doAbort %s: ignoring - " "at state %s", self, self.state) return self.resOwner.cancelAll() if self.state.canAbort(): self._updateState(State.aborting) else: self._updateState(State.raborting) except se.TaskAborted: self._updateState(State.failed) finally: self.lock.acquire() self.ref -= 1 self.lock.release() # If something horrible went wrong. Just fail the task. if not self.state.isDone(): self.log.warn("Task exited in non terminal state. " "Setting tasks as failed.") self._updateState(State.failed) def _doRecover(self): self.lock.acquire() # Am I really the last? if self.ref != 0: self.lock.release() raise se.TaskHasRefs(unicode(self)) self.ref += 1 self.lock.release() try: self._updateState(State.racquiring) finally: self.lock.acquire() self.ref -= 1 self.lock.release() def _incref(self, force=False): self.lock.acquire() try: if self.aborting() and (self._forceAbort or not force): raise se.TaskAborted(unicode(self)) self.ref += 1 ref = self.ref return ref finally: self.lock.release() def _decref(self, force=False): self.lock.acquire() self.ref -= 1 ref = self.ref self.lock.release() self.log.debug("ref %d aborting %s", ref, self.aborting()) if ref == 0 and self.aborting(): self._doAbort(force) return ref ########################################################################## # Public Interface # ########################################################################## def setDefaultException(self, exceptionObj): # defaultException must have response method if exceptionObj and not hasattr(exceptionObj, "response"): raise se.InvalidDefaultExceptionException(unicode(exceptionObj)) self.defaultException = exceptionObj def setTag(self, tag): if KEY_SEPARATOR in tag: raise ValueError("tag cannot include %s character" % KEY_SEPARATOR) self.tag = unicode(tag) def isDone(self): return self.state.isDone() def addJob(self, job): """ Add async job to the task. Assumes all resources are acquired or registered. """ if not self.mng: raise se.UnmanagedTask(unicode(self)) if not isinstance(job, Job): raise TypeError("Job param %s(%s) must be Job object" % (repr(job), type(job))) if self.state != State.preparing: raise Exception("Task.addJob: can't add job in non preparing state" " (%s)" % self.state) if not job.name: raise ValueError("Task.addJob: name is required") name = job.name for j in self.jobs: if name == j.name: raise ValueError("addJob: name '%s' must be unique" % (name)) job.setOwnerTask(self) self.jobs.append(job) self.njobs = len(self.jobs) def clean(self): if not self.store: return if not self.isDone(): raise se.TaskStateError("can't clean in state %s" % self.state) self._clean(self.store) def pushRecovery(self, recovery): """ Add recovery "job" to the task. Recoveries are committed in FILO order. Assumes that all required resources are acquired or registered. """ if not isinstance(recovery, Recovery): raise TypeError("recovery param %s(%s) must be Recovery object" % (repr(recovery), type(recovery))) if not recovery.name: raise ValueError("pushRecovery: name is required") name = recovery.name for r in self.recoveries: if name == r.name: raise ValueError("pushRecovery: name '%s' must be unique" % (name)) recovery.setOwnerTask(self) self.recoveries.append(recovery) self.persist() def replaceRecoveries(self, recovery): if not isinstance(recovery, Recovery): raise TypeError("recovery param %s(%s) must be Recovery object" % (repr(recovery), type(recovery))) if not recovery.name: raise ValueError("replaceRecoveries: name is required") recovery.setOwnerTask(self) rec = Recovery('stubName', 'stubMod', 'stubObj', 'stubFunc', []) while (rec and (rec.name != ROLLBACK_SENTINEL)): rec = self.popRecovery() self.recoveries.append(recovery) self.persist() def popRecovery(self): if self.recoveries: return self.recoveries.pop() def clearRecoveries(self): self.recoveries = [] self.persist() def setManager(self, manager): # If need be, refactor out to "validateManager" method if not hasattr(manager, "queue"): raise se.InvalidTaskMng(unicode(manager)) self.mng = manager def setCleanPolicy(self, clean): self.cleanPolicy = TaskCleanType(clean) def setPersistence(self, store, persistPolicy=TaskPersistType.auto, cleanPolicy=TaskCleanType.auto): self.persistPolicy = TaskPersistType(persistPolicy) self.store = store self.setCleanPolicy(cleanPolicy) if self.persistPolicy != TaskPersistType.none and not self.store: raise se.TaskPersistError("no store defined") taskDir = os.path.join(self.store, self.id) try: getProcPool().fileUtils.createdir(taskDir) except Exception as e: self.log.error("Unexpected error", exc_info=True) raise se.TaskPersistError("%s: cannot access/create taskdir" " %s: %s" % (self, taskDir, e)) if (self.persistPolicy == TaskPersistType.auto and self.state != State.init): self.persist() def setRecoveryPolicy(self, clean): self.recoveryPolicy = TaskRecoveryType(clean) def rollback(self): self.log.debug('(rollback): enter') if self.recoveryPolicy == TaskRecoveryType.none: self.log.debug("rollback is skipped") return if not self.isDone(): raise se.TaskNotFinished("can't rollback in state %s" % self.state) self._doRecover() self.log.debug('(rollback): exit') def persist(self): if self.persistPolicy == TaskPersistType.none: return if not self.store: raise se.TaskPersistError("no store defined") if self.state == State.init: raise se.TaskStateError("can't persist in state %s" % self.state) self._save(self.store) @classmethod def loadTask(cls, store, taskid): t = Task(taskid) if getProcPool().os.path.exists(os.path.join(store, taskid)): ext = "" # TBD: is this the correct order (temp < backup) + should temp # be considered at all? elif getProcPool().os.path.exists(os.path.join(store, taskid + TEMP_EXT)): ext = TEMP_EXT elif getProcPool().os.path.exists(os.path.join(store, taskid + BACKUP_EXT)): ext = BACKUP_EXT else: raise se.TaskDirError("loadTask: no such task dir '%s/%s'" % (store, taskid)) t._load(store, ext) return t @threadlocal_task def prepare(self, func, *args, **kwargs): message = self.error try: self._incref() except se.TaskAborted: self._doAbort() return try: self._updateState(State.preparing) result = None code = 0 try: if func: result = self._run(func, *args, **kwargs) except se.TaskAborted as e: self.log.info("aborting: %s", e) code = e.abortedcode message = e.value if self.aborting(): self.log.debug("Prepare: aborted: %s", message) self._updateResult(code, "Task prepare failed: %s" % (message,), "") raise self.error if self.jobs: self.log.debug("Prepare: %s jobs exist, move to acquiring", self.njobs) self._updateState(State.acquiring) if self.aborting(): self.log.error('failed to acquire task %s', self.id) raise self.error self.log.debug("returning") return dict(uuid=str(self.id)) self.log.debug("finished: %s", result) self._updateResult(0, "OK", result) self._updateState(State.finished) return result finally: self._decref() @threadlocal_task def commit(self, args=None): self.log.debug("committing task: %s", self.id) try: self._incref() except se.TaskAborted: self._doAbort() return try: self._updateState(State.running) finally: self._decref() def aborting(self): return (self._aborting or self.state == State.aborting or self.state == State.raborting) def stop(self, force=False): self.log.debug("stopping in state %s (force %s)", self.state, force) self._incref(force) try: if self.state.isDone(): self.log.debug("Task already stopped (%s), ignoring", self.state) return elif (self.state.isRecovering() and not force and (self.cleanPolicy == TaskCleanType.auto)): self.log.debug("Task (%s) in recovery and force is false, " "ignoring", self.state) return self._aborting = True self._forceAbort = force finally: self._decref(force) @threadlocal_task def recover(self, args=None): ''' Do not call this function while the task is actually running. this method should only be used to recover tasks state after (vdsmd) restart. ''' self.log.debug('(recover): recovering: state %s', self.state) try: self._incref(force=True) except se.TaskAborted: self._doAbort(True) return try: if self.isDone(): self.log.debug('(recover): task is done: state %s', self.state) return # if we are not during recover, just abort if self.state.canAbort(): self.stop() # if we waited for recovery - keep waiting elif self.state == State.waitrecover: pass # if we started the recovery - restart it elif (self.state == State.racquiring or self.state == State.recovering): self._updateState(State.racquiring, force=True) # else we were during failed recovery - abort it else: self.stop(force=True) finally: self._decref(force=True) self.log.debug('(recover): recovered: state %s', self.state) def getState(self): return str(self.state) def getInfo(self): return dict(id=self.id, verb=self.name) def deprecated_getStatus(self): oReturn = {} oReturn["taskID"] = self.id oReturn["taskState"] = self.state.DEPRECATED_STATE[self.state.state] oReturn["taskResult"] = self.state.DEPRECATED_RESULT[self.state.state] oReturn["code"] = self.result.code oReturn["message"] = self.result.message return oReturn def getStatus(self): oReturn = {} oReturn["state"] = {'code': self.result.code, 'message': self.result.message} oReturn["task"] = {'id': self.id, 'state': str(self.state)} oReturn["result"] = self.result.result return oReturn def getDetails(self): return { "id": self.id, "verb": self.name, "state": str(self.state), "code": self.result.code, "message": self.result.message, "result": self.result.result, "tag": self.tag } def getID(self): return self.id def getTags(self): return self.tag def __str__(self): return str(self.id) # FIXME : Use StringIO and enumerate() def dumpTask(self): s = "Task: %s" % self._dump(self, Task.fields) i = 0 for r in self.recoveries: s += " Recovery%d: %s" % (i, self._dump(r, Recovery.fields)) i += 1 i = 0 for j in self.jobs: s += " Job%d: %s" % (i, self._dump(j, Job.fields)) i += 1 return s @misc.logskip("ResourceManager") def getExclusiveLock( self, namespace, resName, timeout=config.getint('irs', 'task_resource_default_timeout')): self.resOwner.acquire(namespace, resName, resourceManager.LockType.exclusive, timeout) @misc.logskip("ResourceManager") def getSharedLock(self, namespace, resName, timeout=config.getint('irs', 'task_resource_default_timeout')): self.resOwner.acquire(namespace, resName, resourceManager.LockType.shared, timeout)
class Vm(object): """ Used for abstracting cummunication between various parts of the system and Qemu. Runs Qemu in a subprocess and communicates with it, and monitors its behaviour. """ log = logging.getLogger("vm.Vm") _ongoingCreations = threading.BoundedSemaphore(caps.CpuInfo().cores()) MigrationSourceThreadClass = MigrationSourceThread def __init__(self, cif, params): """ Initialize a new VM instance. :param cif: The client interface that creates this VM. :type cif: :class:`clientIF.clientIF` :param params: The VM parameters. :type params: dict """ self.conf = {'pid': '0'} self.conf.update(params) self.cif = cif self.log = SimpleLogAdapter(self.log, {"vmId" : self.conf['vmId']}) self.destroyed = False self._recoveryFile = constants.P_VDSM_RUN + str( self.conf['vmId']) + '.recovery' self.user_destroy = False self._monitorResponse = 0 self.conf['clientIp'] = '' self.memCommitted = 0 self._creationThread = threading.Thread(target=self._startUnderlyingVm) if 'migrationDest' in self.conf: self._lastStatus = 'Migration Destination' elif 'restoreState' in self.conf: self._lastStatus = 'Restoring state' else: self._lastStatus = 'WaitForLaunch' self._nice = '' self._migrationSourceThread = self.MigrationSourceThreadClass(self) self._kvmEnable = self.conf.get('kvmEnable', 'true') self._guestSocektFile = constants.P_VDSM_RUN + self.conf['vmId'] + \ '.guest.socket' self._incomingMigrationFinished = threading.Event() self.id = self.conf['vmId'] self._volPrepareLock = threading.Lock() self._initTimePauseCode = None self.guestAgent = None self._guestEvent = 'Powering up' self._guestEventTime = 0 self._vmStats = None self._guestCpuRunning = False self._guestCpuLock = threading.Lock() self._startTime = time.time() - float( self.conf.pop('elapsedTimeOffset', 0)) self._usedIndices = {} #{'ide': [], 'virtio' = []} self._volumesPrepared = False self._pathsPreparedEvent = threading.Event() self._devices = {DISK_DEVICES: [], NIC_DEVICES: [], SOUND_DEVICES: [], VIDEO_DEVICES: [], CONTROLLER_DEVICES: [], GENERAL_DEVICES: [], BALLOON_DEVICES: []} def _get_lastStatus(self): SHOW_PAUSED_STATES = ('Powering down', 'RebootInProgress', 'Up') if not self._guestCpuRunning and self._lastStatus in SHOW_PAUSED_STATES: return 'Paused' return self._lastStatus def _set_lastStatus(self, value): if self._lastStatus == 'Down': self.log.warning('trying to set state to %s when already Down', value) if value == 'Down': raise DoubleDownError else: return if value not in VALID_STATES: self.log.error('setting state to %s', value) if self._lastStatus != value: self.saveState() self._lastStatus = value lastStatus = property(_get_lastStatus, _set_lastStatus) def __getNextIndex(self, used): for n in xrange(max(used or [0]) + 2): if n not in used: idx = n break return str(idx) def _normalizeVdsmImg(self, drv): drv['needExtend'] = False drv['reqsize'] = drv.get('reqsize', '0') if not drv.has_key('device'): drv['device'] = 'disk' if drv['device'] == 'disk': res = self.cif.irs.getVolumeSize(drv['domainID'], drv['poolID'], drv['imageID'], drv['volumeID']) drv['truesize'] = res['truesize'] drv['apparentsize'] = res['apparentsize'] else: drv['truesize'] = 0 drv['apparentsize'] = 0 def __legacyDrives(self): """ Backward compatibility for qa scripts that specify direct paths. """ legacies = [] for index, linuxName in ((0, 'hda'), (1, 'hdb'), (2, 'hdc'), (3, 'hdd')): path = self.conf.get(linuxName) if path: legacies.append({'path': path, 'iface': 'ide', 'index': index, 'truesize': 0}) return legacies def __removableDrives(self): removables = [{'type': DISK_DEVICES, 'device': 'cdrom', 'iface': 'ide', 'path': self.conf.get('cdrom', ''), 'index': 2, 'truesize': 0}] floppyPath = self.conf.get('floppy') if floppyPath: removables.append({'type': DISK_DEVICES, 'device': 'floppy', 'path': floppyPath, 'iface': 'fdc', 'index': 0, 'truesize': 0}) return removables def getConfDevices(self): devices = {DISK_DEVICES: [], NIC_DEVICES: [], SOUND_DEVICES: [], VIDEO_DEVICES: [], CONTROLLER_DEVICES: [], GENERAL_DEVICES: [], BALLOON_DEVICES: []} for dev in self.conf.get('devices'): try: devices[dev['type']].append(dev) except KeyError: # Unknown type device found self.log.warn("Unknown type found, device: '%s' found", dev) devices[GENERAL_DEVICES].append(dev) # Update indecies for drives devices self.normalizeDrivesIndices(devices[DISK_DEVICES]) return devices def buildConfDevices(self): """ Return the "devices" section of this Vm's conf. If missing, create it according to old API. """ # For BC we need to save previous behaviour for old type parameters. # The new/old type parameter will be distinguished # by existence/absence of the 'devices' key devices = {} # Build devices structure if self.conf.get('devices') == None: self.conf['devices'] = [] devices[DISK_DEVICES] = self.getConfDrives() devices[NIC_DEVICES] = self.getConfNetworkInterfaces() devices[SOUND_DEVICES] = self.getConfSound() devices[VIDEO_DEVICES] = self.getConfVideo() devices[CONTROLLER_DEVICES] = self.getConfController() devices[GENERAL_DEVICES] = [] devices[BALLOON_DEVICES] = [] else: devices = self.getConfDevices() # Normalize vdsm images for drv in devices[DISK_DEVICES]: if isVdsmImage(drv): self._normalizeVdsmImg(drv) # Preserve old behavior. Since libvirt add a memory balloon device # to all guests, we need to specifically request not to add it. if len(devices[BALLOON_DEVICES]) == 0: devices[BALLOON_DEVICES].append({'type': BALLOON_DEVICES, 'device': 'memballoon', 'model': 'none'}) return devices def getConfController(self): """ Normalize controller device. """ controllers = [] # For now we create by default only 'virtio-serial' controller controllers.append({'type': CONTROLLER_DEVICES, 'device': 'virtio-serial'}) return controllers def getConfVideo(self): """ Normalize video device provided by conf. """ vcards =[] if self.conf.get('display') == 'vnc': devType = 'cirrus' elif self.conf.get('display') == 'qxl': devType = 'qxl' monitors = int(self.conf.get('spiceMonitors', '1')) vram = '65536' if (monitors <= 2) else '32768' for idx in range(monitors): vcards.append({'type': VIDEO_DEVICES, 'specParams': {'vram': vram}, 'device': devType}) return vcards def getConfSound(self): """ Normalize sound device provided by conf. """ scards = [] if self.conf.get('soundDevice'): scards.append({'type': SOUND_DEVICES, 'device': self.conf.get('soundDevice')}) return scards def getConfNetworkInterfaces(self): """ Normalize networks interfaces provided by conf. """ nics = [] macs = self.conf.get('macAddr', '').split(',') models = self.conf.get('nicModel', '').split(',') bridges = self.conf.get('bridge', DEFAULT_BRIDGE).split(',') if macs == ['']: macs = [] if models == ['']: models = [] if bridges == ['']: bridges = [] if len(models) < len(macs) or len(models) < len(bridges): raise ValueError('Bad nic specification') if models and not (macs or bridges): raise ValueError('Bad nic specification') if not macs or not models or not bridges: return '' macs = macs + [macs[-1]] * (len(models) - len(macs)) bridges = bridges + [bridges[-1]] * (len(models) - len(bridges)) for mac, model, bridge in zip(macs, models, bridges): if model == 'pv': model = 'virtio' nics.append({'type': NIC_DEVICES, 'macAddr': mac, 'nicModel': model, 'network': bridge, 'device': 'bridge'}) return nics def getConfDrives(self): """ Normalize drives provided by conf. """ # FIXME # Will be better to change the self.conf but this implies an API change. # Remove this when the API parameters will be consistent. confDrives = self.conf['drives'] if self.conf.get('drives') else [] if not confDrives: confDrives.extend(self.__legacyDrives()) confDrives.extend(self.__removableDrives()) for drv in confDrives: drv['type'] = DISK_DEVICES drv['format'] = drv.get('format') or 'raw' drv['propagateErrors'] = drv.get('propagateErrors') or 'off' drv['readonly'] = False # FIXME: For BC we have now two identical keys: iface = if # Till the day that conf will not returned as a status anymore. drv['iface'] = drv.get('iface') or drv.get('if', 'ide') # Update indecies for drives devices self.normalizeDrivesIndices(confDrives) return confDrives def updateDriveIndex(self, drv): drv['index'] = self.__getNextIndex(self._usedIndices[drv['iface']]) self._usedIndices[drv['iface']].append(int(drv['index'])) def normalizeDrivesIndices(self, confDrives): drives = [(order, drv) for order, drv in enumerate(confDrives)] indexed = [] for order, drv in drives: if not self._usedIndices.has_key(drv['iface']): self._usedIndices[drv['iface']] = [] idx = drv.get('index') if idx is not None: self._usedIndices[drv['iface']].append(int(idx)) indexed.append(order) for order, drv in drives: if order not in indexed: self.updateDriveIndex(drv) return [drv for order, drv in drives] def run(self): self._creationThread.start() def memCommit(self): """ Reserve the required memory for this VM. """ self.memCommitted = 2**20 * (int(self.conf['memSize']) + config.getint('vars', 'guest_ram_overhead')) def _startUnderlyingVm(self): self.log.debug("Start") try: self.memCommit() self._ongoingCreations.acquire() self.log.debug("_ongoingCreations acquired") try: self._run() if self.lastStatus != 'Down' and 'recover' not in self.conf: self.cif.ksmMonitor.adjust() except Exception: if 'recover' not in self.conf: raise else: self.log.info("Skipping errors on recovery", exc_info=True) finally: self._ongoingCreations.release() self.log.debug("_ongoingCreations released") if ('migrationDest' in self.conf or 'restoreState' in self.conf ) and self.lastStatus != 'Down': self._waitForIncomingMigrationFinish() self.lastStatus = 'Up' if self._initTimePauseCode: self.conf['pauseCode'] = self._initTimePauseCode if self._initTimePauseCode == 'ENOSPC': self.cont() else: try: del self.conf['pauseCode'] except: pass if 'recover' in self.conf: del self.conf['recover'] self.saveState() except Exception, e: if 'recover' in self.conf: self.log.info("Skipping errors on recovery", exc_info=True) else: self.log.error("The vm start process failed", exc_info=True) self.setDownStatus(ERROR, str(e))