Example #1
0
    def __init__(self, namespace, name, wrappedObject=None, resRefID=str(uuid4())):
        self._namespace = namespace
        self._name = name
        self._log = SimpleLogAdapter(self._log, {"ResName" : self.fullName, "ResRefID" : resRefID})

        self.__wrappedObject = wrappedObject
        if wrappedObject is not None:
            self.__wrapObj()

        self.autoRelease = True
        self._isValid = True
        self._syncRoot = misc.RWLock()
Example #2
0
File: vm.py Project: ekohl/vdsm
    def __init__(self, cif, params):
        """
        Initialize a new VM instance.

        :param cif: The client interface that creates this VM.
        :type cif: :class:`clientIF.clientIF`
        :param params: The VM parameters.
        :type params: dict
        """
        self.conf = {'pid': '0'}
        self.conf.update(params)
        self.cif = cif
        self.log = SimpleLogAdapter(self.log, {"vmId" : self.conf['vmId']})
        self.destroyed = False
        self._recoveryFile = constants.P_VDSM_RUN + str(
                                    self.conf['vmId']) + '.recovery'
        self.user_destroy = False
        self._monitorResponse = 0
        self.conf['clientIp'] = ''
        self.memCommitted = 0
        self._creationThread = threading.Thread(target=self._startUnderlyingVm)
        if 'migrationDest' in self.conf:
            self._lastStatus = 'Migration Destination'
        elif 'restoreState' in self.conf:
            self._lastStatus = 'Restoring state'
        else:
            self._lastStatus = 'WaitForLaunch'
        self._nice = ''
        self._migrationSourceThread = self.MigrationSourceThreadClass(self)
        self._kvmEnable = self.conf.get('kvmEnable', 'true')
        self._guestSocektFile = constants.P_VDSM_RUN + self.conf['vmId'] + \
                                '.guest.socket'
        self._incomingMigrationFinished = threading.Event()
        self.id = self.conf['vmId']
        self._volPrepareLock = threading.Lock()
        self._initTimePauseCode = None
        self.guestAgent = None
        self._guestEvent = 'Powering up'
        self._guestEventTime = 0
        self._vmStats = None
        self._guestCpuRunning = False
        self._guestCpuLock = threading.Lock()
        self._startTime = time.time() - float(
                                self.conf.pop('elapsedTimeOffset', 0))

        self._usedIndices = {} #{'ide': [], 'virtio' = []}
        self._volumesPrepared = False
        self._pathsPreparedEvent = threading.Event()
        self._devices = {DISK_DEVICES: [], NIC_DEVICES: [],
                         SOUND_DEVICES: [], VIDEO_DEVICES: [],
                         CONTROLLER_DEVICES: [], GENERAL_DEVICES: [],
                         BALLOON_DEVICES: []}
Example #3
0
File: vm.py Project: openSUSE/vdsm
    def __init__(self, cif, params):
        """
        Initialize a new VM instance.

        :param cif: The client interface that creates this VM.
        :type cif: :class:`clientIF.clientIF`
        :param params: The VM parameters.
        :type params: dict
        """
        self.conf = {"pid": "0"}
        self.conf.update(params)
        self.cif = cif
        self.log = SimpleLogAdapter(self.log, {"vmId": self.conf["vmId"]})
        self.destroyed = False
        self._recoveryFile = constants.P_VDSM_RUN + str(self.conf["vmId"]) + ".recovery"
        self.user_destroy = False
        self._monitorResponse = 0
        self.conf["clientIp"] = ""
        self.memCommitted = 0
        self._creationThread = threading.Thread(target=self._startUnderlyingVm)
        if "migrationDest" in self.conf:
            self._lastStatus = "Migration Destination"
        elif "restoreState" in self.conf:
            self._lastStatus = "Restoring state"
        else:
            self._lastStatus = "WaitForLaunch"
        self._nice = ""
        self._migrationSourceThread = self.MigrationSourceThreadClass(self)
        self._kvmEnable = self.conf.get("kvmEnable", "true")
        self._guestSocektFile = constants.P_VDSM_RUN + self.conf["vmId"] + ".guest.socket"
        self._drives = []
        self._incomingMigrationFinished = threading.Event()
        self.id = self.conf["vmId"]
        self._volPrepareLock = threading.Lock()
        self._preparedDrives = {}
        self._initTimePauseCode = None
        self.guestAgent = None
        self._guestEvent = "Powering up"
        self._guestEventTime = 0
        self._vmStats = None
        self._guestCpuRunning = False
        self._guestCpuLock = threading.Lock()
        self._startTime = time.time() - float(self.conf.pop("elapsedTimeOffset", 0))
        self._cdromPreparedPath = ""
        self._floppyPreparedPath = ""
        self._volumesPrepared = False
        self._pathsPreparedEvent = threading.Event()
        self.saveState()
Example #4
0
    def __init__(self, namespace, name, lockType, callback):
        self._syncRoot = threading.RLock()
        self._namespace = namespace
        self._name = name
        self._lockType = lockType
        self._isActive = True
        self._isCanceled = False
        self._doneEvent = threading.Event()
        self._callback = callback
        self.reqID = str(uuid4())
        self._log = SimpleLogAdapter(self._log, {"ResName" : self.fullName, "ReqID" : self.reqID})

        # Becuase findCaller is expensive. We make sure it wll be printed before
        # calculating it
        if logging.getLogger("ResourceManager.ResourceRef").isEnabledFor(logging.WARN):
            createdAt = misc.findCaller(ignoreSourceFiles=[__file__], logSkipName="ResourceManager")
            self._log.debug("Request was made in '%s' line '%d' at '%s'", *createdAt)
Example #5
0
    def __init__(self, id, name="", tag="",
                 recovery=TaskRecoveryType.none,
                 priority=TaskPriority.low):
        """
        id - Unique ID
        name - human readable name
        persist - persistency type: auto-clean/manual-clean/not-persistent
        """

        if not id:
            id = str(uuid.uuid4())
        self.metadataVersion = TASK_METADATA_VERSION
        self.validateID(id)
        self.lock = threading.Lock()
        self.callbackLock = threading.Lock()
        self.id = str(id)
        self.name = name
        self.tag = tag
        self.priority = priority
        self.recoveryPolicy = recovery
        self.persistPolicy = TaskPersistType.none
        self.cleanPolicy = TaskCleanType.auto
        self.store = None
        self.defaultException = None

        self.state = State(State.init)
        self.result = TaskResult(0, "Task is initializing", "")

        self.resOwner = resourceManager.Owner(proxy(self), raiseonfailure=True)
        self.error = se.TaskAborted("Unknown error encountered")

        self.mng = None
        self._abort_lock = threading.Lock()
        self._abort_callbacks = set()
        self._aborting = False
        self._forceAbort = False
        self.ref = 0

        self.recoveries = []
        self.jobs = []
        self.nrecoveries = 0    # just utility count - used by save/load
        self.njobs = 0          # just utility count - used by save/load

        self.log = SimpleLogAdapter(self.log, {"Task": self.id})
Example #6
0
class ResourceRef(object):
    """
    A reference to a resource. Can be used to conveniently modify an owned resource.

    This object will auto release the referenced resource unless autorelease is set to `False`
    """
    _log = logging.getLogger("ResourceManager.ResourceRef")
    namespace = property(lambda self : self._namespace)
    name = property(lambda self : self._name)
    fullName = property(lambda self : "%s.%s" % (self._namespace, self._name))

    # States whether this reference is pointing to an owned reference
    isValid = property(lambda self : self._isValid)

    def __init__(self, namespace, name, wrappedObject=None, resRefID=str(uuid4())):
        self._namespace = namespace
        self._name = name
        self._log = SimpleLogAdapter(self._log, {"ResName" : self.fullName, "ResRefID" : resRefID})

        self.__wrappedObject = wrappedObject
        if wrappedObject is not None:
            self.__wrapObj()

        self.autoRelease = True
        self._isValid = True
        self._syncRoot = misc.RWLock()

    def __wrapObj(self):
        for attr in dir(self.__wrappedObject):
            if hasattr(self,attr) or attr in ('close', 'switchLockType'):
                continue

            setattr(self, attr, partial(self.__methodProxy, attr))

    def __methodProxy(self, attr, *args, **kwargs):
        with self._syncRoot.shared:
            if not self.isValid:
                raise se.ResourceReferenceInvalid

            return getattr(self.__wrappedObject, attr)(*args, **kwargs)

    def release(self):
        with self._syncRoot.exclusive:
            self.__wrappedObject = None
            if not self._isValid:
                self._log.warn("Tried to re-release a resource. Request ignored.")
                return

            ResourceManager.getInstance().releaseResource(self.namespace, self.name)
            self._isValid = False

    def getStatus(self):
        return ResourceManager.getInstance().getResourceStatus(self.namespace, self.name)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.release()

    def __del__(self):
        if self._isValid and self.autoRelease:
            def release(log, namespace, name):
                log.warn("Resource referece was not properly released. Autoreleasing.")
                # In Python, objects are refcounted and are deleted immediately when
                # the last reference is freed. This means the __del__ method can be called
                # inside of any context. The releaseResource method we use tries to acquire
                # locks. So we might try to acquire the lock in a locked context and reach
                # a deadlock. This is why I need to use a timer. It will defer the operation
                # and use a different context.
                ResourceManager.getInstance().releaseResource(namespace, name)
            threading.Thread(target = release, args=(self._log, self.namespace, self.name)).start()
            self._isValid = False

    def __repr__(self):
        return "< ResourceRef '%s', isValid: '%s' obj: '%s'>" % (self.fullName, self.isValid, repr(self.__wrappedObject) if self.isValid else None)
Example #7
0
class Request(object):
    """
    Internal request object, don't use directly
    """
    _log = logging.getLogger("ResourceManager.Request")
    namespace = property(lambda self : self._namespace)
    name = property(lambda self : self._name)
    fullName = property(lambda self : "%s.%s" % (self._namespace, self._name))
    lockType = property(lambda self : self._lockType)
    syncRoot = property(lambda self : self._syncRoot)

    def __init__(self, namespace, name, lockType, callback):
        self._syncRoot = threading.RLock()
        self._namespace = namespace
        self._name = name
        self._lockType = lockType
        self._isActive = True
        self._isCanceled = False
        self._doneEvent = threading.Event()
        self._callback = callback
        self.reqID = str(uuid4())
        self._log = SimpleLogAdapter(self._log, {"ResName" : self.fullName, "ReqID" : self.reqID})

        # Becuase findCaller is expensive. We make sure it wll be printed before
        # calculating it
        if logging.getLogger("ResourceManager.ResourceRef").isEnabledFor(logging.WARN):
            createdAt = misc.findCaller(ignoreSourceFiles=[__file__], logSkipName="ResourceManager")
            self._log.debug("Request was made in '%s' line '%d' at '%s'", *createdAt)

    def cancel(self):
        with self._syncRoot:
            if not self._isActive:
                self._log.warn("Tried to cancel a processed request")
                raise RequestAlreadyProcessedError("Cannot cancel a processed request")

            self._isActive = False
            self._isCanceled = True

            self._log.debug("Canceled request")
            try:
                self._callback(RequestRef(self), None)
            except Exception:
                self._log.warn("Request callback threw an exception", exc_info=True)
            self._callback = None
            self._doneEvent.set()

    def _status(self):
        with self._syncRoot:
            if self._isCanceled:
                return "canceled"
            if self._doneEvent.isSet():
                return "granted"
            return "waiting"

    def canceled(self):
        return self._isCanceled

    def grant(self):
        with self._syncRoot:
            if not self._isActive:
                self._log.warn("Tried to grant a processed request")
                raise RequestAlreadyProcessedError("Cannot grant a processed request")

            self._isActive = False
            self._log.debug("Granted request")
            self._doneEvent.set()

    def emit(self, resource):
        try:
            ref = RequestRef(self)
            self._callback(ref, resource)
        except Exception:
            self._log.warn("Request callback threw an exception", exc_info=True)


    def wait(self, timeout = None):
        return self._doneEvent.wait(timeout)

    def granted(self):
        with self._syncRoot:
            return (not self._isCanceled) and self._doneEvent.isSet()

    def __str__(self):
        return "Request for %s - %s: %s" % (self.fullName, self.lockType, self._status())
Example #8
0
File: task.py Project: Caez83/vdsm
class Task:
    # External Task info
    fields = {
        # field_name: type
        "id": str,
        "name": unicode,
        "tag": unicode,
        "store": unicode,
        "recoveryPolicy": TaskRecoveryType,
        "persistPolicy": TaskPersistType,
        "cleanPolicy": TaskCleanType,
        "priority": TaskPriority,
        "state": State,
        "njobs": int,
        "nrecoveries": int,
        "metadataVersion": int
    }

    log = logging.getLogger('Storage.TaskManager.Task')

    def __init__(self, id, name="", tag="",
                 recovery=TaskRecoveryType.none,
                 priority=TaskPriority.low):
        """
        id - Unique ID
        name - human readable name
        persist - persistency type: auto-clean/manual-clean/not-persistent
        """

        if not id:
            id = str(uuid.uuid4())
        self.metadataVersion = TASK_METADATA_VERSION
        self.validateID(id)
        self.lock = threading.Lock()
        self.callbackLock = threading.Lock()
        self.id = str(id)
        self.name = name
        self.tag = tag
        self.priority = priority
        self.recoveryPolicy = recovery
        self.persistPolicy = TaskPersistType.none
        self.cleanPolicy = TaskCleanType.auto
        self.store = None
        self.defaultException = None

        self.state = State(State.init)
        self.result = TaskResult(0, "Task is initializing", "")

        self.resOwner = resourceManager.Owner(proxy(self), raiseonfailure=True)
        self.error = se.TaskAborted("Unknown error encountered")

        self.mng = None
        self._aborting = False
        self._forceAbort = False
        self.ref = 0

        self.recoveries = []
        self.jobs = []
        self.nrecoveries = 0    # just utility count - used by save/load
        self.njobs = 0          # just utility count - used by save/load

        self.log = SimpleLogAdapter(self.log, {"Task": self.id})

    def __del__(self):
        def finalize(log, owner, taskDir):
            log.warn("Task was autocleaned")
            owner.releaseAll()
            if taskDir is not None:
                getProcPool().fileUtils.cleanupdir(taskDir)

        if not self.state.isDone():
            taskDir = None
            if (self.cleanPolicy == TaskCleanType.auto and
                    self.store is not None):
                taskDir = os.path.join(self.store, self.id)
            threading.Thread(target=finalize,
                             args=(self.log, self.resOwner, taskDir)).start()

    def _done(self):
        self.resOwner.releaseAll()
        if self.cleanPolicy == TaskCleanType.auto:
            self.clean()

    def __state_preparing(self, fromState):
        pass

    def __state_blocked(self, fromState):
        pass

    def __state_acquiring(self, fromState):
        if self.resOwner.requestsGranted():
            self._updateState(State.queued)

    def __state_queued(self, fromState):
        try:
            self.mng.queue(self)
        except Exception as e:
            self._setError(e)
            self.stop()

    def __state_running(self, fromState):
        self._runJobs()

    def __state_finished(self, fromState):
        self._done()

    def __state_aborting(self, fromState):
        if self.ref > 1:
            return
        self.log.debug("_aborting: recover policy %s", self.recoveryPolicy)
        if self.recoveryPolicy == TaskRecoveryType.auto:
            self._updateState(State.racquiring)
        elif self.recoveryPolicy == TaskRecoveryType.none:
            self._updateState(State.failed)
        else:
            self._updateState(State.waitrecover)

    def __state_waitrecover(self, fromState):
        pass

    def __state_racquiring(self, fromState):
        if self.resOwner.requestsGranted():
            self._updateState(State.recovering)

    def __state_recovering(self, fromState):
        self._recover()

    def __state_raborting(self, fromState):
        if self.ref == 1:
            self._updateState(State.failed)
        else:
            self.log.warn("State was change to 'raborting' "
                          "when ref was not 1.")

    def __state_recovered(self, fromState):
        self._done()

    def __state_failed(self, fromState):
        self._done()

    def __state_cleaning(self, fromState):
        pass

    def _updateState(self, state, force=False):
        fromState = self.state
        requestedState = state
        if self._aborting:
            if self.state.canAbort():
                state = State.aborting
            elif self.state.canAbortRecovery() and state != State.recovered:
                state = State.raborting
        self._aborting = False
        if requestedState == state:
            self.log.debug("moving from state %s -> state %s",
                           fromState, state)
        else:
            self.log.debug("moving from state %s -> state %s instead of %s",
                           fromState, state, requestedState)

        self.state.moveto(state, force)
        if self.persistPolicy == TaskPersistType.auto:
            try:
                self.persist()
            except Exception:
                self.log.warning("Task._updateState: failed persisting task"
                                 " %s", self.id, exc_info=True)

        fn = getattr(self, "_Task__state_%s" % state)
        fn(fromState)

    def _updateResult(self, code, message, result):
        self.result.result = result
        self.result.code = code
        self.result.message = message

    @classmethod
    def validateID(cls, taskID):
        if not taskID or "." in taskID:
            raise se.InvalidParameterException("taskID", taskID)

    @classmethod
    def _loadMetaFile(cls, filename, obj, fields):
        try:
            for line in getProcPool().readLines(filename):
                # process current line
                line = line.encode('utf8')
                if line.find(KEY_SEPARATOR) < 0:
                    continue
                parts = line.split(KEY_SEPARATOR)
                if len(parts) != 2:
                    cls.log.warning("Task._loadMetaFile: %s - ignoring line"
                                    " '%s'", filename, line)
                    continue

                field = _eq_decode(parts[0].strip())
                value = _eq_decode(parts[1].strip())
                if field not in fields:
                    cls.log.warning("Task._loadMetaFile: %s - ignoring field"
                                    " %s in line '%s'", filename, field, line)
                    continue

                ftype = fields[field]
                setattr(obj, field, ftype(value))
        except Exception:
            cls.log.error("Unexpected error", exc_info=True)
            raise se.TaskMetaDataLoadError(filename)

    @classmethod
    def _dump(cls, obj, fields):
        lines = []
        for field in fields:
            try:
                value = unicode(getattr(obj, field))
            except AttributeError:
                cls.log.warning("object %s field %s not found" %
                                (obj, field), exc_info=True)
            else:
                try:
                    field = _eq_encode(field)
                    value = _eq_encode(value)
                except ValueError as e:
                    cls.log.warning("Object %s: Cannot encode field %s or "
                                    "value %s. Skipping field. %s",
                                    obj, field, value, e)
                else:
                    lines.append("%s %s %s" % (field, KEY_SEPARATOR, value))
        return lines

    @classmethod
    def _saveMetaFile(cls, filename, obj, fields):
        try:
            getProcPool().writeLines(filename,
                                     [l.encode('utf8') + "\n"
                                      for l in cls._dump(obj, fields)])
        except Exception:
            cls.log.error("Unexpected error", exc_info=True)
            raise se.TaskMetaDataSaveError(filename)

    def _loadTaskMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + TASK_EXT)
        self._loadMetaFile(taskFile, self, Task.fields)

    def _saveTaskMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + TASK_EXT)
        self._saveMetaFile(taskFile, self, Task.fields)

    def _loadJobMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir, self.id + JOB_EXT + NUM_SEP + str(n))
        self._loadMetaFile(taskFile, self.jobs[n], Job.fields)

    def _saveJobMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir, self.id + JOB_EXT + NUM_SEP + str(n))
        self._saveMetaFile(taskFile, self.jobs[n], Job.fields)

    def _loadRecoveryMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir,
                                self.id + RECOVER_EXT + NUM_SEP + str(n))
        self._loadMetaFile(taskFile, self.recoveries[n], Recovery.fields)

    def _saveRecoveryMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir,
                                self.id + RECOVER_EXT + NUM_SEP + str(n))
        self._saveMetaFile(taskFile, self.recoveries[n], Recovery.fields)

    def _loadTaskResultMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + RESULT_EXT)
        self._loadMetaFile(taskFile, self.result, TaskResult.fields)

    def _saveTaskResultMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + RESULT_EXT)
        self._saveMetaFile(taskFile, self.result, TaskResult.fields)

    def _getResourcesKeyList(self, taskDir):
        keys = []
        for path in getProcPool().glob.glob(os.path.join(taskDir,
                                                         "*" + RESOURCE_EXT)):
            filename = os.path.basename(path)
            keys.append(filename[:filename.rfind(RESOURCE_EXT)])
        return keys

    def _load(self, storPath, ext=""):
        self.log.debug("%s: load from %s, ext '%s'", self, storPath, ext)
        if self.state != State.init:
            raise se.TaskMetaDataLoadError("task %s - can't load self: "
                                           "not in init state" % self)
        taskDir = os.path.join(storPath, str(self.id) + str(ext))
        if not getProcPool().os.path.exists(taskDir):
            raise se.TaskDirError("load: no such task dir '%s'" % taskDir)
        oldid = self.id
        self._loadTaskMetaFile(taskDir)
        if self.id != oldid:
            raise se.TaskMetaDataLoadError("task %s: loaded file do not match"
                                           " id (%s != %s)" %
                                           (self, self.id, oldid))
        if self.state == State.finished:
            self._loadTaskResultMetaFile(taskDir)
        for jn in range(self.njobs):
            self.jobs.append(Job("load", None))
            self._loadJobMetaFile(taskDir, jn)
            self.jobs[jn].setOwnerTask(self)
        for rn in range(self.nrecoveries):
            self.recoveries.append(Recovery("load", "load",
                                            "load", "load", ""))
            self._loadRecoveryMetaFile(taskDir, rn)
            self.recoveries[rn].setOwnerTask(self)

    def _save(self, storPath):
        origTaskDir = os.path.join(storPath, self.id)
        if not getProcPool().os.path.exists(origTaskDir):
            raise se.TaskDirError("_save: no such task dir '%s'" % origTaskDir)
        taskDir = os.path.join(storPath, self.id + TEMP_EXT)
        self.log.debug("_save: orig %s temp %s", origTaskDir, taskDir)
        if getProcPool().os.path.exists(taskDir):
            getProcPool().fileUtils.cleanupdir(taskDir)
        getProcPool().os.mkdir(taskDir)
        try:
            self.njobs = len(self.jobs)
            self.nrecoveries = len(self.recoveries)
            self._saveTaskMetaFile(taskDir)
            if self.state == State.finished:
                self._saveTaskResultMetaFile(taskDir)
            for jn in range(self.njobs):
                self._saveJobMetaFile(taskDir, jn)
            for rn in range(self.nrecoveries):
                self._saveRecoveryMetaFile(taskDir, rn)
        except Exception as e:
            self.log.error("Unexpected error", exc_info=True)
            try:
                getProcPool().fileUtils.cleanupdir(taskDir)
            except:
                self.log.warning("can't remove temp taskdir %s" % taskDir)
            raise se.TaskPersistError("%s persist failed: %s" % (self, e))
        # Make sure backup dir doesn't exist
        getProcPool().fileUtils.cleanupdir(origTaskDir + BACKUP_EXT)
        getProcPool().os.rename(origTaskDir, origTaskDir + BACKUP_EXT)
        getProcPool().os.rename(taskDir, origTaskDir)
        getProcPool().fileUtils.cleanupdir(origTaskDir + BACKUP_EXT)
        getProcPool().fileUtils.fsyncPath(origTaskDir)

    def _clean(self, storPath):
        taskDir = os.path.join(storPath, self.id)
        getProcPool().fileUtils.cleanupdir(taskDir)

    def _recoverDone(self):
        # protect agains races with stop/abort
        self.log.debug("Recover Done: state %s", self.state)
        while True:
            try:
                if self.state == State.recovering:
                    self._updateState(State.recovered)
                elif self.state == State.raborting:
                    self._updateState(State.failed)
                return
            except se.TaskStateTransitionError:
                self.log.error("Unexpected error", exc_info=True)

    def _recover(self):
        self.log.debug("_recover")
        if not self.state == State.recovering:
            raise se.TaskStateError("%s: _recover in state %s" %
                                    (self, self.state))
        try:
            while self.state == State.recovering:
                rec = self.popRecovery()
                self.log.debug("running recovery %s", rec)
                if not rec:
                    break
                self._run(rec.run)
        except Exception as e:
            self.log.warning("task %s: recovery failed: %s",
                             self, e, exc_info=True)
            # protect agains races with stop/abort
            try:
                if self.state == State.recovering:
                    self._updateState(State.raborting)
            except se.TaskStateTransitionError:
                pass
        self._recoverDone()

    def resourceAcquired(self, namespace, resource, locktype):
        # Callback from resourceManager.Owner. May be called by another thread.
        self._incref()
        try:
            self.callbackLock.acquire()
            try:
                self.log.debug("_resourcesAcquired: %s.%s (%s)",
                               namespace, resource, locktype)
                if self.state == State.preparing:
                    return
                if self.state == State.acquiring:
                    self._updateState(State.acquiring)
                elif self.state == State.racquiring:
                    self._updateState(State.racquiring)
                elif self.state == State.blocked:
                    self._updateState(State.preparing)
                elif (self.state == State.aborting or
                      self.state == State.raborting):
                    self.log.debug("resource %s.%s acquired while in state %s",
                                   namespace, resource, self.state)
                else:
                    raise se.TaskStateError("acquire is not allowed in state"
                                            " %s" % self.state)
            finally:
                self.callbackLock.release()
        finally:
            self._decref()

    def resourceRegistered(self, namespace, resource, locktype):
        self._incref()
        try:
            self.callbackLock.acquire()
            try:
                # Callback from resourceManager.Owner. May be called
                # by another thread.
                self.log.debug("_resourcesAcquired: %s.%s (%s)",
                               namespace, resource, locktype)
                # Protect against races with stop/abort
                if self.state == State.preparing:
                    self._updateState(State.blocked)
            finally:
                self.callbackLock.release()
        finally:
            self._decref()

    def _setError(self, e=se.TaskAborted("Unknown error encountered")):
        self.log.error("Unexpected error", exc_info=True)
        self.error = e

    def _run(self, fn, *args, **kargs):
        code = 100
        message = "Unknown Error"
        try:
            return fn(*args, **kargs)
        except se.StorageException as e:
            code = e.code
            message = e.message
            self._setError(e)
        except Exception as e:
            message = unicode(e)
            self._setError(e)
        except:
            self._setError()

        self.log.debug("Task._run: %s %s %s failed - stopping task",
                       self, args, kargs)
        self.stop()
        raise se.TaskAborted(message, code)

    def _runJobs(self):
        result = ""
        code = 100
        message = "Unknown Error"
        i = 0
        j = None
        try:
            if self.aborting():
                raise se.TaskAborted("shutting down")
            if not self.state == State.running:
                raise se.TaskStateError("%s: can't run Jobs in state %s" %
                                        (self, self.state))
            # for now: result is the last job result, jobs are run sequentially
            for j in self.jobs:
                if self.aborting():
                    raise se.TaskAborted("shutting down")
                self.log.debug("Task.run: running job %s: %s" % (i, j))
                self._updateResult(
                    0, 'running job {0} of {0}'.format(i + 1, len(self.jobs)),
                    '')
                result = self._run(j.run)
                if result is None:
                    result = ""
                i += 1
            j = None
            self._updateResult(0, "%s jobs completed successfully" % i, result)
            self._updateState(State.finished)
            self.log.debug('Task.run: exit - success: result %s' % result)
            return result
        except se.TaskAborted as e:
            self.log.debug("aborting: %s", e)
            message = e.value
            code = e.abortedcode
            if not self.aborting():
                self.log.error("Aborted exception but not in aborting state")
                raise
        self._updateResult(code, message, "")

    def _doAbort(self, force=False):
        self.log.debug("Task._doAbort: force %s" % force)
        self.lock.acquire()
        # Am I really the last?
        if self.ref != 0:
            self.lock.release()
            return
        self.ref += 1
        self.lock.release()
        try:
            try:
                if (not self.state.canAbort() and
                        (force and not self.state.canAbortRecovery())):
                    self.log.warning("Task._doAbort %s: ignoring - "
                                     "at state %s", self, self.state)
                    return
                self.resOwner.cancelAll()
                if self.state.canAbort():
                    self._updateState(State.aborting)
                else:
                    self._updateState(State.raborting)
            except se.TaskAborted:
                self._updateState(State.failed)
        finally:
            self.lock.acquire()
            self.ref -= 1
            self.lock.release()
            # If something horrible went wrong. Just fail the task.
            if not self.state.isDone():
                self.log.warn("Task exited in non terminal state. "
                              "Setting tasks as failed.")
                self._updateState(State.failed)

    def _doRecover(self):
        self.lock.acquire()
        # Am I really the last?
        if self.ref != 0:
            self.lock.release()
            raise se.TaskHasRefs(unicode(self))
        self.ref += 1
        self.lock.release()
        try:
            self._updateState(State.racquiring)
        finally:
            self.lock.acquire()
            self.ref -= 1
            self.lock.release()

    def _incref(self, force=False):
        self.lock.acquire()
        try:
            if self.aborting() and (self._forceAbort or not force):
                raise se.TaskAborted(unicode(self))

            self.ref += 1
            ref = self.ref
            return ref
        finally:
            self.lock.release()

    def _decref(self, force=False):
        self.lock.acquire()
        self.ref -= 1
        ref = self.ref
        self.lock.release()

        self.log.debug("ref %d aborting %s", ref, self.aborting())
        if ref == 0 and self.aborting():
            self._doAbort(force)
        return ref

    ##########################################################################
    # Public Interface                                                       #
    ##########################################################################

    def setDefaultException(self, exceptionObj):
        # defaultException must have response method
        if exceptionObj and not hasattr(exceptionObj, "response"):
            raise se.InvalidDefaultExceptionException(unicode(exceptionObj))
        self.defaultException = exceptionObj

    def setTag(self, tag):
        if KEY_SEPARATOR in tag:
            raise ValueError("tag cannot include %s character" % KEY_SEPARATOR)
        self.tag = unicode(tag)

    def isDone(self):
        return self.state.isDone()

    def addJob(self, job):
        """
        Add async job to the task. Assumes all resources are acquired
        or registered.
        """
        if not self.mng:
            raise se.UnmanagedTask(unicode(self))
        if not isinstance(job, Job):
            raise TypeError("Job param %s(%s) must be Job object" %
                            (repr(job), type(job)))
        if self.state != State.preparing:
            raise Exception("Task.addJob: can't add job in non preparing state"
                            " (%s)" % self.state)
        if not job.name:
            raise ValueError("Task.addJob: name is required")
        name = job.name
        for j in self.jobs:
            if name == j.name:
                raise ValueError("addJob: name '%s' must be unique" % (name))
        job.setOwnerTask(self)
        self.jobs.append(job)
        self.njobs = len(self.jobs)

    def clean(self):
        if not self.store:
            return
        if not self.isDone():
            raise se.TaskStateError("can't clean in state %s" % self.state)
        self._clean(self.store)

    def pushRecovery(self, recovery):
        """
        Add recovery "job" to the task. Recoveries are committed in FILO order.
        Assumes that all required resources are acquired or registered.
        """
        if not isinstance(recovery, Recovery):
            raise TypeError("recovery param %s(%s) must be Recovery object" %
                            (repr(recovery), type(recovery)))
        if not recovery.name:
            raise ValueError("pushRecovery: name is required")
        name = recovery.name
        for r in self.recoveries:
            if name == r.name:
                raise ValueError("pushRecovery: name '%s' must be unique" %
                                 (name))
        recovery.setOwnerTask(self)
        self.recoveries.append(recovery)
        self.persist()

    def replaceRecoveries(self, recovery):
        if not isinstance(recovery, Recovery):
            raise TypeError("recovery param %s(%s) must be Recovery object" %
                            (repr(recovery), type(recovery)))
        if not recovery.name:
            raise ValueError("replaceRecoveries: name is required")
        recovery.setOwnerTask(self)
        rec = Recovery('stubName', 'stubMod', 'stubObj', 'stubFunc', [])
        while (rec and (rec.name != ROLLBACK_SENTINEL)):
            rec = self.popRecovery()
        self.recoveries.append(recovery)
        self.persist()

    def popRecovery(self):
        if self.recoveries:
            return self.recoveries.pop()

    def clearRecoveries(self):
        self.recoveries = []
        self.persist()

    def setManager(self, manager):
        # If need be, refactor out to "validateManager" method
        if not hasattr(manager, "queue"):
            raise se.InvalidTaskMng(unicode(manager))
        self.mng = manager

    def setCleanPolicy(self, clean):
        self.cleanPolicy = TaskCleanType(clean)

    def setPersistence(self, store,
                       persistPolicy=TaskPersistType.auto,
                       cleanPolicy=TaskCleanType.auto):
        self.persistPolicy = TaskPersistType(persistPolicy)
        self.store = store
        self.setCleanPolicy(cleanPolicy)
        if self.persistPolicy != TaskPersistType.none and not self.store:
            raise se.TaskPersistError("no store defined")
        taskDir = os.path.join(self.store, self.id)
        try:
            getProcPool().fileUtils.createdir(taskDir)
        except Exception as e:
            self.log.error("Unexpected error", exc_info=True)
            raise se.TaskPersistError("%s: cannot access/create taskdir"
                                      " %s: %s" % (self, taskDir, e))
        if (self.persistPolicy == TaskPersistType.auto and
                self.state != State.init):
            self.persist()

    def setRecoveryPolicy(self, clean):
        self.recoveryPolicy = TaskRecoveryType(clean)

    def rollback(self):
        self.log.debug('(rollback): enter')
        if self.recoveryPolicy == TaskRecoveryType.none:
            self.log.debug("rollback is skipped")
            return
        if not self.isDone():
            raise se.TaskNotFinished("can't rollback in state %s" % self.state)
        self._doRecover()
        self.log.debug('(rollback): exit')

    def persist(self):
        if self.persistPolicy == TaskPersistType.none:
            return
        if not self.store:
            raise se.TaskPersistError("no store defined")
        if self.state == State.init:
            raise se.TaskStateError("can't persist in state %s" % self.state)
        self._save(self.store)

    @classmethod
    def loadTask(cls, store, taskid):
        t = Task(taskid)
        if getProcPool().os.path.exists(os.path.join(store, taskid)):
            ext = ""
        # TBD: is this the correct order (temp < backup) + should temp
        # be considered at all?
        elif getProcPool().os.path.exists(os.path.join(store,
                                                       taskid + TEMP_EXT)):
            ext = TEMP_EXT
        elif getProcPool().os.path.exists(os.path.join(store,
                                                       taskid + BACKUP_EXT)):
            ext = BACKUP_EXT
        else:
            raise se.TaskDirError("loadTask: no such task dir '%s/%s'" %
                                  (store, taskid))
        t._load(store, ext)
        return t

    @threadlocal_task
    def prepare(self, func, *args, **kwargs):
        message = self.error
        try:
            self._incref()
        except se.TaskAborted:
            self._doAbort()
            return
        try:
            self._updateState(State.preparing)
            result = None
            code = 0
            try:
                if func:
                    result = self._run(func, *args, **kwargs)
            except se.TaskAborted as e:
                self.log.info("aborting: %s", e)
                code = e.abortedcode
                message = e.value

            if self.aborting():
                self.log.debug("Prepare: aborted: %s", message)
                self._updateResult(code, "Task prepare failed: %s" %
                                   (message,), "")
                raise self.error

            if self.jobs:
                self.log.debug("Prepare: %s jobs exist, move to acquiring",
                               self.njobs)
                self._updateState(State.acquiring)
                if self.aborting():
                    self.log.error('failed to acquire task %s', self.id)
                    raise self.error
                self.log.debug("returning")
                return dict(uuid=str(self.id))

            self.log.debug("finished: %s", result)
            self._updateResult(0, "OK", result)
            self._updateState(State.finished)
            return result
        finally:
            self._decref()

    @threadlocal_task
    def commit(self, args=None):
        self.log.debug("committing task: %s", self.id)
        try:
            self._incref()
        except se.TaskAborted:
            self._doAbort()
            return
        try:
            self._updateState(State.running)
        finally:
            self._decref()

    def aborting(self):
        return (self._aborting or
                self.state == State.aborting or
                self.state == State.raborting)

    def stop(self, force=False):
        self.log.debug("stopping in state %s (force %s)", self.state, force)
        self._incref(force)
        try:
            if self.state.isDone():
                self.log.debug("Task already stopped (%s), ignoring",
                               self.state)
                return
            elif (self.state.isRecovering() and
                  not force and
                  (self.cleanPolicy == TaskCleanType.auto)):
                self.log.debug("Task (%s) in recovery and force is false, "
                               "ignoring", self.state)
                return

            self._aborting = True
            self._forceAbort = force
        finally:
            self._decref(force)

    @threadlocal_task
    def recover(self, args=None):
        ''' Do not call this function while the task is actually running. this
            method should only be used to recover tasks state after
            (vdsmd) restart.
        '''
        self.log.debug('(recover): recovering: state %s', self.state)
        try:
            self._incref(force=True)
        except se.TaskAborted:
            self._doAbort(True)
            return
        try:
            if self.isDone():
                self.log.debug('(recover): task is done: state %s', self.state)
                return
            # if we are not during recover, just abort
            if self.state.canAbort():
                self.stop()
            # if we waited for recovery - keep waiting
            elif self.state == State.waitrecover:
                pass
            # if we started the recovery - restart it
            elif (self.state == State.racquiring or
                  self.state == State.recovering):
                self._updateState(State.racquiring, force=True)
            # else we were during failed recovery - abort it
            else:
                self.stop(force=True)
        finally:
            self._decref(force=True)
        self.log.debug('(recover): recovered: state %s', self.state)

    def getState(self):
        return str(self.state)

    def getInfo(self):
        return dict(id=self.id, verb=self.name)

    def deprecated_getStatus(self):
        oReturn = {}
        oReturn["taskID"] = self.id
        oReturn["taskState"] = self.state.DEPRECATED_STATE[self.state.state]
        oReturn["taskResult"] = self.state.DEPRECATED_RESULT[self.state.state]
        oReturn["code"] = self.result.code
        oReturn["message"] = self.result.message
        return oReturn

    def getStatus(self):
        oReturn = {}
        oReturn["state"] = {'code': self.result.code,
                            'message': self.result.message}
        oReturn["task"] = {'id': self.id, 'state': str(self.state)}
        oReturn["result"] = self.result.result
        return oReturn

    def getDetails(self):
        return {
            "id": self.id,
            "verb": self.name,
            "state": str(self.state),
            "code": self.result.code,
            "message": self.result.message,
            "result": self.result.result,
            "tag": self.tag
        }

    def getID(self):
        return self.id

    def getTags(self):
        return self.tag

    def __str__(self):
        return str(self.id)

    # FIXME : Use StringIO and enumerate()
    def dumpTask(self):
        s = "Task: %s" % self._dump(self, Task.fields)
        i = 0
        for r in self.recoveries:
            s += " Recovery%d: %s" % (i, self._dump(r, Recovery.fields))
            i += 1
        i = 0
        for j in self.jobs:
            s += " Job%d: %s" % (i, self._dump(j, Job.fields))
            i += 1
        return s

    @misc.logskip("ResourceManager")
    def getExclusiveLock(
        self,
        namespace,
        resName,
        timeout=config.getint('irs',
                              'task_resource_default_timeout')):
        self.resOwner.acquire(namespace,
                              resName,
                              resourceManager.LockType.exclusive,
                              timeout)

    @misc.logskip("ResourceManager")
    def getSharedLock(self,
                      namespace,
                      resName,
                      timeout=config.getint('irs',
                                            'task_resource_default_timeout')):
        self.resOwner.acquire(namespace,
                              resName,
                              resourceManager.LockType.shared,
                              timeout)
Example #9
0
File: vm.py Project: openSUSE/vdsm
class Vm(object):
    """
    Used for abstracting cummunication between various parts of the
    system and Qemu.

    Runs Qemu in a subprocess and communicates with it, and monitors
    its behaviour.
    """

    log = logging.getLogger("vm.Vm")
    _ongoingCreations = threading.BoundedSemaphore(1)
    MigrationSourceThreadClass = MigrationSourceThread

    def __init__(self, cif, params):
        """
        Initialize a new VM instance.

        :param cif: The client interface that creates this VM.
        :type cif: :class:`clientIF.clientIF`
        :param params: The VM parameters.
        :type params: dict
        """
        self.conf = {"pid": "0"}
        self.conf.update(params)
        self.cif = cif
        self.log = SimpleLogAdapter(self.log, {"vmId": self.conf["vmId"]})
        self.destroyed = False
        self._recoveryFile = constants.P_VDSM_RUN + str(self.conf["vmId"]) + ".recovery"
        self.user_destroy = False
        self._monitorResponse = 0
        self.conf["clientIp"] = ""
        self.memCommitted = 0
        self._creationThread = threading.Thread(target=self._startUnderlyingVm)
        if "migrationDest" in self.conf:
            self._lastStatus = "Migration Destination"
        elif "restoreState" in self.conf:
            self._lastStatus = "Restoring state"
        else:
            self._lastStatus = "WaitForLaunch"
        self._nice = ""
        self._migrationSourceThread = self.MigrationSourceThreadClass(self)
        self._kvmEnable = self.conf.get("kvmEnable", "true")
        self._guestSocektFile = constants.P_VDSM_RUN + self.conf["vmId"] + ".guest.socket"
        self._drives = []
        self._incomingMigrationFinished = threading.Event()
        self.id = self.conf["vmId"]
        self._volPrepareLock = threading.Lock()
        self._preparedDrives = {}
        self._initTimePauseCode = None
        self.guestAgent = None
        self._guestEvent = "Powering up"
        self._guestEventTime = 0
        self._vmStats = None
        self._guestCpuRunning = False
        self._guestCpuLock = threading.Lock()
        self._startTime = time.time() - float(self.conf.pop("elapsedTimeOffset", 0))
        self._cdromPreparedPath = ""
        self._floppyPreparedPath = ""
        self._volumesPrepared = False
        self._pathsPreparedEvent = threading.Event()
        self.saveState()

    def _get_lastStatus(self):
        SHOW_PAUSED_STATES = ("Powering down", "RebootInProgress", "Up")
        if not self._guestCpuRunning and self._lastStatus in SHOW_PAUSED_STATES:
            return "Paused"
        return self._lastStatus

    def _set_lastStatus(self, value):
        if self._lastStatus == "Down":
            self.log.warning("trying to set state to %s when already Down", value)
            if value == "Down":
                raise DoubleDownError
            else:
                return
        if value not in VALID_STATES:
            self.log.error("setting state to %s", value)
        if self._lastStatus != value:
            self.saveState()
            self._lastStatus = value

    lastStatus = property(_get_lastStatus, _set_lastStatus)

    def run(self):
        self._creationThread.start()

    def memCommit(self):
        """
        Reserve the required memory for this VM.
        """
        self.memCommitted = 2 ** 20 * (int(self.conf["memSize"]) + config.getint("vars", "guest_ram_overhead"))

    def _startUnderlyingVm(self):
        self.log.debug("Start")
        try:
            self.memCommit()
            self._ongoingCreations.acquire()
            self.log.debug("_ongoingCreations acquired")
            try:
                self._run()
                if self.lastStatus != "Down" and "recover" not in self.conf:
                    self.cif.ksmMonitor.adjust()
            except Exception:
                if "recover" not in self.conf:
                    raise
                else:
                    self.log.info("Skipping errors on recovery", exc_info=True)
            finally:
                self._ongoingCreations.release()
                self.log.debug("_ongoingCreations released")

            if ("migrationDest" in self.conf or "restoreState" in self.conf) and self.lastStatus != "Down":
                self._waitForIncomingMigrationFinish()

            self.lastStatus = "Up"
            if self._initTimePauseCode:
                self.conf["pauseCode"] = self._initTimePauseCode
                if self._initTimePauseCode == "ENOSPC":
                    self.cont()
            else:
                try:
                    del self.conf["pauseCode"]
                except:
                    pass

            if "recover" in self.conf:
                del self.conf["recover"]
            self.saveState()
        except Exception, e:
            if "recover" in self.conf:
                self.log.info("Skipping errors on recovery", exc_info=True)
            else:
                self.log.error("The vm start process failed", exc_info=True)
                self.setDownStatus(ERROR, str(e))
Example #10
0
File: vm.py Project: ekohl/vdsm
class Vm(object):
    """
    Used for abstracting cummunication between various parts of the
    system and Qemu.

    Runs Qemu in a subprocess and communicates with it, and monitors
    its behaviour.
    """
    log = logging.getLogger("vm.Vm")
    _ongoingCreations = threading.BoundedSemaphore(caps.CpuInfo().cores())
    MigrationSourceThreadClass = MigrationSourceThread

    def __init__(self, cif, params):
        """
        Initialize a new VM instance.

        :param cif: The client interface that creates this VM.
        :type cif: :class:`clientIF.clientIF`
        :param params: The VM parameters.
        :type params: dict
        """
        self.conf = {'pid': '0'}
        self.conf.update(params)
        self.cif = cif
        self.log = SimpleLogAdapter(self.log, {"vmId" : self.conf['vmId']})
        self.destroyed = False
        self._recoveryFile = constants.P_VDSM_RUN + str(
                                    self.conf['vmId']) + '.recovery'
        self.user_destroy = False
        self._monitorResponse = 0
        self.conf['clientIp'] = ''
        self.memCommitted = 0
        self._creationThread = threading.Thread(target=self._startUnderlyingVm)
        if 'migrationDest' in self.conf:
            self._lastStatus = 'Migration Destination'
        elif 'restoreState' in self.conf:
            self._lastStatus = 'Restoring state'
        else:
            self._lastStatus = 'WaitForLaunch'
        self._nice = ''
        self._migrationSourceThread = self.MigrationSourceThreadClass(self)
        self._kvmEnable = self.conf.get('kvmEnable', 'true')
        self._guestSocektFile = constants.P_VDSM_RUN + self.conf['vmId'] + \
                                '.guest.socket'
        self._incomingMigrationFinished = threading.Event()
        self.id = self.conf['vmId']
        self._volPrepareLock = threading.Lock()
        self._initTimePauseCode = None
        self.guestAgent = None
        self._guestEvent = 'Powering up'
        self._guestEventTime = 0
        self._vmStats = None
        self._guestCpuRunning = False
        self._guestCpuLock = threading.Lock()
        self._startTime = time.time() - float(
                                self.conf.pop('elapsedTimeOffset', 0))

        self._usedIndices = {} #{'ide': [], 'virtio' = []}
        self._volumesPrepared = False
        self._pathsPreparedEvent = threading.Event()
        self._devices = {DISK_DEVICES: [], NIC_DEVICES: [],
                         SOUND_DEVICES: [], VIDEO_DEVICES: [],
                         CONTROLLER_DEVICES: [], GENERAL_DEVICES: [],
                         BALLOON_DEVICES: []}

    def _get_lastStatus(self):
        SHOW_PAUSED_STATES = ('Powering down', 'RebootInProgress', 'Up')
        if not self._guestCpuRunning and self._lastStatus in SHOW_PAUSED_STATES:
            return 'Paused'
        return self._lastStatus

    def _set_lastStatus(self, value):
        if self._lastStatus == 'Down':
            self.log.warning('trying to set state to %s when already Down',
                             value)
            if value == 'Down':
                raise DoubleDownError
            else:
                return
        if value not in VALID_STATES:
            self.log.error('setting state to %s', value)
        if self._lastStatus != value:
            self.saveState()
            self._lastStatus = value

    lastStatus = property(_get_lastStatus, _set_lastStatus)

    def __getNextIndex(self, used):
        for n in xrange(max(used or [0]) + 2):
            if n not in used:
                idx = n
                break
        return str(idx)

    def _normalizeVdsmImg(self, drv):
        drv['needExtend'] = False
        drv['reqsize'] = drv.get('reqsize', '0')
        if not drv.has_key('device'):
            drv['device'] = 'disk'

        if drv['device'] == 'disk':
            res = self.cif.irs.getVolumeSize(drv['domainID'],
                             drv['poolID'], drv['imageID'],
                             drv['volumeID'])
            drv['truesize'] = res['truesize']
            drv['apparentsize'] = res['apparentsize']
        else:
            drv['truesize'] = 0
            drv['apparentsize'] = 0

    def __legacyDrives(self):
         """
         Backward compatibility for qa scripts that specify direct paths.
         """
         legacies = []
         for index, linuxName in ((0, 'hda'), (1, 'hdb'), (2, 'hdc'), (3, 'hdd')):
             path = self.conf.get(linuxName)
             if path:
                 legacies.append({'path': path, 'iface': 'ide', 'index': index,
                                  'truesize': 0})
         return legacies

    def __removableDrives(self):
        removables =  [{'type': DISK_DEVICES, 'device': 'cdrom', 'iface': 'ide',
                        'path': self.conf.get('cdrom', ''), 'index': 2,
                        'truesize': 0}]
        floppyPath = self.conf.get('floppy')
        if floppyPath:
            removables.append({'type': DISK_DEVICES, 'device': 'floppy',
                               'path': floppyPath, 'iface': 'fdc',
                               'index': 0, 'truesize': 0})
        return removables

    def getConfDevices(self):
        devices = {DISK_DEVICES: [], NIC_DEVICES: [],
                   SOUND_DEVICES: [], VIDEO_DEVICES: [],
                   CONTROLLER_DEVICES: [], GENERAL_DEVICES: [],
                   BALLOON_DEVICES: []}
        for dev in self.conf.get('devices'):
            try:
                devices[dev['type']].append(dev)
            except KeyError:
                # Unknown type device found
                self.log.warn("Unknown type found, device: '%s' found", dev)
                devices[GENERAL_DEVICES].append(dev)

        # Update indecies for drives devices
        self.normalizeDrivesIndices(devices[DISK_DEVICES])
        return devices

    def buildConfDevices(self):
        """
        Return the "devices" section of this Vm's conf.
        If missing, create it according to old API.
        """
        # For BC we need to save previous behaviour for old type parameters.
        # The new/old type parameter will be distinguished
        # by existence/absence of the 'devices' key
        devices = {}
        # Build devices structure
        if self.conf.get('devices') == None:
            self.conf['devices'] = []
            devices[DISK_DEVICES] = self.getConfDrives()
            devices[NIC_DEVICES] = self.getConfNetworkInterfaces()
            devices[SOUND_DEVICES] = self.getConfSound()
            devices[VIDEO_DEVICES] = self.getConfVideo()
            devices[CONTROLLER_DEVICES] = self.getConfController()
            devices[GENERAL_DEVICES] = []
            devices[BALLOON_DEVICES] = []
        else:
            devices = self.getConfDevices()

        # Normalize vdsm images
        for drv in devices[DISK_DEVICES]:
            if isVdsmImage(drv):
                self._normalizeVdsmImg(drv)

        # Preserve old behavior. Since libvirt add a memory balloon device
        # to all guests, we need to specifically request not to add it.
        if len(devices[BALLOON_DEVICES]) == 0:
            devices[BALLOON_DEVICES].append({'type': BALLOON_DEVICES,
                'device': 'memballoon', 'model': 'none'})

        return devices

    def getConfController(self):
        """
        Normalize controller device.
        """
        controllers = []
        # For now we create by default only 'virtio-serial' controller
        controllers.append({'type': CONTROLLER_DEVICES, 'device': 'virtio-serial'})
        return controllers

    def getConfVideo(self):
        """
        Normalize video device provided by conf.
        """
        vcards =[]
        if self.conf.get('display') == 'vnc':
            devType = 'cirrus'
        elif self.conf.get('display') == 'qxl':
            devType = 'qxl'

        monitors = int(self.conf.get('spiceMonitors', '1'))
        vram = '65536' if (monitors <= 2) else '32768'
        for idx in range(monitors):
            vcards.append({'type': VIDEO_DEVICES, 'specParams': {'vram': vram},
                           'device': devType})

        return vcards

    def getConfSound(self):
        """
        Normalize sound device provided by conf.
        """
        scards = []
        if self.conf.get('soundDevice'):
            scards.append({'type': SOUND_DEVICES,
                           'device': self.conf.get('soundDevice')})

        return scards

    def getConfNetworkInterfaces(self):
        """
        Normalize networks interfaces provided by conf.
        """
        nics = []
        macs = self.conf.get('macAddr', '').split(',')
        models = self.conf.get('nicModel', '').split(',')
        bridges = self.conf.get('bridge', DEFAULT_BRIDGE).split(',')
        if macs == ['']: macs = []
        if models == ['']: models = []
        if bridges == ['']: bridges = []
        if len(models) < len(macs) or len(models) < len(bridges):
            raise ValueError('Bad nic specification')
        if models and not (macs or bridges):
            raise ValueError('Bad nic specification')
        if not macs or not models or not bridges:
            return ''
        macs = macs + [macs[-1]] * (len(models) - len(macs))
        bridges = bridges + [bridges[-1]] * (len(models) - len(bridges))

        for mac, model, bridge in zip(macs, models, bridges):
            if model == 'pv':
                model = 'virtio'
            nics.append({'type': NIC_DEVICES, 'macAddr': mac, 'nicModel': model,
                         'network': bridge, 'device': 'bridge'})
        return nics

    def getConfDrives(self):
        """
        Normalize drives provided by conf.
        """
        # FIXME
        # Will be better to change the self.conf but this implies an API change.
        # Remove this when the API parameters will be consistent.
        confDrives = self.conf['drives'] if self.conf.get('drives') else []
        if not confDrives:
            confDrives.extend(self.__legacyDrives())
        confDrives.extend(self.__removableDrives())

        for drv in confDrives:
            drv['type'] = DISK_DEVICES
            drv['format'] = drv.get('format') or 'raw'
            drv['propagateErrors'] = drv.get('propagateErrors') or 'off'
            drv['readonly'] = False
            # FIXME: For BC we have now two identical keys: iface = if
            # Till the day that conf will not returned as a status anymore.
            drv['iface'] = drv.get('iface') or drv.get('if', 'ide')

        # Update indecies for drives devices
        self.normalizeDrivesIndices(confDrives)

        return confDrives

    def updateDriveIndex(self, drv):
        drv['index'] = self.__getNextIndex(self._usedIndices[drv['iface']])
        self._usedIndices[drv['iface']].append(int(drv['index']))

    def normalizeDrivesIndices(self, confDrives):
        drives = [(order, drv) for order, drv in enumerate(confDrives)]
        indexed = []
        for order, drv in drives:
            if not self._usedIndices.has_key(drv['iface']):
                self._usedIndices[drv['iface']] = []
            idx = drv.get('index')
            if idx is not None:
                self._usedIndices[drv['iface']].append(int(idx))
                indexed.append(order)

        for order, drv in drives:
            if order not in indexed:
                self.updateDriveIndex(drv)

        return [drv for order, drv in drives]

    def run(self):
        self._creationThread.start()

    def memCommit(self):
        """
        Reserve the required memory for this VM.
        """
        self.memCommitted = 2**20 * (int(self.conf['memSize']) +
                                config.getint('vars', 'guest_ram_overhead'))

    def _startUnderlyingVm(self):
        self.log.debug("Start")
        try:
            self.memCommit()
            self._ongoingCreations.acquire()
            self.log.debug("_ongoingCreations acquired")
            try:
                self._run()
                if self.lastStatus != 'Down' and 'recover' not in self.conf:
                    self.cif.ksmMonitor.adjust()
            except Exception:
                if 'recover' not in self.conf:
                    raise
                else:
                    self.log.info("Skipping errors on recovery", exc_info=True)
            finally:
                self._ongoingCreations.release()
                self.log.debug("_ongoingCreations released")

            if ('migrationDest' in self.conf or 'restoreState' in self.conf
                                               ) and self.lastStatus != 'Down':
                self._waitForIncomingMigrationFinish()

            self.lastStatus = 'Up'
            if self._initTimePauseCode:
                self.conf['pauseCode'] = self._initTimePauseCode
                if self._initTimePauseCode == 'ENOSPC':
                    self.cont()
            else:
                try:
                    del self.conf['pauseCode']
                except:
                    pass

            if 'recover' in self.conf:
                del self.conf['recover']
            self.saveState()
        except Exception, e:
            if 'recover' in self.conf:
                self.log.info("Skipping errors on recovery", exc_info=True)
            else:
                self.log.error("The vm start process failed", exc_info=True)
                self.setDownStatus(ERROR, str(e))
Example #11
0
class ResourceRef(object):
    """
    A reference to a resource. Can be used to conveniently modify an owned
    resource.

    This object will auto release the referenced resource unless autorelease
    is set to `False`
    """
    _log = logging.getLogger("Storage.ResourceManager.ResourceRef")
    namespace = property(lambda self: self._namespace)
    name = property(lambda self: self._name)
    fullName = property(lambda self: "%s.%s" % (self._namespace, self._name))

    # States whether this reference is pointing to an owned reference
    isValid = property(lambda self: self._isValid)

    def __init__(self,
                 namespace,
                 name,
                 wrappedObject=None,
                 resRefID=str(uuid4())):
        self._namespace = namespace
        self._name = name
        self._log = SimpleLogAdapter(self._log, {
            "ResName": self.fullName,
            "ResRefID": resRefID
        })

        self.__wrappedObject = wrappedObject
        if wrappedObject is not None:
            self.__wrapObj()

        self.autoRelease = True
        self._isValid = True
        self._syncRoot = misc.RWLock()

    def __wrapObj(self):
        for attr in dir(self.__wrappedObject):
            if hasattr(self, attr) or attr in ('close', 'switchLockType'):
                continue

            setattr(self, attr, partial(self.__methodProxy, attr))

    def __methodProxy(self, attr, *args, **kwargs):
        with self._syncRoot.shared:
            if not self.isValid:
                raise se.ResourceReferenceInvalid

            return getattr(self.__wrappedObject, attr)(*args, **kwargs)

    def release(self):
        with self._syncRoot.exclusive:
            self.__wrappedObject = None
            if not self._isValid:
                self._log.warn("Tried to re-release a resource. Request "
                               "ignored.")
                return

            ResourceManager.getInstance().releaseResource(
                self.namespace, self.name)
            self._isValid = False

    def getStatus(self):
        return ResourceManager.getInstance().getResourceStatus(
            self.namespace, self.name)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.release()

    def __del__(self):
        if self._isValid and self.autoRelease:

            def release(log, namespace, name):
                log.warn("Resource reference was not properly released. "
                         "Autoreleasing.")
                # In Python, objects are refcounted and are deleted immediately
                # when the last reference is freed. This means the __del__
                # method can be called inside of any context. The
                # releaseResource method we use tries to acquire locks. So we
                # might try to acquire the lock in a locked context and reach a
                # deadlock. This is why I need to use a timer. It will defer
                # the operation and use a different context.
                ResourceManager.getInstance().releaseResource(namespace, name)

            concurrent.thread(release,
                              args=(self._log, self.namespace,
                                    self.name)).start()
            self._isValid = False

    def __repr__(self):
        return "< ResourceRef '%s', isValid: '%s' obj: '%s'>" % (
            self.fullName, self.isValid,
            repr(self.__wrappedObject) if self.isValid else None)
Example #12
0
class Request(object):
    """
    Internal request object, don't use directly
    """
    _log = logging.getLogger("Storage.ResourceManager.Request")
    namespace = property(lambda self: self._namespace)
    name = property(lambda self: self._name)
    fullName = property(lambda self: "%s.%s" % (self._namespace, self._name))
    lockType = property(lambda self: self._lockType)
    syncRoot = property(lambda self: self._syncRoot)

    def __init__(self, namespace, name, lockType, callback):
        self._syncRoot = threading.RLock()
        self._namespace = namespace
        self._name = name
        self._lockType = lockType
        self._isActive = True
        self._isCanceled = False
        self._doneEvent = threading.Event()
        self._callback = callback
        self.reqID = str(uuid4())
        self._log = SimpleLogAdapter(self._log, {
            "ResName": self.fullName,
            "ReqID": self.reqID
        })

        # Because findCaller is expensive. We make sure it wll be printed
        # before calculating it
        if logging.getLogger("Storage.ResourceManager.ResourceRef").\
                isEnabledFor(logging.WARN):
            createdAt = misc.findCaller(ignoreSourceFiles=[__file__],
                                        logSkipName="ResourceManager")
            self._log.debug("Request was made in '%s' line '%d' at '%s'",
                            *createdAt)

    def cancel(self):
        with self._syncRoot:
            if not self._isActive:
                self._log.warn("Tried to cancel a processed request")
                raise RequestAlreadyProcessedError("Cannot cancel a processed "
                                                   "request")

            self._isActive = False
            self._isCanceled = True

            self._log.debug("Canceled request")
            try:
                self._callback(RequestRef(self), None)
            except Exception:
                self._log.warn("Request callback threw an exception",
                               exc_info=True)
            self._callback = None
            self._doneEvent.set()

    def _status(self):
        with self._syncRoot:
            if self._isCanceled:
                return "canceled"
            if self._doneEvent.isSet():
                return "granted"
            return "waiting"

    def canceled(self):
        return self._isCanceled

    def grant(self):
        with self._syncRoot:
            if not self._isActive:
                self._log.warn("Tried to grant a processed request")
                raise RequestAlreadyProcessedError("Cannot grant a processed "
                                                   "request")

            self._isActive = False
            self._log.debug("Granted request")
            self._doneEvent.set()

    def emit(self, resource):
        try:
            ref = RequestRef(self)
            self._callback(ref, resource)
        except Exception:
            self._log.warn("Request callback threw an exception",
                           exc_info=True)

    def wait(self, timeout=None):
        return self._doneEvent.wait(timeout)

    def granted(self):
        with self._syncRoot:
            return (not self._isCanceled) and self._doneEvent.isSet()

    def __str__(self):
        return "Request for %s - %s: %s" % (self.fullName, self.lockType,
                                            self._status())
Example #13
0
class Task:
    # External Task info
    fields = {
        # field_name: type
        "id": str,
        "name": unicode,
        "tag": unicode,
        "store": unicode,
        "recoveryPolicy": TaskRecoveryType,
        "persistPolicy": TaskPersistType,
        "cleanPolicy": TaskCleanType,
        "priority": TaskPriority,
        "state": State,
        "njobs": int,
        "nrecoveries": int,
        "metadataVersion": int
    }

    log = logging.getLogger('Storage.TaskManager.Task')

    def __init__(self,
                 id,
                 name="",
                 tag="",
                 recovery=TaskRecoveryType.none,
                 priority=TaskPriority.low):
        """
        id - Unique ID
        name - human readable name
        persist - persistency type: auto-clean/manual-clean/not-persistent
        """

        if not id:
            id = str(uuid.uuid4())
        self.metadataVersion = TASK_METADATA_VERSION
        self.validateID(id)
        self.lock = threading.Lock()
        self.callbackLock = threading.Lock()
        self.id = str(id)
        self.name = name
        self.tag = tag
        self.priority = priority
        self.recoveryPolicy = recovery
        self.persistPolicy = TaskPersistType.none
        self.cleanPolicy = TaskCleanType.auto
        self.store = None
        self.defaultException = None

        self.state = State(State.init)
        self.result = TaskResult(0, "Task is initializing", "")

        self.resOwner = resourceManager.Owner(proxy(self), raiseonfailure=True)
        self.error = se.TaskAborted("Unknown error encountered")

        self.mng = None
        self._abort_lock = threading.Lock()
        self._abort_callbacks = set()
        self._aborting = False
        self._forceAbort = False
        self.ref = 0

        self.recoveries = []
        self.jobs = []
        self.nrecoveries = 0  # just utility count - used by save/load
        self.njobs = 0  # just utility count - used by save/load

        self.log = SimpleLogAdapter(self.log, {"Task": self.id})

    def __del__(self):
        def finalize(log, owner, taskDir):
            log.warn("Task was autocleaned")
            owner.releaseAll()
            if taskDir is not None:
                getProcPool().fileUtils.cleanupdir(taskDir)

        if not self.state.isDone():
            taskDir = None
            if (self.cleanPolicy == TaskCleanType.auto
                    and self.store is not None):
                taskDir = os.path.join(self.store, self.id)
            concurrent.thread(finalize,
                              args=(self.log, self.resOwner, taskDir)).start()

    def _done(self):
        self.resOwner.releaseAll()
        if self.cleanPolicy == TaskCleanType.auto:
            self.clean()

    def __state_preparing(self, fromState):
        pass

    def __state_blocked(self, fromState):
        pass

    def __state_acquiring(self, fromState):
        if self.resOwner.requestsGranted():
            self._updateState(State.queued)

    def __state_queued(self, fromState):
        try:
            self.mng.queue(self)
        except Exception as e:
            self._setError(e)
            self.stop()

    def __state_running(self, fromState):
        self._runJobs()

    def __state_finished(self, fromState):
        self._done()

    def __state_aborting(self, fromState):
        if self.ref > 1:
            return
        self.log.debug("_aborting: recover policy %s", self.recoveryPolicy)
        if self.recoveryPolicy == TaskRecoveryType.auto:
            self._updateState(State.racquiring)
        elif self.recoveryPolicy == TaskRecoveryType.none:
            self._updateState(State.failed)
        else:
            self._updateState(State.waitrecover)

    def __state_waitrecover(self, fromState):
        pass

    def __state_racquiring(self, fromState):
        if self.resOwner.requestsGranted():
            self._updateState(State.recovering)

    def __state_recovering(self, fromState):
        self._recover()

    def __state_raborting(self, fromState):
        if self.ref == 1:
            self._updateState(State.failed)
        else:
            self.log.warn("State was change to 'raborting' "
                          "when ref was not 1.")

    def __state_recovered(self, fromState):
        self._done()

    def __state_failed(self, fromState):
        self._done()

    def __state_cleaning(self, fromState):
        pass

    def _updateState(self, state, force=False):
        fromState = self.state
        requestedState = state
        if self._aborting:
            if self.state.canAbort():
                state = State.aborting
            elif self.state.canAbortRecovery() and state != State.recovered:
                state = State.raborting
        self._aborting = False
        if requestedState == state:
            self.log.debug("moving from state %s -> state %s", fromState,
                           state)
        else:
            self.log.debug("moving from state %s -> state %s instead of %s",
                           fromState, state, requestedState)

        self.state.moveto(state, force)
        if self.persistPolicy == TaskPersistType.auto:
            try:
                self.persist()
            except Exception:
                self.log.warning(
                    "Task._updateState: failed persisting task"
                    " %s",
                    self.id,
                    exc_info=True)

        fn = getattr(self, "_Task__state_%s" % state)
        fn(fromState)

    def _updateResult(self, code, message, result):
        self.result.result = result
        self.result.code = code
        self.result.message = message

    @classmethod
    def validateID(cls, taskID):
        if not taskID or "." in taskID:
            raise se.InvalidParameterException("taskID", taskID)

    @classmethod
    def _loadMetaFile(cls, filename, obj, fields):
        try:
            for line in getProcPool().readLines(filename):
                # process current line
                line = line.encode('utf8')
                if line.find(KEY_SEPARATOR) < 0:
                    continue
                parts = line.split(KEY_SEPARATOR)
                if len(parts) != 2:
                    cls.log.warning(
                        "Task._loadMetaFile: %s - ignoring line"
                        " '%s'", filename, line)
                    continue

                field = _eq_decode(parts[0].strip())
                value = _eq_decode(parts[1].strip())
                if field not in fields:
                    cls.log.warning(
                        "Task._loadMetaFile: %s - ignoring field"
                        " %s in line '%s'", filename, field, line)
                    continue

                ftype = fields[field]
                setattr(obj, field, ftype(value))
        except Exception:
            cls.log.error("Unexpected error", exc_info=True)
            raise se.TaskMetaDataLoadError(filename)

    @classmethod
    def _dump(cls, obj, fields):
        lines = []
        for field in fields:
            try:
                value = unicode(getattr(obj, field))
            except AttributeError:
                cls.log.warning("object %s field %s not found" % (obj, field),
                                exc_info=True)
            else:
                try:
                    field = _eq_encode(field)
                    value = _eq_encode(value)
                except ValueError as e:
                    cls.log.warning(
                        "Object %s: Cannot encode field %s or "
                        "value %s. Skipping field. %s", obj, field, value, e)
                else:
                    lines.append("%s %s %s" % (field, KEY_SEPARATOR, value))
        return lines

    @classmethod
    def _saveMetaFile(cls, filename, obj, fields):
        try:
            getProcPool().writeLines(
                filename,
                [l.encode('utf8') + "\n" for l in cls._dump(obj, fields)])
        except Exception:
            cls.log.error("Unexpected error", exc_info=True)
            raise se.TaskMetaDataSaveError(filename)

    def _loadTaskMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + TASK_EXT)
        self._loadMetaFile(taskFile, self, Task.fields)

    def _saveTaskMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + TASK_EXT)
        self._saveMetaFile(taskFile, self, Task.fields)

    def _loadJobMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir, self.id + JOB_EXT + NUM_SEP + str(n))
        self._loadMetaFile(taskFile, self.jobs[n], Job.fields)

    def _saveJobMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir, self.id + JOB_EXT + NUM_SEP + str(n))
        self._saveMetaFile(taskFile, self.jobs[n], Job.fields)

    def _loadRecoveryMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir,
                                self.id + RECOVER_EXT + NUM_SEP + str(n))
        self._loadMetaFile(taskFile, self.recoveries[n], Recovery.fields)

    def _saveRecoveryMetaFile(self, taskDir, n):
        taskFile = os.path.join(taskDir,
                                self.id + RECOVER_EXT + NUM_SEP + str(n))
        self._saveMetaFile(taskFile, self.recoveries[n], Recovery.fields)

    def _loadTaskResultMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + RESULT_EXT)
        self._loadMetaFile(taskFile, self.result, TaskResult.fields)

    def _saveTaskResultMetaFile(self, taskDir):
        taskFile = os.path.join(taskDir, self.id + RESULT_EXT)
        self._saveMetaFile(taskFile, self.result, TaskResult.fields)

    def _getResourcesKeyList(self, taskDir):
        keys = []
        for path in getProcPool().glob.glob(
                os.path.join(taskDir, "*" + RESOURCE_EXT)):
            filename = os.path.basename(path)
            keys.append(filename[:filename.rfind(RESOURCE_EXT)])
        return keys

    def _load(self, storPath, ext=""):
        self.log.debug("%s: load from %s, ext '%s'", self, storPath, ext)
        if self.state != State.init:
            raise se.TaskMetaDataLoadError("task %s - can't load self: "
                                           "not in init state" % self)
        taskDir = os.path.join(storPath, str(self.id) + str(ext))
        if not getProcPool().os.path.exists(taskDir):
            raise se.TaskDirError("load: no such task dir '%s'" % taskDir)
        oldid = self.id
        self._loadTaskMetaFile(taskDir)
        if self.id != oldid:
            raise se.TaskMetaDataLoadError("task %s: loaded file do not match"
                                           " id (%s != %s)" %
                                           (self, self.id, oldid))
        if self.state == State.finished:
            self._loadTaskResultMetaFile(taskDir)
        for jn in range(self.njobs):
            self.jobs.append(Job("load", None))
            self._loadJobMetaFile(taskDir, jn)
            self.jobs[jn].setOwnerTask(self)
        for rn in range(self.nrecoveries):
            self.recoveries.append(Recovery("load", "load", "load", "load",
                                            ""))
            self._loadRecoveryMetaFile(taskDir, rn)
            self.recoveries[rn].setOwnerTask(self)

    def _save(self, storPath):
        origTaskDir = os.path.join(storPath, self.id)
        if not getProcPool().os.path.exists(origTaskDir):
            raise se.TaskDirError("_save: no such task dir '%s'" % origTaskDir)
        taskDir = os.path.join(storPath, self.id + TEMP_EXT)
        self.log.debug("_save: orig %s temp %s", origTaskDir, taskDir)
        if getProcPool().os.path.exists(taskDir):
            getProcPool().fileUtils.cleanupdir(taskDir)
        getProcPool().os.mkdir(taskDir)
        try:
            self.njobs = len(self.jobs)
            self.nrecoveries = len(self.recoveries)
            self._saveTaskMetaFile(taskDir)
            if self.state == State.finished:
                self._saveTaskResultMetaFile(taskDir)
            for jn in range(self.njobs):
                self._saveJobMetaFile(taskDir, jn)
            for rn in range(self.nrecoveries):
                self._saveRecoveryMetaFile(taskDir, rn)
        except Exception as e:
            self.log.error("Unexpected error", exc_info=True)
            try:
                getProcPool().fileUtils.cleanupdir(taskDir)
            except:
                self.log.warning("can't remove temp taskdir %s" % taskDir)
            raise se.TaskPersistError("%s persist failed: %s" % (self, e))
        # Make sure backup dir doesn't exist
        getProcPool().fileUtils.cleanupdir(origTaskDir + BACKUP_EXT)
        getProcPool().os.rename(origTaskDir, origTaskDir + BACKUP_EXT)
        getProcPool().os.rename(taskDir, origTaskDir)
        getProcPool().fileUtils.cleanupdir(origTaskDir + BACKUP_EXT)
        getProcPool().fileUtils.fsyncPath(origTaskDir)

    def _clean(self, storPath):
        taskDir = os.path.join(storPath, self.id)
        getProcPool().fileUtils.cleanupdir(taskDir)

    def _recoverDone(self):
        # protect agains races with stop/abort
        self.log.debug("Recover Done: state %s", self.state)
        while True:
            try:
                if self.state == State.recovering:
                    self._updateState(State.recovered)
                elif self.state == State.raborting:
                    self._updateState(State.failed)
                return
            except se.TaskStateTransitionError:
                self.log.error("Unexpected error", exc_info=True)

    def _recover(self):
        self.log.debug("_recover")
        if not self.state == State.recovering:
            raise se.TaskStateError("%s: _recover in state %s" %
                                    (self, self.state))
        try:
            while self.state == State.recovering:
                rec = self.popRecovery()
                self.log.debug("running recovery %s", rec)
                if not rec:
                    break
                self._run(rec.run)
        except Exception as e:
            self.log.warning("task %s: recovery failed: %s",
                             self,
                             e,
                             exc_info=True)
            # protect agains races with stop/abort
            try:
                if self.state == State.recovering:
                    self._updateState(State.raborting)
            except se.TaskStateTransitionError:
                pass
        self._recoverDone()

    def resourceAcquired(self, namespace, resource, locktype):
        # Callback from resourceManager.Owner. May be called by another thread.
        self._incref()
        try:
            self.callbackLock.acquire()
            try:
                self.log.debug("_resourcesAcquired: %s.%s (%s)", namespace,
                               resource, locktype)
                if self.state == State.preparing:
                    return
                if self.state == State.acquiring:
                    self._updateState(State.acquiring)
                elif self.state == State.racquiring:
                    self._updateState(State.racquiring)
                elif self.state == State.blocked:
                    self._updateState(State.preparing)
                elif (self.state == State.aborting
                      or self.state == State.raborting):
                    self.log.debug("resource %s.%s acquired while in state %s",
                                   namespace, resource, self.state)
                else:
                    raise se.TaskStateError("acquire is not allowed in state"
                                            " %s" % self.state)
            finally:
                self.callbackLock.release()
        finally:
            self._decref()

    def resourceRegistered(self, namespace, resource, locktype):
        self._incref()
        try:
            self.callbackLock.acquire()
            try:
                # Callback from resourceManager.Owner. May be called
                # by another thread.
                self.log.debug("_resourcesAcquired: %s.%s (%s)", namespace,
                               resource, locktype)
                # Protect against races with stop/abort
                if self.state == State.preparing:
                    self._updateState(State.blocked)
            finally:
                self.callbackLock.release()
        finally:
            self._decref()

    def _setError(self, e=se.TaskAborted("Unknown error encountered")):
        self.log.error("Unexpected error", exc_info=True)
        self.error = e

    def _run(self, fn, *args, **kargs):
        code = 100
        message = "Unknown Error"
        try:
            return fn(*args, **kargs)
        except se.StorageException as e:
            code = e.code
            message = e.message
            self._setError(e)
        except Exception as e:
            message = unicode(e)
            self._setError(e)
        except:
            self._setError()

        self.log.debug("Task._run: %s %s %s failed - stopping task", self,
                       args, kargs)
        self.stop()
        raise se.TaskAborted(message, code)

    def _runJobs(self):
        result = ""
        code = 100
        message = "Unknown Error"
        i = 0
        j = None
        try:
            if self.aborting():
                raise se.TaskAborted("shutting down")
            if not self.state == State.running:
                raise se.TaskStateError("%s: can't run Jobs in state %s" %
                                        (self, self.state))
            # for now: result is the last job result, jobs are run sequentially
            for j in self.jobs:
                if self.aborting():
                    raise se.TaskAborted("shutting down")
                self.log.debug("Task.run: running job %s: %s" % (i, j))
                self._updateResult(
                    0, 'running job {0} of {0}'.format(i + 1, len(self.jobs)),
                    '')
                result = self._run(j.run)
                if result is None:
                    result = ""
                i += 1
            j = None
            self._updateResult(0, "%s jobs completed successfully" % i, result)
            self._updateState(State.finished)
            self.log.debug('Task.run: exit - success: result %s' % result)
            return result
        except se.TaskAborted as e:
            self.log.debug("aborting: %s", e)
            message = e.value
            code = e.abortedcode
            if not self.aborting():
                self.log.error("Aborted exception but not in aborting state")
                raise
        self._updateResult(code, message, "")

    def _doAbort(self, force=False):
        self.log.debug("Task._doAbort: force %s" % force)
        self.lock.acquire()
        # Am I really the last?
        if self.ref != 0:
            self.lock.release()
            return
        self.ref += 1
        self.lock.release()
        try:
            try:
                if (not self.state.canAbort()
                        and (force and not self.state.canAbortRecovery())):
                    self.log.warning(
                        "Task._doAbort %s: ignoring - "
                        "at state %s", self, self.state)
                    return
                self.resOwner.cancelAll()
                if self.state.canAbort():
                    self._updateState(State.aborting)
                else:
                    self._updateState(State.raborting)
            except se.TaskAborted:
                self._updateState(State.failed)
        finally:
            self.lock.acquire()
            self.ref -= 1
            self.lock.release()
            # If something horrible went wrong. Just fail the task.
            if not self.state.isDone():
                self.log.warn("Task exited in non terminal state. "
                              "Setting tasks as failed.")
                self._updateState(State.failed)

    def _doRecover(self):
        self.lock.acquire()
        # Am I really the last?
        if self.ref != 0:
            self.lock.release()
            raise se.TaskHasRefs(unicode(self))
        self.ref += 1
        self.lock.release()
        try:
            self._updateState(State.racquiring)
        finally:
            self.lock.acquire()
            self.ref -= 1
            self.lock.release()

    def _incref(self, force=False):
        self.lock.acquire()
        try:
            if self.aborting() and (self._forceAbort or not force):
                raise se.TaskAborted(unicode(self))

            self.ref += 1
            ref = self.ref
            return ref
        finally:
            self.lock.release()

    def _decref(self, force=False):
        self.lock.acquire()
        self.ref -= 1
        ref = self.ref
        self.lock.release()

        self.log.debug("ref %d aborting %s", ref, self.aborting())
        if ref == 0 and self.aborting():
            self._doAbort(force)
        return ref

    ##########################################################################
    # Public Interface                                                       #
    ##########################################################################

    def setDefaultException(self, exceptionObj):
        # defaultException must have response method
        if exceptionObj and not hasattr(exceptionObj, "response"):
            raise se.InvalidDefaultExceptionException(unicode(exceptionObj))
        self.defaultException = exceptionObj

    def setTag(self, tag):
        if KEY_SEPARATOR in tag:
            raise ValueError("tag cannot include %s character" % KEY_SEPARATOR)
        self.tag = unicode(tag)

    def isDone(self):
        return self.state.isDone()

    def addJob(self, job):
        """
        Add async job to the task. Assumes all resources are acquired
        or registered.
        """
        if not self.mng:
            raise se.UnmanagedTask(unicode(self))
        if not isinstance(job, Job):
            raise TypeError("Job param %s(%s) must be Job object" %
                            (repr(job), type(job)))
        if self.state != State.preparing:
            raise Exception("Task.addJob: can't add job in non preparing state"
                            " (%s)" % self.state)
        if not job.name:
            raise ValueError("Task.addJob: name is required")
        name = job.name
        for j in self.jobs:
            if name == j.name:
                raise ValueError("addJob: name '%s' must be unique" % (name))
        job.setOwnerTask(self)
        self.jobs.append(job)
        self.njobs = len(self.jobs)

    def clean(self):
        if not self.store:
            return
        if not self.isDone():
            raise se.TaskStateError("can't clean in state %s" % self.state)
        self._clean(self.store)

    def pushRecovery(self, recovery):
        """
        Add recovery "job" to the task. Recoveries are committed in FILO order.
        Assumes that all required resources are acquired or registered.
        """
        if not isinstance(recovery, Recovery):
            raise TypeError("recovery param %s(%s) must be Recovery object" %
                            (repr(recovery), type(recovery)))
        if not recovery.name:
            raise ValueError("pushRecovery: name is required")
        name = recovery.name
        for r in self.recoveries:
            if name == r.name:
                raise ValueError("pushRecovery: name '%s' must be unique" %
                                 (name))
        recovery.setOwnerTask(self)
        self.recoveries.append(recovery)
        self.persist()

    def replaceRecoveries(self, recovery):
        if not isinstance(recovery, Recovery):
            raise TypeError("recovery param %s(%s) must be Recovery object" %
                            (repr(recovery), type(recovery)))
        if not recovery.name:
            raise ValueError("replaceRecoveries: name is required")
        recovery.setOwnerTask(self)
        rec = Recovery('stubName', 'stubMod', 'stubObj', 'stubFunc', [])
        while (rec and (rec.name != ROLLBACK_SENTINEL)):
            rec = self.popRecovery()
        self.recoveries.append(recovery)
        self.persist()

    def popRecovery(self):
        if self.recoveries:
            return self.recoveries.pop()

    def clearRecoveries(self):
        self.recoveries = []
        self.persist()

    def setManager(self, manager):
        # If need be, refactor out to "validateManager" method
        if not hasattr(manager, "queue"):
            raise se.InvalidTaskMng(unicode(manager))
        self.mng = manager

    def setCleanPolicy(self, clean):
        self.cleanPolicy = TaskCleanType(clean)

    def setPersistence(self,
                       store,
                       persistPolicy=TaskPersistType.auto,
                       cleanPolicy=TaskCleanType.auto):
        self.persistPolicy = TaskPersistType(persistPolicy)
        self.store = store
        self.setCleanPolicy(cleanPolicy)
        if self.persistPolicy != TaskPersistType.none and not self.store:
            raise se.TaskPersistError("no store defined")
        taskDir = os.path.join(self.store, self.id)
        try:
            getProcPool().fileUtils.createdir(taskDir)
        except Exception as e:
            self.log.error("Unexpected error", exc_info=True)
            raise se.TaskPersistError("%s: cannot access/create taskdir"
                                      " %s: %s" % (self, taskDir, e))
        if (self.persistPolicy == TaskPersistType.auto
                and self.state != State.init):
            self.persist()

    def setRecoveryPolicy(self, clean):
        self.recoveryPolicy = TaskRecoveryType(clean)

    def rollback(self):
        self.log.debug('(rollback): enter')
        if self.recoveryPolicy == TaskRecoveryType.none:
            self.log.debug("rollback is skipped")
            return
        if not self.isDone():
            raise se.TaskNotFinished("can't rollback in state %s" % self.state)
        self._doRecover()
        self.log.debug('(rollback): exit')

    def persist(self):
        if self.persistPolicy == TaskPersistType.none:
            return
        if not self.store:
            raise se.TaskPersistError("no store defined")
        if self.state == State.init:
            raise se.TaskStateError("can't persist in state %s" % self.state)
        self._save(self.store)

    @classmethod
    def loadTask(cls, store, taskid):
        t = Task(taskid)
        if getProcPool().os.path.exists(os.path.join(store, taskid)):
            ext = ""
        # TBD: is this the correct order (temp < backup) + should temp
        # be considered at all?
        elif getProcPool().os.path.exists(
                os.path.join(store, taskid + TEMP_EXT)):
            ext = TEMP_EXT
        elif getProcPool().os.path.exists(
                os.path.join(store, taskid + BACKUP_EXT)):
            ext = BACKUP_EXT
        else:
            raise se.TaskDirError("loadTask: no such task dir '%s/%s'" %
                                  (store, taskid))
        t._load(store, ext)
        return t

    @threadlocal_task
    def prepare(self, func, *args, **kwargs):
        message = self.error
        try:
            self._incref()
        except se.TaskAborted:
            self._doAbort()
            return
        try:
            self._updateState(State.preparing)
            result = None
            code = 0
            try:
                if func:
                    result = self._run(func, *args, **kwargs)
            except se.TaskAborted as e:
                self.log.info("aborting: %s", e)
                code = e.abortedcode
                message = e.value

            if self.aborting():
                self.log.debug("Prepare: aborted: %s", message)
                self._updateResult(code,
                                   "Task prepare failed: %s" % (message, ), "")
                raise self.error

            if self.jobs:
                self.log.debug("Prepare: %s jobs exist, move to acquiring",
                               self.njobs)
                self._updateState(State.acquiring)
                if self.aborting():
                    self.log.error('failed to acquire task %s', self.id)
                    raise self.error
                self.log.debug("returning")
                return dict(uuid=str(self.id))

            self.log.debug("finished: %s", result)
            self._updateResult(0, "OK", result)
            self._updateState(State.finished)
            return result
        finally:
            self._decref()

    @threadlocal_task
    def commit(self, args=None):
        self.log.debug("committing task: %s", self.id)
        try:
            self._incref()
        except se.TaskAborted:
            self._doAbort()
            return
        try:
            self._updateState(State.running)
        finally:
            self._decref()

    @contextmanager
    def abort_callback(self, callback):
        with self._abort_lock:
            if self.aborting():
                aborting = True
            else:
                aborting = False
                self._abort_callbacks.add(callback)

        if aborting:
            callback()

        try:
            yield
        finally:
            with self._abort_lock:
                self._abort_callbacks.discard(callback)

    def _execute_abort_callbacks(self):
        with self._abort_lock:
            self._aborting = True
            abort_callbacks = list(self._abort_callbacks)

        for callback in abort_callbacks:
            try:
                callback()
            except Exception:
                self.log.exception('failure running abort callback')

    def aborting(self):
        return (self._aborting or self.state == State.aborting
                or self.state == State.raborting)

    def stop(self, force=False):
        self.log.debug("stopping in state %s (force %s)", self.state, force)
        self._incref(force)
        try:
            if self.state.isDone():
                self.log.debug("Task already stopped (%s), ignoring",
                               self.state)
                return
            elif (self.state.isRecovering() and not force
                  and (self.cleanPolicy == TaskCleanType.auto)):
                self.log.debug(
                    "Task (%s) in recovery and force is false, "
                    "ignoring", self.state)
                return

            self._execute_abort_callbacks()
            self._forceAbort = force
        finally:
            self._decref(force)

    @threadlocal_task
    def recover(self, args=None):
        ''' Do not call this function while the task is actually running. this
            method should only be used to recover tasks state after
            (vdsmd) restart.
        '''
        self.log.debug('(recover): recovering: state %s', self.state)
        try:
            self._incref(force=True)
        except se.TaskAborted:
            self._doAbort(True)
            return
        try:
            if self.isDone():
                self.log.debug('(recover): task is done: state %s', self.state)
                return
            # if we are not during recover, just abort
            if self.state.canAbort():
                self.stop()
            # if we waited for recovery - keep waiting
            elif self.state == State.waitrecover:
                pass
            # if we started the recovery - restart it
            elif (self.state == State.racquiring
                  or self.state == State.recovering):
                self._updateState(State.racquiring, force=True)
            # else we were during failed recovery - abort it
            else:
                self.stop(force=True)
        finally:
            self._decref(force=True)
        self.log.debug('(recover): recovered: state %s', self.state)

    def getState(self):
        return str(self.state)

    def getInfo(self):
        return dict(id=self.id, verb=self.name)

    def deprecated_getStatus(self):
        oReturn = {}
        oReturn["taskID"] = self.id
        oReturn["taskState"] = self.state.DEPRECATED_STATE[self.state.state]
        oReturn["taskResult"] = self.state.DEPRECATED_RESULT[self.state.state]
        oReturn["code"] = self.result.code
        oReturn["message"] = self.result.message
        return oReturn

    def getStatus(self):
        oReturn = {}
        oReturn["state"] = {
            'code': self.result.code,
            'message': self.result.message
        }
        oReturn["task"] = {'id': self.id, 'state': str(self.state)}
        oReturn["result"] = self.result.result
        return oReturn

    def getDetails(self):
        return {
            "id": self.id,
            "verb": self.name,
            "state": str(self.state),
            "code": self.result.code,
            "message": self.result.message,
            "result": self.result.result,
            "tag": self.tag
        }

    def getID(self):
        return self.id

    def getTags(self):
        return self.tag

    def __str__(self):
        return str(self.id)

    # FIXME : Use StringIO and enumerate()
    def dumpTask(self):
        s = "Task: %s" % self._dump(self, Task.fields)
        i = 0
        for r in self.recoveries:
            s += " Recovery%d: %s" % (i, self._dump(r, Recovery.fields))
            i += 1
        i = 0
        for j in self.jobs:
            s += " Job%d: %s" % (i, self._dump(j, Job.fields))
            i += 1
        return s

    @misc.logskip("ResourceManager")
    def getExclusiveLock(self,
                         namespace,
                         resName,
                         timeout=config.getint(
                             'irs', 'task_resource_default_timeout')):
        self.resOwner.acquire(namespace, resName,
                              resourceManager.LockType.exclusive, timeout)

    @misc.logskip("ResourceManager")
    def getSharedLock(self,
                      namespace,
                      resName,
                      timeout=config.getint('irs',
                                            'task_resource_default_timeout')):
        self.resOwner.acquire(namespace, resName,
                              resourceManager.LockType.shared, timeout)