class FSMContext(dict): """ A finite state machine context instance. """ def __init__(self, initialState, currentState=None, machineName=None, instanceName=None, retryOptions=None, url=None, queueName=None, data=None, contextTypes=None, method='GET', persistentLogging=False, obj=None, headers=None, globalTaskTarget=None, useRunOnceSemaphore=True): """ Constructor @param initialState: a State instance @param currentState: a State instance @param machineName: the name of the fsm @param instanceName: the instance name of the fsm @param retryOptions: the TaskRetryOptions for the machine @param url: the url of the fsm @param queueName: the name of the appengine task queue @param headers: a dict of X-Fantasm request headers to pass along in Tasks @param persistentLogging: if True, use persistent _FantasmLog model @param obj: an object that the FSMContext can operate on @param globalTaskTarget: the machine-level target configuration parameter """ assert queueName super(FSMContext, self).__init__(data or {}) self.initialState = initialState self.currentState = currentState self.currentAction = None if currentState: self.currentAction = currentState.exitAction self.machineName = machineName self.instanceName = instanceName or self._generateUniqueInstanceName() self.queueName = queueName self.retryOptions = retryOptions self.url = url self.method = method self.startingEvent = None self.startingState = None self.contextTypes = constants.PARAM_TYPES.copy() if contextTypes: self.contextTypes.update(contextTypes) self.logger = Logger(self, obj=obj, persistentLogging=persistentLogging) self.__obj = obj self.headers = headers self.globalTaskTarget = globalTaskTarget self.useRunOnceSemaphore = useRunOnceSemaphore # the following is monkey-patched from handler.py for 'immediate mode' from google.appengine.api.taskqueue.taskqueue import Queue self.Queue = Queue # pylint: disable-msg=C0103 def _generateUniqueInstanceName(self): """ Generates a unique instance name for this machine. @return: a FSMContext instanceName that is (pretty darn likely to be) unique """ utcnow = datetime.datetime.utcnow() dateStr = utcnow.strftime('%Y%m%d%H%M%S') randomStr = ''.join(random.sample(constants.CHARS_FOR_RANDOM, 6)) return '%s-%s-%s' % (self.machineName, dateStr, randomStr) def putTypedValue(self, key, value): """ Sets a value on context[key], but casts the value according to self.contextTypes. """ # cast the value to the appropriate type TODO: should this be in FSMContext? cast = self.contextTypes[key] kwargs = {} if cast is simplejson.loads: kwargs = {'object_hook': models.decode} if cast is pickle.loads: value = pickle.loads(str(value)) elif isinstance(value, list): value = [cast(v, **kwargs) for v in value] else: value = cast(value, **kwargs) # update the context self[key] = value def generateInitializationTask(self, countdown=0, taskName=None): """ Generates a task for initializing the machine. """ assert self.currentState.name == FSM.PSEUDO_INIT url = self.buildUrl(self.currentState, FSM.PSEUDO_INIT) params = self.buildParams(self.currentState, FSM.PSEUDO_INIT) taskName = taskName or self.getTaskName(FSM.PSEUDO_INIT) transition = self.currentState.getTransition(FSM.PSEUDO_INIT) task = Task(name=taskName, method=self.method, url=url, params=params, countdown=countdown, headers=self.headers, retry_options=transition.retryOptions, target=self.globalTaskTarget) return task def fork(self, data=None): """ Forks the FSMContext. When an FSMContext is forked, an identical copy of the finite state machine is generated that will have the same event dispatched to it as the machine that called .fork(). The data parameter is useful for allowing each forked instance to operate on a different bit of data. @param data: an option mapping of data to apply to the forked FSMContext """ obj = self.__obj if obj.get(constants.FORKED_CONTEXTS_PARAM) is None: obj[constants.FORKED_CONTEXTS_PARAM] = [] forkedContexts = obj.get(constants.FORKED_CONTEXTS_PARAM) data = copy.copy(data) or {} data[constants.FORK_PARAM] = len(forkedContexts) forkedContexts.append(self.clone(updateData=data)) def spawn(self, machineName, contexts, countdown=0, method='POST', _currentConfig=None, taskName=None): """ Spawns new machines. @param machineName the machine to spawn @param contexts a list of contexts (dictionaries) to spawn the new machine(s) with; multiple contexts will spawn multiple machines @param countdown the countdown (in seconds) to wait before spawning machines @param method the method ('GET' or 'POST') to invoke the machine with (default: POST) @param _currentConfig test injection for configuration @param taskName used for idempotency; will become the root of the task name for the actual task queued """ # using the current task name as a root to startStateMachine will make this idempotent taskName = taskName or self.__obj[constants.TASK_NAME_PARAM] startStateMachine(machineName, contexts, taskName=taskName, method=method, countdown=countdown, _currentConfig=_currentConfig, headers=self.headers) def initialize(self): """ Initializes the FSMContext. Queues a Task (so that we can benefit from auto-retry) to dispatch an event and take the machine from 'pseudo-init' into the state machine's initial state, as defined in the fsm.yaml file. @param data: a dict of initial key, value pairs to stuff into the FSMContext @return: an event string to dispatch to the FSMContext to put it into the initialState """ self[constants.STEPS_PARAM] = 0 task = self.generateInitializationTask() self.Queue(name=self.queueName).add(task) key = db.Key.from_path(_FantasmInstance.kind(), self.instanceName, namespace='') _FantasmInstance(key=key, instanceName=self.instanceName).put() return FSM.PSEUDO_INIT def dispatch(self, event, obj): """ The main entry point to move the machine according to an event. @param event: a string event to dispatch to the FSMContext @param obj: an object that the FSMContext can operate on @return: an event string to dispatch to the FSMContext """ self.__obj = self.__obj or obj # hold the obj object for use during this context # store the starting state and event for the handleEvent() method self.startingState = self.currentState self.startingEvent = event nextEvent = None try: nextEvent = self.currentState.dispatch(self, event, obj) if obj.get(constants.FORKED_CONTEXTS_PARAM): # pylint: disable-msg=W0212 # - accessing the protected method is fine here, since it is an instance of the same class tasks = [] for context in obj[constants.FORKED_CONTEXTS_PARAM]: context[constants.STEPS_PARAM] = int(context.get(constants.STEPS_PARAM, '0')) + 1 task = context.queueDispatch(nextEvent, queue=False) if task: # fan-in magic if not task.was_enqueued: # fan-in always queues tasks.append(task) try: if tasks: transition = self.currentState.getTransition(nextEvent) _queueTasks(self.Queue, transition.queueName, tasks) except (TaskAlreadyExistsError, TombstonedTaskError): # unlike a similar block in self.continutation, this is well off the happy path self.logger.critical( 'Unable to queue fork Tasks %s as it/they already exists. (Machine %s, State %s)', [task.name for task in tasks if not task.was_enqueued], self.machineName, self.currentState.name) if nextEvent: self[constants.STEPS_PARAM] = int(self.get(constants.STEPS_PARAM, '0')) + 1 try: self.queueDispatch(nextEvent) except (TaskAlreadyExistsError, TombstonedTaskError): # unlike a similar block in self.continutation, this is well off the happy path # # FIXME: when this happens, it means there was failure shortly after queuing the Task, or # possibly even with queuing the Task. when this happens there is a chance that # two states in the machine are executing simultaneously, which is may or may not # be a good thing, depending on what each state does. gracefully handling this # exception at least means that this state will terminate. self.logger.critical('Unable to queue next Task as it already exists. (Machine %s, State %s)', self.machineName, self.currentState.name) else: # if we're not in a final state, emit a log message # FIXME - somehow we should avoid this message if we're in the "last" step of a continuation... if not self.currentState.isFinalState and not obj.get(constants.TERMINATED_PARAM): self.logger.critical('Non-final state did not emit an event. Machine has terminated in an ' + 'unknown state. (Machine %s, State %s)' % (self.machineName, self.currentState.name)) # if it is a final state, then dispatch the pseudo-final event to finalize the state machine elif self.currentState.isFinalState and self.currentState.exitAction: self[constants.STEPS_PARAM] = int(self.get(constants.STEPS_PARAM, '0')) + 1 self.queueDispatch(FSM.PSEUDO_FINAL) except Exception: self.logger.exception("FSMContext.dispatch is handling the following exception:") self._handleException(event, obj) return nextEvent def continuation(self, nextToken): """ Performs a continuation be re-queueing an FSMContext Task with a slightly modified continuation token. self.startingState and self.startingEvent are used in the re-queue, so this can be seen as a 'fork' of the current context. @param nextToken: the next continuation token """ assert not self.get(constants.INDEX_PARAM) # fan-out after fan-in is not allowed step = str(self[constants.STEPS_PARAM]) # needs to be a str key into a json dict # make a copy and set the currentState to the startingState of this context context = self.clone() context.currentState = self.startingState # update the generation and continuation params gen = context.get(constants.GEN_PARAM, {}) gen[step] = gen.get(step, 0) + 1 context[constants.GEN_PARAM] = gen context[constants.CONTINUATION_PARAM] = nextToken try: # pylint: disable-msg=W0212 # - accessing the protected method is fine here, since it is an instance of the same class transition = self.startingState.getTransition(self.startingEvent) context._queueDispatchNormal(self.startingEvent, queue=True, queueName=transition.queueName, retryOptions=transition.retryOptions, taskTarget=transition.taskTarget) except (TaskAlreadyExistsError, TombstonedTaskError): # this can happen when currentState.dispatch() previously succeeded in queueing the continuation # Task, but failed with the doAction.execute() call in a _previous_ execution of this Task. # NOTE: this prevent the dreaded "fork bomb" self.logger.info('Unable to queue continuation Task as it already exists. (Machine %s, State %s)', self.machineName, self.currentState.name) def queueDispatch(self, nextEvent, queue=True): """ Queues a .dispatch(nextEvent) call in the appengine Task queue. @param nextEvent: a string event @param queue: a boolean indicating whether or not to queue a Task, or leave it to the caller @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None # self.currentState is already transitioned away from self.startingState transition = self.currentState.getTransition(nextEvent) queueName = self.__obj.get(constants.QUEUE_NAME_PARAM) or transition.queueName if transition.target.isFanIn: task = self._queueDispatchFanIn(nextEvent, fanInPeriod=transition.target.fanInPeriod, retryOptions=transition.retryOptions, queueName=queueName, taskTarget=transition.taskTarget) else: task = self._queueDispatchNormal(nextEvent, queue=queue, countdown=transition.countdown, retryOptions=transition.retryOptions, queueName=queueName, taskTarget=transition.taskTarget) return task def _queueDispatchNormal(self, nextEvent, queue=True, countdown=0, retryOptions=None, queueName=None, taskTarget=None): """ Queues a call to .dispatch(nextEvent) in the appengine Task queue. @param nextEvent: a string event @param queue: a boolean indicating whether or not to queue a Task, or leave it to the caller @param countdown: the number of seconds to countdown before the queued task fires @param retryOptions: the RetryOptions for the task @param queueName: the queue name to Queue into @param taskTarget: the task target parameter @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None assert queueName url = self.buildUrl(self.currentState, nextEvent) params = self.buildParams(self.currentState, nextEvent) taskName = self.getTaskName(nextEvent) task = Task(name=taskName, method=self.method, url=url, params=params, countdown=countdown, retry_options=retryOptions, headers=self.headers, target=taskTarget) if queue: self.Queue(name=queueName).add(task) if not task.was_enqueued: self.logger.critical('Task "%s" was not enqueued.', taskName) return task def _queueDispatchFanIn(self, nextEvent, fanInPeriod=0, retryOptions=None, queueName=None, taskTarget=None): """ Queues a call to .dispatch(nextEvent) in the task queue, or saves the context to the datastore for processing by the queued .dispatch(nextEvent) @param nextEvent: a string event @param fanInPeriod: the period of time between fan in Tasks @param queueName: the queue name to Queue into @param taskTarget: the task target parameter @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None assert not self.get(constants.INDEX_PARAM) # fan-in after fan-in is not allowed assert queueName # we pop this off here because we do not want the fan-out/continuation param as part of the # task name, otherwise we loose the fan-in - each fan-in gets one work unit. self.pop(constants.GEN_PARAM, None) fork = self.pop(constants.FORK_PARAM, None) # transfer the fan-in-group into the context (under a fixed value key) so that states beyond # the fan-in get unique Task names # FIXME: this will likely change once we formalize what to do post fan-in transition = self.currentState.getTransition(nextEvent) if self.get(transition.target.fanInGroup) is not None: self[constants.FAN_IN_GROUP_PARAM] = self[transition.target.fanInGroup] taskNameBase = self.getTaskName(nextEvent, fanIn=True) rwlock = ReadWriteLock(taskNameBase, self) index = rwlock.currentIndex() # (***) # # grab the lock - memcache.incr() # # on Task retry, multiple incr() calls are possible. possible ways to handle: # # 1. release the lock in a 'finally' clause, but then risk missing a work # package because acquiring the read lock will succeed even though the # work package was not written yet. # # 2. allow the lock to get too high. the fan-in logic attempts to wait for # work packages across multiple-retry attempts, so this seems like the # best option. we basically trade a bit of latency in fan-in for reliability. # rwlock.acquireWriteLock(index, nextEvent=nextEvent) # insert the work package, which is simply a serialized FSMContext workIndex = '%s-%d' % (taskNameBase, knuthHash(index)) # on retry, we want to ensure we get the same work index for this task actualTaskName = self.__obj[constants.TASK_NAME_PARAM] indexKeyName = 'workIndex-' + '-'.join([str(i) for i in [actualTaskName, fork] if i]) or None semaphore = RunOnceSemaphore(indexKeyName, self) # check if the workIndex changed during retry semaphoreWritten = False if self.__obj[constants.RETRY_COUNT_PARAM] > 0: # see comment (A) in self._queueDispatchFanIn(...) time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME) payload = semaphore.readRunOnceSemaphore(payload=workIndex, transactional=False) if payload: semaphoreWritten = True if payload != workIndex: self.logger.info("Work index changed from '%s' to '%s' on retry.", payload, workIndex) workIndex = payload # update this here so it gets written down into the work package too self[constants.INDEX_PARAM] = index # write down two models, one actual work package, one idempotency package keyName = '-'.join([str(i) for i in [actualTaskName, fork] if i]) or None key = db.Key.from_path(_FantasmFanIn.kind(), keyName, namespace='') work = _FantasmFanIn(context=self, workIndex=workIndex, key=key) # close enough to idempotent, but could still write only one of the entities # FIXME: could be made faster using a bulk put, but this interface is cleaner if not semaphoreWritten: semaphore.writeRunOnceSemaphore(payload=workIndex, transactional=False) # put the work item db.put(work) # (A) now the datastore is asynchronously writing the indices, so the work package may # not show up in a query for a period of time. there is a corresponding time.sleep() # in the fan-in of self.mergeJoinDispatch(...) # release the lock - memcache.decr() rwlock.releaseWriteLock(index) try: # insert a task to run in the future and process a bunch of work packages now = time.time() url = self.buildUrl(self.currentState, nextEvent) params = self.buildParams(self.currentState, nextEvent) taskName = '%s-%d' % (taskNameBase, index) task = Task(name=taskName, method=self.method, url=url, params=params, eta=datetime.datetime.utcfromtimestamp(now) + datetime.timedelta(seconds=fanInPeriod), headers=self.headers, retry_options=retryOptions, target=taskTarget) self.Queue(name=queueName).add(task) if not task.was_enqueued: self.logger.critical('Task "%s" was not enqueued.', taskName) return task except (TaskAlreadyExistsError, TombstonedTaskError): pass # Fan-in magic def mergeJoinDispatch(self, event, obj): """ Performs a merge join on the pending fan-in dispatches. @param event: an event that is being merge joined (destination state must be a fan in) @return: a list (possibly empty) of FSMContext instances """ # this assertion comes from _queueDispatchFanIn - we never want fan-out info in a fan-in context assert not self.get(constants.GEN_PARAM) assert not self.get(constants.FORK_PARAM) # the work package index is stored in the url of the Task/FSMContext index = self.get(constants.INDEX_PARAM) self.logger.debug('Index: %s', index) taskNameBase = self.getTaskName(event, fanIn=True) # see comment (***) in self._queueDispatchFanIn # # in the case of failing to acquire a read lock (due to failed release of write lock) # we have decided to keep retrying raiseOnFail = False if self._getTaskRetryLimit() is not None: raiseOnFail = (self._getTaskRetryLimit() > self.__obj[constants.RETRY_COUNT_PARAM]) rwlock = ReadWriteLock(taskNameBase, self) rwlock.acquireReadLock(index, raiseOnFail=raiseOnFail) # and return the FSMContexts list class FSMContextList(list): """ A list that supports .logger.info(), .logger.warning() etc.for fan-in actions """ def __init__(self, context, contexts, guarded=False): """ setup a self.logger for fan-in actions """ super(FSMContextList, self).__init__(contexts) self.logger = Logger(context) self.instanceName = context.instanceName self.guarded = guarded # see comment (A) in self._queueDispatchFanIn(...) time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME) # the following step ensure that fan-in only ever operates one time over a list of data # the entity is created in State.dispatch(...) _after_ all the actions have executed # successfully khash = knuthHash(index) self.logger.debug('knuthHash of index: %s', khash) workIndex = '%s-%d' % (taskNameBase, khash) if obj[constants.RETRY_COUNT_PARAM] > 0: semaphore = RunOnceSemaphore(workIndex, self) if semaphore.readRunOnceSemaphore(payload=self.__obj[constants.TASK_NAME_PARAM]): self.logger.info("Fan-in idempotency guard for workIndex '%s', not processing any work items.", workIndex) return FSMContextList(self, [], guarded=True) # don't operate over the data again # fetch all the work packages in the current group for processing query = _FantasmFanIn.all(namespace='') \ .filter('workIndex =', workIndex) \ .order('__key__') # construct a list of FSMContexts contexts = [self.clone(replaceData=r.context) for r in query] return FSMContextList(self, contexts) def _getTaskRetryLimit(self): """ Method that returns the maximum number of retries for this particular dispatch @param obj: an object that the FSMContext can operate on """ # get task_retry_limit configuration try: transition = self.startingState.getTransition(self.startingEvent) taskRetryLimit = transition.retryOptions.task_retry_limit except UnknownEventError: # can't find the transition, use the machine-level default taskRetryLimit = self.retryOptions.task_retry_limit return taskRetryLimit def _handleException(self, event, obj): """ Method for child classes to override to handle exceptions. @param event: a string event @param obj: an object that the FSMContext can operate on """ retryCount = obj.get(constants.RETRY_COUNT_PARAM, 0) taskRetryLimit = self._getTaskRetryLimit() if taskRetryLimit and retryCount >= taskRetryLimit: # need to permanently fail self.logger.critical('Max-requeues reached. Machine has terminated in an unknown state. ' + '(Machine %s, State %s, Event %s)', self.machineName, self.startingState.name, event, exc_info=True) # re-raise, letting App Engine TaskRetryOptions kill the task raise else: # re-raise the exception self.logger.warning('Exception occurred processing event. Task will be retried. ' + '(Machine %s, State %s)', self.machineName, self.startingState.name, exc_info=True) # this line really just allows unit tests to work - the request is really dead at this point self.currentState = self.startingState raise def buildUrl(self, state, event): """ Builds the taskqueue url. @param state: the State to dispatch to @param event: the event to dispatch @return: a url that can be used to build a taskqueue.Task instance to .dispatch(event) """ assert state and event return self.url + '%s/%s/%s/' % (state.name, event, state.getTransition(event).target.name) def buildParams(self, state, event): """ Builds the taskqueue params. @param state: the State to dispatch to @param event: the event to dispatch @return: a dict suitable to use in constructing a url (GET) or using as params (POST) """ assert state and event params = {constants.STATE_PARAM: state.name, constants.EVENT_PARAM: event, constants.INSTANCE_NAME_PARAM: self.instanceName} for key, value in self.items(): if key not in constants.NON_CONTEXT_PARAMS: if self.contextTypes.get(key) is simplejson.loads: value = simplejson.dumps(value, cls=models.Encoder) if self.contextTypes.get(key) is pickle.loads: value = pickle.dumps(value) if isinstance(value, datetime.datetime): value = str(int(time.mktime(value.utctimetuple()))) if isinstance(value, dict): # FIXME: should we issue a warning that they should update fsm.yaml? value = simplejson.dumps(value, cls=models.Encoder) valueIsNotBasestring = False if isinstance(value, (list, tuple)): for v in value: if not isinstance(v, basestring): valueIsNotBasestring = True elif not isinstance(value, basestring): valueIsNotBasestring = True if valueIsNotBasestring: if key not in self.contextTypes.keys(): self.logger.warning("Attempting to put an object in the FSMContext without specifying an " "entry for key '%s' in 'context_types' in the yaml for machineName '%s'. " "There will likely be conversion issues (ie. booleans turned into " "strings).", key, self.machineName) if isinstance(value, (list, tuple)) and len(value) == 1: key = key + '[]' # used to preserve lists of length=1 - see handler.py for inverse params[key] = value return params def getTaskName(self, nextEvent, instanceName=None, fanIn=False): """ Returns a task name that is unique for a specific dispatch @param nextEvent: the event to dispatch @return: a task name that can be used to build a taskqueue.Task instance to .dispatch(nextEvent) """ transition = self.currentState.getTransition(nextEvent) parts = [] parts.append(instanceName or self.instanceName) if self.get(constants.GEN_PARAM): for (step, gen) in self[constants.GEN_PARAM].items(): parts.append('continuation-%s-%s' % (step, gen)) if self.get(constants.FORK_PARAM): parts.append('fork-' + str(self[constants.FORK_PARAM])) # post-fan-in we need to store the workIndex in the task name to avoid duplicates, since # we popped the generation off during fan-in # FIXME: maybe not pop the generation in fan-in? # FIXME: maybe store this in the instanceName? # FIXME: i wish this was easier to get right :-) if (not fanIn) and self.get(constants.INDEX_PARAM): parts.append('work-index-' + str(self[constants.INDEX_PARAM])) parts.append(self.currentState.name) parts.append(nextEvent) parts.append(transition.target.name) parts.append('step-' + str(self[constants.STEPS_PARAM])) if self.get(constants.FAN_IN_GROUP_PARAM) is not None: parts.append('group-' + str(self[constants.FAN_IN_GROUP_PARAM])) return '--'.join(parts) def clone(self, instanceName=None, updateData=None, replaceData=None): """ Returns a copy of the FSMContext. @param instanceName: the instance name to optionally apply to the clone @param updateData: a dict/mapping of data to optionally apply (.update()) to the clone @param replaceData: a dict/mapping of data to optionally apply (.clear()/.update()) to the clone @return: a new FSMContext instance """ assert (not updateData) or (not replaceData), "cannot update and replace data at the same time" #context = copy.deepcopy(self) # shallow copy the context context = copy.copy(self) # deepcopy the dictionary portion of the context deepcopy_dict = copy.deepcopy(dict(self)) context.update(deepcopy_dict) if instanceName: context.instanceName = instanceName if updateData: context.update(updateData) if replaceData: context.clear() context.update(replaceData) return context
class FSMContext(dict): """ A finite state machine context instance. """ def __init__(self, initialState, currentState=None, machineName=None, instanceName=None, retryOptions=None, url=None, queueName=None, data=None, contextTypes=None, method='GET', persistentLogging=False, obj=None, headers=None): """ Constructor @param initialState: a State instance @param currentState: a State instance @param machineName: the name of the fsm @param instanceName: the instance name of the fsm @param retryOptions: the TaskRetryOptions for the machine @param url: the url of the fsm @param queueName: the name of the appengine task queue @param headers: a dict of X-Fantasm request headers to pass along in Tasks @param persistentLogging: if True, use persistent _FantasmLog model @param obj: an object that the FSMContext can operate on """ assert queueName super(FSMContext, self).__init__(data or {}) self.initialState = initialState self.currentState = currentState self.currentAction = None if currentState: self.currentAction = currentState.exitAction self.machineName = machineName self.instanceName = instanceName or self._generateUniqueInstanceName() self.queueName = queueName self.retryOptions = retryOptions self.url = url self.method = method self.startingEvent = None self.startingState = None self.contextTypes = constants.PARAM_TYPES.copy() if contextTypes: self.contextTypes.update(contextTypes) self.logger = Logger(self, obj=obj, persistentLogging=persistentLogging) self.__obj = obj self.headers = headers # the following is monkey-patched from handler.py for 'immediate mode' from google.appengine.api.taskqueue.taskqueue import Queue self.Queue = Queue # pylint: disable-msg=C0103 def _generateUniqueInstanceName(self): """ Generates a unique instance name for this machine. @return: a FSMContext instanceName that is (pretty darn likely to be) unique """ utcnow = datetime.datetime.utcnow() dateStr = utcnow.strftime('%Y%m%d%H%M%S') randomStr = ''.join(random.sample(constants.CHARS_FOR_RANDOM, 6)) return '%s-%s-%s' % (self.machineName, dateStr, randomStr) def putTypedValue(self, key, value): """ Sets a value on context[key], but casts the value according to self.contextTypes. """ # cast the value to the appropriate type TODO: should this be in FSMContext? cast = self.contextTypes[key] kwargs = {} if cast is simplejson.loads: kwargs = {'object_hook': models.decode} if isinstance(value, list): value = [cast(v, **kwargs) for v in value] else: value = cast(value, **kwargs) # update the context self[key] = value def generateInitializationTask(self, countdown=0, taskName=None): """ Generates a task for initializing the machine. """ assert self.currentState.name == FSM.PSEUDO_INIT url = self.buildUrl(self.currentState, FSM.PSEUDO_INIT) params = self.buildParams(self.currentState, FSM.PSEUDO_INIT) taskName = taskName or self.getTaskName(FSM.PSEUDO_INIT) task = Task(name=taskName, method=self.method, url=url, params=params, countdown=countdown, headers=self.headers) return task def fork(self, data=None): """ Forks the FSMContext. When an FSMContext is forked, an identical copy of the finite state machine is generated that will have the same event dispatched to it as the machine that called .fork(). The data parameter is useful for allowing each forked instance to operate on a different bit of data. @param data: an option mapping of data to apply to the forked FSMContext """ obj = self.__obj if obj.get(constants.FORKED_CONTEXTS_PARAM) is None: obj[constants.FORKED_CONTEXTS_PARAM] = [] forkedContexts = obj.get(constants.FORKED_CONTEXTS_PARAM) data = copy.copy(data) or {} data[constants.FORK_PARAM] = len(forkedContexts) forkedContexts.append(self.clone(data=data)) def spawn(self, machineName, contexts, countdown=0, method='POST', _currentConfig=None): """ Spawns new machines. @param machineName the machine to spawn @param contexts a list of contexts (dictionaries) to spawn the new machine(s) with; multiple contexts will spawn multiple machines @param countdown the countdown (in seconds) to wait before spawning machines @param method the method ('GET' or 'POST') to invoke the machine with (default: POST) @param _currentConfig test injection for configuration """ # using the current task name as a root to startStateMachine will make this idempotent taskName = self.__obj[constants.TASK_NAME_PARAM] startStateMachine(machineName, contexts, taskName=taskName, method=method, countdown=countdown, _currentConfig=_currentConfig) def initialize(self): """ Initializes the FSMContext. Queues a Task (so that we can benefit from auto-retry) to dispatch an event and take the machine from 'pseudo-init' into the state machine's initial state, as defined in the fsm.yaml file. @param data: a dict of initial key, value pairs to stuff into the FSMContext @return: an event string to dispatch to the FSMContext to put it into the initialState """ self[constants.STEPS_PARAM] = 0 task = self.generateInitializationTask() self.Queue(name=self.queueName).add(task) _FantasmInstance(key_name=self.instanceName, instanceName=self.instanceName).put() return FSM.PSEUDO_INIT def dispatch(self, event, obj): """ The main entry point to move the machine according to an event. @param event: a string event to dispatch to the FSMContext @param obj: an object that the FSMContext can operate on @return: an event string to dispatch to the FSMContext """ self.__obj = self.__obj or obj # hold the obj object for use during this context # store the starting state and event for the handleEvent() method self.startingState = self.currentState self.startingEvent = event nextEvent = None try: nextEvent = self.currentState.dispatch(self, event, obj) if obj.get(constants.FORKED_CONTEXTS_PARAM): # pylint: disable-msg=W0212 # - accessing the protected method is fine here, since it is an instance of the same class tasks = [] for context in obj[constants.FORKED_CONTEXTS_PARAM]: context[constants.STEPS_PARAM] = int(context.get(constants.STEPS_PARAM, '0')) + 1 task = context.queueDispatch(nextEvent, queue=False) if task: # fan-in magic if not task.was_enqueued: # fan-in always queues tasks.append(task) try: if tasks: transition = self.currentState.getTransition(nextEvent) self.Queue(name=transition.queueName).add(tasks) except (TaskAlreadyExistsError, TombstonedTaskError): # unlike a similar block in self.continutation, this is well off the happy path self.logger.critical( 'Unable to queue fork Tasks %s as it/they already exists. (Machine %s, State %s)', [task.name for task in tasks if not task.was_enqueued], self.machineName, self.currentState.name) if nextEvent: self[constants.STEPS_PARAM] = int(self.get(constants.STEPS_PARAM, '0')) + 1 try: self.queueDispatch(nextEvent) except (TaskAlreadyExistsError, TombstonedTaskError): # unlike a similar block in self.continutation, this is well off the happy path # # FIXME: when this happens, it means there was failure shortly after queuing the Task, or # possibly even with queuing the Task. when this happens there is a chance that # two states in the machine are executing simultaneously, which is may or may not # be a good thing, depending on what each state does. gracefully handling this # exception at least means that this state will terminate. self.logger.critical('Unable to queue next Task as it already exists. (Machine %s, State %s)', self.machineName, self.currentState.name) else: # if we're not in a final state, emit a log message # FIXME - somehow we should avoid this message if we're in the "last" step of a continuation... if not self.currentState.isFinalState and not obj.get(constants.TERMINATED_PARAM): self.logger.critical('Non-final state did not emit an event. Machine has terminated in an ' + 'unknown state. (Machine %s, State %s)' % (self.machineName, self.currentState.name)) # if it is a final state, then dispatch the pseudo-final event to finalize the state machine elif self.currentState.isFinalState and self.currentState.exitAction: self[constants.STEPS_PARAM] = int(self.get(constants.STEPS_PARAM, '0')) + 1 self.queueDispatch(FSM.PSEUDO_FINAL) except Exception: self.logger.exception("FSMContext.dispatch is handling the following exception:") self._handleException(event, obj) return nextEvent def continuation(self, nextToken): """ Performs a continuation be re-queueing an FSMContext Task with a slightly modified continuation token. self.startingState and self.startingEvent are used in the re-queue, so this can be seen as a 'fork' of the current context. @param nextToken: the next continuation token """ assert not self.get(constants.INDEX_PARAM) # fan-out after fan-in is not allowed step = str(self[constants.STEPS_PARAM]) # needs to be a str key into a json dict # make a copy and set the currentState to the startingState of this context context = self.clone() context.currentState = self.startingState # update the generation and continuation params gen = context.get(constants.GEN_PARAM, {}) gen[step] = gen.get(step, 0) + 1 context[constants.GEN_PARAM] = gen context[constants.CONTINUATION_PARAM] = nextToken try: # pylint: disable-msg=W0212 # - accessing the protected method is fine here, since it is an instance of the same class transition = self.startingState.getTransition(self.startingEvent) context._queueDispatchNormal(self.startingEvent, queue=True, queueName=transition.queueName) except (TaskAlreadyExistsError, TombstonedTaskError): # this can happen when currentState.dispatch() previously succeeded in queueing the continuation # Task, but failed with the doAction.execute() call in a _previous_ execution of this Task. # NOTE: this prevent the dreaded "fork bomb" self.logger.info('Unable to queue continuation Task as it already exists. (Machine %s, State %s)', self.machineName, self.currentState.name) def queueDispatch(self, nextEvent, queue=True): """ Queues a .dispatch(nextEvent) call in the appengine Task queue. @param nextEvent: a string event @param queue: a boolean indicating whether or not to queue a Task, or leave it to the caller @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None # self.currentState is already transitioned away from self.startingState transition = self.currentState.getTransition(nextEvent) if transition.target.isFanIn: task = self._queueDispatchFanIn(nextEvent, fanInPeriod=transition.target.fanInPeriod, queueName=transition.queueName) else: task = self._queueDispatchNormal(nextEvent, queue=queue, countdown=transition.countdown, retryOptions=transition.retryOptions, queueName=transition.queueName) return task def _queueDispatchNormal(self, nextEvent, queue=True, countdown=0, retryOptions=None, queueName=None): """ Queues a call to .dispatch(nextEvent) in the appengine Task queue. @param nextEvent: a string event @param queue: a boolean indicating whether or not to queue a Task, or leave it to the caller @param countdown: the number of seconds to countdown before the queued task fires @param retryOptions: the RetryOptions for the task @param queueName: the queue name to Queue into @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None assert queueName url = self.buildUrl(self.currentState, nextEvent) params = self.buildParams(self.currentState, nextEvent) taskName = self.getTaskName(nextEvent) task = Task(name=taskName, method=self.method, url=url, params=params, countdown=countdown, retry_options=retryOptions, headers=self.headers) if queue: self.Queue(name=queueName).add(task) return task def _queueDispatchFanIn(self, nextEvent, fanInPeriod=0, queueName=None): """ Queues a call to .dispatch(nextEvent) in the task queue, or saves the context to the datastore for processing by the queued .dispatch(nextEvent) @param nextEvent: a string event @param fanInPeriod: the period of time between fan in Tasks @param queueName: the queue name to Queue into @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None assert not self.get(constants.INDEX_PARAM) # fan-in after fan-in is not allowed assert queueName # we pop this off here because we do not want the fan-out/continuation param as part of the # task name, otherwise we loose the fan-in - each fan-in gets one work unit. self.pop(constants.GEN_PARAM, None) self.pop(constants.FORK_PARAM, None) taskNameBase = self.getTaskName(nextEvent, fanIn=True) index = memcache.get('index-' + taskNameBase) if index is None: # using 'random.randint' here instead of '1' helps when the index is ejected from memcache # instead of restarting at the same counter, we jump (likely) far way from existing task job # names. memcache.add('index-' + taskNameBase, random.randint(1, 2**32)) index = memcache.get('index-' + taskNameBase) # grab the lock lock = '%s-lock-%d' % (taskNameBase, index) writers = memcache.incr(lock, initial_value=2**16) if writers < 2**16: memcache.decr(lock) # this will escape as a 500 error and the Task will be re-tried by appengine raise FanInWriteLockFailureRuntimeError(nextEvent, self.machineName, self.currentState.name, self.instanceName) # insert the work package, which is simply a serialized FSMContext workIndex = '%s-%d' % (taskNameBase, knuthHash(index)) work = _FantasmFanIn(context=self, workIndex=workIndex) work.put() # insert a task to run in the future and process a bunch of work packages now = time.time() try: self[constants.INDEX_PARAM] = index url = self.buildUrl(self.currentState, nextEvent) params = self.buildParams(self.currentState, nextEvent) # int(now / (fanInPeriod - 1 + 30)) included because it was in [2], but is less needed now that # we use random.randint in seeding memcache. for long fan in periods, and the case where random.randint # hits the same value twice, this may cause problems for up to fanInPeriod + 30s. # see: http://www.mail-archive.com/[email protected]/msg30408.html task = Task(name='%s-%d-%d' % (taskNameBase, int(now / (fanInPeriod - 1 + 30)), index), method=self.method, url=url, params=params, eta=datetime.datetime.utcfromtimestamp(now) + datetime.timedelta(seconds=fanInPeriod), headers=self.headers) self.Queue(name=queueName).add(task) return task except (TaskAlreadyExistsError, TombstonedTaskError): pass # Fan-in magic finally: memcache.decr(lock) def mergeJoinDispatch(self, event, obj): """ Performs a merge join on the pending fan-in dispatches. @param event: an event that is being merge joined (destination state must be a fan in) @return: a list (possibly empty) of FSMContext instances """ # this assertion comes from _queueDispatchFanIn - we never want fan-out info in a fan-in context assert not self.get(constants.GEN_PARAM) assert not self.get(constants.FORK_PARAM) # the work package index is stored in the url of the Task/FSMContext index = self.get(constants.INDEX_PARAM) taskNameBase = self.getTaskName(event, fanIn=True) # tell writers to use another index memcache.incr('index-' + taskNameBase) lock = '%s-lock-%d' % (taskNameBase, index) memcache.decr(lock, 2**15) # tell writers they missed the boat # 20 iterations * 0.25s = 5s total wait time busyWaitIters = 20 busyWaitIterSecs = 0.250 # busy wait for writers for i in xrange(busyWaitIters): counter = memcache.get(lock) # counter is None --> ejected from memcache # int(counter) <= 2**15 --> writers have all called memcache.decr if counter is None or int(counter) <= 2**15: break time.sleep(busyWaitIterSecs) self.logger.debug("Tried to acquire lock '%s' %d times...", lock, i + 1) # FIXME: is there anything else that can be done? will work packages be lost? maybe queue another task # to sweep up later? if i >= (busyWaitIters - 1): # pylint: disable-msg=W0631 self.logger.error("Gave up waiting for all fan-in work items.") # at this point we could have two tasks trying to process the same work packages. in the # happy path this will not likely happen because the tasks are sent off with different ETAs, # however in the unhappy path, it is possible for multiple tasks to be executing (retry on # 500 etc.). we solve this with a read lock using memcache. # # FIXME: would using a transaction on db.delete work if using ancestors? one task would win the # race to delete the the work based on a transaction error? readlock = '%s-readlock-%d' % (taskNameBase, index) haveReadLock = False try: # put the actual name of the winning task into to lock actualTaskName = self.get(constants.TASK_NAME_PARAM) added = memcache.add(readlock, actualTaskName, time=30) # FIXME: is 30s appropriate? lockValue = memcache.get(readlock) # and return the FSMContexts list class FSMContextList(list): """ A list that supports .logger.info(), .logger.warning() etc.for fan-in actions """ def __init__(self, context, contexts): """ setup a self.logger for fan-in actions """ super(FSMContextList, self).__init__(contexts) self.logger = Logger(context) self.instanceName = context.instanceName # if the lock value is not equal to the added value, it means this task lost the race if not added or lockValue != actualTaskName: return FSMContextList(self, []) # raise FanInReadLockFailureRuntimeError(event, # self.machineName, # self.currentState.name, # self.instanceName) # flag used in finally block to decide whether or not to log an error message haveReadLock = True # fetch all the work packages in the current group for processing workIndex = '%s-%d' % (taskNameBase, knuthHash(index)) query = _FantasmFanIn.all() \ .filter('workIndex =', workIndex) \ .order('__key__') # iterate over the query to fetch results - this is done in 'small batches' fanInResults = list(query) # construct a list of FSMContexts contexts = [self.clone(data=r.context) for r in fanInResults] # hold the fanInResult around in case we need to re-put them (on an Exception) obj[constants.FAN_IN_RESULTS_PARAM] = fanInResults # and delete the work packages - bearing in mind appengine limits maxDeleteSize = 250 # appengine does not like to delete > 500 models at a time, 250 is a nice safe number if len(fanInResults) > maxDeleteSize: self.logger.warning("%d contexts in the current batch. Consider decreasing fan-in.", len(fanInResults)) i = 0 while fanInResults[i:i+maxDeleteSize]: db.delete(fanInResults[i:i+maxDeleteSize]) i += maxDeleteSize return FSMContextList(self, contexts) finally: deleted = memcache.delete(readlock) # FIXME: is there anything else that can be done? if haveReadLock and deleted == memcache.DELETE_NETWORK_FAILURE: self.logger.error("Unable to release the fan in read lock.") def _getTaskRetryLimit(self): """ Method that returns the maximum number of retries for this particular dispatch @param obj: an object that the FSMContext can operate on """ # get task_retry_limit configuration try: transition = self.startingState.getTransition(self.startingEvent) taskRetryLimit = transition.retryOptions.task_retry_limit except UnknownEventError: # can't find the transition, use the machine-level default taskRetryLimit = self.retryOptions.task_retry_limit return taskRetryLimit def _handleException(self, event, obj): """ Method for child classes to override to handle exceptions. @param event: a string event @param obj: an object that the FSMContext can operate on """ retryCount = obj.get(constants.RETRY_COUNT_PARAM, 0) taskRetryLimit = self._getTaskRetryLimit() if taskRetryLimit and retryCount >= taskRetryLimit: # need to permanently fail self.logger.critical('Max-requeues reached. Machine has terminated in an unknown state. ' + '(Machine %s, State %s, Event %s)', self.machineName, self.startingState.name, event, exc_info=True) # re-raise, letting App Engine TaskRetryOptions kill the task raise else: # re-raise the exception self.logger.warning('Exception occurred processing event. Task will be retried. ' + '(Machine %s, State %s)', self.machineName, self.startingState.name, exc_info=True) # re-put fan-in work packages if obj.get(constants.FAN_IN_RESULTS_PARAM): try: fanInResults = obj[constants.FAN_IN_RESULTS_PARAM] maxPutSize = 250 # put in chunks, rather than the entire list which could be large i = 0 while(fanInResults[i:i+maxPutSize]): db.put(fanInResults[i:i+maxPutSize]) i += maxPutSize except Exception: self.logger.critical("Unable to re.put() for workIndex = %s", self.fanInResults[0].workIndex) raise # this line really just allows unit tests to work - the request is really dead at this point self.currentState = self.startingState raise def buildUrl(self, state, event): """ Builds the taskqueue url. @param state: the State to dispatch to @param event: the event to dispatch @return: a url that can be used to build a taskqueue.Task instance to .dispatch(event) """ assert state and event return self.url + '%s/%s/%s/' % (state.name, event, state.getTransition(event).target.name) def buildParams(self, state, event): """ Builds the taskqueue params. @param state: the State to dispatch to @param event: the event to dispatch @return: a dict suitable to use in constructing a url (GET) or using as params (POST) """ assert state and event params = {constants.STATE_PARAM: state.name, constants.EVENT_PARAM: event, constants.INSTANCE_NAME_PARAM: self.instanceName} for key, value in self.items(): if key not in constants.NON_CONTEXT_PARAMS: if self.contextTypes.get(key) is simplejson.loads: value = simplejson.dumps(value, cls=models.Encoder) if isinstance(value, datetime.datetime): value = str(int(time.mktime(value.utctimetuple()))) if isinstance(value, dict): # FIXME: should we issue a warning that they should update fsm.yaml? value = simplejson.dumps(value, cls=models.Encoder) if isinstance(value, list) and len(value) == 1: key = key + '[]' # used to preserve lists of length=1 - see handler.py for inverse params[key] = value return params def getTaskName(self, nextEvent, instanceName=None, fanIn=False): """ Returns a task name that is unique for a specific dispatch @param nextEvent: the event to dispatch @return: a task name that can be used to build a taskqueue.Task instance to .dispatch(nextEvent) """ transition = self.currentState.getTransition(nextEvent) parts = [] parts.append(instanceName or self.instanceName) if self.get(constants.GEN_PARAM): for (step, gen) in self[constants.GEN_PARAM].items(): parts.append('continuation-%s-%s' % (step, gen)) if self.get(constants.FORK_PARAM): parts.append('fork-' + str(self[constants.FORK_PARAM])) # post-fan-in we need to store the workIndex in the task name to avoid duplicates, since # we popped the generation off during fan-in # FIXME: maybe not pop the generation in fan-in? # FIXME: maybe store this in the instanceName? # FIXME: i wish this was easier to get right :-) if (not fanIn) and self.get(constants.INDEX_PARAM): parts.append('work-index-' + str(self[constants.INDEX_PARAM])) parts.append(self.currentState.name) parts.append(nextEvent) parts.append(transition.target.name) parts.append('step-' + str(self[constants.STEPS_PARAM])) return '--'.join(parts) def clone(self, instanceName=None, data=None): """ Returns a copy of the FSMContext. @param instanceName: the instance name to optionally apply to the clone @param data: a dict/mapping of data to optionally apply (.update()) to the clone @return: a new FSMContext instance """ context = copy.deepcopy(self) if instanceName: context.instanceName = instanceName if data: context.update(data) return context