def mergeJoinDispatch(self, event, obj): """ Performs a merge join on the pending fan-in dispatches. @param event: an event that is being merge joined (destination state must be a fan in) @return: a list (possibly empty) of FSMContext instances """ # this assertion comes from _queueDispatchFanIn - we never want fan-out info in a fan-in context assert not self.get(constants.GEN_PARAM) assert not self.get(constants.FORK_PARAM) # the work package index is stored in the url of the Task/FSMContext index = self.get(constants.INDEX_PARAM) taskNameBase = self.getTaskName(event, fanIn=True) # see comment (***) in self._queueDispatchFanIn # # in the case of failing to acquire a read lock (due to failed release of write lock) # we have decided to keep retrying raiseOnFail = False if self._getTaskRetryLimit() is not None: raiseOnFail = (self._getTaskRetryLimit() > self.__obj[constants.RETRY_COUNT_PARAM]) rwlock = ReadWriteLock(taskNameBase, self) rwlock.acquireReadLock(index, raiseOnFail=raiseOnFail) # and return the FSMContexts list class FSMContextList(list): """ A list that supports .logger.info(), .logger.warning() etc.for fan-in actions """ def __init__(self, context, contexts): """ setup a self.logger for fan-in actions """ super(FSMContextList, self).__init__(contexts) self.logger = Logger(context) self.instanceName = context.instanceName # see comment (A) in self._queueDispatchFanIn(...) time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME) # the following step ensure that fan-in only ever operates one time over a list of data # the entity is created in State.dispatch(...) _after_ all the actions have executed # successfully workIndex = '%s-%d' % (taskNameBase, knuthHash(index)) if obj[constants.RETRY_COUNT_PARAM] > 0: semaphore = RunOnceSemaphore(workIndex, self) if semaphore.readRunOnceSemaphore( payload=self.__obj[constants.TASK_NAME_PARAM]): self.logger.info( "Fan-in idempotency guard for workIndex '%s', not processing any work items.", workIndex) return FSMContextList(self, []) # don't operate over the data again # fetch all the work packages in the current group for processing query = _FantasmFanIn.all() \ .filter('workIndex =', workIndex) \ .order('__key__') # construct a list of FSMContexts contexts = [self.clone(data=r.context) for r in query] return FSMContextList(self, contexts)
def test_readRunOnceSemaphore_memcache_expired(self): sem = RunOnceSemaphore('foo', None) sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) memcache.delete('foo') payload = sem.readRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) self.assertEqual('payload', payload)
def mergeJoinDispatch(self, event, obj): """ Performs a merge join on the pending fan-in dispatches. @param event: an event that is being merge joined (destination state must be a fan in) @return: a list (possibly empty) of FSMContext instances """ # this assertion comes from _queueDispatchFanIn - we never want fan-out info in a fan-in context assert not self.get(constants.GEN_PARAM) assert not self.get(constants.FORK_PARAM) # the work package index is stored in the url of the Task/FSMContext index = self.get(constants.INDEX_PARAM) self.logger.debug('Index: %s', index) taskNameBase = self.getTaskName(event, fanIn=True) # see comment (***) in self._queueDispatchFanIn # # in the case of failing to acquire a read lock (due to failed release of write lock) # we have decided to keep retrying raiseOnFail = False if self._getTaskRetryLimit() is not None: raiseOnFail = (self._getTaskRetryLimit() > self.__obj[constants.RETRY_COUNT_PARAM]) rwlock = ReadWriteLock(taskNameBase, self) rwlock.acquireReadLock(index, raiseOnFail=raiseOnFail) # and return the FSMContexts list class FSMContextList(list): """ A list that supports .logger.info(), .logger.warning() etc.for fan-in actions """ def __init__(self, context, contexts, guarded=False): """ setup a self.logger for fan-in actions """ super(FSMContextList, self).__init__(contexts) self.logger = Logger(context) self.instanceName = context.instanceName self.guarded = guarded # see comment (A) in self._queueDispatchFanIn(...) time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME) # the following step ensure that fan-in only ever operates one time over a list of data # the entity is created in State.dispatch(...) _after_ all the actions have executed # successfully khash = knuthHash(index) self.logger.debug('knuthHash of index: %s', khash) workIndex = '%s-%d' % (taskNameBase, khash) if obj[constants.RETRY_COUNT_PARAM] > 0: semaphore = RunOnceSemaphore(workIndex, self) if semaphore.readRunOnceSemaphore(payload=self.__obj[constants.TASK_NAME_PARAM]): self.logger.info("Fan-in idempotency guard for workIndex '%s', not processing any work items.", workIndex) return FSMContextList(self, [], guarded=True) # don't operate over the data again # fetch all the work packages in the current group for processing query = _FantasmFanIn.all(namespace='') \ .filter('workIndex =', workIndex) \ .order('__key__') # construct a list of FSMContexts contexts = [self.clone(replaceData=r.context) for r in query] return FSMContextList(self, contexts)
def execute(self, context, obj): """ Writes the CSV file """ # if already wrote a file, don't do anything semaphore = RunOnceSemaphore(context.instanceName, context) if not semaphore.readRunOnceSemaphore(payload='payload'): # fetch the CsvCounter, since it is the parent of all the other Models counter = CsvProgressCounter.get_by_key_name(context.instanceName) # fetch the single aggregated results Model aggResults = CsvIntermediateResults.get_by_key_name( context.instanceName, counter) # open the file fileName = files.blobstore.create( mime_type='application/octet-stream') with files.open(fileName, 'a') as f: # the csv module has a convenient row writing interface writer = csv.writer(f) # this queries for all the intermediate results query = CsvIntermediateResults.all().ancestor(counter) for results in query: # the aggregated results may also be in the results, so skip them if aggResults and results.key() == aggResults.key(): continue # for all the intermediate data, write the rows data = results.data for item in data: rows = self.getRows(context, obj, item, aggResults.data) if rows: for row in rows: writer.writerow(row) if aggResults: # now also write down any specific aggregated data rows rows = self.getAggregatedRows(context, obj, aggResults.data) if rows: for row in rows: writer.writerow(row) # finalize the file files.finalize(fileName) # FIXME: what to do with this? blobKey = files.blobstore.get_blob_key(fileName) # at this point we have successfully written the file, lets make sure we don't do it again # if a retry occurs downstream semaphore.writeRunOnceSemaphore(payload='payload') # store the key of the counter (ie. parent of intermediate results) for cleanup context[COUNTER_KEY_PARAM] = counter.key() return OK_EVENT
def test_readRunOnceSemaphore_payload_error_memcache_expired(self): sem = RunOnceSemaphore('foo', None) sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) payload = sem.readRunOnceSemaphore('bar', transactional=self.TRANSACTIONAL) self.assertEqual('payload', payload) self.assertEqual(["Run-once semaphore memcache payload read error."], self.loggingDouble.messages['critical'])
def test_readRunOnceSemaphore_payload_error_memcache_expired(self): sem = RunOnceSemaphore('foo', None) sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) payload = sem.readRunOnceSemaphore('bar', transactional=self.TRANSACTIONAL) self.assertEqual('payload', payload) self.assertEqual(1, len(self.loggingDouble.messages['critical'])) self.assertTrue(self.loggingDouble.messages['critical'][0]\ .startswith("Run-once semaphore memcache payload read error."))
def test_readRunOnceSemaphore_payload_error(self): sem = RunOnceSemaphore('foo', None) sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) payload = sem.readRunOnceSemaphore('bar', transactional=self.TRANSACTIONAL) self.assertEqual('payload', payload) self.assertEqual(1, len(self.loggingDouble.messages['critical'])) self.assertTrue(self.loggingDouble.messages['critical'][0]\ .startswith("Run-once semaphore memcache payload read error."))
def execute(self, context, obj): """ Writes the CSV file """ # if already wrote a file, don't do anything semaphore = RunOnceSemaphore(context.instanceName, context) if not semaphore.readRunOnceSemaphore(payload='payload'): # fetch the CsvCounter, since it is the parent of all the other Models counter = CsvProgressCounter.get_by_key_name(context.instanceName) # fetch the single aggregated results Model aggResults = CsvIntermediateResults.get_by_key_name(context.instanceName, counter) # open the file fileName = files.blobstore.create(mime_type='application/octet-stream') with files.open(fileName, 'a') as f: # the csv module has a convenient row writing interface writer = csv.writer(f) # this queries for all the intermediate results query = CsvIntermediateResults.all().ancestor(counter) for results in query: # the aggregated results may also be in the results, so skip them if aggResults and results.key() == aggResults.key(): continue # for all the intermediate data, write the rows data = results.data for item in data: rows = self.getRows(context, obj, item, aggResults.data) if rows: for row in rows: writer.writerow(row) if aggResults: # now also write down any specific aggregated data rows rows = self.getAggregatedRows(context, obj, aggResults.data) if rows: for row in rows: writer.writerow(row) # finalize the file files.finalize(fileName) # FIXME: what to do with this? blobKey = files.blobstore.get_blob_key(fileName) # at this point we have successfully written the file, lets make sure we don't do it again # if a retry occurs downstream semaphore.writeRunOnceSemaphore(payload='payload') # store the key of the counter (ie. parent of intermediate results) for cleanup context[COUNTER_KEY_PARAM] = counter.key() return OK_EVENT
def test_readRunOnceSemaphore(self): sem = RunOnceSemaphore('foo', None) sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) payload = sem.readRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) self.assertEqual('payload', payload)
def test_readRunOnceSemaphore_not_written(self): sem = RunOnceSemaphore('foo', None) self.assertEqual( None, sem.readRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL))
def _queueDispatchFanIn(self, nextEvent, fanInPeriod=0, retryOptions=None, queueName=None): """ Queues a call to .dispatch(nextEvent) in the task queue, or saves the context to the datastore for processing by the queued .dispatch(nextEvent) @param nextEvent: a string event @param fanInPeriod: the period of time between fan in Tasks @param queueName: the queue name to Queue into @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None assert not self.get(constants.INDEX_PARAM) # fan-in after fan-in is not allowed assert queueName # we pop this off here because we do not want the fan-out/continuation param as part of the # task name, otherwise we loose the fan-in - each fan-in gets one work unit. self.pop(constants.GEN_PARAM, None) fork = self.pop(constants.FORK_PARAM, None) # transfer the fan-in-group into the context (under a fixed value key) so that states beyond # the fan-in get unique Task names # FIXME: this will likely change once we formalize what to do post fan-in transition = self.currentState.getTransition(nextEvent) if self.get(transition.target.fanInGroup) is not None: self[constants.FAN_IN_GROUP_PARAM] = self[transition.target.fanInGroup] taskNameBase = self.getTaskName(nextEvent, fanIn=True) rwlock = ReadWriteLock(taskNameBase, self) index = rwlock.currentIndex() # (***) # # grab the lock - memcache.incr() # # on Task retry, multiple incr() calls are possible. possible ways to handle: # # 1. release the lock in a 'finally' clause, but then risk missing a work # package because acquiring the read lock will succeed even though the # work package was not written yet. # # 2. allow the lock to get too high. the fan-in logic attempts to wait for # work packages across multiple-retry attempts, so this seems like the # best option. we basically trade a bit of latency in fan-in for reliability. # rwlock.acquireWriteLock(index, nextEvent=nextEvent) # insert the work package, which is simply a serialized FSMContext workIndex = '%s-%d' % (taskNameBase, knuthHash(index)) # on retry, we want to ensure we get the same work index for this task actualTaskName = self.__obj[constants.TASK_NAME_PARAM] indexKeyName = 'workIndex-' + '-'.join([str(i) for i in [actualTaskName, fork] if i]) or None semaphore = RunOnceSemaphore(indexKeyName, self) # check if the workIndex changed during retry semaphoreWritten = False if self.__obj[constants.RETRY_COUNT_PARAM] > 0: # see comment (A) in self._queueDispatchFanIn(...) time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME) payload = semaphore.readRunOnceSemaphore(payload=workIndex, transactional=False) if payload: semaphoreWritten = True if payload != workIndex: self.logger.info("Work index changed from '%s' to '%s' on retry.", payload, workIndex) workIndex = payload # update this here so it gets written down into the work package too self[constants.INDEX_PARAM] = index # write down two models, one actual work package, one idempotency package keyName = '-'.join([str(i) for i in [actualTaskName, fork] if i]) or None work = _FantasmFanIn(context=self, workIndex=workIndex, key_name=keyName) # close enough to idempotent, but could still write only one of the entities # FIXME: could be made faster using a bulk put, but this interface is cleaner if not semaphoreWritten: semaphore.writeRunOnceSemaphore(payload=workIndex, transactional=False) # put the work item db.put(work) # (A) now the datastore is asynchronously writing the indices, so the work package may # not show up in a query for a period of time. there is a corresponding time.sleep() # in the fan-in of self.mergeJoinDispatch(...) # release the lock - memcache.decr() rwlock.releaseWriteLock(index) try: # insert a task to run in the future and process a bunch of work packages now = time.time() url = self.buildUrl(self.currentState, nextEvent) params = self.buildParams(self.currentState, nextEvent) task = Task(name='%s-%d' % (taskNameBase, index), method=self.method, url=url, params=params, eta=datetime.datetime.utcfromtimestamp(now) + datetime.timedelta(seconds=fanInPeriod), headers=self.headers, retry_options=retryOptions) self.Queue(name=queueName).add(task) return task except (TaskAlreadyExistsError, TombstonedTaskError): pass # Fan-in magic
def _queueDispatchFanIn(self, nextEvent, fanInPeriod=0, retryOptions=None, queueName=None): """ Queues a call to .dispatch(nextEvent) in the task queue, or saves the context to the datastore for processing by the queued .dispatch(nextEvent) @param nextEvent: a string event @param fanInPeriod: the period of time between fan in Tasks @param queueName: the queue name to Queue into @return: a taskqueue.Task instance which may or may not have been queued already """ assert nextEvent is not None assert not self.get( constants.INDEX_PARAM) # fan-in after fan-in is not allowed assert queueName # we pop this off here because we do not want the fan-out/continuation param as part of the # task name, otherwise we loose the fan-in - each fan-in gets one work unit. self.pop(constants.GEN_PARAM, None) fork = self.pop(constants.FORK_PARAM, None) taskNameBase = self.getTaskName(nextEvent, fanIn=True) rwlock = ReadWriteLock(taskNameBase, self) index = rwlock.currentIndex() # (***) # # grab the lock - memcache.incr() # # on Task retry, multiple incr() calls are possible. possible ways to handle: # # 1. release the lock in a 'finally' clause, but then risk missing a work # package because acquiring the read lock will succeed even though the # work package was not written yet. # # 2. allow the lock to get too high. the fan-in logic attempts to wait for # work packages across multiple-retry attempts, so this seems like the # best option. we basically trade a bit of latency in fan-in for reliability. # rwlock.acquireWriteLock(index, nextEvent=nextEvent) # insert the work package, which is simply a serialized FSMContext workIndex = '%s-%d' % (taskNameBase, knuthHash(index)) # on retry, we want to ensure we get the same work index for this task actualTaskName = self.__obj[constants.TASK_NAME_PARAM] indexKeyName = 'workIndex-' + '-'.join( [str(i) for i in [actualTaskName, fork] if i]) or None semaphore = RunOnceSemaphore(indexKeyName, self) # check if the workIndex changed during retry semaphoreWritten = False if self.__obj[constants.RETRY_COUNT_PARAM] > 0: # see comment (A) in self._queueDispatchFanIn(...) time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME) payload = semaphore.readRunOnceSemaphore(payload=workIndex, transactional=False) if payload: semaphoreWritten = True if payload != workIndex: self.logger.info( "Work index changed from '%s' to '%s' on retry.", payload, workIndex) workIndex = payload # write down two models, one actual work package, one idempotency package keyName = '-'.join([str(i) for i in [actualTaskName, fork] if i]) or None work = _FantasmFanIn(context=self, workIndex=workIndex, key_name=keyName) # close enough to idempotent, but could still write only one of the entities # FIXME: could be made faster using a bulk put, but this interface is cleaner if not semaphoreWritten: semaphore.writeRunOnceSemaphore(payload=workIndex, transactional=False) # put the work item db.put(work) # (A) now the datastore is asynchronously writing the indices, so the work package may # not show up in a query for a period of time. there is a corresponding time.sleep() # in the fan-in of self.mergeJoinDispatch(...) # release the lock - memcache.decr() rwlock.releaseWriteLock(index) try: # insert a task to run in the future and process a bunch of work packages now = time.time() self[constants.INDEX_PARAM] = index url = self.buildUrl(self.currentState, nextEvent) params = self.buildParams(self.currentState, nextEvent) task = Task(name='%s-%d' % (taskNameBase, index), method=self.method, url=url, params=params, eta=datetime.datetime.utcfromtimestamp(now) + datetime.timedelta(seconds=fanInPeriod), headers=self.headers, retry_options=retryOptions) self.Queue(name=queueName).add(task) return task except (TaskAlreadyExistsError, TombstonedTaskError): pass # Fan-in magic
def test_readRunOnceSemaphore_not_written(self): sem = RunOnceSemaphore('foo', None) self.assertEqual(None, sem.readRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL))
def test_readRunOnceSemaphore_payload_error(self): sem = RunOnceSemaphore('foo', None) sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL) payload = sem.readRunOnceSemaphore('bar', transactional=self.TRANSACTIONAL) self.assertEqual('payload', payload) self.assertEqual(["Run-once semaphore memcache payload read error."], self.loggingDouble.messages['critical'])