コード例 #1
0
    def mergeJoinDispatch(self, event, obj):
        """ Performs a merge join on the pending fan-in dispatches.
        
        @param event: an event that is being merge joined (destination state must be a fan in) 
        @return: a list (possibly empty) of FSMContext instances
        """
        # this assertion comes from _queueDispatchFanIn - we never want fan-out info in a fan-in context
        assert not self.get(constants.GEN_PARAM)
        assert not self.get(constants.FORK_PARAM)

        # the work package index is stored in the url of the Task/FSMContext
        index = self.get(constants.INDEX_PARAM)
        taskNameBase = self.getTaskName(event, fanIn=True)

        # see comment (***) in self._queueDispatchFanIn
        #
        # in the case of failing to acquire a read lock (due to failed release of write lock)
        # we have decided to keep retrying
        raiseOnFail = False
        if self._getTaskRetryLimit() is not None:
            raiseOnFail = (self._getTaskRetryLimit() >
                           self.__obj[constants.RETRY_COUNT_PARAM])

        rwlock = ReadWriteLock(taskNameBase, self)
        rwlock.acquireReadLock(index, raiseOnFail=raiseOnFail)

        # and return the FSMContexts list
        class FSMContextList(list):
            """ A list that supports .logger.info(), .logger.warning() etc.for fan-in actions """
            def __init__(self, context, contexts):
                """ setup a self.logger for fan-in actions """
                super(FSMContextList, self).__init__(contexts)
                self.logger = Logger(context)
                self.instanceName = context.instanceName

        # see comment (A) in self._queueDispatchFanIn(...)
        time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME)

        # the following step ensure that fan-in only ever operates one time over a list of data
        # the entity is created in State.dispatch(...) _after_ all the actions have executed
        # successfully
        workIndex = '%s-%d' % (taskNameBase, knuthHash(index))
        if obj[constants.RETRY_COUNT_PARAM] > 0:
            semaphore = RunOnceSemaphore(workIndex, self)
            if semaphore.readRunOnceSemaphore(
                    payload=self.__obj[constants.TASK_NAME_PARAM]):
                self.logger.info(
                    "Fan-in idempotency guard for workIndex '%s', not processing any work items.",
                    workIndex)
                return FSMContextList(self,
                                      [])  # don't operate over the data again

        # fetch all the work packages in the current group for processing
        query = _FantasmFanIn.all() \
                             .filter('workIndex =', workIndex) \
                             .order('__key__')

        # construct a list of FSMContexts
        contexts = [self.clone(data=r.context) for r in query]
        return FSMContextList(self, contexts)
コード例 #2
0
ファイル: lock_test.py プロジェクト: insad/fantasm
 def test_readRunOnceSemaphore_memcache_expired(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     memcache.delete('foo')
     payload = sem.readRunOnceSemaphore('payload',
                                        transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
コード例 #3
0
ファイル: fsmcontext.py プロジェクト: GTxx/fsm
    def mergeJoinDispatch(self, event, obj):
        """ Performs a merge join on the pending fan-in dispatches.

        @param event: an event that is being merge joined (destination state must be a fan in)
        @return: a list (possibly empty) of FSMContext instances
        """
        # this assertion comes from _queueDispatchFanIn - we never want fan-out info in a fan-in context
        assert not self.get(constants.GEN_PARAM)
        assert not self.get(constants.FORK_PARAM)

        # the work package index is stored in the url of the Task/FSMContext
        index = self.get(constants.INDEX_PARAM)
        self.logger.debug('Index: %s', index)
        taskNameBase = self.getTaskName(event, fanIn=True)

        # see comment (***) in self._queueDispatchFanIn
        #
        # in the case of failing to acquire a read lock (due to failed release of write lock)
        # we have decided to keep retrying
        raiseOnFail = False
        if self._getTaskRetryLimit() is not None:
            raiseOnFail = (self._getTaskRetryLimit() > self.__obj[constants.RETRY_COUNT_PARAM])

        rwlock = ReadWriteLock(taskNameBase, self)
        rwlock.acquireReadLock(index, raiseOnFail=raiseOnFail)

        # and return the FSMContexts list
        class FSMContextList(list):
            """ A list that supports .logger.info(), .logger.warning() etc.for fan-in actions """
            def __init__(self, context, contexts, guarded=False):
                """ setup a self.logger for fan-in actions """
                super(FSMContextList, self).__init__(contexts)
                self.logger = Logger(context)
                self.instanceName = context.instanceName
                self.guarded = guarded

        # see comment (A) in self._queueDispatchFanIn(...)
        time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME)

        # the following step ensure that fan-in only ever operates one time over a list of data
        # the entity is created in State.dispatch(...) _after_ all the actions have executed
        # successfully
        khash = knuthHash(index)
        self.logger.debug('knuthHash of index: %s', khash)
        workIndex = '%s-%d' % (taskNameBase, khash)
        if obj[constants.RETRY_COUNT_PARAM] > 0:
            semaphore = RunOnceSemaphore(workIndex, self)
            if semaphore.readRunOnceSemaphore(payload=self.__obj[constants.TASK_NAME_PARAM]):
                self.logger.info("Fan-in idempotency guard for workIndex '%s', not processing any work items.",
                                 workIndex)
                return FSMContextList(self, [], guarded=True) # don't operate over the data again

        # fetch all the work packages in the current group for processing
        query = _FantasmFanIn.all(namespace='') \
                             .filter('workIndex =', workIndex) \
                             .order('__key__')

        # construct a list of FSMContexts
        contexts = [self.clone(replaceData=r.context) for r in query]
        return FSMContextList(self, contexts)
コード例 #4
0
    def execute(self, context, obj):
        """ Writes the CSV file """

        # if already wrote a file, don't do anything
        semaphore = RunOnceSemaphore(context.instanceName, context)
        if not semaphore.readRunOnceSemaphore(payload='payload'):

            # fetch the CsvCounter, since it is the parent of all the other Models
            counter = CsvProgressCounter.get_by_key_name(context.instanceName)
            # fetch the single aggregated results Model
            aggResults = CsvIntermediateResults.get_by_key_name(
                context.instanceName, counter)

            # open the file
            fileName = files.blobstore.create(
                mime_type='application/octet-stream')
            with files.open(fileName, 'a') as f:

                # the csv module has a convenient row writing interface
                writer = csv.writer(f)

                # this queries for all the intermediate results
                query = CsvIntermediateResults.all().ancestor(counter)
                for results in query:

                    # the aggregated results may also be in the results, so skip them
                    if aggResults and results.key() == aggResults.key():
                        continue

                    # for all the intermediate data, write the rows
                    data = results.data
                    for item in data:
                        rows = self.getRows(context, obj, item,
                                            aggResults.data)
                        if rows:
                            for row in rows:
                                writer.writerow(row)

                if aggResults:
                    # now also write down any specific aggregated data rows
                    rows = self.getAggregatedRows(context, obj,
                                                  aggResults.data)
                    if rows:
                        for row in rows:
                            writer.writerow(row)

            # finalize the file
            files.finalize(fileName)

            # FIXME: what to do with this?
            blobKey = files.blobstore.get_blob_key(fileName)

            # at this point we have successfully written the file, lets make sure we don't do it again
            # if a retry occurs downstream
            semaphore.writeRunOnceSemaphore(payload='payload')

        # store the key of the counter (ie. parent of intermediate results) for cleanup
        context[COUNTER_KEY_PARAM] = counter.key()
        return OK_EVENT
コード例 #5
0
ファイル: lock_test.py プロジェクト: insad/fantasm
 def test_readRunOnceSemaphore_payload_error_memcache_expired(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     payload = sem.readRunOnceSemaphore('bar',
                                        transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
     self.assertEqual(["Run-once semaphore memcache payload read error."],
                      self.loggingDouble.messages['critical'])
コード例 #6
0
ファイル: lock_test.py プロジェクト: BobDohnal/fantasm
 def test_readRunOnceSemaphore_payload_error_memcache_expired(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     payload = sem.readRunOnceSemaphore('bar', transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
     self.assertEqual(1, len(self.loggingDouble.messages['critical']))
     self.assertTrue(self.loggingDouble.messages['critical'][0]\
                     .startswith("Run-once semaphore memcache payload read error."))
コード例 #7
0
ファイル: lock_test.py プロジェクト: oikmar/fantasm
 def test_readRunOnceSemaphore_payload_error(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     payload = sem.readRunOnceSemaphore('bar',
                                        transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
     self.assertEqual(1, len(self.loggingDouble.messages['critical']))
     self.assertTrue(self.loggingDouble.messages['critical'][0]\
                     .startswith("Run-once semaphore memcache payload read error."))
コード例 #8
0
 def execute(self, context, obj):
     """ Writes the CSV file """
     
     # if already wrote a file, don't do anything
     semaphore = RunOnceSemaphore(context.instanceName, context)
     if not semaphore.readRunOnceSemaphore(payload='payload'):
     
         # fetch the CsvCounter, since it is the parent of all the other Models
         counter = CsvProgressCounter.get_by_key_name(context.instanceName)
         # fetch the single aggregated results Model
         aggResults = CsvIntermediateResults.get_by_key_name(context.instanceName, counter)
         
         # open the file
         fileName = files.blobstore.create(mime_type='application/octet-stream')
         with files.open(fileName, 'a') as f:
             
             # the csv module has a convenient row writing interface
             writer = csv.writer(f)
             
             # this queries for all the intermediate results
             query = CsvIntermediateResults.all().ancestor(counter)
             for results in query:
                 
                 # the aggregated results may also be in the results, so skip them
                 if aggResults and results.key() == aggResults.key():
                     continue
                 
                 # for all the intermediate data, write the rows
                 data = results.data
                 for item in data:
                     rows = self.getRows(context, obj, item, aggResults.data)
                     if rows:
                         for row in rows:
                             writer.writerow(row)
             
             if aggResults:
                 # now also write down any specific aggregated data rows
                 rows = self.getAggregatedRows(context, obj, aggResults.data)
                 if rows:
                     for row in rows:
                         writer.writerow(row)
         
         # finalize the file
         files.finalize(fileName)
         
         # FIXME: what to do with this?
         blobKey = files.blobstore.get_blob_key(fileName)
         
         # at this point we have successfully written the file, lets make sure we don't do it again
         # if a retry occurs downstream
         semaphore.writeRunOnceSemaphore(payload='payload')
         
     # store the key of the counter (ie. parent of intermediate results) for cleanup
     context[COUNTER_KEY_PARAM] = counter.key()
     return OK_EVENT
コード例 #9
0
ファイル: lock_test.py プロジェクト: insad/fantasm
 def test_readRunOnceSemaphore(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     payload = sem.readRunOnceSemaphore('payload',
                                        transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
コード例 #10
0
ファイル: lock_test.py プロジェクト: insad/fantasm
 def test_readRunOnceSemaphore_not_written(self):
     sem = RunOnceSemaphore('foo', None)
     self.assertEqual(
         None,
         sem.readRunOnceSemaphore('payload',
                                  transactional=self.TRANSACTIONAL))
コード例 #11
0
ファイル: fsm.py プロジェクト: iki/fantasm
 def _queueDispatchFanIn(self, nextEvent, fanInPeriod=0, retryOptions=None, queueName=None):
     """ Queues a call to .dispatch(nextEvent) in the task queue, or saves the context to the 
     datastore for processing by the queued .dispatch(nextEvent)
     
     @param nextEvent: a string event 
     @param fanInPeriod: the period of time between fan in Tasks 
     @param queueName: the queue name to Queue into 
     @return: a taskqueue.Task instance which may or may not have been queued already
     """
     assert nextEvent is not None
     assert not self.get(constants.INDEX_PARAM) # fan-in after fan-in is not allowed
     assert queueName
     
     # we pop this off here because we do not want the fan-out/continuation param as part of the
     # task name, otherwise we loose the fan-in - each fan-in gets one work unit.
     self.pop(constants.GEN_PARAM, None)
     fork = self.pop(constants.FORK_PARAM, None)
     
     # transfer the fan-in-group into the context (under a fixed value key) so that states beyond 
     # the fan-in get unique Task names
     # FIXME: this will likely change once we formalize what to do post fan-in
     transition = self.currentState.getTransition(nextEvent)
     if self.get(transition.target.fanInGroup) is not None:
         self[constants.FAN_IN_GROUP_PARAM] = self[transition.target.fanInGroup]
     
     taskNameBase = self.getTaskName(nextEvent, fanIn=True)
     rwlock = ReadWriteLock(taskNameBase, self)
     index = rwlock.currentIndex()
         
     # (***)
     #
     # grab the lock - memcache.incr()
     # 
     # on Task retry, multiple incr() calls are possible. possible ways to handle:
     #
     # 1. release the lock in a 'finally' clause, but then risk missing a work
     #    package because acquiring the read lock will succeed even though the
     #    work package was not written yet.
     #
     # 2. allow the lock to get too high. the fan-in logic attempts to wait for 
     #    work packages across multiple-retry attempts, so this seems like the 
     #    best option. we basically trade a bit of latency in fan-in for reliability.
     #    
     rwlock.acquireWriteLock(index, nextEvent=nextEvent)
     
     # insert the work package, which is simply a serialized FSMContext
     workIndex = '%s-%d' % (taskNameBase, knuthHash(index))
     
     # on retry, we want to ensure we get the same work index for this task
     actualTaskName = self.__obj[constants.TASK_NAME_PARAM]
     indexKeyName = 'workIndex-' + '-'.join([str(i) for i in [actualTaskName, fork] if i]) or None
     semaphore = RunOnceSemaphore(indexKeyName, self)
     
     # check if the workIndex changed during retry
     semaphoreWritten = False
     if self.__obj[constants.RETRY_COUNT_PARAM] > 0:
         # see comment (A) in self._queueDispatchFanIn(...)
         time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME)
         payload = semaphore.readRunOnceSemaphore(payload=workIndex, transactional=False)
         if payload:
             semaphoreWritten = True
             if payload != workIndex:
                 self.logger.info("Work index changed from '%s' to '%s' on retry.", payload, workIndex)
                 workIndex = payload
     
     # update this here so it gets written down into the work package too
     self[constants.INDEX_PARAM] = index
             
     # write down two models, one actual work package, one idempotency package
     keyName = '-'.join([str(i) for i in [actualTaskName, fork] if i]) or None
     work = _FantasmFanIn(context=self, workIndex=workIndex, key_name=keyName)
     
     # close enough to idempotent, but could still write only one of the entities
     # FIXME: could be made faster using a bulk put, but this interface is cleaner
     if not semaphoreWritten:
         semaphore.writeRunOnceSemaphore(payload=workIndex, transactional=False)
     
     # put the work item
     db.put(work)
     
     # (A) now the datastore is asynchronously writing the indices, so the work package may
     #     not show up in a query for a period of time. there is a corresponding time.sleep()
     #     in the fan-in of self.mergeJoinDispatch(...) 
         
     # release the lock - memcache.decr()
     rwlock.releaseWriteLock(index)
         
     try:
         
         # insert a task to run in the future and process a bunch of work packages
         now = time.time()
         url = self.buildUrl(self.currentState, nextEvent)
         params = self.buildParams(self.currentState, nextEvent)
         task = Task(name='%s-%d' % (taskNameBase, index),
                     method=self.method,
                     url=url,
                     params=params,
                     eta=datetime.datetime.utcfromtimestamp(now) + datetime.timedelta(seconds=fanInPeriod),
                     headers=self.headers,
                     retry_options=retryOptions)
         self.Queue(name=queueName).add(task)
         return task
     
     except (TaskAlreadyExistsError, TombstonedTaskError):
         pass # Fan-in magic
コード例 #12
0
    def _queueDispatchFanIn(self,
                            nextEvent,
                            fanInPeriod=0,
                            retryOptions=None,
                            queueName=None):
        """ Queues a call to .dispatch(nextEvent) in the task queue, or saves the context to the 
        datastore for processing by the queued .dispatch(nextEvent)
        
        @param nextEvent: a string event 
        @param fanInPeriod: the period of time between fan in Tasks 
        @param queueName: the queue name to Queue into 
        @return: a taskqueue.Task instance which may or may not have been queued already
        """
        assert nextEvent is not None
        assert not self.get(
            constants.INDEX_PARAM)  # fan-in after fan-in is not allowed
        assert queueName

        # we pop this off here because we do not want the fan-out/continuation param as part of the
        # task name, otherwise we loose the fan-in - each fan-in gets one work unit.
        self.pop(constants.GEN_PARAM, None)
        fork = self.pop(constants.FORK_PARAM, None)

        taskNameBase = self.getTaskName(nextEvent, fanIn=True)
        rwlock = ReadWriteLock(taskNameBase, self)
        index = rwlock.currentIndex()

        # (***)
        #
        # grab the lock - memcache.incr()
        #
        # on Task retry, multiple incr() calls are possible. possible ways to handle:
        #
        # 1. release the lock in a 'finally' clause, but then risk missing a work
        #    package because acquiring the read lock will succeed even though the
        #    work package was not written yet.
        #
        # 2. allow the lock to get too high. the fan-in logic attempts to wait for
        #    work packages across multiple-retry attempts, so this seems like the
        #    best option. we basically trade a bit of latency in fan-in for reliability.
        #
        rwlock.acquireWriteLock(index, nextEvent=nextEvent)

        # insert the work package, which is simply a serialized FSMContext
        workIndex = '%s-%d' % (taskNameBase, knuthHash(index))

        # on retry, we want to ensure we get the same work index for this task
        actualTaskName = self.__obj[constants.TASK_NAME_PARAM]
        indexKeyName = 'workIndex-' + '-'.join(
            [str(i) for i in [actualTaskName, fork] if i]) or None
        semaphore = RunOnceSemaphore(indexKeyName, self)

        # check if the workIndex changed during retry
        semaphoreWritten = False
        if self.__obj[constants.RETRY_COUNT_PARAM] > 0:
            # see comment (A) in self._queueDispatchFanIn(...)
            time.sleep(constants.DATASTORE_ASYNCRONOUS_INDEX_WRITE_WAIT_TIME)
            payload = semaphore.readRunOnceSemaphore(payload=workIndex,
                                                     transactional=False)
            if payload:
                semaphoreWritten = True
                if payload != workIndex:
                    self.logger.info(
                        "Work index changed from '%s' to '%s' on retry.",
                        payload, workIndex)
                    workIndex = payload

        # write down two models, one actual work package, one idempotency package
        keyName = '-'.join([str(i)
                            for i in [actualTaskName, fork] if i]) or None
        work = _FantasmFanIn(context=self,
                             workIndex=workIndex,
                             key_name=keyName)

        # close enough to idempotent, but could still write only one of the entities
        # FIXME: could be made faster using a bulk put, but this interface is cleaner
        if not semaphoreWritten:
            semaphore.writeRunOnceSemaphore(payload=workIndex,
                                            transactional=False)

        # put the work item
        db.put(work)

        # (A) now the datastore is asynchronously writing the indices, so the work package may
        #     not show up in a query for a period of time. there is a corresponding time.sleep()
        #     in the fan-in of self.mergeJoinDispatch(...)

        # release the lock - memcache.decr()
        rwlock.releaseWriteLock(index)

        try:

            # insert a task to run in the future and process a bunch of work packages
            now = time.time()
            self[constants.INDEX_PARAM] = index
            url = self.buildUrl(self.currentState, nextEvent)
            params = self.buildParams(self.currentState, nextEvent)
            task = Task(name='%s-%d' % (taskNameBase, index),
                        method=self.method,
                        url=url,
                        params=params,
                        eta=datetime.datetime.utcfromtimestamp(now) +
                        datetime.timedelta(seconds=fanInPeriod),
                        headers=self.headers,
                        retry_options=retryOptions)
            self.Queue(name=queueName).add(task)
            return task

        except (TaskAlreadyExistsError, TombstonedTaskError):
            pass  # Fan-in magic
コード例 #13
0
ファイル: lock_test.py プロジェクト: BobDohnal/fantasm
 def test_readRunOnceSemaphore_memcache_expired(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     memcache.delete('foo')
     payload = sem.readRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
コード例 #14
0
ファイル: lock_test.py プロジェクト: BobDohnal/fantasm
 def test_readRunOnceSemaphore(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     payload = sem.readRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
コード例 #15
0
ファイル: lock_test.py プロジェクト: BobDohnal/fantasm
 def test_readRunOnceSemaphore_not_written(self):
     sem = RunOnceSemaphore('foo', None)
     self.assertEqual(None, sem.readRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL))
コード例 #16
0
ファイル: lock_test.py プロジェクト: iki/fantasm
 def test_readRunOnceSemaphore_payload_error(self):
     sem = RunOnceSemaphore('foo', None)
     sem.writeRunOnceSemaphore('payload', transactional=self.TRANSACTIONAL)
     payload = sem.readRunOnceSemaphore('bar', transactional=self.TRANSACTIONAL)
     self.assertEqual('payload', payload)
     self.assertEqual(["Run-once semaphore memcache payload read error."], self.loggingDouble.messages['critical'])