def smoke(self): correlation_id = uuid.uuid4().hex # can acquire lease acquired = aws.acquire_lease(correlation_id, 1, 1, primary=True) self.assertTrue(acquired is 1) # cannot re-acquire own lease (not re-entrant) acquired = aws.acquire_lease(correlation_id, 1, 1, primary=True) self.assertTrue(acquired is False) # cannot acquire someone else's lease acquired = aws.acquire_lease(correlation_id, 1, 2, primary=True) self.assertTrue(acquired is False) # cannot release someone else's lease released = aws.release_lease(correlation_id, 1, 2, 1, primary=True) self.assertTrue(released is False) # cannot release own lease (different fence token) released = aws.release_lease(correlation_id, 1, 1, 99, primary=True) self.assertTrue(released is False) # can release own lease released = aws.release_lease(correlation_id, 1, 1, 1, primary=True) self.assertTrue(released is True) # someone else can acquire new lease with short timeout acquired = aws.acquire_lease(correlation_id, 1, 2, primary=True, timeout=1) self.assertTrue(acquired is 2) time.sleep(2) # someone else can acquire new lease when previous times out acquired = aws.acquire_lease(correlation_id, 1, 2, primary=True) self.assertTrue(acquired is 3)
def dispatch(self, event, obj): """ Acquires an exclusive lease for the machine's correlation_id, and executes all the machinery of the framework and all the user code. :param event: a str event. :param obj: a dict. """ fence_token = None try: # attempt to acquire the lease and execute the state transition fence_token = acquire_lease(self.correlation_id, self.steps, self.retries, primary=self.lease_primary) if fence_token == 0: self._queue_error( ERRORS.CACHE, 'System error acquiring primary=%s lease.' % self.lease_primary) self.lease_primary = not self.lease_primary fence_token = acquire_lease(self.correlation_id, self.steps, self.retries, primary=self.lease_primary) if not fence_token: # could not get the lease. something is going wrong self._queue_error(ERRORS.CACHE, 'Could not acquire lease. Retrying.') self._retry(obj) else: # lease acquired, execute the state transition self._dispatch_and_retry(event, obj) finally: released = release_lease(self.correlation_id, self.steps, self.retries, fence_token, primary=self.lease_primary) if not released: self._queue_error(ERRORS.CACHE, 'Could not release lease.')
def dispatch(self, event, obj): """ Acquires an exclusive lease for the machine's correlation_id, and executes all the machinery of the framework and all the user code. In Marting Kleppmann's blog entry "How to do distributed locking" (http://martin.kleppmann.com/2016/02/08/how-to-do-distributed-locking.html) he points out that the following code is broken (in a distributed system): // THIS CODE IS BROKEN function writeData(filename, data) { var lock = lockService.acquireLock(filename); if (!lock) { throw 'Failed to acquire lock'; } // GARBAGE COLLECTION try { var file = storage.readFile(filename); var updated = updateContents(file, data); storage.writeFile(filename, updated); } finally { lock.release(); } } It is broken under the following circumstances: time 0 - lease acquired by process 1 time 1 - huge garbage collection pause in process 1 at "// GARBAGE COLLECTION" time 2 - lease expires time 3 - lease acquired by process 2 time 4 - writeData executed in process 2 time 5 - garbage collection done in process 1 time 6 - writeData loses race in process 1 In order to fix it, a monotonically increasing "fence token" must be supplied by the locking/leasing system and the storage system must be able to use the fence token to detect late writes. The framework makes the current fence token available in obj[OBJ.FENCE_TOKEN] so that developers writing Action code can implement the above advice. :param event: a str event. :param obj: a dict. """ fence_token = None try: # attempt to acquire the lease and execute the state transition fence_token = acquire_lease(self.correlation_id, self.steps, self.retries, primary=self.lease_primary) # 0 indicates system error, False indicates lease acquisition failure if fence_token == 0: self._queue_error( ERRORS.CACHE, 'System error acquiring primary=%s lease.' % self.lease_primary) self.lease_primary = not self.lease_primary fence_token = acquire_lease(self.correlation_id, self.steps, self.retries, primary=self.lease_primary) if not fence_token: # could not get the lease. something is going wrong self._queue_error(ERRORS.CACHE, 'Could not acquire lease. Retrying.') self._retry(obj) else: # lease acquired, execute the state transition # NOTE: idempotency # # In the happy path, each state transition is associated with a # correlation_id and a monotonically increasing step value. So, # a typical machine executes with the following values for # step, retry count, and fence token # # | correlation_id | steps | retries | fence | # +----------------+-------+---------+-------+ # | abc123 | 0 | 0 | 1 | # | abc123 | 1 | 0 | 2 | # | abc123 | 2 | 0 | 3 | # | abc123 | 3 | 0 | 4 | # # In the event of system or user-code error, the framework may # retry a given step multiple times # # | correlation_id | steps | retries | fence | # +----------------+-------+---------+-------+ # | abc123 | 0 | 0 | 1 | # | abc123 | 1 | 0 | 2 | # | abc123 | 1 | 1 | 3 | # retry # | abc123 | 1 | 2 | 4 | # retry # | abc123 | 2 | 0 | 5 | # | abc123 | 3 | 0 | 6 | # # so user-code should ensure it is idempotent (details below). # # The most interesting case arises when the AWS Lambda function # is re-executed with a duplicate message, poentially out-of-order. # This can happen for any number of reasons, since AWS Lambda ensure # at-least-once delivery of messages. In this case, one may see messages # like the following # # | correlation_id | steps | retries | fence | # +----------------+-------+---------+-------+ # | abc123 | 0 | 0 | 1 | # | abc123 | 1 | 0 | 2 | # | abc123 | 1 | 2 | 3 | # out-of-order retry # | abc123 | 1 | 1 | 4 | # out-of-order retry # | abc123 | 1 | 2 | 5 | # duplicate message # | abc123 | 2 | 0 | 6 | # | abc123 | 3 | 0 | 7 | # # User code can use (correlation_id, steps) as an idempotency token # for any resource unique to a specific state machine, and # (correlation_id, steps, fence) as an idempotency token for any # global resource. # # In the latter case, the global storage/resource system needs to # understand fence tokens. # make the fence token available if isinstance(fence_token, (int, long)): obj[OBJ.FENCE_TOKEN] = fence_token self._dispatch_and_retry(event, obj) finally: released = release_lease(self.correlation_id, self.steps, self.retries, fence_token, primary=self.lease_primary) if not released: self._queue_error(ERRORS.CACHE, 'Could not release lease.')