Exemple #1
0
class MemcachedNonceCache(object):
    """Object for managing a cache of used nonce values in memcached.

    This class allow easy timestamp-based management of client-generated
    nonces for hawkauthlib.

    It stores the nonces in memcached so that they can be shared between
    different webserver processes.  Each timestamp+nonce combo is stored
    under a key sha1(<timestamp>:<nonce>).
    """
    def __init__(self,
                 window=None,
                 get_time=None,
                 cache_server=None,
                 cache_key_prefix="noncecache:",
                 cache_pool_size=None,
                 cache_pool_timeout=60,
                 **kwds):
        # Memcached ttls are in integer seconds, so round up to the nearest.
        if window is None:
            window = DEFAULT_TIMESTAMP_WINDOW
        else:
            window = int(math.ceil(window))
        self.window = window
        self.get_time = get_time or time.time
        self.mcclient = MemcachedClient(cache_server, cache_key_prefix,
                                        cache_pool_size, cache_pool_timeout)

    def __len__(self):
        raise NotImplementedError

    def check_nonce(self, timestamp, nonce):
        """Check if the given timestamp+nonce is fresh.

        This method checks that the given timestamp is within the configured
        time window, and that the given nonce has not previously been seen
        with that timestamp.  It returns True if the nonce is fresh and False
        if it is stale.

        Fresh nonces are stored in memcache so that subsequent checks of the
        same nonce will return False.
        """
        now = self.get_time()
        # Check if the timestamp is within the configured window.
        ts_min = now - self.window
        ts_max = now + self.window
        if not ts_min < timestamp < ts_max:
            return False
        # Check if it's in memcached, adding it if not.
        # Fortunately memcached 'add' has precisely the right semantics
        # of "create if not exists"
        key = urlsafe_b64encode(sha1("%d:%s" % (timestamp, nonce)).digest())
        try:
            if not self.mcclient.add(key, 1, time=self.window):
                return False
        except ValueError:
            return False
        # Successfully added, the nonce must be fresh.
        return True
class MemcachedNonceCache(object):
    """Object for managing a cache of used nonce values in memcached.

    This class allow easy timestamp-based management of client-generated
    nonces for hawkauthlib.

    It stores the nonces in memcached so that they can be shared between
    different webserver processes.  Each timestamp+nonce combo is stored
    under a key sha1(<timestamp>:<nonce>).
    """

    def __init__(self, window=None, get_time=None, cache_server=None,
                 cache_key_prefix="noncecache:", cache_pool_size=None,
                 cache_pool_timeout=60, **kwds):
        # Memcached ttls are in integer seconds, so round up to the nearest.
        if window is None:
            window = DEFAULT_TIMESTAMP_WINDOW
        else:
            window = int(math.ceil(window))
        self.window = window
        self.get_time = get_time or time.time
        self.mcclient = MemcachedClient(cache_server, cache_key_prefix,
                                        cache_pool_size, cache_pool_timeout)

    def __len__(self):
        raise NotImplementedError

    def check_nonce(self, timestamp, nonce):
        """Check if the given timestamp+nonce is fresh.

        This method checks that the given timestamp is within the configured
        time window, and that the given nonce has not previously been seen
        with that timestamp.  It returns True if the nonce is fresh and False
        if it is stale.

        Fresh nonces are stored in memcache so that subsequent checks of the
        same nonce will return False.
        """
        now = self.get_time()
        # Check if the timestamp is within the configured window.
        ts_min = now - self.window
        ts_max = now + self.window
        if not ts_min < timestamp < ts_max:
            return False
        # Check if it's in memcached, adding it if not.
        # Fortunately memcached 'add' has precisely the right semantics
        # of "create if not exists"
        key = urlsafe_b64encode(sha1("%d:%s" % (timestamp, nonce)).digest())
        try:
            if not self.mcclient.add(key, 1, time=self.window):
                return False
        except ValueError:
            return False
        # Successfully added, the nonce must be fresh.
        return True
class MemcachedStorage(SyncStorage):
    """Memcached caching wrapper for SyncStorage backends.

    The SyncStorage implementation wraps another storage backend to provide
    a caching layer.  You may specify the following arguments:

        * storage:  the underlying SyncStorage object that is to be wrapped.a
        * cache_servers:  a list of memcached server URLs.
        * cached_collections:  a list of names of collections that should
                               be duplicated into memcache for fast access.
        * cache_only_collections:  a list of names of collections that should
                                   be stored *only* in memcached, and never
                                   written through to the bacend.
        * cache_key_prefix:  a string to be prepended to all memcached keys,
                             useful for namespacing in shared cache setups.
        * cache_pool_size:  the maximum number of active memcache clients.
        * cache_pool_timeout:  the maximum lifetime of each memcache client.

    """

    def __init__(self, storage, cache_servers=None, cache_key_prefix="",
                 cache_pool_size=None, cache_pool_timeout=60,
                 cached_collections=(), cache_only_collections=(),
                 cache_lock=False, cache_lock_ttl=None, **kwds):
        self.storage = storage
        self.cache = MemcachedClient(cache_servers, cache_key_prefix,
                                     cache_pool_size, cache_pool_timeout)
        self.cached_collections = {}
        for collection in aslist(cached_collections):
            colmgr = CachedManager(self, collection)
            self.cached_collections[collection] = colmgr
        self.cache_only_collections = {}
        for collection in aslist(cache_only_collections):
            colmgr = CacheOnlyManager(self, collection)
            self.cache_only_collections[collection] = colmgr
        self.cache_lock = cache_lock
        if cache_lock_ttl is None:
            self.cache_lock_ttl = DEFAULT_CACHE_LOCK_TTL
        else:
            self.cache_lock_ttl = cache_lock_ttl
        # Keep a threadlocal to track the currently-held locks.
        # This is needed to make the read locking API reentrant.
        self._tldata = threading.local()

    def iter_cache_keys(self, userid):
        """Iterator over all potential cache keys for the given userid.

        This method yields all potential cache keys for the given userid,
        including their metadata key and the keys for any cached collections.
        The yielded keys do *not* include the key prefix, if any.
        """
        yield _key(userid, "metadata")
        for colmgr in self.cached_collections.itervalues():
            yield colmgr.get_key(userid)
        for colmgr in self.cache_only_collections.itervalues():
            yield colmgr.get_key(userid)

    def _get_collection_manager(self, collection):
        """Get a collection-management object for the named collection.

        This class delegates all collection-level operations to a "collection
        manager" object.  The manager for a given collection will be different
        depending on the required caching characteristics, and this method
        gets and returns on appropriate manager for the named collection.
        """
        try:
            return self.cached_collections[collection]
        except KeyError:
            try:
                return self.cache_only_collections[collection]
            except KeyError:
                return UncachedManager(self, collection)

    #
    # APIs for collection-level locking.
    #
    # This class provides the option of locking at the memcache level rather
    # than calling through to the underlying storage engine.  Such locks
    # are just simple mutex keys in memcache, one per collection.  If you
    # can successfully add the key then you get the lock, if it already
    # exists then someone else holds the lock.  If you crash while holding
    # the lock, it will eventually expire.
    #

    def lock_for_read(self, userid, collection):
        """Acquire a shared read lock on the named collection."""
        if self.cache_lock or collection in self.cache_only_collections:
            return self._lock_in_memcache(userid, collection)
        else:
            return self.storage.lock_for_read(userid, collection)

    def lock_for_write(self, userid, collection):
        """Acquire an exclusive write lock on the named collection."""
        if self.cache_lock or collection in self.cache_only_collections:
            return self._lock_in_memcache(userid, collection)
        else:
            return self.storage.lock_for_write(userid, collection)

    @contextlib.contextmanager
    def _lock_in_memcache(self, userid, collection):
        """Helper method to take a memcache-level lock on a collection."""
        # Use a thread-local set of held locks to make this reentrant.
        try:
            locked_collections = self._tldata.locked_collections
        except AttributeError:
            locked_collections = self._tldata.locked_collections = set()
        if (userid, collection) in locked_collections:
            yield None
            return
        # Take the lock in memcached.
        ttl = self.cache_lock_ttl
        now = time.time()
        key = _key(userid, "lock", collection)
        if not self.cache.add(key, True, time=ttl):
            raise ConflictError
        locked_collections.add((userid, collection))
        try:
            yield None
        finally:
            locked_collections.remove((userid, collection))
            if time.time() - now >= ttl:
                msg = "Lock expired while we were holding it"
                raise RuntimeError(msg)
            self.cache.delete(key)

    #
    # APIs to operate on the entire storage.
    #

    def get_storage_timestamp(self, userid):
        """Returns the last-modified timestamp for the entire storage."""
        # Try to use the cached value.
        ts = self._get_metadata(userid)["modified"]
        # Fall back to live data if it's dirty.
        if ts is None:
            ts = self.storage.get_storage_timestamp(userid)
            for colmgr in self.cache_only_collections.itervalues():
                try:
                    ts = max(ts, colmgr.get_timestamp(userid))
                except CollectionNotFoundError:
                    pass
        return ts

    def get_collection_timestamps(self, userid):
        """Returns the collection timestamps for a user."""
        # Try to use the cached value.
        timestamps = self._get_metadata(userid)["collections"]
        # Fall back to live data for any collections that are dirty.
        for collection, ts in timestamps.items():
            if ts is None:
                colmgr = self._get_collection_manager(collection)
                try:
                    timestamps[collection] = colmgr.get_timestamp(userid)
                except CollectionNotFoundError:
                    del timestamps[collection]
        return timestamps

    def get_collection_counts(self, userid):
        """Returns the collection counts."""
        # Read most of the data from the database.
        counts = self.storage.get_collection_counts(userid)
        # Add in counts for collections stored only in memcache.
        for colmgr in self.cache_only_collections.itervalues():
            try:
                items = colmgr.get_items(userid)["items"]
            except CollectionNotFoundError:
                pass
            else:
                counts[colmgr.collection] = len(items)
        return counts

    def get_collection_sizes(self, userid):
        """Returns the total size for each collection."""
        # Read most of the data from the database.
        sizes = self.storage.get_collection_sizes(userid)
        # Add in sizes for collections stored only in memcache.
        for colmgr in self.cache_only_collections.itervalues():
            try:
                items = colmgr.get_items(userid)["items"]
                payloads = (item.get("payload", "") for item in items)
                sizes[colmgr.collection] = sum(len(p) for p in payloads)
            except CollectionNotFoundError:
                pass
        # Since we've just gone to the trouble of recalculating sizes,
        # we might as well update the cached total size as well.
        self._update_total_size(userid, sum(sizes.itervalues()))
        return sizes

    def get_total_size(self, userid, recalculate=False):
        """Returns the total size of a user's storage data."""
        return self._get_metadata(userid, recalculate)["size"]

    def delete_storage(self, userid):
        """Removes all data for the user."""
        for key in self.iter_cache_keys(userid):
            self.cache.delete(key)
        self.storage.delete_storage(userid)

    #
    # APIs to operate on an individual collection
    #

    def get_collection_timestamp(self, userid, collection):
        """Returns the last-modified timestamp for the named collection."""
        # It's likely cheaper to read all cached timestamps out of memcache
        # than to read just the single timestamp from the database.
        timestamps = self.get_collection_timestamps(userid)
        try:
            ts = timestamps[collection]
        except KeyError:
            raise CollectionNotFoundError
        # Refresh from the live data if dirty.
        if ts is None:
            colmgr = self._get_collection_manager(collection)
            ts = colmgr.get_timestamp(userid)
        return ts

    def get_items(self, userid, collection, **kwds):
        """Returns items from a collection"""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_items(userid, **kwds)

    def get_item_ids(self, userid, collection, **kwds):
        """Returns item idss from a collection"""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_item_ids(userid, **kwds)

    def set_items(self, userid, collection, items):
        """Creates or updates multiple items in a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(userid, collection) as update:
            ts = colmgr.set_items(userid, items)
            size = sum(len(item.get("payload", "")) for item in items)
            update(ts, ts, size)
            return ts

    def delete_collection(self, userid, collection):
        """Deletes an entire collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(userid, collection) as update:
            ts = colmgr.del_collection(userid)
            update(ts, None)
            return ts

    def delete_items(self, userid, collection, items):
        """Deletes multiple items from a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(userid, collection) as update:
            ts = colmgr.del_items(userid, items)
            update(ts, ts)
            return ts

    #
    # Items APIs
    #

    def get_item_timestamp(self, userid, collection, item):
        """Returns the last-modified timestamp for the named item."""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_item_timestamp(userid, item)

    def get_item(self, userid, collection, item):
        """Returns one item from a collection."""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_item(userid, item)

    def set_item(self, userid, collection, item, data):
        """Creates or updates a single item in a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(userid, collection) as update:
            res = colmgr.set_item(userid, item, data)
            size = len(data.get("payload", ""))
            update(res["modified"], res["modified"], size)
            return res

    def delete_item(self, userid, collection, item):
        """Deletes a single item from a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(userid, collection) as update:
            ts = colmgr.del_item(userid, item)
            update(ts, ts)
            return ts

    #
    # Administrative/maintenance methods.
    #

    def purge_expired_items(self, grace_period=0, max_per_loop=1000):
        """Purges items with an expired TTL from the database."""
        # We have no way to purge expired items from memcached, as
        # there's no way to enumerate all the userids.  Purging is
        # instead done on each write for cached collections, with the
        # expectation that this will be cheap due to low item count.
        # Therefore, the only thing we can do here is pass on the call.
        return self.storage.purge_expired_items(grace_period, max_per_loop)

    #
    #  Private APIs for managing the cached metadata
    #

    def _get_metadata(self, userid, recalculate_size=False):
        """Get the metadata dict, recalculating things if necessary.

        This method pulls the dict of metadata out of memcache and returns it.
        If there is no information yet in memcache then it pulls the data from
        the underlying storage, caches it and then returns it.

        If recalculate_size is given and True, then the cache size value will
        be recalculated from the store if it is more than an hour old.
        """
        key = _key(userid, "metadata")
        data, casid = self.cache.gets(key)
        # If there is no cached metadata, initialize it from the storage.
        # Use CAS to avoid overwriting other changes, but don't error out if
        # the write fails - it just means that someone else beat us to it.
        if data is None:
            # Get the mapping of collection names to timestamps.
            # Make sure to include any cache-only collections.
            timestamps = self.storage.get_collection_timestamps(userid)
            for colmgr in self.cached_collections.itervalues():
                if colmgr.collection not in timestamps:
                    try:
                        ts = colmgr.get_timestamp(userid)
                        timestamps[colmgr.collection] = ts
                    except CollectionNotFoundError:
                        pass
            # Get the storage-level modified time.
            # Make sure it's not less than any collection-level timestamp.
            ts = self.storage.get_storage_timestamp(userid)
            if timestamps:
                ts = max(ts, max(timestamps.itervalues()))
            # Calculate the total size if requested,
            # but don't bother if it's not necessary.
            if not recalculate_size:
                last_size_recalc = 0
                size = 0
            else:
                last_size_recalc = int(time.time())
                size = self._recalculate_total_size(userid)
            # Store it all back into the cache.
            data = {
                "size": size,
                "last_size_recalc": last_size_recalc,
                "modified": ts,
                "collections": timestamps,
            }
            self.cache.cas(key, data, casid)
        # Recalculate the size if it appears to be out of date.
        # Use CAS to avoid clobbering changes but don't let it fail us.
        elif recalculate_size:
            recalc_period = time.time() - data["last_size_recalc"]
            if recalc_period > SIZE_RECALCULATION_PERIOD:
                data["last_size_recalc"] = int(time.time())
                data["size"] = self._recalculate_total_size(userid)
                self.cache.cas(key, data, casid)
        return data

    def _update_total_size(self, userid, size):
        """Update the cached value for total storage size."""
        key = _key(userid, "metadata")
        data, casid = self.cache.gets(key)
        if data is None:
            self._get_metadata(userid)
            data, casid = self.cache.gets(key)
        data["last_size_recalc"] = int(time.time())
        data["size"] = size
        self.cache.cas(key, data, casid)

    def _recalculate_total_size(self, userid):
        """Re-calculate total size from the database."""
        size = self.storage.get_total_size(userid)
        for colmgr in self.cache_only_collections.itervalues():
            try:
                items = colmgr.get_items(userid)["items"]
                payloads = (item.get("payload", "") for item in items)
                size += sum(len(p) for p in payloads)
            except CollectionNotFoundError:
                pass
        return size

    @contextlib.contextmanager
    def _mark_collection_dirty(self, userid, collection):
        """Context manager for marking collections as dirty during write.

        To prevent the cache from getting out of sync with the underlying store
        it is necessary to mark a collection as dirty before performing any
        modifications on it.  This is a handy context manager that can take
        care of that, as well as update the timestamps with new results when
        the modification is complete.

        The context object associated with this method is a callback function
        that can be used to update the stored metadata.  It accepts the top-
        level storage timestamp, collection-level timestamp, and a total size
        increment as its three arguments.  Example usage::

            with self._mark_collection_dirty(userid, collection) as update:
                colobj = self._get_collection_manager(collection)
                ts = colobj.set_item(userid, "test", {"payload": "TEST"})
                update(ts, ts, len("TEST"))

        """
        # Get the old values from the metadata.
        # We can't call _get_metadata directly because we also want the casid.
        key = _key(userid, "metadata")
        data, casid = self.cache.gets(key)
        if data is None:
            # No cached data, so refresh.
            self._get_metadata(userid)
            data, casid = self.cache.gets(key)

        # Write None into the metadata to mark things as dirty.
        ts = data["modified"]
        col_ts = data["collections"].get(collection)
        data["modified"] = None
        data["collections"][collection] = None
        if not self.cache.cas(key, data, casid):
            raise ConflictError

        # Define the callback function for the calling code to use.
        # We also use this function internally to recover from errors.
        update_was_called = []

        def update(ts=ts, col_ts=col_ts, size_incr=0):
            assert not update_was_called
            update_was_called.append(True)
            data["modified"] = ts
            if col_ts is None:
                del data["collections"][collection]
            else:
                data["collections"][collection] = col_ts
            data["size"] += size_incr
            # We assume the write lock is held to avoid conflicting changes.
            # Sadly, using CAS again would require another round-trip.
            self.cache.set(key, data)

        # Yield out to the calling code.
        # It can call the yielded function to provide new metadata.
        # If they don't call it, then we cannot make any assumptions about
        # the consistency of the cached data and must leave things marked
        # as dirty until another write cleans it up.
        try:
            yield update
        except StorageError:
            # If a storage-related error occurs, then we know that the
            # operation wrapped by the calling code did not succeed.
            # It's therefore safe to roll back to previous values.
            if not update_was_called:
                update()
            raise
class MemcachedStorage(SyncStorage):
    """Memcached caching wrapper for SyncStorage backends.

    The SyncStorage implementation wraps another storage backend to provide
    a caching layer.  You may specify the following arguments:

        * storage:  the underlying SyncStorage object that is to be wrapped.
        * cache_servers:  a list of memcached server URLs.
        * cached_collections:  a list of names of collections that should
                               be duplicated into memcache for fast access.
        * cache_only_collections:  a list of names of collections that should
                                   be stored *only* in memcached, and never
                                   written through to the bacend.
        * cache_key_prefix:  a string to be prepended to all memcached keys,
                             useful for namespacing in shared cache setups.
        * cache_pool_size:  the maximum number of active memcache clients.
        * cache_pool_timeout:  the maximum lifetime of each memcache client.

    """

    def __init__(self, storage, cache_servers=None, cache_key_prefix="",
                 cache_pool_size=None, cache_pool_timeout=60,
                 cached_collections=(), cache_only_collections=(),
                 cache_lock=False, cache_lock_ttl=None, **kwds):
        self.storage = storage
        self.cache = MemcachedClient(cache_servers, cache_key_prefix,
                                     cache_pool_size, cache_pool_timeout)
        self.cached_collections = {}
        for collection in aslist(cached_collections):
            colmgr = CachedManager(self, collection)
            self.cached_collections[collection] = colmgr
        self.cache_only_collections = {}
        for collection in aslist(cache_only_collections):
            colmgr = CacheOnlyManager(self, collection)
            self.cache_only_collections[collection] = colmgr
        self.cache_lock = cache_lock
        if cache_lock_ttl is None:
            self.cache_lock_ttl = DEFAULT_CACHE_LOCK_TTL
        else:
            self.cache_lock_ttl = cache_lock_ttl
        # Keep a threadlocal to track the currently-held locks.
        # This is needed to make the read locking API reentrant.
        self._tldata = threading.local()

    def iter_cache_keys(self, user):
        """Iterator over all potential cache keys for the given user.

        This method yields all potential cache keys for the given user,
        including their metadata key and the keys for any cached collections.
        The yielded keys do *not* include the key prefix, if any.
        """
        yield _key(user["uid"], "metadata")
        for colmgr in self.cached_collections.itervalues():
            for key in colmgr.iter_cache_keys(user):
                yield key
        for colmgr in self.cache_only_collections.itervalues():
            for key in colmgr.iter_cache_keys(user):
                yield key

    def _get_collection_manager(self, collection):
        """Get a collection-management object for the named collection.

        This class delegates all collection-level operations to a "collection
        manager" object.  The manager for a given collection will be different
        depending on the required caching characteristics, and this method
        gets and returns on appropriate manager for the named collection.
        """
        try:
            return self.cached_collections[collection]
        except KeyError:
            try:
                return self.cache_only_collections[collection]
            except KeyError:
                return UncachedManager(self, collection)

    #
    # APIs for collection-level locking.
    #
    # This class provides the option of locking at the memcache level rather
    # than calling through to the underlying storage engine.  Such locks
    # are just simple mutex keys in memcache, one per collection.  If you
    # can successfully add the key then you get the lock, if it already
    # exists then someone else holds the lock.  If you crash while holding
    # the lock, it will eventually expire.
    #

    def lock_for_read(self, user, collection):
        """Acquire a shared read lock on the named collection."""
        if self.cache_lock or collection in self.cache_only_collections:
            return self._lock_in_memcache(user, collection)
        else:
            return self.storage.lock_for_read(user, collection)

    def lock_for_write(self, user, collection):
        """Acquire an exclusive write lock on the named collection."""
        if self.cache_lock or collection in self.cache_only_collections:
            return self._lock_in_memcache(user, collection)
        else:
            return self.storage.lock_for_write(user, collection)

    @contextlib.contextmanager
    def _lock_in_memcache(self, user, collection):
        """Helper method to take a memcache-level lock on a collection."""
        userid = user["uid"]
        # Use a thread-local set of held locks to make this reentrant.
        try:
            locked_collections = self._tldata.locked_collections
        except AttributeError:
            locked_collections = self._tldata.locked_collections = set()
        if (userid, collection) in locked_collections:
            yield None
            return
        # Take the lock in memcached.
        ttl = self.cache_lock_ttl
        now = time.time()
        key = _key(userid, "lock", collection)
        if not self.cache.add(key, True, time=ttl):
            raise ConflictError
        locked_collections.add((userid, collection))
        try:
            yield None
        finally:
            locked_collections.remove((userid, collection))
            if time.time() - now >= ttl:
                msg = "Lock expired while we were holding it"
                raise RuntimeError(msg)
            self.cache.delete(key)

    #
    # APIs to operate on the entire storage.
    #

    def get_storage_timestamp(self, user):
        """Returns the last-modified timestamp for the entire storage."""
        # Try to use the cached value.
        ts = self._get_metadata(user)["modified"]
        # Fall back to live data if it's dirty.
        if ts is None:
            ts = self.storage.get_storage_timestamp(user)
            for colmgr in self.cache_only_collections.itervalues():
                try:
                    ts = max(ts, colmgr.get_timestamp(user))
                except CollectionNotFoundError:
                    pass
        return ts

    def get_collection_timestamps(self, user):
        """Returns the collection timestamps for a user."""
        # Try to use the cached value.
        timestamps = self._get_metadata(user)["collections"]
        # Fall back to live data for any collections that are dirty.
        for collection, ts in timestamps.items():
            if ts is None:
                colmgr = self._get_collection_manager(collection)
                try:
                    timestamps[collection] = colmgr.get_timestamp(user)
                except CollectionNotFoundError:
                    del timestamps[collection]
        return timestamps

    def get_collection_counts(self, user):
        """Returns the collection counts."""
        # Read most of the data from the database.
        counts = self.storage.get_collection_counts(user)
        # Add in counts for collections stored only in memcache.
        for colmgr in self.cache_only_collections.itervalues():
            try:
                items = colmgr.get_items(user)["items"]
            except CollectionNotFoundError:
                pass
            else:
                counts[colmgr.collection] = len(items)
        return counts

    def get_collection_sizes(self, user):
        """Returns the total size for each collection."""
        # Read most of the data from the database.
        sizes = self.storage.get_collection_sizes(user)
        # Add in sizes for collections stored only in memcache.
        for colmgr in self.cache_only_collections.itervalues():
            try:
                items = colmgr.get_items(user)["items"]
                payloads = (item.get("payload", "") for item in items)
                sizes[colmgr.collection] = sum(len(p) for p in payloads)
            except CollectionNotFoundError:
                pass
        # Since we've just gone to the trouble of recalculating sizes,
        # we might as well update the cached total size as well.
        self._update_total_size(user, sum(sizes.itervalues()))
        return sizes

    def get_total_size(self, user, recalculate=False):
        """Returns the total size of a user's storage data."""
        return self._get_metadata(user, recalculate)["size"]

    def delete_storage(self, user):
        """Removes all data for the user."""
        for key in self.iter_cache_keys(user):
            self.cache.delete(key)
        self.storage.delete_storage(user)

    #
    # APIs to operate on an individual collection
    #

    def get_collection_timestamp(self, user, collection):
        """Returns the last-modified timestamp for the named collection."""
        # It's likely cheaper to read all cached timestamps out of memcache
        # than to read just the single timestamp from the database.
        timestamps = self.get_collection_timestamps(user)
        try:
            ts = timestamps[collection]
        except KeyError:
            raise CollectionNotFoundError
        # Refresh from the live data if dirty.
        if ts is None:
            colmgr = self._get_collection_manager(collection)
            ts = colmgr.get_timestamp(user)
        return ts

    def get_items(self, user, collection, **kwds):
        """Returns items from a collection"""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_items(user, **kwds)

    def get_item_ids(self, user, collection, **kwds):
        """Returns item idss from a collection"""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_item_ids(user, **kwds)

    def set_items(self, user, collection, items):
        """Creates or updates multiple items in a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(user, collection) as update:
            ts = colmgr.set_items(user, items)
            size = sum(len(item.get("payload", "")) for item in items)
            update(ts, ts, size)
            return ts

    def delete_collection(self, user, collection):
        """Deletes an entire collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(user, collection) as update:
            ts = colmgr.del_collection(user)
            update(ts, None)
            return ts

    def delete_items(self, user, collection, items):
        """Deletes multiple items from a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(user, collection) as update:
            ts = colmgr.del_items(user, items)
            update(ts, ts)
            return ts

    def create_batch(self, user, collection):
        """Creates batch for a give user's collection."""
        colmgr = self._get_collection_manager(collection)
        return colmgr.create_batch(user)

    def valid_batch(self, user, collection, batchid):
        """Verifies that a batch ID is valid"""
        colmgr = self._get_collection_manager(collection)
        return colmgr.valid_batch(user, batchid)

    def append_items_to_batch(self, user, collection, batchid, items):
        """Appends items to the pending batch."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(user, collection) as update:
            ts = colmgr.append_items_to_batch(user, batchid, items)
            # Account for the size of the new items as they come in,
            # since that's the only opportunity we have to see them.
            # Don't update the timestamp yet though, as they're not committed.
            size = sum(len(item.get("payload", "")) for item in items)
            update(size_incr=size)
            return ts

    def apply_batch(self, user, collection, batchid):
        """Applies the batch"""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(user, collection) as update:
            ts = colmgr.apply_batch(user, batchid)
            update(ts, ts)
            return ts

    def close_batch(self, user, collection, batchid):
        colmgr = self._get_collection_manager(collection)
        return colmgr.close_batch(user, batchid)

    #
    # Items APIs
    #

    def get_item_timestamp(self, user, collection, item):
        """Returns the last-modified timestamp for the named item."""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_item_timestamp(user, item)

    def get_item(self, user, collection, item):
        """Returns one item from a collection."""
        colmgr = self._get_collection_manager(collection)
        return colmgr.get_item(user, item)

    def set_item(self, user, collection, item, data):
        """Creates or updates a single item in a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(user, collection) as update:
            res = colmgr.set_item(user, item, data)
            size = len(data.get("payload", ""))
            update(res["modified"], res["modified"], size)
            return res

    def delete_item(self, user, collection, item):
        """Deletes a single item from a collection."""
        colmgr = self._get_collection_manager(collection)
        with self._mark_collection_dirty(user, collection) as update:
            ts = colmgr.del_item(user, item)
            update(ts, ts)
            return ts

    def is_migrating(self, user):
        """Is the user in migration?"""
        return self.storage.is_migrating(user)

    #
    # Administrative/maintenance methods.
    #

    def purge_expired_items(self, grace_period=0, max_per_loop=1000):
        """Purges items with an expired TTL from the database."""
        # We have no way to purge expired items from memcached, as
        # there's no way to enumerate all the userids.  Purging is
        # instead done on each write for cached collections, with the
        # expectation that this will be cheap due to low item count.
        # Therefore, the only thing we can do here is pass on the call.
        return self.storage.purge_expired_items(grace_period, max_per_loop)

    #
    #  Private APIs for managing the cached metadata
    #

    def _get_metadata(self, user, recalculate_size=False):
        """Get the metadata dict, recalculating things if necessary.

        This method pulls the dict of metadata out of memcache and returns it.
        If there is no information yet in memcache then it pulls the data from
        the underlying storage, caches it and then returns it.

        If recalculate_size is given and True, then the cache size value will
        be recalculated from the store if it is more than an hour old.
        """
        key = _key(user["uid"], "metadata")
        data, casid = self.cache.gets(key)
        # If there is no cached metadata, initialize it from the storage.
        # Use CAS to avoid overwriting other changes, but don't error out if
        # the write fails - it just means that someone else beat us to it.
        if data is None:
            # Get the mapping of collection names to timestamps.
            # Make sure to include any cache-only collections.
            timestamps = self.storage.get_collection_timestamps(user)
            for colmgr in self.cached_collections.itervalues():
                if colmgr.collection not in timestamps:
                    try:
                        ts = colmgr.get_timestamp(user)
                        timestamps[colmgr.collection] = ts
                    except CollectionNotFoundError:
                        pass
            # Get the storage-level modified time.
            # Make sure it's not less than any collection-level timestamp.
            ts = self.storage.get_storage_timestamp(user)
            if timestamps:
                ts = max(ts, max(timestamps.itervalues()))
            # Calculate the total size if requested,
            # but don't bother if it's not necessary.
            if not recalculate_size:
                last_size_recalc = 0
                size = 0
            else:
                last_size_recalc = int(time.time())
                size = self._recalculate_total_size(user)
            # Store it all back into the cache.
            data = {
                "size": size,
                "last_size_recalc": last_size_recalc,
                "modified": ts,
                "collections": timestamps,
            }
            self.cache.cas(key, data, casid)
        # Recalculate the size if it appears to be out of date.
        # Use CAS to avoid clobbering changes but don't let it fail us.
        elif recalculate_size:
            recalc_period = time.time() - data["last_size_recalc"]
            if recalc_period > SIZE_RECALCULATION_PERIOD:
                data["last_size_recalc"] = int(time.time())
                data["size"] = self._recalculate_total_size(user)
                self.cache.cas(key, data, casid)
        return data

    def _update_total_size(self, user, size):
        """Update the cached value for total storage size."""
        key = _key(user["uid"], "metadata")
        data, casid = self.cache.gets(key)
        if data is None:
            self._get_metadata(user)
            data, casid = self.cache.gets(key)
        data["last_size_recalc"] = int(time.time())
        data["size"] = size
        self.cache.cas(key, data, casid)

    def _recalculate_total_size(self, user):
        """Re-calculate total size from the database."""
        size = self.storage.get_total_size(user)
        for colmgr in self.cache_only_collections.itervalues():
            try:
                items = colmgr.get_items(user)["items"]
                payloads = (item.get("payload", "") for item in items)
                size += sum(len(p) for p in payloads)
            except CollectionNotFoundError:
                pass
        return size

    @contextlib.contextmanager
    def _mark_collection_dirty(self, user, collection):
        """Context manager for marking collections as dirty during write.

        To prevent the cache from getting out of sync with the underlying store
        it is necessary to mark a collection as dirty before performing any
        modifications on it.  This is a handy context manager that can take
        care of that, as well as update the timestamps with new results when
        the modification is complete.

        The context object associated with this method is a callback function
        that can be used to update the stored metadata.  It accepts the top-
        level storage timestamp, collection-level timestamp, and a total size
        increment as its three arguments.  Example usage::

            with self._mark_collection_dirty(user, collection) as update:
                colobj = self._get_collection_manager(collection)
                ts = colobj.set_item(user, "test", {"payload": "TEST"})
                update(ts, ts, len("TEST"))

        """
        # Get the old values from the metadata.
        # We can't call _get_metadata directly because we also want the casid.
        key = _key(user["uid"], "metadata")
        data, casid = self.cache.gets(key)
        if data is None:
            # No cached data, so refresh.
            self._get_metadata(user)
            data, casid = self.cache.gets(key)

        # Write None into the metadata to mark things as dirty.
        ts = data["modified"]
        col_ts = data["collections"].get(collection)
        data["modified"] = None
        data["collections"][collection] = None
        if not self.cache.cas(key, data, casid):
            raise ConflictError

        # Define the callback function for the calling code to use.
        # We also use this function internally to recover from errors.
        update_was_called = []

        def update(ts=ts, col_ts=col_ts, size_incr=0):
            assert not update_was_called
            update_was_called.append(True)
            data["modified"] = ts
            if col_ts is None:
                del data["collections"][collection]
            else:
                data["collections"][collection] = col_ts
            data["size"] += size_incr
            # We assume the write lock is held to avoid conflicting changes.
            # Sadly, using CAS again would require another round-trip.
            self.cache.set(key, data)

        # Yield out to the calling code.
        # It can call the yielded function to provide new metadata.
        # If they don't call it, then we cannot make any assumptions about
        # the consistency of the cached data and must leave things marked
        # as dirty until another write cleans it up.
        try:
            yield update
        except StorageError:
            # If a storage-related error occurs, then we know that the
            # operation wrapped by the calling code did not succeed.
            # It's therefore safe to roll back to previous values.
            if not update_was_called:
                update()
            raise
Exemple #5
0
class MemcachedNonceCache(object):
    """Object for managing a cache of used nonce values in memcached.

    This class allow easy timestamp-based management of client-generated
    nonces according to the rules of RFC-TODO:

        * Maintain a measure of clock skew for each MAC id.
        * Reject nonces with a timestamp outside the configured range.
        * Reject nonces that have already been seen.

    It stores the nonces in memcached so that they can be shared between
    different webserver processes.  The clock-skew for each id token is
    stored under they key sha1(<id>:skew), while each nonce seen for the id
    is marked by a key sha1(<id>:nonce:<timestamp>:<nonce>).

    NOTE: the "MAC id" here corresponds to the full authentication token
    issues by the tokenserver, not to the numeric userid of an individual
    user.  So it is entirely possible to have multiple skew records for
    each user, corresponding to different active tokens.
    """

    def __init__(self, nonce_ttl=None, id_ttl=None, cache_servers=None,
                 cache_key_prefix="noncecache:", cache_pool_size=None,
                 cache_pool_timeout=60, **kwds):
        # Memcached ttls are in integer seconds, so round up to the nearest.
        if nonce_ttl is None:
            nonce_ttl = DEFAULT_NONCE_TTL
        else:
            nonce_ttl = int(math.ceil(nonce_ttl))
        if id_ttl is None:
            id_ttl = DEFAULT_ID_TTL
        else:
            id_ttl = int(math.ceil(id_ttl))
        self.nonce_ttl = nonce_ttl
        self.id_ttl = id_ttl
        self.mcclient = MemcachedClient(cache_servers, cache_key_prefix,
                                        cache_pool_size, cache_pool_timeout)

    def _key(self, *names):
        """Get a memcached key built from the given component names.

        This method returns the memcached key to use for the given component
        names, by contatentating them together and then hashing them.  The
        hashing serves both to ensure confidentiality of the macauth tokens
        store in memcached, and the reduce the size of the keys.
        """
        return urlsafe_b64encode(sha1(":".join(names)).digest())

    def check_nonce(self, id, timestamp, nonce):
        """Check if the given timestamp+nonce is fresh for the given id.

        This method checks that the given timestamp+nonce has not previously
        been seen for the given id.  It returns True if the nonce is fresh
        and False if not.

        Fresh nonces are added to the cache, so that subsequent checks of the
        same nonce will return False.
        """
        # We want to fetch the recorded clock skew for this id, along with
        # any existing cache entry for the provided nonce.
        key_skew = self._key(id, "skew")
        key_nonce = self._key(id, "nonce", str(timestamp), nonce)
        # Use get_multi to fetch both keys in a single request.
        # If the data appears to be corrupted then fail out for safety.
        try:
            cached = self.mcclient.get_multi([key_skew, key_nonce])
        except ValueError:
            return False
        # If the nonce appears in the cache, it must be stale.
        if key_nonce in cached:
            return False
        # If we've never recorded a clock skew for this id, record it now.
        try:
            skew = cached[key_skew]
        except KeyError:
            skew = int(time.time() - timestamp)
            self.mcclient.add(key_skew, skew, time=self.id_ttl)
        # If the adjusted timestamp is too old or too new, it is stale.
        # XXX TODO: we should use a monotonic clock here.
        if abs(timestamp + skew - time.time()) >= self.nonce_ttl:
            return False
        # The nonce is fresh, add it into the cache.
        self.mcclient.add(key_nonce, True, time=self.nonce_ttl)
        return True