def __init__(self, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS super(RedisQuota, self).__init__(**options) options.setdefault('hosts', {0: {}}) self.cluster = Cluster(options['hosts'])
class RedisRateLimiter(RateLimiter): ttl = 60 def __init__(self, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS options.setdefault('hosts', {0: {}}) self.cluster = Cluster(options['hosts']) def validate(self): try: with self.cluster.all() as client: client.ping() except Exception as e: raise InvalidConfiguration(unicode(e)) def is_limited(self, project, key, limit): key = 'rl:%s:%s:%s' % ( key, project.id, int(time() / self.ttl) ) with self.cluster.map() as client: proj_result = client.incr(key) client.expire(key, self.ttl) return proj_result.value > limit
def __init__(self, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS options.setdefault('hosts', { 0: {}, }) self.cluster = Cluster(options['hosts']) self.client = self.cluster.get_routing_client()
def __init__(self, hosts=None, prefix='ts:', vnodes=64, **kwargs): # inherit default options from REDIS_OPTIONS defaults = settings.SENTRY_REDIS_OPTIONS if hosts is None: hosts = defaults.get('hosts', {0: {}}) self.cluster = Cluster(hosts) self.prefix = prefix self.vnodes = vnodes super(RedisTSDB, self).__init__(**kwargs)
def test_redis_blaster_operation_disable_instance(): from rb import Cluster cluster = Cluster(hosts={0: { 'port': DB_SETTINGS['port'] }}, host_defaults={'host': DB_SETTINGS['host']}) exercise_fanout(cluster) client = cluster.get_routing_client() exercise_redis(client)
def __init__(self, version=None, prefix=None, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS options.setdefault('hosts', { 0: {}, }) self.cluster = Cluster(options['hosts']) self.client = self.cluster.get_routing_client() super(RedisCache, self).__init__(version=version, prefix=prefix)
class RedisCache(BaseCache): key_expire = 60 * 60 # 1 hour def __init__(self, version=None, prefix=None, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS options.setdefault('hosts', { 0: {}, }) self.cluster = Cluster(options['hosts']) self.client = self.cluster.get_routing_client() super(RedisCache, self).__init__(version=version, prefix=prefix) def set(self, key, value, timeout, version=None): key = self.make_key(key, version=version) v = json.dumps(value) if timeout: self.client.setex(key, int(timeout), v) else: self.client.set(key, v) def delete(self, key, version=None): key = self.make_key(key, version=version) self.client.delete(key) def get(self, key, version=None): key = self.make_key(key, version=version) result = self.client.get(key) if result is not None: result = json.loads(result) return result
class RedisCache(local): key_expire = 60 * 60 # 1 hour def __init__(self, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS options.setdefault('hosts', { 0: {}, }) self.cluster = Cluster(options['hosts']) self.client = self.cluster.get_routing_client() def set(self, key, value, timeout): v = json.dumps(value) if timeout: self.client.setex(key, int(timeout), v) else: self.client.set(key, v) def delete(self, key): self.client.delete(key) def get(self, key): result = self.client.get(key) if result is not None: result = json.loads(result) return result
def __init__(self, **options): super(RedisBackend, self).__init__(**options) self.cluster = Cluster(**options.pop('cluster', settings.SENTRY_REDIS_OPTIONS)) self.namespace = options.pop('namespace', 'd') # Sets the time-to-live (in seconds) for records, timelines, and # digests. This can (and should) be a relatively high value, since # timelines, digests, and records should all be deleted after they have # been processed -- this is mainly to ensure stale data doesn't hang # around too long in the case of a configuration error. This should be # larger than the maximum backoff value to ensure data is not evicted # too early. self.ttl = options.pop('ttl', 60 * 60) if options: logger.warning('Discarding invalid options: %r', options)
class RedisQuota(Quota): ttl = 60 def __init__(self, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS super(RedisQuota, self).__init__(**options) options.setdefault('hosts', {0: {}}) self.cluster = Cluster(options['hosts']) def validate(self): try: with self.cluster.all() as client: client.ping() except Exception as e: raise InvalidConfiguration(unicode(e)) def is_rate_limited(self, project): proj_quota = self.get_project_quota(project) if project.team: team_quota = self.get_team_quota(project.team) else: team_quota = 0 system_quota = self.get_system_quota() if not (proj_quota or system_quota or team_quota): return NotRateLimited sys_result, team_result, proj_result = self._incr_project(project) if proj_quota and proj_result > proj_quota: return RateLimited(retry_after=self.get_time_remaining()) if team_quota and team_result > team_quota: return RateLimited(retry_after=self.get_time_remaining()) if system_quota and sys_result > system_quota: return RateLimited(retry_after=self.get_time_remaining()) return NotRateLimited def get_time_remaining(self): return int(self.ttl - ( time.time() - int(time.time() / self.ttl) * self.ttl)) def _get_system_key(self): return 'quota:s:%s' % (int(time.time() / self.ttl),) def _get_team_key(self, team): return 'quota:t:%s:%s' % (team.id, int(time.time() / self.ttl)) def _get_project_key(self, project): return 'quota:p:%s:%s' % (project.id, int(time.time() / self.ttl)) def _incr_project(self, project): if project.team: team_key = self._get_team_key(project.team) else: team_key = None team_result = None proj_key = self._get_project_key(project) sys_key = self._get_system_key() with self.cluster.map() as client: proj_result = client.incr(proj_key) client.expire(proj_key, self.ttl) sys_result = client.incr(sys_key) client.expire(sys_key, self.ttl) if team_key: team_result = client.incr(team_key) client.expire(team_key, self.ttl) return ( int(sys_result.value), int(team_result and team_result.value or 0), int(proj_result.value), )
def init_redis_cache(): global cluster cluster = Cluster(hosts=redis_map.nodes, host_defaults=redis_map.node_options, router_cls=KemonoRouter) return cluster
class RedisBackend(Backend): """ Implements the digest backend API, backed by Redis. Each timeline is modeled as a sorted set, and also maintains a separate key that contains the iteration counter for implementing backoff strategies that require this value as an argument, such as exponential backoff. .. code:: redis:6379> GET "d:t:mail:p:1:i" "1" redis:6379> ZREVRANGEBYSCORE "d:t:mail:p:1" inf -inf WITHSCORES 1) "433be20b807c4cd49a132de69c0f6c55" 2) "1444847625" 3) "0f9d5fe4b5b3400fab85d9a841aa8467" 4) "1444847625" ... In the example above, the timeline ``mail:p:1`` has already been digested once, as evidenced by the iteration counter (the key that ends with ``:i``.) The timeline also contains references to several records, which are stored separately, encoded using the codec provided to the backend: .. code:: redis:6379> GET "d:t:mail:p:1:r:433be20b807c4cd49a132de69c0f6c55" [ binary content ] When the timeline is ready to be digested, the timeline set is renamed, creating a digest set (in this case the key would be ``d:t:mail:p:1:d``), that represents a snapshot of the timeline contents at that point in time. (If the digest set already exists, the timeline contents are instead unioned into the digest set and then the timeline is cleared.) This allows new records to be added to the timeline that will be processed after the next scheduling interval without the risk of data loss due to race conditions between the record addition and digest generation and delivery. Schedules are modeled as two sorted sets -- one for ``waiting`` items, and one for ``ready`` items. Items in the ``waiting`` set are scored by the time at which they should be transitioned to the ``ready`` set. Items in the ``ready`` set are scored by the time at which they were scheduled to be added to the ``ready`` set. Iterating each set from oldest to newest yields the highest priority items for action (moving from the ``waiting`` to ``ready`` set, or delivering a digest for the ``waiting`` and ``ready`` set, respectively.) .. code:: redis:6379> ZREVRANGEBYSCORE "d:s:w" inf -inf WITHSCORES 1) "mail:p:1" 2) "1444847638" """ def __init__(self, **options): super(RedisBackend, self).__init__(**options) self.cluster = Cluster(**options.pop('cluster', settings.SENTRY_REDIS_OPTIONS)) self.namespace = options.pop('namespace', 'd') # Sets the time-to-live (in seconds) for records, timelines, and # digests. This can (and should) be a relatively high value, since # timelines, digests, and records should all be deleted after they have # been processed -- this is mainly to ensure stale data doesn't hang # around too long in the case of a configuration error. This should be # larger than the maximum backoff value to ensure data is not evicted # too early. self.ttl = options.pop('ttl', 60 * 60) if options: logger.warning('Discarding invalid options: %r', options) def add(self, key, record): timeline_key = make_timeline_key(self.namespace, key) record_key = make_record_key(timeline_key, record.key) connection = self.cluster.get_local_client_for_key(timeline_key) with connection.pipeline() as pipeline: pipeline.multi() pipeline.set( record_key, self.codec.encode(record.value), ex=self.ttl, ) pipeline.set(make_iteration_key(timeline_key), 0, nx=True) pipeline.expire(make_iteration_key(timeline_key), self.ttl) # In the future, it might make sense to prefix the entry with the # timestamp (lexicographically sortable) to ensure that we can # maintain the correct sort order with abitrary precision: # http://redis.io/commands/ZADD#elements-with-the-same-score pipeline.zadd(timeline_key, record.timestamp, record.key) pipeline.expire(timeline_key, self.ttl) ensure_timeline_scheduled( map( functools.partial(make_schedule_key, self.namespace), (SCHEDULE_STATE_WAITING, SCHEDULE_STATE_READY,), ), (key, record.timestamp + self.backoff(0)), pipeline, ) should_truncate = random.random() < self.truncation_chance if should_truncate: truncate_timeline((timeline_key,), (self.capacity,), pipeline) results = pipeline.execute() if should_truncate: logger.info('Removed %s extra records from %s.', results[-1], key) def schedule(self, deadline, chunk=1000): # TODO: This doesn't lead to a fair balancing of workers, ideally each # scheduling task would be executed by a different process for each # host. There is also no failure isolation here, so a single shard # failure will cause the remainder of the shards to not be able to be # scheduled. for host in self.cluster.hosts: connection = self.cluster.get_local_client(host) with Lock('{0}:s:{1}'.format(self.namespace, host), nowait=True, timeout=30): # Prevent a runaway loop by setting a maximum number of # iterations. Note that this limits the total number of # expected items in any specific scheduling interval to chunk * # maximum_iterations. maximum_iterations = 1000 for i in xrange(maximum_iterations): items = connection.zrangebyscore( make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING), min=0, max=deadline, withscores=True, start=0, num=chunk, ) # XXX: Redis will error if we try and execute an empty # transaction. If there are no items to move between states, we # need to exit the loop now. (This can happen on the first # iteration of the loop if there is nothing to do, or on a # subsequent iteration if there was exactly the same number of # items to change states as the chunk size.) if not items: break with connection.pipeline() as pipeline: pipeline.multi() pipeline.zrem( make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING), *[key for key, timestamp in items] ) pipeline.zadd( make_schedule_key(self.namespace, SCHEDULE_STATE_READY), *itertools.chain.from_iterable([(timestamp, key) for (key, timestamp) in items]) ) for key, timestamp in items: yield ScheduleEntry(key, timestamp) pipeline.execute() # If we retrieved less than the chunk size of items, we don't # need try to retrieve more items. if len(items) < chunk: break else: raise RuntimeError('loop exceeded maximum iterations (%s)' % (maximum_iterations,)) def maintenance(self, deadline, chunk=1000): # TODO: This needs tests! # TODO: This suffers from the same shard isolation issues as # ``schedule``. Ideally, this would also return the number of items # that were rescheduled (and possibly even how late they were at the # point of rescheduling) but that causes a bit of an API issue since in # the case of an error, this can be considered a partial success (but # still should raise an exception.) for host in self.cluster.hosts: connection = self.cluster.get_local_client(host) extra = 0 start = 0 maximum_iterations = 1000 for i in xrange(maximum_iterations): fetch_size = chunk + extra items = connection.zrangebyscore( make_schedule_key(self.namespace, SCHEDULE_STATE_READY), min=start, max=deadline, withscores=True, start=0, num=fetch_size, ) def try_lock(item): """ Attempt to immedately acquire a lock on the timeline at key, returning the lock if it can be acquired, otherwise returning ``None``. """ key, timestamp = item lock = Lock(make_timeline_key(self.namespace, key), timeout=5, nowait=True) return lock if lock.acquire() else None, item # Try to take out a lock on each item. If we can't acquire the # lock, that means this is currently being digested and cannot # be rescheduled. can_reschedule = { True: [], False: [], } for result in map(try_lock, items): can_reschedule[result[0] is not None].append(result) logger.debug('Fetched %s items, able to reschedule %s.', len(items), len(can_reschedule[True])) # Set the start position for the next query. (If there are no # items, we don't need to worry about this, since there won't # be a next query.) If all items share the same score and are # locked, the iterator will never advance (we will keep trying # to schedule the same locked items over and over) and either # eventually progress slowly as items are unlocked, or hit the # maximum iterations boundary. A possible solution to this # would be to count the number of items that have the maximum # score in this page that we assume we can't acquire (since we # couldn't acquire the lock this iteration) and add that count # to the next query limit. (This unfortunately could also # lead to unbounded growth too, so we have to limit it as well.) if items: start = items[-1][0] # (This value is (key, timestamp).) extra = min( ilen( itertools.takewhile( lambda (lock, (key, timestamp)): timestamp == start, can_reschedule[False][::-1], ), ), chunk, ) # XXX: We need to perform this check before the transaction to # ensure that we don't execute an empty transaction. (We'll # need to perform a similar check after the completion of the # transaction as well.) if not can_reschedule[True]: if len(items) == fetch_size: # There is nothing to reschedule in this chunk, but we # need check if there are others after this chunk. continue else: # There is nothing to unlock, and we've exhausted all items. break try: with connection.pipeline() as pipeline: pipeline.multi() pipeline.zrem( make_schedule_key(self.namespace, SCHEDULE_STATE_READY), *[key for (lock, (key, timestamp)) in can_reschedule[True]] ) pipeline.zadd( make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING), *itertools.chain.from_iterable([(timestamp, key) for (lock, (key, timestamp)) in can_reschedule[True]]) ) pipeline.execute() finally: # Regardless of the outcome of the transaction, we should # try to unlock the items for processing. for lock, item in can_reschedule[True]: try: lock.release() except Exception as error: # XXX: This shouldn't be hit (the ``Lock`` code # should swallow the exception) but this is here # for safety anyway. logger.warning('Could not unlock %r: %s', item, error) # If we retrieved less than the chunk size of items, we don't # need try to retrieve more items. if len(items) < fetch_size: break else: raise RuntimeError('loop exceeded maximum iterations (%s)' % (maximum_iterations,)) @contextmanager def digest(self, key): timeline_key = make_timeline_key(self.namespace, key) digest_key = make_digest_key(timeline_key) connection = self.cluster.get_local_client_for_key(timeline_key) with Lock(timeline_key, nowait=True, timeout=30): if connection.zscore(make_schedule_key(self.namespace, SCHEDULE_STATE_READY), key) is None: raise Exception('Cannot digest timeline, timeline is not in the ready state.') with connection.pipeline() as pipeline: pipeline.watch(digest_key) # This shouldn't be necessary, but better safe than sorry? if pipeline.exists(digest_key): pipeline.multi() pipeline.zunionstore(digest_key, (timeline_key, digest_key), aggregate='max') pipeline.delete(timeline_key) pipeline.expire(digest_key, self.ttl) pipeline.execute() else: pipeline.multi() pipeline.rename(timeline_key, digest_key) pipeline.expire(digest_key, self.ttl) try: pipeline.execute() except ResponseError as error: if 'no such key' in str(error): logger.debug('Could not move timeline for digestion (likely has no contents.)') else: raise # XXX: This must select all records, even though not all of them will # be returned if they exceed the capacity, to ensure that all records # will be garbage collected. records = connection.zrevrange(digest_key, 0, -1, withscores=True) if not records: logger.info('Retrieved timeline containing no records.') def get_iteration_count(default=0): value = connection.get(make_iteration_key(timeline_key)) if not value: logger.warning('Could not retrieve iteration counter for %s, defaulting to %s.', key, default) return default return int(value) iteration = get_iteration_count() def get_records_for_digest(): with connection.pipeline(transaction=False) as pipeline: for record_key, timestamp in records: pipeline.get(make_record_key(timeline_key, record_key)) for (record_key, timestamp), value in zip(records, pipeline.execute()): # We have to handle failures if the key does not exist -- # this could happen due to evictions or race conditions # where the record was added to a timeline while it was # already being digested. if value is None: logger.warning('Could not retrieve event for timeline.') else: yield Record(record_key, self.codec.decode(value), timestamp) yield itertools.islice(get_records_for_digest(), self.capacity) def cleanup_records(pipeline): record_keys = [make_record_key(timeline_key, record_key) for record_key, score in records] pipeline.delete(digest_key, *record_keys) def reschedule(): with connection.pipeline() as pipeline: pipeline.watch(digest_key) # This shouldn't be necessary, but better safe than sorry? pipeline.multi() cleanup_records(pipeline) pipeline.zrem(make_schedule_key(self.namespace, SCHEDULE_STATE_READY), key) pipeline.zadd(make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING), time.time() + self.backoff(iteration + 1), key) pipeline.set(make_iteration_key(timeline_key), iteration + 1) pipeline.execute() def unschedule(): with connection.pipeline() as pipeline: # Watch the timeline to ensure that no other transactions add # events to the timeline while we are trying to delete it. pipeline.watch(timeline_key) pipeline.multi() if connection.zcard(timeline_key) is 0: cleanup_records(pipeline) pipeline.delete(make_iteration_key(timeline_key)) pipeline.zrem(make_schedule_key(self.namespace, SCHEDULE_STATE_READY), key) pipeline.zrem(make_schedule_key(self.namespace, SCHEDULE_STATE_WAITING), key) pipeline.execute() # If there were records in the digest, we need to schedule it so that # we schedule any records that were added during digestion with the # appropriate backoff. If there were no items, we can try to remove the # timeline from the digestion schedule. if records: reschedule() else: try: unschedule() except WatchError: logger.debug('Could not remove timeline from schedule, rescheduling instead') reschedule()
class RedisTSDB(BaseTSDB): """ A time series storage backend for Redis. The time series API supports two data types: * simple counters * distinct counters (number of unique elements seen) The backend also supports virtual nodes (``vnodes``) which controls shard distribution. This value should be set to the anticipated maximum number of physical hosts and not modified after data has been written. Simple counters are stored in hashes. The key of the hash is composed of the model, epoch (which defines the start of the rollup period), and a shard identifier. This allows TTLs to be applied to the entire bucket, instead of having to be stored for every individual element in the rollup period. This results in a data layout that looks something like this:: { "<model>:<epoch>:<shard id>": { "<key>": value, ... }, ... } Distinct counters are stored using HyperLogLog, which provides a cardinality estimate with a standard error of 0.8%. The data layout looks something like this:: { "<model>:<epoch>:<key>": value, ... } """ def __init__(self, hosts=None, prefix='ts:', vnodes=64, **kwargs): # inherit default options from REDIS_OPTIONS defaults = settings.SENTRY_REDIS_OPTIONS if hosts is None: hosts = defaults.get('hosts', {0: {}}) self.cluster = Cluster(hosts) self.prefix = prefix self.vnodes = vnodes super(RedisTSDB, self).__init__(**kwargs) def validate(self): logger.info('Validating Redis version...') try: with self.cluster.all() as client: results = client.info() except Exception as e: # Any connection issues should be caught here. raise InvalidConfiguration(unicode(e)) versions = {} for id, info in results.value.items(): host = self.cluster.hosts[id] # NOTE: This assumes there is no routing magic going on here, and # all requests to this host are being served by the same database. key = '{host}:{port}'.format(host=host.host, port=host.port) versions[key] = Version(map(int, info['redis_version'].split('.', 3))) check_versions('Redis (TSDB)', versions, Version((2, 8, 9)), Version((3, 0, 4))) def make_key(self, model, epoch, model_key): if isinstance(model_key, six.integer_types): vnode = model_key % self.vnodes else: vnode = crc32(model_key) % self.vnodes return '{0}{1}:{2}:{3}'.format(self.prefix, model.value, epoch, vnode) def get_model_key(self, key): # We specialize integers so that a pure int-map can be optimized by # Redis, whereas long strings (say tag values) will store in a more # efficient hashed format. if not isinstance(key, six.integer_types): # enforce utf-8 encoding if isinstance(key, unicode): key = key.encode('utf-8') return md5(repr(key)).hexdigest() return key def incr(self, model, key, timestamp=None, count=1): self.incr_multi([(model, key)], timestamp, count) def incr_multi(self, items, timestamp=None, count=1): """ Increment project ID=1 and group ID=5: >>> incr_multi([(TimeSeriesModel.project, 1), (TimeSeriesModel.group, 5)]) """ make_key = self.make_key normalize_to_rollup = self.normalize_to_rollup if timestamp is None: timestamp = timezone.now() with self.cluster.map() as client: for rollup, max_values in self.rollups: norm_rollup = normalize_to_rollup(timestamp, rollup) for model, key in items: model_key = self.get_model_key(key) hash_key = make_key(model, norm_rollup, model_key) client.hincrby(hash_key, model_key, count) client.expireat( hash_key, self.calculate_expiry(rollup, max_values, timestamp), ) def get_range(self, model, keys, start, end, rollup=None): """ To get a range of data for group ID=[1, 2, 3]: Start and end are both inclusive. >>> now = timezone.now() >>> get_keys(TimeSeriesModel.group, [1, 2, 3], >>> start=now - timedelta(days=1), >>> end=now) """ normalize_to_epoch = self.normalize_to_epoch normalize_to_rollup = self.normalize_to_rollup make_key = self.make_key if rollup is None: rollup = self.get_optimal_rollup(start, end) results = [] timestamp = end with self.cluster.map() as client: while timestamp >= start: real_epoch = normalize_to_epoch(timestamp, rollup) norm_epoch = normalize_to_rollup(timestamp, rollup) for key in keys: model_key = self.get_model_key(key) hash_key = make_key(model, norm_epoch, model_key) results.append((real_epoch, key, client.hget(hash_key, model_key))) timestamp = timestamp - timedelta(seconds=rollup) results_by_key = defaultdict(dict) for epoch, key, count in results: results_by_key[key][epoch] = int(count.value or 0) for key, points in results_by_key.iteritems(): results_by_key[key] = sorted(points.items()) return dict(results_by_key) def make_distinct_counter_key(self, model, rollup, timestamp, key): return '{prefix}{model}:{epoch}:{key}'.format( prefix=self.prefix, model=model.value, epoch=self.normalize_ts_to_rollup(timestamp, rollup), key=self.get_model_key(key), ) def record(self, model, key, values, timestamp=None): self.record_multi(((model, key, values),), timestamp) def record_multi(self, items, timestamp=None): """ Record an occurence of an item in a distinct counter. """ if timestamp is None: timestamp = timezone.now() ts = int(to_timestamp(timestamp)) # ``timestamp`` is not actually a timestamp :( with self.cluster.fanout() as client: for model, key, values in items: c = client.target_key(key) for rollup, max_values in self.rollups: k = self.make_distinct_counter_key( model, rollup, ts, key, ) c.pfadd(k, *values) c.expireat( k, self.calculate_expiry( rollup, max_values, timestamp, ), ) def get_distinct_counts_series(self, model, keys, start, end=None, rollup=None): """ Fetch counts of distinct items for each rollup interval within the range. """ rollup, series = self.get_optimal_rollup_series(start, end, rollup) responses = {} with self.cluster.fanout() as client: for key in keys: c = client.target_key(key) r = responses[key] = [] for timestamp in series: r.append(( timestamp, c.pfcount( self.make_distinct_counter_key( model, rollup, timestamp, key, ), ), )) return {key: [(timestamp, promise.value) for timestamp, promise in value] for key, value in responses.iteritems()} def get_distinct_counts_totals(self, model, keys, start, end=None, rollup=None): """ Count distinct items during a time range. """ rollup, series = self.get_optimal_rollup_series(start, end, rollup) responses = {} with self.cluster.fanout() as client: for key in keys: # XXX: The current versions of the Redis driver don't implement # ``PFCOUNT`` correctly (although this is fixed in the Git # master, so should be available in the next release) and only # supports a single key argument -- not the variadic signature # supported by the protocol -- so we have to call the commnand # directly here instead. ks = [] for timestamp in series: ks.append(self.make_distinct_counter_key(model, rollup, timestamp, key)) responses[key] = client.target_key(key).execute_command('PFCOUNT', *ks) return {key: value.value for key, value in responses.iteritems()}
class RedisBuffer(Buffer): key_expire = 60 * 60 # 1 hour pending_key = 'b:p' def __init__(self, **options): if not options: # inherit default options from REDIS_OPTIONS options = settings.SENTRY_REDIS_OPTIONS options.setdefault('hosts', { 0: {}, }) self.cluster = Cluster(options['hosts']) def validate(self): try: with self.cluster.all() as client: client.ping() except Exception as e: raise InvalidConfiguration(unicode(e)) def _coerce_val(self, value): if isinstance(value, models.Model): value = value.pk return smart_str(value) def _make_key(self, model, filters): """ Returns a Redis-compatible key for the model given filters. """ return 'b:k:%s:%s' % ( model._meta, md5(smart_str('&'.join('%s=%s' % (k, self._coerce_val(v)) for k, v in sorted(filters.iteritems())))).hexdigest(), ) def _make_lock_key(self, key): return 'l:%s' % (key,) def incr(self, model, columns, filters, extra=None): """ Increment the key by doing the following: - Insert/update a hashmap based on (model, columns) - Perform an incrby on counters - Perform a set (last write wins) on extra - Add hashmap key to pending flushes """ # TODO(dcramer): longer term we'd rather not have to serialize values # here (unless it's to JSON) key = self._make_key(model, filters) # We can't use conn.map() due to wanting to support multiple pending # keys (one per Redis shard) conn = self.cluster.get_local_client_for_key(key) pipe = conn.pipeline() pipe.hsetnx(key, 'm', '%s.%s' % (model.__module__, model.__name__)) pipe.hsetnx(key, 'f', pickle.dumps(filters)) for column, amount in columns.iteritems(): pipe.hincrby(key, 'i+' + column, amount) if extra: for column, value in extra.iteritems(): pipe.hset(key, 'e+' + column, pickle.dumps(value)) pipe.expire(key, self.key_expire) pipe.zadd(self.pending_key, time(), key) pipe.execute() def process_pending(self): client = self.cluster.get_routing_client() lock_key = self._make_lock_key(self.pending_key) # prevent a stampede due to celerybeat + periodic task if not client.set(lock_key, '1', nx=True, ex=60): return try: for host_id in self.cluster.hosts.iterkeys(): conn = self.cluster.get_local_client(host_id) keys = conn.zrange(self.pending_key, 0, -1) if not keys: continue for key in keys: process_incr.apply_async(kwargs={ 'key': key, }) pipe = conn.pipeline() pipe.zrem(self.pending_key, *keys) pipe.execute() finally: client.delete(lock_key) def process(self, key): client = self.cluster.get_routing_client() lock_key = self._make_lock_key(key) # prevent a stampede due to the way we use celery etas + duplicate # tasks if not client.set(lock_key, '1', nx=True, ex=10): return with self.cluster.map() as conn: values = conn.hgetall(key) conn.delete(key) if not values.value: return model = import_string(values.value['m']) filters = pickle.loads(values.value['f']) incr_values = {} extra_values = {} for k, v in values.value.iteritems(): if k.startswith('i+'): incr_values[k[2:]] = int(v) elif k.startswith('e+'): extra_values[k[2:]] = pickle.loads(v) super(RedisBuffer, self).process(model, incr_values, filters, extra_values)
class RedisTSDB(BaseTSDB): """ A time series storage backend for Redis. The time series API supports two data types: * simple counters * distinct counters (number of unique elements seen) The backend also supports virtual nodes (``vnodes``) which controls shard distribution. This value should be set to the anticipated maximum number of physical hosts and not modified after data has been written. Simple counters are stored in hashes. The key of the hash is composed of the model, epoch (which defines the start of the rollup period), and a shard identifier. This allows TTLs to be applied to the entire bucket, instead of having to be stored for every individual element in the rollup period. This results in a data layout that looks something like this:: { "<model>:<epoch>:<shard id>": { "<key>": value, ... }, ... } Distinct counters are stored using HyperLogLog, which provides a cardinality estimate with a standard error of 0.8%. The data layout looks something like this:: { "<model>:<epoch>:<key>": value, ... } """ def __init__(self, hosts=None, prefix='ts:', vnodes=64, **kwargs): # inherit default options from REDIS_OPTIONS defaults = settings.SENTRY_REDIS_OPTIONS if hosts is None: hosts = defaults.get('hosts', {0: {}}) self.cluster = Cluster(hosts) self.prefix = prefix self.vnodes = vnodes super(RedisTSDB, self).__init__(**kwargs) def validate(self): logger.info('Validating Redis version...') try: with self.cluster.all() as client: results = client.info() except Exception as e: # Any connection issues should be caught here. raise InvalidConfiguration(unicode(e)) versions = {} for id, info in results.value.items(): host = self.cluster.hosts[id] # NOTE: This assumes there is no routing magic going on here, and # all requests to this host are being served by the same database. key = '{host}:{port}'.format(host=host.host, port=host.port) versions[key] = Version( map(int, info['redis_version'].split('.', 3))) check_versions('Redis (TSDB)', versions, Version((2, 8, 9)), Version((3, 0, 4))) def make_key(self, model, epoch, model_key): if isinstance(model_key, six.integer_types): vnode = model_key % self.vnodes else: vnode = crc32(model_key) % self.vnodes return '{0}{1}:{2}:{3}'.format(self.prefix, model.value, epoch, vnode) def get_model_key(self, key): # We specialize integers so that a pure int-map can be optimized by # Redis, whereas long strings (say tag values) will store in a more # efficient hashed format. if not isinstance(key, six.integer_types): # enforce utf-8 encoding if isinstance(key, unicode): key = key.encode('utf-8') return md5(repr(key)).hexdigest() return key def incr(self, model, key, timestamp=None, count=1): self.incr_multi([(model, key)], timestamp, count) def incr_multi(self, items, timestamp=None, count=1): """ Increment project ID=1 and group ID=5: >>> incr_multi([(TimeSeriesModel.project, 1), (TimeSeriesModel.group, 5)]) """ make_key = self.make_key normalize_to_rollup = self.normalize_to_rollup if timestamp is None: timestamp = timezone.now() with self.cluster.map() as client: for rollup, max_values in self.rollups: norm_rollup = normalize_to_rollup(timestamp, rollup) for model, key in items: model_key = self.get_model_key(key) hash_key = make_key(model, norm_rollup, model_key) client.hincrby(hash_key, model_key, count) client.expireat( hash_key, self.calculate_expiry(rollup, max_values, timestamp), ) def get_range(self, model, keys, start, end, rollup=None): """ To get a range of data for group ID=[1, 2, 3]: Start and end are both inclusive. >>> now = timezone.now() >>> get_keys(TimeSeriesModel.group, [1, 2, 3], >>> start=now - timedelta(days=1), >>> end=now) """ normalize_to_epoch = self.normalize_to_epoch normalize_to_rollup = self.normalize_to_rollup make_key = self.make_key if rollup is None: rollup = self.get_optimal_rollup(start, end) results = [] timestamp = end with self.cluster.map() as client: while timestamp >= start: real_epoch = normalize_to_epoch(timestamp, rollup) norm_epoch = normalize_to_rollup(timestamp, rollup) for key in keys: model_key = self.get_model_key(key) hash_key = make_key(model, norm_epoch, model_key) results.append( (real_epoch, key, client.hget(hash_key, model_key))) timestamp = timestamp - timedelta(seconds=rollup) results_by_key = defaultdict(dict) for epoch, key, count in results: results_by_key[key][epoch] = int(count.value or 0) for key, points in results_by_key.iteritems(): results_by_key[key] = sorted(points.items()) return dict(results_by_key) def make_distinct_counter_key(self, model, rollup, timestamp, key): return '{prefix}{model}:{epoch}:{key}'.format( prefix=self.prefix, model=model.value, epoch=self.normalize_ts_to_rollup(timestamp, rollup), key=self.get_model_key(key), ) def record(self, model, key, values, timestamp=None): self.record_multi(((model, key, values), ), timestamp) def record_multi(self, items, timestamp=None): """ Record an occurence of an item in a distinct counter. """ if timestamp is None: timestamp = timezone.now() ts = int(to_timestamp( timestamp)) # ``timestamp`` is not actually a timestamp :( with self.cluster.fanout() as client: for model, key, values in items: c = client.target_key(key) for rollup, max_values in self.rollups: k = self.make_distinct_counter_key( model, rollup, ts, key, ) c.pfadd(k, *values) c.expireat( k, self.calculate_expiry( rollup, max_values, timestamp, ), ) def get_distinct_counts_series(self, model, keys, start, end=None, rollup=None): """ Fetch counts of distinct items for each rollup interval within the range. """ rollup, series = self.get_optimal_rollup_series(start, end, rollup) responses = {} with self.cluster.fanout() as client: for key in keys: c = client.target_key(key) r = responses[key] = [] for timestamp in series: r.append(( timestamp, c.pfcount( self.make_distinct_counter_key( model, rollup, timestamp, key, ), ), )) return { key: [(timestamp, promise.value) for timestamp, promise in value] for key, value in responses.iteritems() } def get_distinct_counts_totals(self, model, keys, start, end=None, rollup=None): """ Count distinct items during a time range. """ rollup, series = self.get_optimal_rollup_series(start, end, rollup) responses = {} with self.cluster.fanout() as client: for key in keys: # XXX: The current versions of the Redis driver don't implement # ``PFCOUNT`` correctly (although this is fixed in the Git # master, so should be available in the next release) and only # supports a single key argument -- not the variadic signature # supported by the protocol -- so we have to call the commnand # directly here instead. ks = [] for timestamp in series: ks.append( self.make_distinct_counter_key(model, rollup, timestamp, key)) responses[key] = client.target_key(key).execute_command( 'PFCOUNT', *ks) return {key: value.value for key, value in responses.iteritems()}
class RedisTSDB(BaseTSDB): """ A time series storage implementation which maps types + normalized epochs to hash buckets. Since each hash keyspace is an epoch, TTLs are applied to the entire bucket. This ends up looking something like the following inside of Redis: { "TSDBModel:epoch:shard": { "Key": Count } } In our case, this translates to: { "Group:epoch:shard": { "GroupID": Count } } - ``vnodes`` controls the shard distribution and should ideally be set to the maximum number of physical hosts. """ def __init__(self, hosts=None, prefix='ts:', vnodes=64, **kwargs): # inherit default options from REDIS_OPTIONS defaults = settings.SENTRY_REDIS_OPTIONS if hosts is None: hosts = defaults.get('hosts', {0: {}}) self.cluster = Cluster(hosts) self.prefix = prefix self.vnodes = vnodes super(RedisTSDB, self).__init__(**kwargs) def validate(self): logger.info('Validating Redis version...') try: with self.cluster.all() as client: results = client.info() except Exception as e: # Any connection issues should be caught here. raise InvalidConfiguration(unicode(e)) versions = {} for id, info in results.value.items(): host = self.cluster.hosts[id] # NOTE: This assumes there is no routing magic going on here, and # all requests to this host are being served by the same database. key = '{host}:{port}'.format(host=host.host, port=host.port) versions[key] = Version( map(int, info['redis_version'].split('.', 3))) check_versions('Redis (TSDB)', versions, Version((2, 8, 9)), Version((3, 0, 4))) def make_key(self, model, epoch, model_key): if isinstance(model_key, six.integer_types): vnode = model_key % self.vnodes else: vnode = crc32(model_key) % self.vnodes return '{0}{1}:{2}:{3}'.format(self.prefix, model.value, epoch, vnode) def get_model_key(self, key): # We specialize integers so that a pure int-map can be optimized by # Redis, whereas long strings (say tag values) will store in a more # efficient hashed format. if not isinstance(key, six.integer_types): # enforce utf-8 encoding if isinstance(key, unicode): key = key.encode('utf-8') return md5(repr(key)).hexdigest() return key def incr(self, model, key, timestamp=None, count=1): self.incr_multi([(model, key)], timestamp, count) def incr_multi(self, items, timestamp=None, count=1): """ Increment project ID=1 and group ID=5: >>> incr_multi([(TimeSeriesModel.project, 1), (TimeSeriesModel.group, 5)]) """ make_key = self.make_key normalize_to_rollup = self.normalize_to_rollup if timestamp is None: timestamp = timezone.now() with self.cluster.map() as client: for rollup, max_values in self.rollups: norm_rollup = normalize_to_rollup(timestamp, rollup) for model, key in items: model_key = self.get_model_key(key) hash_key = make_key(model, norm_rollup, model_key) client.hincrby(hash_key, model_key, count) client.expireat( hash_key, self.calculate_expiry(rollup, max_values, timestamp), ) def get_range(self, model, keys, start, end, rollup=None): """ To get a range of data for group ID=[1, 2, 3]: Start and end are both inclusive. >>> now = timezone.now() >>> get_keys(TimeSeriesModel.group, [1, 2, 3], >>> start=now - timedelta(days=1), >>> end=now) """ normalize_to_epoch = self.normalize_to_epoch normalize_to_rollup = self.normalize_to_rollup make_key = self.make_key if rollup is None: rollup = self.get_optimal_rollup(start, end) results = [] timestamp = end with self.cluster.map() as client: while timestamp >= start: real_epoch = normalize_to_epoch(timestamp, rollup) norm_epoch = normalize_to_rollup(timestamp, rollup) for key in keys: model_key = self.get_model_key(key) hash_key = make_key(model, norm_epoch, model_key) results.append( (real_epoch, key, client.hget(hash_key, model_key))) timestamp = timestamp - timedelta(seconds=rollup) results_by_key = defaultdict(dict) for epoch, key, count in results: results_by_key[key][epoch] = int(count.value or 0) for key, points in results_by_key.iteritems(): results_by_key[key] = sorted(points.items()) return dict(results_by_key)
class RedisTSDB(BaseTSDB): """ A time series storage implementation which maps types + normalized epochs to hash buckets. Since each hash keyspace is an epoch, TTLs are applied to the entire bucket. This ends up looking something like the following inside of Redis: { "TSDBModel:epoch:shard": { "Key": Count } } In our case, this translates to: { "Group:epoch:shard": { "GroupID": Count } } - ``vnodes`` controls the shard distribution and should ideally be set to the maximum number of physical hosts. """ def __init__(self, hosts=None, prefix='ts:', vnodes=64, **kwargs): # inherit default options from REDIS_OPTIONS defaults = settings.SENTRY_REDIS_OPTIONS if hosts is None: hosts = defaults.get('hosts', {0: {}}) self.cluster = Cluster(hosts) self.prefix = prefix self.vnodes = vnodes super(RedisTSDB, self).__init__(**kwargs) def validate(self): try: with self.cluster.all() as client: client.ping() except Exception as e: raise InvalidConfiguration(unicode(e)) def make_key(self, model, epoch, model_key): if isinstance(model_key, six.integer_types): vnode = model_key % self.vnodes else: vnode = crc32(model_key) % self.vnodes return '{0}{1}:{2}:{3}'.format(self.prefix, model.value, epoch, vnode) def get_model_key(self, key): # We specialize integers so that a pure int-map can be optimized by # Redis, whereas long strings (say tag values) will store in a more # efficient hashed format. if not isinstance(key, six.integer_types): # enforce utf-8 encoding if isinstance(key, unicode): key = key.encode('utf-8') return md5(repr(key)).hexdigest() return key def incr(self, model, key, timestamp=None, count=1): self.incr_multi([(model, key)], timestamp, count) def incr_multi(self, items, timestamp=None, count=1): """ Increment project ID=1 and group ID=5: >>> incr_multi([(TimeSeriesModel.project, 1), (TimeSeriesModel.group, 5)]) """ make_key = self.make_key normalize_to_rollup = self.normalize_to_rollup if timestamp is None: timestamp = timezone.now() with self.cluster.map() as client: for rollup, max_values in self.rollups: norm_rollup = normalize_to_rollup(timestamp, rollup) expire = rollup * max_values for model, key in items: model_key = self.get_model_key(key) hash_key = make_key(model, norm_rollup, model_key) client.hincrby(hash_key, model_key, count) client.expire(hash_key, expire) def get_range(self, model, keys, start, end, rollup=None): """ To get a range of data for group ID=[1, 2, 3]: Start and end are both inclusive. >>> now = timezone.now() >>> get_keys(TimeSeriesModel.group, [1, 2, 3], >>> start=now - timedelta(days=1), >>> end=now) """ normalize_to_epoch = self.normalize_to_epoch normalize_to_rollup = self.normalize_to_rollup make_key = self.make_key if rollup is None: rollup = self.get_optimal_rollup(start, end) results = [] timestamp = end with self.cluster.map() as client: while timestamp >= start: real_epoch = normalize_to_epoch(timestamp, rollup) norm_epoch = normalize_to_rollup(timestamp, rollup) for key in keys: model_key = self.get_model_key(key) hash_key = make_key(model, norm_epoch, model_key) results.append((real_epoch, key, client.hget(hash_key, model_key))) timestamp = timestamp - timedelta(seconds=rollup) results_by_key = defaultdict(dict) for epoch, key, count in results: results_by_key[key][epoch] = int(count.value or 0) for key, points in results_by_key.iteritems(): results_by_key[key] = sorted(points.items()) return dict(results_by_key)