def aggregate_raw_times(self, key, service_name, max_batch_size=None): """ Aggregates values from a list living under a given key. Returns its min, max, mean and an overall usage count. 'max_batch_size' controls how many items will be fetched from the list so it's possible to fetch less items than its LLEN returns. """ key_len = self.server.kvdb.conn.llen(key) if max_batch_size: batch_size = min(key_len, max_batch_size) if batch_size < key_len: msg = 'batch_size:`%s` < key_len:`%s`, max_batch_size:`%s`, key:`%s`, ' \ 'consider decreasing the job interval or increasing max_batch_size' self.logger.warn(msg, batch_size, key_len, max_batch_size, key) else: batch_size = key_len times = [ int(elem) for elem in self.server.kvdb.conn.lrange(key, 0, batch_size) ] if times: mean_percentile = int( self.server.kvdb.conn.hget( KVDB.SERVICE_TIME_BASIC + service_name, 'mean_percentile') or 0) max_score = int(percentile(times, mean_percentile)) return min(times), max(times), (tmean(times, limit_to=max_score) or 0), len(times) else: return 0, 0, 0, 0
def create_summary(self, target, *pattern_names): try: now = datetime.utcnow() key_prefix = KVDB.SERVICE_SUMMARY_PREFIX_PATTERN.format(target) if target == 'by-week': start = parse_datetime( (now + relativedelta(weekday=MO(-1)) ).strftime('%Y-%m-%d 00:00:00')) # Current week start key_suffix = start.strftime( DT_PATTERNS.SUMMARY_SUFFIX_PATTERNS[target]) else: start = parse_datetime( now.strftime('%Y-%m-%d 00:00:00')) # Current day start key_suffix = now.strftime( DT_PATTERNS.SUMMARY_SUFFIX_PATTERNS[target]) total_seconds = (now - start).total_seconds() patterns = [] for name in pattern_names: patterns.append( getattr(self, 'get_by_{}_patterns'.format(name))(now)) services = {} for elem in chain(*patterns): prefix, suffix = elem.split('*') suffix = suffix[1:] stats = self.collect_service_stats(elem, prefix, suffix, None, False, False, False) for service_name, values in stats.items(): stats = services.setdefault(service_name, deepcopy(DEFAULT_STATS)) for name in STATS_KEYS: value = values[name] if name == 'usage': stats[name] += value elif name == 'max': stats[name] = max(stats[name], value) elif name == 'mean': stats[name].append(value) elif name == 'min': stats[name] = min(stats[name], value) for service_name, values in services.items(): values['mean'] = round(tmean(values['mean']), 2) values['rate'] = round(values['usage'] / total_seconds, 2) except Exception: self.logger.debug('Could not store mean/rate. e=`%r`, locals=`%r`', format_exc(), locals()) else: self.hset_aggr_keys(services, key_prefix, key_suffix)
def collect_service_stats(self, keys_pattern, key_prefix, key_suffix, total_seconds, suffix_needs_colon=True, chop_off_service_name=True, needs_rate=True): service_stats = {} if suffix_needs_colon: key_suffix = ':' + key_suffix for key in self.kvdb.conn.keys(keys_pattern): service_name = key.replace(key_prefix, '').replace(key_suffix, '') if chop_off_service_name: service_name = service_name[:-3] values = self.kvdb.conn.hgetall(key) stats = service_stats.setdefault(service_name, {}) for name in STATS_KEYS: value = values.get(name) if value: if name in ('rate', 'mean'): value = float(value) else: value = int(value) if not name in stats: if name == 'mean': stats[name] = [] elif name == 'min': stats[name] = maxint else: stats[name] = 0 if name == 'usage': stats[name] += value elif name == 'max': stats[name] = max(stats[name], value) elif name == 'mean': stats[name].append(value) elif name == 'min': stats[name] = min(stats[name], value) for service_name, values in service_stats.items(): mean = values.get('mean') if mean: values['mean'] = tmean(mean) if needs_rate: values['rate'] = values['usage'] / total_seconds return service_stats
def handle(self): if not self.stats_enabled(): return # # Sample config values # # global_slow_threshold=120 # max_batch_size=99999 # config = Bunch() for item in self.request.payload.splitlines(): key, value = item.split('=') config[key] = int(value) for key in self.server.kvdb.conn.keys(KVDB.SERVICE_TIME_RAW + '*'): service_name = key.replace(KVDB.SERVICE_TIME_RAW, '') current_mean = float( self.server.kvdb.conn.hget( KVDB.SERVICE_TIME_BASIC + service_name, 'mean_all_time') or 0) current_min = float( self.server.kvdb.conn.hget( KVDB.SERVICE_TIME_BASIC + service_name, 'min_all_time') or 0) current_max = float( self.server.kvdb.conn.hget( KVDB.SERVICE_TIME_BASIC + service_name, 'max_all_time') or 0) batch_min, batch_max, batch_mean, batch_total = self.aggregate_raw_times( key, service_name, config.max_batch_size) self.server.kvdb.conn.hset( KVDB.SERVICE_TIME_BASIC + service_name, 'mean_all_time', tmean(batch_mean, limit_to=current_mean)) self.server.kvdb.conn.hset(KVDB.SERVICE_TIME_BASIC + service_name, 'min_all_time', min(current_min, batch_min)) self.server.kvdb.conn.hset(KVDB.SERVICE_TIME_BASIC + service_name, 'max_all_time', max(current_max, batch_max)) # Services use RPUSH for storing raw times so we are safe to use LTRIM # in order to do away with the already processed ones self.server.kvdb.conn.ltrim(key, batch_total, -1)
def get_stats(self, start, stop, service='*', n=None, n_type=None, needs_trends=True, stats_key_prefix=None, suffixes=None): """ Returns statistics for a given interval, as defined by 'start' and 'stop'. service default to '*' for all services in that period and may be set to return a one-element list of information regarding that particular service. Setting 'n' to a positive integer will make it return only top n services. """ if not stats_key_prefix: stats_key_prefix = self.stats_key_prefix stats_elems = {} all_services_stats = Bunch({'usage': 0, 'time': 0}) # All mean values mean_all_services_list = [] # A mean value of all the mean values (mean_all_services_list) mean_all_services = 0 start = parse_datetime(start) stop = parse_datetime(stop) delta = (stop - start) if hasattr(delta, 'total_seconds'): delta_seconds = delta.total_seconds() else: delta_seconds = delta.seconds if not suffixes: suffixes = self.get_suffixes(start, stop) # We make several passes. First two passes are made over Redis keys, one gathers the services, if any at all, # and another one actually collects statistics for each service found. Next pass, a partly optional one, # computes trends for mean response time and service usage. Another one computes each of the service's # average rate and updates other attributes basing on values collected in the previous step. # Optionally, the last one will pick only top n elements of a given type (top mean response time # or top usage). # 1st pass for suffix in suffixes: keys = self.server.kvdb.conn.keys('{}{}:{}'.format( stats_key_prefix, service, suffix)) for key in keys: service_name = key.replace(stats_key_prefix, '').replace(':{}'.format(suffix), '') stats_elem = StatsElem(service_name) stats_elems[service_name] = stats_elem # When building statistics, we can't expect there will be data for all the time # elems built above so to guard against it, this is a dictionary whose keys are the # said elems and values are mean/usage for each elem. The values will remain # 0/0.0 if there is no data for the time elem, which may mean that in this # particular time slice the service wasn't invoked at all. stats_elem.expected_time_elems = OrderedDict( (elem, Bunch({ 'mean': 0, 'usage': 0.0 })) for elem in suffixes) # 2nd pass for service, stats_elem in stats_elems.items(): for suffix in suffixes: key = '{}{}:{}'.format(stats_key_prefix, service, suffix) # We can convert all the values to floats here to ease with computing # all the stuff and convert them still to integers later on, when necessary. key_values = Bunch(((name, float(value)) for ( name, value) in iteritems(self.server.kvdb.conn.hgetall(key)))) if key_values: time = (key_values.usage * key_values.mean) stats_elem.time += time mean_all_services_list.append(key_values.mean) all_services_stats.time += time all_services_stats.usage += key_values.usage stats_elem.min_resp_time = min(stats_elem.min_resp_time, key_values.min) stats_elem.max_resp_time = max(stats_elem.max_resp_time, key_values.max) for attr in ('mean', 'usage'): stats_elem.expected_time_elems[suffix][ attr] = key_values[attr] mean_all_services = '{:.0f}'.format( tmean(mean_all_services_list)) if mean_all_services_list else 0 # 3rd pass (partly optional) for stats_elem in stats_elems.values(): stats_elem.mean_all_services = mean_all_services stats_elem.all_services_time = int(all_services_stats.time) stats_elem.all_services_usage = int(all_services_stats.usage) values = stats_elem.expected_time_elems.values() stats_elem.mean_trend_int = [int(elem.mean) for elem in values] stats_elem.usage_trend_int = [int(elem.usage) for elem in values] stats_elem.mean = float('{:.2f}'.format( tmean(stats_elem.mean_trend_int))) stats_elem.usage = sum(stats_elem.usage_trend_int) stats_elem.rate = float('{:.2f}'.format( sum(stats_elem.usage_trend_int) / delta_seconds)) self.set_percent_of_all_services(all_services_stats, stats_elem) if needs_trends: stats_elem.mean_trend = ','.join( str(elem) for elem in stats_elem.mean_trend_int) stats_elem.usage_trend = ','.join( str(elem) for elem in stats_elem.usage_trend_int) # 4th pass (optional) if n: for stats_elem in self.yield_top_n(n, n_type, stats_elems): yield stats_elem else: for stats_elem in stats_elems.values(): yield stats_elem