def test_get_metric_statistics(self): c = CloudWatchConnection() m = c.list_metrics()[0] end = datetime.datetime.utcnow() start = end - datetime.timedelta(hours=24 * 14) c.get_metric_statistics(3600 * 24, start, end, m.name, m.namespace, ['Average', 'Sum'])
def test_get_metric_statistics(self): c = CloudWatchConnection() m = c.list_metrics()[0] end = datetime.datetime.now() start = end - datetime.timedelta(hours=24*14) c.get_metric_statistics( 3600*24, start, end, m.name, m.namespace, ['Average', 'Sum'])
class BotoWatchInterface(WatchInterface): conn = None saveclcdata = False def __init__(self, clc_host, access_id, secret_key, token): #boto.set_stream_logger('foo') path='/services/CloudWatch' port=8773 if clc_host[len(clc_host)-13:] == 'amazonaws.com': clc_host = clc_host.replace('ec2', 'monitoring', 1) path = '/' reg = None port=443 reg = RegionInfo(name='eucalyptus', endpoint=clc_host) self.conn = CloudWatchConnection(access_id, secret_key, region=reg, port=port, path=path, is_secure=True, security_token=token, debug=0) self.conn.https_validate_certificates = False self.conn.http_connection_kwargs['timeout'] = 30 def __save_json__(self, obj, name): f = open(name, 'w') json.dump(obj, f, cls=BotoJsonWatchEncoder, indent=2) f.close() def get_metric_statistics(self, period, start_name, end_time, metric_name, namespace, statistics, dimensions, unit): obj = self.conn.get_metric_statistics(period, start_name, end_time, metric_name, namespace, statistics, dimensions, unit) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Statistics.json") return obj def list_metrics(self, next_token=None, dimensions=None, metric_name=None, namespace=None): obj = self.conn.list_metrics(next_token, dimensions, metric_name, namespace) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Metrics.json") return obj def put_metric_data(self, namespace, name, value, timestamp, unit, dimensions, statistics): return self.conn.put_metric_data(namespace, name, value, timestamp, unit, dimensions, statistics) def describe_alarms(self, action_prefix=None, alarm_name_prefix=None, alarm_names=None, max_records=None, state_value=None, next_token=None): obj = self.conn.describe_alarms(action_prefix, alarm_name_prefix, alarm_names, max_records, state_value, next_token) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Alarms.json") return obj def delete_alarms(self, alarm_names): return self.conn.delete_alarms(alarm_names) def enable_alarm_actions(self, alarm_names): return self.conn.enable_alarm_actions(alarm_names) def disable_alarm_actions(self, alarm_names): return self.conn.disable_alarm_actions(alarm_names) def put_metric_alarm(self, alarm): return self.conn.put_metric_alarm(alarm)
def getCloudWatchMetric(): end_time = datetime.datetime.now() # adding 65 seconds due amazon caracteristic end_time = end_time - datetime.timedelta(seconds=65) start_time = end_time - datetime.timedelta(seconds=args.interval) if args.verbose: debug = args.verbose else: debug = 0 regions = boto.ec2.cloudwatch.regions() for reg in regions: if reg.name == args.region: cloudwatch = CloudWatchConnection(is_secure=True, debug=debug, region=reg) cloudwatch_result = None # Check if the metric has collected statistics. If it does not, say so metricsList = cloudwatch.list_metrics(dimensions=dimension, namespace=args.namespace) metricTest = 'Metric:' + args.metric strMetricsList = [] for item in metricsList: strMetricsList.append(str(item)) if metricTest in strMetricsList: # Specify the application load balancer as follows: app/load-balancer-name/1234567890123456 (the final portion of the load balancer ARN) #tested metrics for ALB: TargetResponseTime(Average),RequestCount(Sum),ActiveConnectionCount(Sum),NewConnectionCount(Sum),HTTPCode_Target_4XX_Count(Sum),HTTPCode_Target_5XX_Count(Sum),HealthyHostCount(Average) cloudwatch_result = cloudwatch.get_metric_statistics( args.interval, start_time, end_time, args.metric, args.namespace, statistics=args.statistic, dimensions=dimension) if len(cloudwatch_result) > 0: cloudwatch_result = cloudwatch_result[0] if len(cloudwatch_result) > 0: if len(repr(cloudwatch_result[args.statistic])) > 6: cloudwatch_result = long(cloudwatch_result[args.statistic]) else: cloudwatch_result = float( cloudwatch_result[args.statistic]) else: # Assuming value is 0 if AWS returned empty list cloudwatch_result = 0 print cloudwatch_result else: print 'Unsupported Metric' return
def get_cloudwatch_top_metrics(): conn = CloudWatchConnection() metrics_names = [] next_token = None while True: res = conn.list_metrics(next_token=next_token, dimensions=settings.CLOUDWATCH_DIMENSIONS, namespace=settings.CLOUDWATCH_NAMESPACE) metrics_names.extend([m.name for m in res]) next_token = res.next_token if next_token is None: break # List of tuples like [(metric_name, count), ...] metrics = [] for metric_name in metrics_names: res = conn.get_metric_statistics(int(START_DELTA_AGO.total_seconds()), datetime.datetime.now() - START_DELTA_AGO, datetime.datetime.now(), metric_name, settings.CLOUDWATCH_NAMESPACE, 'Sum', settings.CLOUDWATCH_DIMENSIONS, 'Count') if not res: # Some metrics will not have (or no longer have) results continue count = int(res[0]['Sum']) if count >= TOP_THRESHOLD_COUNT: metrics.append((metric_name, count)) metrics.sort(key=lambda x: x[1], reverse=True) text = 'Responses sent\n----------------------\n' for metric in metrics: metric_name = 'TOTAL' if metric[0] == settings.CLOUDWATCH_TOTAL_SENT_METRIC_NAME else metric[0] if metric_name == settings.CLOUDWATCH_PROCESSING_TIME_METRIC_NAME: continue text += '%s %s\n' % (str(metric[1]).rjust(5), metric_name) return text
class BotoWatchInterface(WatchInterface): conn = None saveclcdata = False def __init__(self, clc_host, access_id, secret_key, token): #boto.set_stream_logger('foo') path='/services/CloudWatch' port=8773 if clc_host[len(clc_host)-13:] == 'amazonaws.com': clc_host = clc_host.replace('ec2', 'monitoring', 1) path = '/' reg = None port=443 reg = RegionInfo(name='eucalyptus', endpoint=clc_host) if boto.__version__ < '2.6': self.conn = CloudWatchConnection(access_id, secret_key, region=reg, port=port, path=path, is_secure=True, security_token=token, debug=0) else: self.conn = CloudWatchConnection(access_id, secret_key, region=reg, port=port, path=path, validate_certs=False, is_secure=True, security_token=token, debug=0) self.conn.http_connection_kwargs['timeout'] = 30 def __save_json__(self, obj, name): f = open(name, 'w') json.dump(obj, f, cls=BotoJsonWatchEncoder, indent=2) f.close() def get_metric_statistics(self, period, start_name, end_time, metric_name, namespace, statistics, dimensions, unit): obj = self.conn.get_metric_statistics(period, start_name, end_time, metric_name, namespace, statistics, dimensions, unit) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Statistics.json") return obj def list_metrics(self, next_token, dimensions, metric_name, namespace): obj = self.conn.list_metrics(next_token, dimensions, metric_name, namespace) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Metrics.json") return obj def put_metric_data(self, namespace, name, value, timestamp, unit, dimensions, statistics): return self.conn.put_metric_data(namespace, name, value, timestamp, unit, dimensions, statistics)
class BotoWatchInterface(WatchInterface): conn = None saveclcdata = False def __init__(self, clc_host, access_id, secret_key, token): # boto.set_stream_logger('foo') path = "/services/CloudWatch" port = 8773 if clc_host[len(clc_host) - 13 :] == "amazonaws.com": clc_host = clc_host.replace("ec2", "monitoring", 1) path = "/" reg = None port = 443 reg = RegionInfo(name="eucalyptus", endpoint=clc_host) self.conn = CloudWatchConnection( access_id, secret_key, region=reg, port=port, path=path, is_secure=True, security_token=token, debug=0 ) self.conn.https_validate_certificates = False self.conn.http_connection_kwargs["timeout"] = 30 def __save_json__(self, obj, name): f = open(name, "w") json.dump(obj, f, cls=BotoJsonWatchEncoder, indent=2) f.close() def get_metric_statistics(self, period, start_name, end_time, metric_name, namespace, statistics, dimensions, unit): obj = self.conn.get_metric_statistics( period, start_name, end_time, metric_name, namespace, statistics, dimensions, unit ) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Statistics.json") return obj def list_metrics(self, next_token=None, dimensions=None, metric_name=None, namespace=None): obj = self.conn.list_metrics(next_token, dimensions, metric_name, namespace) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Metrics.json") return obj def put_metric_data(self, namespace, name, value, timestamp, unit, dimensions, statistics): return self.conn.put_metric_data(namespace, name, value, timestamp, unit, dimensions, statistics) def describe_alarms( self, action_prefix=None, alarm_name_prefix=None, alarm_names=None, max_records=None, state_value=None, next_token=None, ): obj = self.conn.describe_alarms( action_prefix, alarm_name_prefix, alarm_names, max_records, state_value, next_token ) if self.saveclcdata: self.__save_json__(obj, "mockdata/CW_Alarms.json") return obj def delete_alarms(self, alarm_names): return self.conn.delete_alarms(alarm_names) def enable_alarm_actions(self, alarm_names): return self.conn.enable_alarm_actions(alarm_names) def disable_alarm_actions(self, alarm_names): return self.conn.disable_alarm_actions(alarm_names) def put_metric_alarm(self, alarm): return self.conn.put_metric_alarm(alarm)
class Monitor: def __init__(self, key, access): try: url = "http://169.254.169.254/latest/" self.userdata = json.load(urlopen(url + "user-data/")) public_hostname = urlopen(url + "meta-data/public-hostname/").read() zone = urlopen(url + "meta-data/placement/availability-zone/").read() region = zone[:-1] except: sys.exit("We should be getting user-data here...") # the name (and identity) of the cluster (the master) self.cluster = self.userdata['cluster'] self.name = "{0}.{1}".format(self.userdata['name'], self.cluster) endpoint = "monitoring.{0}.amazonaws.com".format(region) region_info = RegionInfo(name=region, endpoint=endpoint) self.cloudwatch = CloudWatchConnection(key, access, region=region_info) self.namespace = '9apps/postgres' self.connection = psycopg2.connect(host=settings.host, port=5432, dbname=settings.database_name, user=settings.database_user, password=settings.database_password) # now, the non-system database connections self.databases = [] try: database_cursor = self.connection.cursor() database_cursor.execute("select datname from pg_stat_database where datname !~ '(template[0-9]+|root|postgres)'") for database in database_cursor: self.databases.append([database[0], psycopg2.connect(host=settings.host, port=5432, dbname=database[0], user=settings.database_user, password=settings.database_password)]) finally: database_cursor.close() self.pgbouncer = psycopg2.connect(host=settings.host, port=6432, dbname='pgbouncer', user=settings.database_user, password=settings.database_password) # without this it doesn't work self.pgbouncer.set_isolation_level(0) def __del__(self): self.connection.close() def is_in_recovery(self): self.connection.autocommit = True try: cur = self.connection.cursor() cur.execute("select pg_is_in_recovery()") in_recovery = cur.fetchone()[0] finally: cur.close() return in_recovery == True def collect(self, monitoring = 'on'): if monitoring not in ['on', 'all']: return [[], [], [], {}] now = datetime.now() names = [] values = [] units = [] dimensions = { 'name' : self.name, 'cluster' : self.cluster } if 'master' in self.userdata: [offset, receive_offset, replay_offset] = self._get_standby_lag() if receive_offset != None: names.append('receive_lag') values.append(int(offset - receive_offset)) units.append('Bytes') if replay_offset != None: names.append('replay_lag') values.append(int(offset - replay_offset)) units.append('Bytes') for database in self.databases: for relation in ["heap", "idx"]: [read, hit, hitratio] = self._get_hitratio(database[1], relation) names.append("{0}_{1}_read".format(database[0], relation)) values.append(int(read)) units.append("Count") names.append("{0}_{1}_hit".format(database[0], relation)) values.append(int(hit)) units.append("Count") if hitratio != None: names.append("{0}_{1}_hitratio".format(database[0], relation)) values.append(float(hitratio * 100)) units.append("Percent") conflicts = self._get_conflicts(database[0]) names.append("{0}_{1}".format(database[0], 'confl_tablespace')) values.append(int(conflicts[0])) units.append("Count") names.append("{0}_{1}".format(database[0], 'confl_lock')) values.append(int(conflicts[1])) units.append("Count") names.append("{0}_{1}".format(database[0], 'confl_snapshot')) values.append(int(conflicts[2])) units.append("Count") names.append("{0}_{1}".format(database[0], 'confl_bufferpin')) values.append(int(conflicts[3])) units.append("Count") names.append("{0}_{1}".format(database[0], 'confl_deadlock')) values.append(int(conflicts[4])) units.append("Count") indexes_size = self._get_indexes_size(database[1]) names.append("{0}_indexes_size".format(database[0])) values.append(int(indexes_size)) units.append("Bytes") tables_size = self._get_tables_size(database[1]) names.append("{0}_tables_size".format(database[0])) values.append(int(tables_size)) units.append("Bytes") # nr of wal files size = self._get_nr_wal_files() names.append("wal_files") values.append(int(size)) units.append("Count") # pgbouncer stats stats = self._get_pgbouncer_stats() names.append("pgbouncer_avg_req") values.append(int(stats[0])) units.append("Count/Second") names.append("pgbouncer_avg_recv") values.append(int(stats[1])) units.append("Bytes/Second") names.append("pgbouncer_avg_sent") values.append(int(stats[2])) units.append("Bytes/Second") names.append("pgbouncer_avg_query") values.append(float(stats[3] / 1000000)) units.append("Seconds") # pgbouncer pools pools = self._get_pgbouncer_pools() names.append("pgbouncer_cl_active") values.append(float(pools[0])) units.append("Count") names.append("pgbouncer_cl_waiting") values.append(float(pools[1])) units.append("Count") names.append("pgbouncer_sv_active") values.append(float(pools[2])) units.append("Count") names.append("pgbouncer_sv_idle") values.append(float(pools[3])) units.append("Count") names.append("pgbouncer_sv_used") values.append(float(pools[4])) units.append("Count") names.append("pgbouncer_sv_tested") values.append(float(pools[5])) units.append("Count") names.append("pgbouncer_sv_login") values.append(float(pools[6])) units.append("Count") names.append("pgbouncer_maxwait") values.append(float(pools[7])) units.append("Count") return [names, values, units, dimensions] def put(self): result = False try: # only monitor if we are told to (this will break, if not set) monitoring = self.userdata['monitoring'] except: monitoring = 'on' if monitoring in ['on', 'all']: # first get all we need [names, values, units, dimensions] = self.collect(monitoring) while len(names) > 0: names20 = names[:20] values20 = values[:20] units20 = units[:20] # we can't send all at once, only 20 at a time # first aggregated over all result = self.cloudwatch.put_metric_data(self.namespace, names20, value=values20, unit=units20) for dimension in dimensions: dimension = { dimension : dimensions[dimension] } result &= self.cloudwatch.put_metric_data( self.namespace, names20, value=values20, unit=units20, dimensions=dimension) del names[:20] del values[:20] del units[:20] else: print "we are not monitoring" return result def metrics(self): return self.cloudwatch.list_metrics() def _get_nr_wal_files(self): try: cursor = self.connection.cursor() sql = "select count(name) from (select pg_ls_dir('pg_xlog') as name) as xlogs where name != 'archive_status'" cursor.execute(sql) [size] = cursor.fetchone() finally: cursor.close() return size def _get_tables_size(self, connection): try: cursor = connection.cursor() sql = "select sum(pg_relation_size(relid)) from pg_stat_user_tables" cursor.execute(sql) [size] = cursor.fetchone() finally: cursor.close() return size def _get_indexes_size(self, connection): try: cursor = connection.cursor() sql = "select sum(pg_relation_size(indexrelid)) from pg_stat_user_indexes" cursor.execute(sql) [size] = cursor.fetchone() finally: cursor.close() return size def _get_conflicts(self, database): try: cursor = self.connection.cursor() sql = "select * from pg_stat_database_conflicts where datname = '{0}'".format(database) cursor.execute(sql) conflicts = cursor.fetchone() finally: cursor.close() return [conflicts[2], conflicts[3], conflicts[4], conflicts[5], conflicts[6]] def _get_hitratio(self, connection, relation="heap"): if relation == "heap": table = "tables" else: table = "indexes" try: cursor = connection.cursor() sql = "select sum({0}_blks_read) as read, sum({0}_blks_hit) as hit, (sum({0}_blks_hit) - sum({0}_blks_read)) / nullif(sum({0}_blks_hit),0) as hitratio from pg_statio_user_{1}".format(relation, table) cursor.execute(sql) [read, hit, hitratio] = cursor.fetchone() finally: cursor.close() return [read, hit, hitratio] def _get_standby_lag(self): try: master = psycopg2.connect(host=self.userdata['master'], dbname=settings.database_name, user=settings.database_user, password=settings.database_password) master.autocommit = True try: cursor = master.cursor() cursor.execute( "SELECT pg_current_xlog_location() AS location") [x, y] = (cursor.fetchone()[0]).split('/') offset = (int('ff000000', 16) * int(x, 16)) + int(y, 16) finally: cursor.close() try: cursor = self.connection.cursor() cursor.execute( "SELECT pg_last_xlog_receive_location(), pg_last_xlog_replay_location()") one = cursor.fetchone() try: [x, y] = (one[0]).split('/') receive_offset = (int('ff000000', 16) * int(x, 16)) + int(y, 16) except: receive_offset = None try: [x, y] = (one[1]).split('/') replay_offset = (int('ff000000', 16) * int(x, 16)) + int(y, 16) except: replay_offset = None finally: cursor.close() finally: master.close() return [offset, receive_offset, replay_offset] def _get_pgbouncer_stats(self): try: cursor = self.pgbouncer.cursor() cursor.execute('show stats') # ('pgbouncer\x00', 119L, 0L, 0L, 0L, 0L, 0L, 0L, 0L) [name, total_requests, total_received, total_sent, total_query_time, avg_req, avg_recv, avg_sent, avg_query] = cursor.fetchone() finally: cursor.close() return [avg_req, avg_recv, avg_sent, avg_query] def _get_pgbouncer_pools(self): cl_active = cl_waiting = sv_active = sv_idle = 0 sv_used = sv_tested = sv_login = maxwait = 0 try: cursor = self.pgbouncer.cursor() cursor.execute('show pools') # ('pgbouncer\x00', 'pgbouncer\x00', 1, 0, 0, 0, 0, 0, 0, 0) for pool in cursor: cl_active += pool[2] cl_waiting += pool[3] sv_active += pool[4] sv_idle += pool[5] sv_used += pool[6] sv_tested += pool[7] sv_login += pool[8] maxwait = max(maxwait, pool[9]) finally: cursor.close() return [cl_active, cl_waiting, sv_active, sv_idle, sv_used, sv_tested, sv_login, maxwait]
class Monitor: def __init__(self, key, access, cluster): try: url = "http://169.254.169.254/latest/" self.userdata = json.load(urlopen(url + "user-data/")) public_hostname = urlopen(url + "meta-data/public-hostname/").read() zone = urlopen(url + "meta-data/placement/availability-zone/").read() region = zone[:-1] except: sys.exit("We should be getting user-data here...") # the name (and identity) of the cluster (the master) self.cluster = cluster self.redis = redis.StrictRedis(host='localhost', port=6379) endpoint = "monitoring.{0}.amazonaws.com".format(region) region_info = RegionInfo(name=region, endpoint=endpoint) self.cloudwatch = CloudWatchConnection(key, access, region=region_info) self.namespace = '9apps/redis' # get the host, but without the logging self.node = public_hostname def collect(self, monitoring = 'on'): if monitoring not in ['on', 'all']: return [[], [], [], {}] now = datetime.now() items = self.redis.info() names = [] values = [] units = [] dimensions = { 'node' : self.node, 'cluster' : self.cluster } slowlog_len = self.redis.execute_command('SLOWLOG','LEN') names.append('slowlog_len') values.append(slowlog_len) units.append('Count') if items['aof_enabled']: names.append('bgrewriteaof_in_progress') values.append(items['bgrewriteaof_in_progress']) units.append('Count') names.append('aof_pending_bio_fsync') values.append(items['aof_pending_bio_fsync']) units.append('Count') names.append('aof_buffer_length') values.append(items['aof_buffer_length']) units.append('Count') names.append('aof_current_size') values.append(items['aof_current_size']) units.append('Bytes') names.append('aof_pending_rewrite') values.append(items['aof_pending_rewrite']) units.append('Count') names.append('aof_base_size') values.append(items['aof_base_size']) units.append('Bytes') # master/slave names.append(items['role']) values.append(1) units.append('Count') for item in items: if item >= 'db0' and item < 'dc': names.append("{0}_keys".format(item)) values.append(items[item]['keys']) units.append('Count') names.append("{0}_expires".format(item)) values.append(items[item]['expires']) units.append('Count') # and now add some info on the keys, if we want if monitoring == 'all': nr = item.lstrip('db') db = redis.StrictRedis(host='localhost', port=6379, db=nr) keys = db.keys('*') for key in keys: key_type = db.type(key) key = key.replace( '.', '_') if key_type == "list": llen = db.llen(key) names.append("{0}_{1}_llen".format(item, key)) values.append(llen) units.append('Count') elif key_type == "hash": hlen = db.hlen(key) names.append("{0}_{1}_hlen".format(item, key)) values.append(hlen) units.append('Count') elif key_type == "set": scard = db.scard(key) names.append("{0}_{1}_scard".format(item, key)) values.append(scard) units.append('Count') elif key_type == "zset": zcard = db.zcard(key) names.append("{0}_{1}_zcard".format(item, key)) values.append(zcard) units.append('Count') elif key_type == "string": strlen = db.strlen(key) names.append("{0}_{1}_strlen".format(item, key)) values.append(strlen) units.append('Count') # pub/sub names.append('pubsub_channels') values.append(items['pubsub_channels']) units.append('Count') names.append('pubsub_patterns') values.append(items['pubsub_patterns']) units.append('Count') # memory names.append('used_memory') values.append(items['used_memory']) units.append('Bytes') names.append('used_memory_peak') values.append(items['used_memory_peak']) units.append('Bytes') names.append('used_memory_rss') values.append(items['used_memory_rss']) units.append('Bytes') names.append('mem_fragmentation_ratio') values.append(items['mem_fragmentation_ratio']) units.append('None') names.append('connected_slaves') values.append(items['connected_slaves']) units.append('Count') # names.append('loading') values.append(items['loading']) units.append('Count') names.append('bgsave_in_progress') values.append(items['bgsave_in_progress']) units.append('Count') # clients names.append('connected_clients') values.append(items['connected_clients']) units.append('Count') names.append('blocked_clients') values.append(items['blocked_clients']) units.append('Count') # connection/command totals #names.append('total_connections_received') #values.append(items['total_connections_received']) #units.append('Count') #names.append('total_commands_processed') #values.append(items['total_commands_processed']) #units.append('Count') # client input/output names.append('client_biggest_input_buf') values.append(items['client_biggest_input_buf']) units.append('Bytes') names.append('client_longest_output_list') values.append(items['client_longest_output_list']) units.append('Bytes') # keys names.append('expired_keys') values.append(items['expired_keys']) units.append('Count') names.append('evicted_keys') values.append(items['evicted_keys']) units.append('Count') # last_save names.append('changes_since_last_save') values.append(items['changes_since_last_save']) units.append('Count') # keyspace #names.append('keyspace_misses') #values.append(items['keyspace_misses']) #units.append('Count') #names.append('keyspace_hits') #values.append(items['keyspace_hits']) #units.append('Count') return [names, values, units, dimensions] def put(self): result = False try: # only monitor if we are told to (this will break, if not set) monitoring = self.userdata['monitoring'] except: monitoring = 'on' if monitoring in ['on', 'all']: # first get all we need [names, values, units, dimensions] = self.collect(monitoring) print [names, values, units, dimensions] while len(names) > 0: names20 = names[:20] values20 = values[:20] units20 = units[:20] # we can't send all at once, only 20 at a time # first aggregated over all result = self.cloudwatch.put_metric_data(self.namespace, names20, value=values20, unit=units20) for dimension in dimensions: dimension = { dimension : dimensions[dimension] } result &= self.cloudwatch.put_metric_data( self.namespace, names20, value=values20, unit=units20, dimensions=dimension) del names[:20] del values[:20] del units[:20] else: print "we are not monitoring" return result def metrics(self): return self.cloudwatch.list_metrics()
class Monitor: def __init__(self, key, access, cluster): try: url = "http://169.254.169.254/latest/meta-data/" public_hostname = urlopen(url + "public-hostname").read() zone = urlopen(url + "placement/availability-zone").read() region = zone[:-1] except: sys.exit("We should be getting user-data here...") # the name (and identity) of the cluster (the master) self.cluster = cluster self.redis = redis.StrictRedis(host='localhost', port=6379) endpoint = "monitoring.{0}.amazonaws.com".format(region) region_info = RegionInfo(name=region, endpoint=endpoint) self.cloudwatch = CloudWatchConnection(key, access, region=region_info) self.namespace = '9apps/redis' self.events = Events(key, access, cluster) # get the host, but without the logging self.host = Host(cluster) self.node = self.host.get_node() def __log(self, message, logging='warning'): self.events.log(self.node, 'Monitor', message, logging) def collect(self): self.__log('collecting metrics data from Redis INFO', 'info') now = datetime.now() items = self.redis.info() names = [] values = [] units = [] dimensions = { 'node' : self.node, 'cluster' : self.cluster } if items['aof_enabled']: self.__log('aof enabled: getting metrics data for the AOF', 'info') names.append('bgrewriteaof_in_progress') values.append(items['bgrewriteaof_in_progress']) units.append('Count') names.append('aof_pending_bio_fsync') values.append(items['aof_pending_bio_fsync']) units.append('Count') names.append('aof_buffer_length') values.append(items['aof_buffer_length']) units.append('Count') names.append('aof_current_size') values.append(items['aof_current_size']) units.append('Bytes') names.append('aof_pending_rewrite') values.append(items['aof_pending_rewrite']) units.append('Count') names.append('aof_base_size') values.append(items['aof_base_size']) units.append('Bytes') # master/slave names.append(items['role']) values.append(1) units.append('Count') for item in items: if item >= 'db0' and item < 'dc': self.__log('adding metrics data for database: {0}'.format(item), 'info') names.append("{0}_keys".format(item)) values.append(items[item]['keys']) units.append('Count') names.append("{0}_expires".format(item)) values.append(items[item]['expires']) units.append('Count') # and now add some info on the keys nr = item.lstrip('db') db = redis.StrictRedis(host='localhost', port=6379, db=nr) keys = db.keys('*') for key in keys: key = key.split('.')[-1] key_type = db.type(key) if key_type == "list": llen = db.llen(key) names.append("{0}_{1}_llen".format(item, key)) values.append(llen) units.append('Count') elif key_type == "hash": hlen = db.hlen(key) names.append("{0}_{1}_hlen".format(item, key)) values.append(hlen) units.append('Count') elif key_type == "set": scard = db.scard(key) names.append("{0}_{1}_scard".format(item, key)) values.append(scard) units.append('Count') elif key_type == "zset": zcard = db.zcard(key) names.append("{0}_{1}_zcard".format(item, key)) values.append(zcard) units.append('Count') elif key_type == "string": strlen = db.strlen(key) names.append("{0}_{1}_strlen".format(item, key)) values.append(strlen) units.append('Count') # pub/sub names.append('pubsub_channels') values.append(items['pubsub_channels']) units.append('Count') names.append('pubsub_patterns') values.append(items['pubsub_patterns']) units.append('Count') # memory names.append('used_memory') values.append(items['used_memory']) units.append('Bytes') names.append('used_memory_peak') values.append(items['used_memory_peak']) units.append('Bytes') names.append('used_memory_rss') values.append(items['used_memory_rss']) units.append('Bytes') names.append('mem_fragmentation_ratio') values.append(items['mem_fragmentation_ratio']) units.append('None') names.append('connected_slaves') values.append(items['connected_slaves']) units.append('Count') # names.append('loading') values.append(items['loading']) units.append('Count') names.append('bgsave_in_progress') values.append(items['bgsave_in_progress']) units.append('Count') # clients names.append('connected_clients') values.append(items['connected_clients']) units.append('Count') names.append('blocked_clients') values.append(items['blocked_clients']) units.append('Count') # connection/command totals names.append('total_connections_received') values.append(items['total_connections_received']) units.append('Count') names.append('total_commands_processed') values.append(items['total_commands_processed']) units.append('Count') # client input/output names.append('client_biggest_input_buf') values.append(items['client_biggest_input_buf']) units.append('Bytes') names.append('client_longest_output_list') values.append(items['client_longest_output_list']) units.append('Bytes') # keys names.append('expired_keys') values.append(items['expired_keys']) units.append('Count') names.append('evicted_keys') values.append(items['evicted_keys']) units.append('Count') # last_save names.append('changes_since_last_save') values.append(items['changes_since_last_save']) units.append('Count') # keyspace names.append('keyspace_misses') values.append(items['keyspace_misses']) units.append('Count') names.append('keyspace_hits') values.append(items['keyspace_hits']) units.append('Count') return [names, values, units, dimensions] def put(self): # first get all we need [names, values, units, dimensions] = self.collect() while len(names) > 0: names20 = names[:20] values20 = values[:20] units20 = units[:20] # we can't send all at once, only 20 at a time # first aggregated over all self.__log('put aggregated ReDiS metrics data', 'info') result = self.cloudwatch.put_metric_data(self.namespace, names20, value=values20, unit=units20) for dimension in dimensions: self.__log('put ReDiS metrics data for {0}'.format(dimensions[dimension]), 'info') dimension = { dimension : dimensions[dimension] } result &= self.cloudwatch.put_metric_data(self.namespace, names20, value=values20, unit=units20, dimensions=dimension) del names[:20] del values[:20] del units[:20] return result def metrics(self): return self.cloudwatch.list_metrics()