def get_jmx_status(): """This function tries to read the 2 jmxfetch status file which are yaml file located in the temp directory. There are 2 files: - One generated by the Agent itself, for jmx checks that can't be initialized because there are missing stuff. Its format is as following: ### invalid_checks: jmx: !!python/object/apply:jmxfetch.InvalidJMXConfiguration [You need to have at least one instance defined in the YAML file for this check] timestamp: 1391040927.136523 ### - One generated by jmxfetch that return information about the collection of metrics its format is as following: ### timestamp: 1391037347435 checks: failed_checks: jmx: - {message: Unable to create instance. Please check your yaml file, status: ERROR} initialized_checks: tomcat: - {message: null, status: OK, metric_count: 7, instance_name: jmx-remihakim.fr-3000} ### """ check_statuses = [] java_status_path = os.path.join(tempfile.gettempdir(), "jmx_status.yaml") python_status_path = os.path.join(tempfile.gettempdir(), "jmx_status_python.yaml") if not os.path.exists(java_status_path) and not os.path.exists(python_status_path): log.debug("There is no jmx_status file at: %s or at: %s" % (java_status_path, python_status_path)) return [] check_data = defaultdict(lambda: defaultdict(list)) try: if os.path.exists(java_status_path): java_jmx_stats = yaml.load(file(java_status_path)) status_age = time.time() - java_jmx_stats.get('timestamp')/1000 # JMX timestamp is saved in milliseconds jmx_checks = java_jmx_stats.get('checks', {}) if status_age > 60: check_statuses.append(CheckStatus("jmx", [InstanceStatus( 0, STATUS_ERROR, error="JMXfetch didn't return any metrics during the last minute" )])) else: for check_name, instances in jmx_checks.get('failed_checks', {}).iteritems(): for info in instances: message = info.get('message', None) metric_count = info.get('metric_count', 0) status = info.get('status') instance_name = info.get('instance_name', None) check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count)) check_data[check_name]['metric_count'].append(metric_count) for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems(): for info in instances: message = info.get('message', None) metric_count = info.get('metric_count', 0) status = info.get('status') instance_name = info.get('instance_name', None) check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count)) check_data[check_name]['metric_count'].append(metric_count) for check_name, data in check_data.iteritems(): check_status = CheckStatus(check_name, data['statuses'], sum(data['metric_count'])) check_statuses.append(check_status) if os.path.exists(python_status_path): python_jmx_stats = yaml.load(file(python_status_path)) jmx_checks = python_jmx_stats.get('invalid_checks', {}) for check_name, excep in jmx_checks.iteritems(): check_statuses.append(CheckStatus(check_name, [], init_failed_error=excep)) return check_statuses except Exception: log.exception("Couldn't load latest jmx status") return []
def get_jmx_status(): """This function tries to read the 2 jmxfetch status file which are yaml file located in the temp directory. There are 2 files: - One generated by the Agent itself, for jmx checks that can't be initialized because there are missing stuff. Its format is as following: ### invalid_checks: jmx: !!python/object/apply:jmxfetch.InvalidJMXConfiguration [You need to have at least one instance defined in the YAML file for this check] timestamp: 1391040927.136523 ### - One generated by jmxfetch that return information about the collection of metrics its format is as following: ### timestamp: 1391037347435 checks: failed_checks: jmx: - {message: Unable to create instance. Please check your yaml file, status: ERROR} initialized_checks: tomcat: - {message: null, status: OK, metric_count: 7, instance_name: jmx-remihakim.fr-3000} ### """ check_statuses = [] java_status_path = os.path.join(tempfile.gettempdir(), "jmx_status.yaml") python_status_path = os.path.join(tempfile.gettempdir(), "jmx_status_python.yaml") if not os.path.exists(java_status_path) and not os.path.exists(python_status_path): log.debug("There is no jmx_status file at: %s or at: %s" % (java_status_path, python_status_path)) return [] check_data = defaultdict(lambda: defaultdict(list)) try: if os.path.exists(java_status_path): java_jmx_stats = yaml.load(file(java_status_path)) status_age = time.time() - java_jmx_stats.get('timestamp')/1000 # JMX timestamp is saved in milliseconds jmx_checks = java_jmx_stats.get('checks', {}) if status_age > 60: check_statuses.append(CheckStatus("jmx", [InstanceStatus( 0, STATUS_ERROR, error="JMXfetch didn't return any metrics during the last minute" )], 0, 0)) else: for check_name, instances in jmx_checks.get('failed_checks', {}).iteritems(): for info in instances: message = info.get('message', None) metric_count = info.get('metric_count', 0) status = info.get('status') instance_name = info.get('instance_name', None) check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count)) check_data[check_name]['metric_count'].append(metric_count) for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems(): for info in instances: message = info.get('message', None) metric_count = info.get('metric_count', 0) status = info.get('status') instance_name = info.get('instance_name', None) check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count)) check_data[check_name]['metric_count'].append(metric_count) for check_name, data in check_data.iteritems(): check_status = CheckStatus(check_name, data['statuses'], sum(data['metric_count']), 0) check_statuses.append(check_status) if os.path.exists(python_status_path): python_jmx_stats = yaml.load(file(python_status_path)) jmx_checks = python_jmx_stats.get('invalid_checks', {}) for check_name, excep in jmx_checks.iteritems(): check_statuses.append(CheckStatus(check_name, [], 0, 0, init_failed_error=excep)) return check_statuses except Exception, e: log.exception("Couldn't load latest jmx status") return []
def check(self, instance): consumer_groups = self.read_config(instance, 'consumer_groups', cast=self._validate_consumer_groups) zk_connect_str = self.read_config(instance, 'zk_connect_str') kafka_host_ports = self.read_config(instance, 'kafka_connect_str', cast=self._parse_connect_str) # Construct the Zookeeper path pattern zk_prefix = instance.get('zk_prefix', '') zk_path_tmpl = zk_prefix + '/consumers/%s/offsets/%s/%s' # Connect to Zookeeper zk_conn = KazooClient(zk_connect_str) zk_conn.start() try: # Query Zookeeper for consumer offsets consumer_offsets = {} topics = defaultdict(set) for consumer_group, topic_partitions in consumer_groups.iteritems(): for topic, partitions in topic_partitions.iteritems(): # Remember the topic partitions that we've see so that we can # look up their broker offsets later topics[topic].update(set(partitions)) for partition in partitions: zk_path = zk_path_tmpl % (consumer_group, topic, partition) try: consumer_offset = int(zk_conn.get(zk_path)[0]) key = (consumer_group, topic, partition) consumer_offsets[key] = consumer_offset except NoNodeError: self.log.warn('No zookeeper node at %s' % zk_path) except Exception: self.log.exception('Could not read consumer offset from %s' % zk_path) finally: try: zk_conn.stop() zk_conn.close() except Exception: self.log.exception('Error cleaning up Zookeeper connection') # Connect to Kafka kafka_host, kafka_port = random.choice(kafka_host_ports) kafka_conn = KafkaClient(kafka_host, kafka_port) try: # Query Kafka for the broker offsets broker_offsets = {} for topic, partitions in topics.items(): offset_responses = kafka_conn.send_offset_request([ OffsetRequest(topic, p, -1, 1) for p in partitions]) for resp in offset_responses: broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0] finally: try: kafka_conn.close() except Exception: self.log.exception('Error cleaning up Kafka connection') # Report the broker data for (topic, partition), broker_offset in broker_offsets.items(): broker_tags = ['topic:%s' % topic, 'partition:%s' % partition] broker_offset = broker_offsets.get((topic, partition)) self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags) # Report the consumer for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items(): # Get the broker offset broker_offset = broker_offsets.get((topic, partition)) # Report the consumer offset and lag tags = ['topic:%s' % topic, 'partition:%s' % partition, 'consumer_group:%s' % consumer_group] self.gauge('kafka.consumer_offset', consumer_offset, tags=tags) self.gauge('kafka.consumer_lag', broker_offset - consumer_offset, tags=tags)
def check(self, instance): consumer_groups = self.read_config(instance, 'consumer_groups', cast=self._validate_consumer_groups) zk_connect_str = self.read_config(instance, 'zk_connect_str') kafka_host_ports = self.read_config(instance, 'kafka_connect_str', cast=self._parse_connect_str) # Construct the Zookeeper path pattern zk_prefix = instance.get('zk_prefix', '') zk_path_tmpl = zk_prefix + '/consumers/%s/offsets/%s/%s' # Connect to Zookeeper zk_conn = KazooClient(zk_connect_str) zk_conn.start() try: # Query Zookeeper for consumer offsets consumer_offsets = {} topics = defaultdict(set) for consumer_group, topic_partitions in consumer_groups.iteritems( ): for topic, partitions in topic_partitions.iteritems(): # Remember the topic partitions that we've see so that we can # look up their broker offsets later topics[topic].update(set(partitions)) for partition in partitions: zk_path = zk_path_tmpl % (consumer_group, topic, partition) try: consumer_offset = int(zk_conn.get(zk_path)[0]) key = (consumer_group, topic, partition) consumer_offsets[key] = consumer_offset except NoNodeError: self.log.warn('No zookeeper node at %s' % zk_path) except Exception: self.log.exception( 'Could not read consumer offset from %s' % zk_path) finally: try: zk_conn.stop() zk_conn.close() except Exception: self.log.exception('Error cleaning up Zookeeper connection') # Connect to Kafka kafka_host, kafka_port = random.choice(kafka_host_ports) kafka_conn = KafkaClient(kafka_host, kafka_port) try: # Query Kafka for the broker offsets broker_offsets = {} for topic, partitions in topics.items(): offset_responses = kafka_conn.send_offset_request( [OffsetRequest(topic, p, -1, 1) for p in partitions]) for resp in offset_responses: broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0] finally: try: kafka_conn.close() except Exception: self.log.exception('Error cleaning up Kafka connection') # Report the broker data for (topic, partition), broker_offset in broker_offsets.items(): broker_tags = ['topic:%s' % topic, 'partition:%s' % partition] broker_offset = broker_offsets.get((topic, partition)) self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags) # Report the consumer for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items(): # Get the broker offset broker_offset = broker_offsets.get((topic, partition)) # Report the consumer offset and lag tags = [ 'topic:%s' % topic, 'partition:%s' % partition, 'consumer_group:%s' % consumer_group ] self.gauge('kafka.consumer_offset', consumer_offset, tags=tags) self.gauge('kafka.consumer_lag', broker_offset - consumer_offset, tags=tags)