def configure_logging(json=False, verbose=False): log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(module)s.%(funcName)s %(filename)s:%(lineno)s %(message)s' formatter = JogFormatter(log_format) if json else logging.Formatter( log_format) log_handler.setFormatter(formatter) logging.basicConfig(level=logging.DEBUG if verbose else logging.INFO, handlers=[log_handler]) logging.captureWarnings(True)
def main(): signal.signal(signal.SIGTERM, signal_handler) parser = argparse.ArgumentParser(description='Export example metrics for Prometheus consumption.') parser.add_argument('-p', '--port', type=int, default=9900, help='port to serve the metrics endpoint on. (default: 9900)') parser.add_argument('-c', '--config-file', default='example.cfg', help='path to query config file. Can be absolute, or relative to the current working directory. (default: example.cfg)') parser.add_argument('--example1-disable', action='store_true', help='disable example 1 monitoring.') parser.add_argument('--example2-disable', action='store_true', help='disable example 2 monitoring.') parser.add_argument('-j', '--json-logging', action='store_true', help='turn on json logging.') parser.add_argument('--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='detail level to log. (default: INFO)') parser.add_argument('-v', '--verbose', action='store_true', help='turn on verbose (DEBUG) logging. Overrides --log-level.') args = parser.parse_args() log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter(log_format) if args.json_logging else logging.Formatter(log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, args.log_level) logging.basicConfig( handlers=[log_handler], level=logging.DEBUG if args.verbose else log_level ) logging.captureWarnings(True) port = args.port scheduler = None if not args.example1_disable: REGISTRY.register(Example1Collector()) if not args.example2_disable: REGISTRY.register(Example2Collector()) logging.info('Starting server...') start_http_server(port) logging.info('Server started on port %s', port) try: while True: time.sleep(5) except KeyboardInterrupt: pass shutdown()
def main(): signal.signal(signal.SIGTERM, signal_handler) parser = argparse.ArgumentParser( description='Export ES query results to Prometheus.') parser.add_argument( '-e', '--es-cluster', default='localhost', help= 'addresses of nodes in a Elasticsearch cluster to run queries on. Nodes should be separated by commas e.g. es1,es2. Ports can be provided if non-standard (9200) e.g. es1:9999 (default: localhost)' ) parser.add_argument( '--ca-certs', help= 'path to a CA certificate bundle. Can be absolute, or relative to the current working directory. If not specified, SSL certificate verification is disabled.' ) parser.add_argument( '-p', '--port', type=int, default=9206, help='port to serve the metrics endpoint on. (default: 9206)') parser.add_argument('--basic-user', help='User for authentication. (default: no user)') parser.add_argument( '--basic-password', help='Password for authentication. (default: no password)') parser.add_argument( '--query-disable', action='store_true', help= 'disable query monitoring. Config file does not need to be present if query monitoring is disabled.' ) parser.add_argument( '-c', '--config-file', default='exporter.cfg', help= 'path to query config file. Can be absolute, or relative to the current working directory. (default: exporter.cfg)' ) parser.add_argument( '--config-dir', default='./config', help= 'path to query config directory. Besides including the single config file specified by "--config-file" at first, all config files in the config directory will be sorted, merged, then included. Can be absolute, or relative to the current working directory. (default: ./config)' ) parser.add_argument('--cluster-health-disable', action='store_true', help='disable cluster health monitoring.') parser.add_argument( '--cluster-health-timeout', type=float, default=10.0, help= 'request timeout for cluster health monitoring, in seconds. (default: 10)' ) parser.add_argument( '--cluster-health-level', default='indices', choices=['cluster', 'indices', 'shards'], help= 'level of detail for cluster health monitoring. (default: indices)') parser.add_argument('--nodes-stats-disable', action='store_true', help='disable nodes stats monitoring.') parser.add_argument( '--nodes-stats-timeout', type=float, default=10.0, help= 'request timeout for nodes stats monitoring, in seconds. (default: 10)' ) parser.add_argument( '--nodes-stats-metrics', type=nodes_stats_metrics_parser, help= 'limit nodes stats to specific metrics. Metrics should be separated by commas e.g. indices,fs.' ) parser.add_argument('--indices-stats-disable', action='store_true', help='disable indices stats monitoring.') parser.add_argument( '--indices-stats-timeout', type=float, default=10.0, help= 'request timeout for indices stats monitoring, in seconds. (default: 10)' ) parser.add_argument( '--indices-stats-mode', default='cluster', choices=['cluster', 'indices'], help='detail mode for indices stats monitoring. (default: cluster)') parser.add_argument( '--indices-stats-metrics', type=indices_stats_metrics_parser, help= 'limit indices stats to specific metrics. Metrics should be separated by commas e.g. indices,fs.' ) parser.add_argument( '--indices-stats-fields', type=indices_stats_fields_parser, help= 'include fielddata info for specific fields. Fields should be separated by commas e.g. indices,fs. Use \'*\' for all.' ) parser.add_argument('-j', '--json-logging', action='store_true', help='turn on json logging.') parser.add_argument( '--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='detail level to log. (default: INFO)') parser.add_argument( '-v', '--verbose', action='store_true', help='turn on verbose (DEBUG) logging. Overrides --log-level.') args = parser.parse_args() if args.basic_user and args.basic_password is None: parser.error('Username provided with no password.') elif args.basic_user is None and args.basic_password: parser.error('Password provided with no username.') elif args.basic_user: http_auth = (args.basic_user, args.basic_password) else: http_auth = None log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter( log_format) if args.json_logging else logging.Formatter(log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, args.log_level) logging.basicConfig(handlers=[log_handler], level=logging.DEBUG if args.verbose else log_level) logging.captureWarnings(True) port = args.port es_cluster = args.es_cluster.split(',') if args.ca_certs: es_client = Elasticsearch(es_cluster, verify_certs=True, ca_certs=args.ca_certs, http_auth=http_auth) else: es_client = Elasticsearch(es_cluster, verify_certs=False, http_auth=http_auth) scheduler = None if not args.query_disable: scheduler = sched.scheduler() config = configparser.ConfigParser() config.read_file(open(args.config_file)) config_dir_sorted_files = sorted( glob.glob(os.path.join(args.config_dir, '*.cfg'))) config.read(config_dir_sorted_files) query_prefix = 'query_' queries = {} for section in config.sections(): if section.startswith(query_prefix): query_name = section[len(query_prefix):] query_interval = config.getfloat(section, 'QueryIntervalSecs', fallback=15) query_timeout = config.getfloat(section, 'QueryTimeoutSecs', fallback=10) query_indices = config.get(section, 'QueryIndices', fallback='_all') query = json.loads(config.get(section, 'QueryJson')) queries[query_name] = (query_interval, query_timeout, query_indices, query) if queries: for name, (interval, timeout, indices, query) in queries.items(): func = partial(run_query, es_client, name, indices, query, timeout) run_scheduler(scheduler, interval, func) else: logging.warn('No queries found in config file %s', args.config_file) if not args.cluster_health_disable: REGISTRY.register( ClusterHealthCollector(es_client, args.cluster_health_timeout, args.cluster_health_level)) if not args.nodes_stats_disable: REGISTRY.register( NodesStatsCollector(es_client, args.nodes_stats_timeout, metrics=args.nodes_stats_metrics)) if not args.indices_stats_disable: parse_indices = args.indices_stats_mode == 'indices' REGISTRY.register( IndicesStatsCollector(es_client, args.indices_stats_timeout, parse_indices=parse_indices, metrics=args.indices_stats_metrics, fields=args.indices_stats_fields)) logging.info('Starting server...') start_http_server(port) logging.info('Server started on port %s', port) try: if scheduler: scheduler.run() else: while True: time.sleep(5) except KeyboardInterrupt: pass shutdown()
def cli(**options): """Export Elasticsearch query results to Prometheus.""" if options['basic_user'] and options['basic_password'] is None: click.BadOptionUsage('basic_user', 'Username provided with no password.') elif options['basic_user'] is None and options['basic_password']: click.BadOptionUsage('basic_password', 'Password provided with no username.') elif options['basic_user']: http_auth = (options['basic_user'], options['basic_password']) else: http_auth = None if not options['ca_certs'] and options['client_cert']: click.BadOptionUsage( 'client_cert', '--client-cert can only be used when --ca-certs is provided.') elif not options['ca_certs'] and options['client_key']: click.BadOptionUsage( 'client_key', '--client-key can only be used when --ca-certs is provided.') elif options['client_cert'] and not options['client_key']: click.BadOptionUsage( 'client_cert', '--client-key must be provided when --client-cert is used.') elif not options['client_cert'] and options['client_key']: click.BadOptionUsage( 'client_key', '--client-cert must be provided when --client-key is used.') log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter( log_format) if options['json_logging'] else logging.Formatter( log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, options['log_level']) logging.basicConfig( handlers=[log_handler], level=logging.DEBUG if options['verbose'] else log_level) logging.captureWarnings(True) port = options['port'] es_cluster = options['es_cluster'].split(',') if options['ca_certs']: es_client = Elasticsearch(es_cluster, verify_certs=True, ca_certs=options['ca_certs'], client_cert=options['client_cert'], client_key=options['client_key'], http_auth=http_auth) else: es_client = Elasticsearch(es_cluster, verify_certs=False, http_auth=http_auth) if options['indices_stats_indices'] and options[ 'indices_stats_mode'] != 'indices': raise click.BadOptionUsage( 'indices_stats_indices', '--indices-stats-mode must be "indices" for ' '--indices-stats-indices to be used.') scheduler = sched.scheduler() indices_for_stats = [] config = None config_file_ext = '*.cfg' if not options['query_disable']: config = configparser.ConfigParser(converters=CONFIGPARSER_CONVERTERS) config.read(options['config_file']) config_dir_file_pattern = os.path.join(options['config_dir'], config_file_ext) config_dir_sorted_files = sorted(glob.glob(config_dir_file_pattern)) config.read(config_dir_sorted_files) query_prefix = 'query_' queries = {} for section in config.sections(): if section.startswith(query_prefix): query_name = section[len(query_prefix):] interval = config.getfloat(section, 'QueryIntervalSecs', fallback=15) timeout = config.getfloat(section, 'QueryTimeoutSecs', fallback=10) indices = config.get(section, 'QueryIndices', fallback='_all') query = json.loads(config.get(section, 'QueryJson')) on_error = config.getenum(section, 'QueryOnError', fallback='drop') on_missing = config.getenum(section, 'QueryOnMissing', fallback='drop') queries[query_name] = (interval, timeout, indices, query, on_error, on_missing) if queries: for query_name, (interval, timeout, indices, query, on_error, on_missing) in queries.items(): schedule_job(scheduler, interval, run_query, es_client, query_name, indices, query, timeout, on_error, on_missing) else: log.error('No queries found in config file(s)') return chain_query_prefix = 'chain_query_' chain_queries = {} for section in config.sections(): if section.startswith(chain_query_prefix): query_name = section[len(chain_query_prefix):] interval = config.getfloat(section, 'QueryIntervalSecs', fallback=15) timeout = config.getfloat(section, 'QueryTimeoutSecs', fallback=10) query_def = json.loads(config.get(section, 'QueryJson')) on_error = config.getenum(section, 'QueryOnError', fallback='drop') on_missing = config.getenum(section, 'QueryOnMissing', fallback='drop') chain_queries[query_name] = (interval, timeout, query_def, on_error, on_missing) if chain_queries: for query_name, (interval, timeout, query_def, on_error, on_missing) in chain_queries.items(): schedule_job(scheduler, interval, run_chain_query, es_client, query_name, query_def, timeout, on_error, on_missing) if not options['cluster_health_disable']: REGISTRY.register( ClusterHealthCollector(es_client, options['cluster_health_timeout'], options['cluster_health_level'])) if not options['nodes_stats_disable']: REGISTRY.register( NodesStatsCollector(es_client, options['nodes_stats_timeout'], metrics=options['nodes_stats_metrics'])) if not options['indices_aliases_disable']: REGISTRY.register( IndicesAliasesCollector(es_client, options['indices_aliases_timeout'])) if not options['indices_mappings_disable']: REGISTRY.register( IndicesMappingsCollector(es_client, options['indices_mappings_timeout'])) if not options['indices_stats_disable']: parse_indices = options['indices_stats_mode'] == 'indices' REGISTRY.register( IndicesStatsCollector(es_client, options['indices_stats_timeout'], parse_indices=parse_indices, indices=options['indices_stats_indices'], metrics=options['indices_stats_metrics'], fields=options['indices_stats_fields'])) if scheduler: REGISTRY.register(QueryMetricCollector()) Thread(target=start_prometheus_server, args=(port, )).start() Thread(target=start_scheduler, args=(scheduler, )).start()
def main(): signal.signal(signal.SIGTERM, signal_handler) parser = argparse.ArgumentParser( description='Export Kafka consumer offsets to Prometheus.') parser.add_argument( '-b', '--bootstrap-brokers', help='Addresses of brokers in a Kafka cluster to talk to.' + ' Brokers should be separated by commas e.g. broker1,broker2.' + ' Ports can be provided if non-standard (9092) e.g. brokers1:9999.' + ' (default: localhost)') parser.add_argument( '-p', '--port', type=int, default=9208, help='Port to serve the metrics endpoint on. (default: 9208)') parser.add_argument( '-s', '--from-start', action='store_true', help='Start from the beginning of the `__consumer_offsets` topic.') parser.add_argument( '--topic-interval', type=float, default=30.0, help='How often to refresh topic information, in seconds. (default: 30)' ) parser.add_argument( '--high-water-interval', type=float, default=10.0, help= 'How often to refresh high-water information, in seconds. (default: 10)' ) parser.add_argument( '--low-water-interval', type=float, default=10.0, help= 'How often to refresh low-water information, in seconds. (default: 10)' ) parser.add_argument( '--consumer-config', action='append', default=[], help= 'Provide additional Kafka consumer config as a consumer.properties file. Multiple files will be merged, later files having precedence.' ) parser.add_argument('-j', '--json-logging', action='store_true', help='Turn on json logging.') parser.add_argument( '--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Detail level to log. (default: INFO)') parser.add_argument( '-v', '--verbose', action='store_true', help='Turn on verbose (DEBUG) logging. Overrides --log-level.') args = parser.parse_args() log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter(log_format) \ if args.json_logging \ else logging.Formatter(log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, args.log_level) logging.basicConfig(handlers=[log_handler], level=logging.DEBUG if args.verbose else log_level) logging.captureWarnings(True) port = args.port consumer_config = { 'bootstrap_servers': 'localhost', 'auto_offset_reset': 'latest', 'group_id': None, 'consumer_timeout_ms': 500 } args.consumer_config.append(os.environ.get('CONSUMER_CONFIG')) for filename in args.consumer_config: with open(filename) as f: raw_config = javaproperties.load(f) for k, v in raw_config.items(): if v == '': # Treat empty values as if they weren't set continue if v.lower() in ['true', 'false']: # Convert boolean values v = True if v.lower() == 'true' else False else: # Try and convert numeric values try: v = int(v) except ValueError: try: v = float(v) except ValueError: pass consumer_config[k.replace('.', '_')] = v if args.bootstrap_brokers: consumer_config['bootstrap_servers'] = args.bootstrap_brokers consumer_config['bootstrap_servers'] = consumer_config[ 'bootstrap_servers'].split(',') if args.from_start: consumer_config['auto_offset_reset'] = 'earliest' consumer = KafkaConsumer('__consumer_offsets', **consumer_config) client = consumer._client topic_interval = args.topic_interval high_water_interval = args.high_water_interval low_water_interval = args.low_water_interval logging.info('Starting server...') start_http_server(port) logging.info('Server started on port %s', port) REGISTRY.register(collectors.HighwaterCollector()) REGISTRY.register(collectors.LowwaterCollector()) REGISTRY.register(collectors.ConsumerOffsetCollector()) REGISTRY.register(collectors.ConsumerLagCollector()) REGISTRY.register(collectors.ConsumerLeadCollector()) REGISTRY.register(collectors.ConsumerCommitsCollector()) REGISTRY.register(collectors.ConsumerCommitTimestampCollector()) REGISTRY.register(collectors.ExporterOffsetCollector()) REGISTRY.register(collectors.ExporterLagCollector()) REGISTRY.register(collectors.ExporterLeadCollector()) scheduled_jobs = setup_fetch_jobs(topic_interval, high_water_interval, low_water_interval, client) scheduler.run_scheduled_jobs(scheduled_jobs) try: while True: for message in consumer: offsets = collectors.get_offsets() commits = collectors.get_commits() commit_timestamps = collectors.get_commit_timestamps() exporter_offsets = collectors.get_exporter_offsets() # Commits store the offset a consumer should read from next, # so we need to add one to the current offset for semantic parity exporter_partition = message.partition exporter_offset = message.offset + 1 exporter_offsets = ensure_dict_key(exporter_offsets, exporter_partition, exporter_offset) exporter_offsets[exporter_partition] = exporter_offset collectors.set_exporter_offsets(exporter_offsets) if message.key: key_dict = parse_key(message.key) # Only key versions 0 and 1 are offset commit messages. # Ignore other versions. if key_dict is not None and key_dict['version'] in (0, 1): if message.value: value_dict = parse_value(message.value) if value_dict is not None: group = key_dict['group'] topic = key_dict['topic'] partition = key_dict['partition'] offset = value_dict['offset'] commit_timestamp = value_dict[ 'commit_timestamp'] / 1000 offsets = ensure_dict_key(offsets, group, {}) offsets[group] = ensure_dict_key( offsets[group], topic, {}) offsets[group][topic] = ensure_dict_key( offsets[group][topic], partition, offset) offsets[group][topic][partition] = offset collectors.set_offsets(offsets) commits = ensure_dict_key(commits, group, {}) commits[group] = ensure_dict_key( commits[group], topic, {}) commits[group][topic] = ensure_dict_key( commits[group][topic], partition, 0) commits[group][topic][partition] += 1 collectors.set_commits(commits) commit_timestamps = ensure_dict_key( commit_timestamps, group, {}) commit_timestamps[group] = ensure_dict_key( commit_timestamps[group], topic, {}) commit_timestamps[group][ topic] = ensure_dict_key( commit_timestamps[group][topic], partition, 0) commit_timestamps[group][topic][ partition] = commit_timestamp collectors.set_commit_timestamps( commit_timestamps) else: # The group has been removed, so we should not report metrics group = key_dict['group'] topic = key_dict['topic'] partition = key_dict['partition'] if group in offsets: if topic in offsets[group]: if partition in offsets[group][topic]: del offsets[group][topic][partition] if group in commits: if topic in commits[group]: if partition in commits[group][topic]: del commits[group][topic][partition] if group in commit_timestamps: if topic in commit_timestamps[group]: if partition in commit_timestamps[group][ topic]: del commit_timestamps[group][topic][ partition] # Check if we need to run any scheduled jobs # each message. scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) # Also check if we need to run any scheduled jobs # each time the consumer times out, in case there # aren't any messages to consume. scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) except KeyboardInterrupt: pass shutdown()
def cli(**options): """Export Elasticsearch query results to Prometheus.""" if options['basic_user'] and options['basic_password'] is None: click.BadOptionUsage('basic_user', 'Username provided with no password.') elif options['basic_user'] is None and options['basic_password']: click.BadOptionUsage('basic_password', 'Password provided with no username.') elif options['basic_user']: http_auth = (options['basic_user'], options['basic_password']) else: http_auth = None if not options['ca_certs'] and options['client_cert']: click.BadOptionUsage( 'client_cert', '--client-cert can only be used when --ca-certs is provided.') elif not options['ca_certs'] and options['client_key']: click.BadOptionUsage( 'client_key', '--client-key can only be used when --ca-certs is provided.') elif options['client_cert'] and not options['client_key']: click.BadOptionUsage( 'client_cert', '--client-key must be provided when --client-cert is used.') elif not options['client_cert'] and options['client_key']: click.BadOptionUsage( 'client_key', '--client-cert must be provided when --client-key is used.') log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter( log_format) if options['json_logging'] else logging.Formatter( log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, options['log_level']) logging.basicConfig( handlers=[log_handler], level=logging.DEBUG if options['verbose'] else log_level) logging.captureWarnings(True) port = options['port'] es_cluster = options['es_cluster'].split(',') if es_cluster == ['consul']: consul_host = options['consul_host'] consul_port = options['consul_port'] # check consul connection while True: try: log.info('Connecting to Consul agent at {}:{}...'.format( consul_host, consul_port)) consul_client.connect(consul_host, consul_port) es_cluster = [ consul_client.get_service_address(options['es_service']) ] break except Exception: log.info('retrying to connect to consul after 5 seconds...') time.sleep(5) continue log.info('Found Elasticsearch registered at {}.'.format(es_cluster)) if options['ca_certs']: es_client = Elasticsearch(es_cluster, verify_certs=True, ca_certs=options['ca_certs'], client_cert=options['client_cert'], client_key=options['client_key'], http_auth=http_auth) else: es_client = Elasticsearch(es_cluster, verify_certs=False, http_auth=http_auth) # check es health while True: try: log.info('Checking Elasticsearch client health...') es_client.cluster.health() break except Exception: log.info( 'Elasticsearch client is not ready. Retry after 5 seconds...') time.sleep(5) continue log.info('Elasticsearch is ready') scheduler = None if not options['query_disable']: config = configparser.ConfigParser(converters=CONFIGPARSER_CONVERTERS) config.read(options['config_file']) config_dir_file_pattern = os.path.join(options['config_dir'], '*.cfg') config_dir_sorted_files = sorted(glob.glob(config_dir_file_pattern)) config.read(config_dir_sorted_files) query_prefix = 'query_' queries = {} for section in config.sections(): if section.startswith(query_prefix): query_name = section[len(query_prefix):] interval = config.getfloat(section, 'QueryIntervalSecs', fallback=15) timeout = config.getfloat(section, 'QueryTimeoutSecs', fallback=10) indices = config.get(section, 'QueryIndices', fallback='_all') query = json.loads(config.get(section, 'QueryJson')) on_error = config.getenum(section, 'QueryOnError', fallback='drop') on_missing = config.getenum(section, 'QueryOnMissing', fallback='drop') queries[query_name] = (interval, timeout, indices, query, on_error, on_missing) scheduler = sched.scheduler() if queries: for query_name, (interval, timeout, indices, query, on_error, on_missing) in queries.items(): schedule_job(scheduler, interval, run_query, es_client, query_name, indices, query, timeout, on_error, on_missing) else: log.error('No queries found in config file(s)') return if not options['cluster_health_disable']: REGISTRY.register( ClusterHealthCollector(es_client, options['cluster_health_timeout'], options['cluster_health_level'])) if not options['nodes_stats_disable']: REGISTRY.register( NodesStatsCollector(es_client, options['nodes_stats_timeout'], metrics=options['nodes_stats_metrics'])) if not options['indices_aliases_disable']: REGISTRY.register( IndicesAliasesCollector(es_client, options['indices_aliases_timeout'])) if not options['indices_mappings_disable']: REGISTRY.register( IndicesMappingsCollector(es_client, options['indices_mappings_timeout'])) if not options['indices_stats_disable']: parse_indices = options['indices_stats_mode'] == 'indices' REGISTRY.register( IndicesStatsCollector(es_client, options['indices_stats_timeout'], parse_indices=parse_indices, metrics=options['indices_stats_metrics'], fields=options['indices_stats_fields'])) if scheduler: REGISTRY.register(QueryMetricCollector()) log.info('Starting server...') start_http_server(port) log.info('Server started on port %(port)s', {'port': port}) if scheduler: scheduler.run() else: while True: time.sleep(5)
def main(): signal.signal(signal.SIGTERM, signal_handler) parser = argparse.ArgumentParser( description='Export Kafka consumer offsets to Prometheus.') parser.add_argument( '-b', '--bootstrap-brokers', default='localhost', help='Addresses of brokers in a Kafka cluster to talk to.' + ' Brokers should be separated by commas e.g. broker1,broker2.' + ' Ports can be provided if non-standard (9092) e.g. brokers1:9999.' + ' (default: localhost)') parser.add_argument( '-p', '--port', type=int, default=9208, help='Port to serve the metrics endpoint on. (default: 9208)') parser.add_argument( '-s', '--from-start', action='store_true', help='Start from the beginning of the `__consumer_offsets` topic.') parser.add_argument( '--topic-interval', type=float, default=30.0, help='How often to refresh topic information, in seconds. (default: 30)' ) parser.add_argument( '--high-water-interval', type=float, default=10.0, help= 'How often to refresh high-water information, in seconds. (default: 10)' ) parser.add_argument( '--low-water-interval', type=float, default=10.0, help= 'How often to refresh low-water information, in seconds. (default: 10)' ) parser.add_argument( '--consumer-config', action='append', default=[], help= 'Provide additional Kafka consumer config as a consumer.properties file. Multiple files will be merged, later files having precedence.' ) parser.add_argument('-j', '--json-logging', action='store_true', help='Turn on json logging.') parser.add_argument( '--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='detail level to log. (default: INFO)') parser.add_argument( '-v', '--verbose', action='store_true', help='turn on verbose (DEBUG) logging. Overrides --log-level.') args = parser.parse_args() log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter(log_format) \ if args.json_logging \ else logging.Formatter(log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, args.log_level) logging.basicConfig(handlers=[log_handler], level=logging.DEBUG if args.verbose else log_level) logging.captureWarnings(True) port = args.port consumer_config = { 'bootstrap_servers': 'localhost', 'auto_offset_reset': 'latest', 'group_id': None, 'consumer_timeout_ms': 500 } for filename in args.consumer_config: with open(filename) as f: raw_config = javaproperties.load(f) converted_config = { k.replace('.', '_'): v for k, v in raw_config.items() } consumer_config.update(converted_config) if args.bootstrap_brokers: consumer_config['bootstrap_servers'] = args.bootstrap_brokers.split( ',') if args.from_start: consumer_config['auto_offset_reset'] = 'earliest' consumer = KafkaConsumer('__consumer_offsets', **consumer_config) client = consumer._client topic_interval = args.topic_interval high_water_interval = args.high_water_interval low_water_interval = args.low_water_interval logging.info('Starting server...') start_http_server(port) logging.info('Server started on port %s', port) REGISTRY.register(collectors.HighwaterCollector()) REGISTRY.register(collectors.LowwaterCollector()) REGISTRY.register(collectors.ConsumerOffsetCollector()) REGISTRY.register(collectors.ConsumerLagCollector()) REGISTRY.register(collectors.ConsumerLeadCollector()) REGISTRY.register(collectors.ConsumerCommitsCollector()) REGISTRY.register(collectors.ExporterOffsetCollector()) REGISTRY.register(collectors.ExporterLagCollector()) REGISTRY.register(collectors.ExporterLeadCollector()) scheduled_jobs = setup_fetch_jobs(topic_interval, high_water_interval, low_water_interval, client) try: while True: for message in consumer: offsets = collectors.get_offsets() commits = collectors.get_commits() exporter_offsets = collectors.get_exporter_offsets() exporter_partition = message.partition exporter_offset = message.offset exporter_offsets = ensure_dict_key(exporter_offsets, exporter_partition, exporter_offset) exporter_offsets[exporter_partition] = exporter_offset collectors.set_exporter_offsets(exporter_offsets) if message.key and message.value: key = parse_key(message.key) if key: value = parse_value(message.value) group = key[1] topic = key[2] partition = key[3] offset = value[1] offsets = ensure_dict_key(offsets, group, {}) offsets[group] = ensure_dict_key( offsets[group], topic, {}) offsets[group][topic] = ensure_dict_key( offsets[group][topic], partition, offset) offsets[group][topic][partition] = offset collectors.set_offsets(offsets) commits = ensure_dict_key(commits, group, {}) commits[group] = ensure_dict_key( commits[group], topic, {}) commits[group][topic] = ensure_dict_key( commits[group][topic], partition, 0) commits[group][topic][partition] += 1 collectors.set_commits(commits) # Check if we need to run any scheduled jobs # each message. scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) # Also check if we need to run any scheduled jobs # each time the consumer times out, in case there # aren't any messages to consume. scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) except KeyboardInterrupt: pass shutdown()
def main(): signal.signal(signal.SIGTERM, signal_handler) parser = argparse.ArgumentParser( description='Export Kafka consumer offsets to Prometheus.') parser.add_argument( '-b', '--bootstrap-brokers', default='localhost', help='Addresses of brokers in a Kafka cluster to talk to.' + ' Brokers should be separated by commas e.g. broker1,broker2.' + ' Ports can be provided if non-standard (9092) e.g. brokers1:9999.' + ' (default: localhost)') parser.add_argument( '-p', '--port', type=int, default=9208, help='Port to serve the metrics endpoint on. (default: 9208)') parser.add_argument( '-s', '--from-start', action='store_true', help='Start from the beginning of the `__consumer_offsets` topic.') parser.add_argument( '--topic-interval', type=float, default=30.0, help='How often to refresh topic information, in seconds. (default: 30)' ) parser.add_argument( '--high-water-interval', type=float, default=10.0, help= 'How often to refresh high-water information, in seconds. (default: 10)' ) parser.add_argument( '--consumer-config', action='append', default=[], help= 'Provide additional Kafka consumer config as a consumer.properties file. Multiple files will be merged, later files having precedence.' ) parser.add_argument('-j', '--json-logging', action='store_true', help='Turn on json logging.') parser.add_argument( '--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='detail level to log. (default: INFO)') parser.add_argument( '-v', '--verbose', action='store_true', help='turn on verbose (DEBUG) logging. Overrides --log-level.') args = parser.parse_args() log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter(log_format) \ if args.json_logging \ else logging.Formatter(log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, args.log_level) logging.basicConfig(handlers=[log_handler], level=logging.DEBUG if args.verbose else log_level) logging.captureWarnings(True) port = args.port consumer_config = { 'bootstrap_servers': 'localhost', 'auto_offset_reset': 'latest', 'group_id': None, 'consumer_timeout_ms': 500 } for filename in args.consumer_config: with open(filename) as f: raw_config = javaproperties.load(f) converted_config = { k.replace('.', '_'): v for k, v in raw_config.items() } consumer_config.update(converted_config) if args.bootstrap_brokers: consumer_config['bootstrap_servers'] = args.bootstrap_brokers.split( ',') if args.from_start: consumer_config['auto_offset_reset'] = 'earliest' consumer = KafkaConsumer('__consumer_offsets', **consumer_config) client = consumer._client topic_interval = args.topic_interval high_water_interval = args.high_water_interval logging.info('Starting server...') start_http_server(port) logging.info('Server started on port %s', port) def read_short(bytes): num = unpack_from('>h', bytes)[0] remaining = bytes[2:] return (num, remaining) def read_int(bytes): num = unpack_from('>i', bytes)[0] remaining = bytes[4:] return (num, remaining) def read_long_long(bytes): num = unpack_from('>q', bytes)[0] remaining = bytes[8:] return (num, remaining) def read_string(bytes): length, remaining = read_short(bytes) string = remaining[:length].decode('utf-8') remaining = remaining[length:] return (string, remaining) def parse_key(bytes): (version, remaining_key) = read_short(bytes) if version == 1 or version == 0: (group, remaining_key) = read_string(remaining_key) (topic, remaining_key) = read_string(remaining_key) (partition, remaining_key) = read_int(remaining_key) return (version, group, topic, partition) def parse_value(bytes): (version, remaining_key) = read_short(bytes) if version == 0: (offset, remaining_key) = read_long_long(remaining_key) (metadata, remaining_key) = read_string(remaining_key) (timestamp, remaining_key) = read_long_long(remaining_key) return (version, offset, metadata, timestamp) elif version == 1: (offset, remaining_key) = read_long_long(remaining_key) (metadata, remaining_key) = read_string(remaining_key) (commit_timestamp, remaining_key) = read_long_long(remaining_key) (expire_timestamp, remaining_key) = read_long_long(remaining_key) return (version, offset, metadata, commit_timestamp, expire_timestamp) def update_topics(api_version, metadata): logging.info('Received topics and partition assignments') global topics if api_version == 0: TOPIC_ERROR = 0 TOPIC_NAME = 1 TOPIC_PARTITIONS = 2 PARTITION_ERROR = 0 PARTITION_NUMBER = 1 PARTITION_LEADER = 2 else: TOPIC_ERROR = 0 TOPIC_NAME = 1 TOPIC_PARTITIONS = 3 PARTITION_ERROR = 0 PARTITION_NUMBER = 1 PARTITION_LEADER = 2 new_topics = {} for t in metadata.topics: error_code = t[TOPIC_ERROR] if error_code: error = Errors.for_code(error_code)(t) logging.warning( 'Received error in metadata response at topic level: %s', error) else: topic = t[TOPIC_NAME] partitions = t[TOPIC_PARTITIONS] new_partitions = {} for p in partitions: error_code = p[PARTITION_ERROR] if error_code: error = Errors.for_code(error_code)(p) logging.warning( 'Received error in metadata response at partition level for topic %(topic)s: %(error)s', { 'topic': topic, 'error': error }) else: partition = p[PARTITION_NUMBER] leader = p[PARTITION_LEADER] logging.debug( 'Received partition assignment for partition %(partition)s of topic %(topic)s', { 'partition': partition, 'topic': topic }) new_partitions[partition] = leader new_topics[topic] = new_partitions topics = new_topics def update_highwater(offsets): logging.info('Received high-water marks') for topic, partitions in offsets.topics: for partition, error_code, offsets in partitions: if error_code: error = Errors.for_code(error_code)( (partition, error_code, offsets)) logging.warning( 'Received error in offset response for topic %(topic)s: %(error)s', { 'topic': topic, 'error': error }) else: logging.debug( 'Received high-water marks for partition %(partition)s of topic %(topic)s', { 'partition': partition, 'topic': topic }) update_gauge( metric_name='kafka_topic_highwater', label_dict={ 'topic': topic, 'partition': partition }, value=offsets[0], doc='The offset of the head of a partition in a topic.' ) def fetch_topics(this_time): logging.info('Requesting topics and partition assignments') next_time = this_time + topic_interval try: node = client.least_loaded_node() logging.debug( 'Requesting topics and partition assignments from %(node)s', {'node': node}) api_version = 0 if client.config['api_version'] < (0, 10) else 1 request = MetadataRequest[api_version](None) f = client.send(node, request) f.add_callback(update_topics, api_version) except Exception: logging.exception( 'Error requesting topics and partition assignments') finally: client.schedule(partial(fetch_topics, next_time), next_time) def fetch_highwater(this_time): logging.info('Requesting high-water marks') next_time = this_time + high_water_interval try: global topics if topics: nodes = {} for topic, partition_map in topics.items(): for partition, leader in partition_map.items(): if leader not in nodes: nodes[leader] = {} if topic not in nodes[leader]: nodes[leader][topic] = [] nodes[leader][topic].append(partition) for node, topic_map in nodes.items(): logging.debug('Requesting high-water marks from %(node)s', { 'topic': topic, 'node': node }) request = OffsetRequest[0]( -1, [(topic, [(partition, OffsetResetStrategy.LATEST, 1) for partition in partitions]) for topic, partitions in topic_map.items()]) f = client.send(node, request) f.add_callback(update_highwater) except Exception: logging.exception('Error requesting high-water marks') finally: client.schedule(partial(fetch_highwater, next_time), next_time) now_time = time.time() fetch_topics(now_time) fetch_highwater(now_time) try: while True: for message in consumer: update_gauge( metric_name=METRIC_PREFIX + 'exporter_offset', label_dict={'partition': message.partition}, value=message.offset, doc= 'The current offset of the exporter consumer in a partition of the __consumer_offsets topic.' ) if message.key and message.value: key = parse_key(message.key) if key: value = parse_value(message.value) update_gauge( metric_name=METRIC_PREFIX + 'offset', label_dict={ 'group': key[1], 'topic': key[2], 'partition': key[3] }, value=value[1], doc= 'The current offset of a consumer group in a partition of a topic.' ) increment_counter( metric_name=METRIC_PREFIX + 'commits', label_dict={ 'group': key[1], 'topic': key[2], 'partition': key[3] }, doc= 'The number of commit messages read by the exporter consumer from a consumer group for a partition of a topic.' ) except KeyboardInterrupt: pass shutdown()
def cli(**options): """Export MySQL query results to Prometheus.""" log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter( log_format) if options['json_logging'] else logging.Formatter( log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, options['log_level']) logging.basicConfig( handlers=[log_handler], level=logging.DEBUG if options['verbose'] else log_level) logging.captureWarnings(True) port = options['port'] mysql_host, mysql_port = options['mysql_server'] username = options['mysql_user'] password = options['mysql_password'] timezone = options['mysql_local_timezone'] config = configparser.ConfigParser(converters=CONFIGPARSER_CONVERTERS) config.read_file(options['config_file']) config_dir_file_pattern = os.path.join(options['config_dir'], '*.cfg') config_dir_sorted_files = sorted(glob.glob(config_dir_file_pattern)) config.read(config_dir_sorted_files) query_prefix = 'query_' queries = {} for section in config.sections(): if section.startswith(query_prefix): query_name = section[len(query_prefix):] interval = config.getfloat(section, 'QueryIntervalSecs', fallback=15) db_name = config.get(section, 'QueryDatabase') query = config.get(section, 'QueryStatement') value_columns = config.get(section, 'QueryValueColumns').split(',') on_error = config.getenum(section, 'QueryOnError', fallback='drop') on_missing = config.getenum(section, 'QueryOnMissing', fallback='drop') queries[query_name] = (interval, db_name, query, value_columns, on_error, on_missing) scheduler = sched.scheduler() mysql_kwargs = dict( host=mysql_host, port=mysql_port, user=username, password=password, # Use autocommit mode to avoid keeping the same transaction across query # runs when the connection is reused. Using the same transaction would # prevent changes from being reflected in results, and therefore metrics. # Note: Queries could theoretically change data... autocommit=True) if timezone: mysql_kwargs['init_command'] = "SET time_zone = '{}'".format(timezone) mysql_client = PersistentDB(creator=pymysql, **mysql_kwargs) if queries: for query_name, (interval, db_name, query, value_columns, on_error, on_missing) in queries.items(): schedule_job(scheduler, interval, run_query, mysql_client, query_name, db_name, query, value_columns, on_error, on_missing) else: log.warning('No queries found in config file(s)') REGISTRY.register(QueryMetricCollector()) log.info('Starting server...') start_http_server(port) log.info('Server started on port %(port)s', {'port': port}) scheduler.run()
def main(): signal.signal(signal.SIGTERM, signal_handler) parser = argparse.ArgumentParser( description='Export Kafka consumer offsets to Prometheus.') parser.add_argument( '-b', '--bootstrap-brokers', help='Addresses of brokers in a Kafka cluster to talk to.' + ' Brokers should be separated by commas e.g. broker1,broker2.' + ' Ports can be provided if non-standard (9092) e.g. brokers1:9999.' + ' (default: localhost)') parser.add_argument( '-p', '--port', type=int, default=9208, help='Port to serve the metrics endpoint on. (default: 9208)') parser.add_argument('-c', '--consumers', type=int, default=1, help='Number of Kakfa consumers to use (parallelism)') parser.add_argument( '--use-confluent-kafka', action='store_true', help='Use confluent_kafka rather than kafka-python for consumption') parser.add_argument( '-s', '--from-start', action='store_true', help='Start from the beginning of the `__consumer_offsets` topic.') parser.add_argument( '--topic-interval', type=float, default=30.0, help='How often to refresh topic information, in seconds. (default: 30)' ) parser.add_argument( '--high-water-interval', type=float, default=10.0, help= 'How often to refresh high-water information, in seconds. (default: 10)' ) parser.add_argument( '--low-water-interval', type=float, default=10.0, help= 'How often to refresh low-water information, in seconds. (default: 10)' ) parser.add_argument( '--consumer-config', action='append', default=[], help= 'Provide additional Kafka consumer config as a consumer.properties file. Multiple files will be merged, later files having precedence.' ) parser.add_argument('-j', '--json-logging', action='store_true', help='Turn on json logging.') parser.add_argument( '--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='detail level to log. (default: INFO)') parser.add_argument( '-v', '--verbose', action='store_true', help='turn on verbose (DEBUG) logging. Overrides --log-level.') args = parser.parse_args() log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter(log_format) \ if args.json_logging \ else logging.Formatter(log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, args.log_level) logging.basicConfig(handlers=[log_handler], level=logging.DEBUG if args.verbose else log_level) logging.captureWarnings(True) port = args.port consumer_config = { 'auto_offset_reset': 'latest', 'group_id': None, 'consumer_timeout_ms': 500 } # the same config is used both for kafka-python and confluent_kafka # most important properties have the same names (except _ being used instead of .) # one difference is that in case of single consumer kafka-python requires group_id not to be set # while confluent_kafka always requires to have group_id if not args.use_confluent_kafka: if args.consumers > 1: consumer_config[ 'group_id'] = 'prometheus-kafka-consumer-exporter-' + id_generator( ) consumer_config['enable_auto_commit'] = False else: consumer_config[ 'group_id'] = 'prometheus-kafka-consumer-exporter-' + id_generator( ) for filename in args.consumer_config: with open(filename) as f: raw_config = javaproperties.load(f) converted_config = { k: int(v) if v.isdigit() else True if v == 'True' else False if v == 'False' else v for k, v in raw_config.items() } consumer_config.update(converted_config) if not 'bootstrap_servers' in consumer_config: consumer_config['bootstrap_servers'] = 'localhost' logging.info('bootstrap_servers not specified - using localhost') if args.bootstrap_brokers: consumer_config['bootstrap_servers'] = args.bootstrap_brokers.split( ',') if args.from_start: consumer_config['auto_offset_reset'] = 'earliest' # retain only settings relevant for kafka-python kafka_python_consumer_config = cleanup_conf(consumer_config) consumer = KafkaConsumer(**kafka_python_consumer_config) client = consumer._client topic_interval = args.topic_interval high_water_interval = args.high_water_interval low_water_interval = args.low_water_interval logging.info('Starting server...') start_http_server(port) logging.info('Server started on port %s', port) REGISTRY.register(collectors.HighwaterCollector()) REGISTRY.register(collectors.LowwaterCollector()) REGISTRY.register(collectors.ConsumerOffsetCollector()) REGISTRY.register(collectors.ConsumerLagCollector()) REGISTRY.register(collectors.ConsumerLeadCollector()) REGISTRY.register(collectors.ConsumerCommitsCollector()) REGISTRY.register(collectors.ExporterOffsetCollector()) REGISTRY.register(collectors.ExporterLagCollector()) REGISTRY.register(collectors.ExporterLeadCollector()) scheduled_jobs = setup_fetch_jobs(topic_interval, high_water_interval, low_water_interval, client) mpc = MultiProcessConsumer(args.use_confluent_kafka, args.consumers, 5, args.json_logging, args.log_level, args.verbose, **consumer_config) try: while True: for item in mpc: offsets = collectors.get_offsets() commits = collectors.get_commits() exporter_offsets = collectors.get_exporter_offsets() exporter_offsets = merge_exporter_offsets( exporter_offsets, item[0]) offsets = merge_offsets(offsets, item[1]) commits = merge_offsets(commits, item[2]) collectors.set_exporter_offsets(exporter_offsets) collectors.set_offsets(offsets) collectors.set_commits(commits) # Check if we need to run any scheduled jobs # each message. scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) # Also check if we need to run any scheduled jobs # each time the consumer times out, in case there # aren't any messages to consume. scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) except KeyboardInterrupt: pass mpc.stop() shutdown()
def _mp_consume(message_queue, report_inverval, json_logging, log_level, verbose, events, **consumer_options): conf = cleanup_conf(consumer_options) log_handler = logging.StreamHandler() log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' formatter = JogFormatter(log_format) \ if json_logging \ else logging.Formatter(log_format) log_handler.setFormatter(formatter) log_level = getattr(logging, log_level) logging.basicConfig(handlers=[log_handler], level=logging.DEBUG if verbose else log_level) logging.captureWarnings(True) this.logger = logging.getLogger(__name__) offsets = {} commits = {} exporter_offsets = {} while not events.exit.is_set(): # Wait till the controller indicates us to start consumption events.start.wait() this.logger.info('Initialising Consumer') consumer = Consumer(conf, logger=this.logger) consumer.subscribe(['__consumer_offsets'], on_assign=on_assignment) start_time = time.time() current_report_interval = randint(1, report_inverval + 1) i = 0 while True: # If we are asked to quit, do so - do not check to frequently if time.time() - start_time > current_report_interval and ( events.exit.is_set() or events.stop.is_set()): consumer.close() break message = consumer.poll(timeout=1.0) if message is None: continue if message.error(): if message.error().code() == KafkaError._PARTITION_EOF: this.logger.debug('Reached end of [%d] at offset %d', message.partition(), message.offset()) continue else: this.logger.error('poll() failed: %r', message.error()) exporter_partition = message.partition() exporter_offset = message.offset() exporter_offsets = ensure_dict_key(exporter_offsets, exporter_partition, exporter_offset) exporter_offsets[exporter_partition] = exporter_offset if message.key() and message.value(): key = parse_key(message.key()) if key: i += 1 value = parse_value(message.value()) group = key[1] topic = key[2] partition = key[3] offset = value[1] offsets = ensure_dict_key(offsets, group, {}) offsets[group] = ensure_dict_key(offsets[group], topic, {}) offsets[group][topic] = ensure_dict_key( offsets[group][topic], partition, offset) offsets[group][topic][partition] = offset commits = ensure_dict_key(commits, group, {}) commits[group] = ensure_dict_key(commits[group], topic, {}) commits[group][topic] = ensure_dict_key( commits[group][topic], partition, 0) commits[group][topic][partition] += 1 try: if time.time() - start_time > current_report_interval: this.logger.debug( 'Successfully processed %d/sec messages since last report', i / current_report_interval) current_report_interval = randint( 1, report_inverval + 1) start_time = time.time() message_queue.put( (exporter_offsets, offsets, commits), timeout=report_inverval * 2) clear_commits(commits) i = 0 except queue.Full: this.logger.error('Queue is full, backing off') current_report_interval *= 2