class TestTomcat(unittest.TestCase): def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) pid_file = PidFile('dogstatsd') self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.environ['VOLATILE_DIR'], 'jmx_yaml') self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start() def tearDown(self): self.server.stop() self.reporter.finished = True self.jmx_daemon.terminate() def test_tomcat_metrics(self): count = 0 while self.reporter.metrics is None: time.sleep(1) count += 1 if count > 25: raise Exception("No metrics were received in 25 seconds") metrics = self.reporter.metrics self.assertTrue(type(metrics) == type([])) self.assertTrue(len(metrics) > 0) self.assertEquals(len([t for t in metrics if t['metric'] == "tomcat.threads.busy" and "instance:tomcat_instance" in t['tags']]), 2, metrics) self.assertEquals(len([t for t in metrics if t['metric'] == "tomcat.bytes_sent" and "instance:tomcat_instance" in t['tags']]), 0, metrics) self.assertTrue(len([t for t in metrics if "jvm." in t['metric'] and "instance:tomcat_instance" in t['tags']]) > 4, metrics)
class JMXTestCase(unittest.TestCase): def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = Fixtures.directory() self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start() def tearDown(self): self.server.stop() self.reporter.finished = True self.jmx_daemon.terminate() def testCustomJMXMetric(self): count = 0 while self.reporter.metrics is None: time.sleep(1) count += 1 if count > 25: raise Exception("No metrics were received in 25 seconds") metrics = self.reporter.metrics self.assertTrue(isinstance(metrics, ListType)) self.assertTrue(len(metrics) > 0) self.assertTrue(len([t for t in metrics if "cassandra.db." in t['metric'] and "instance:cassandra_instance" in t['tags']]) > 40, metrics)
class JMXTestCase(unittest.TestCase): def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.path.dirname(__file__)) self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start() def tearDown(self): self.server.stop() self.reporter.finished = True self.jmx_daemon.terminate() def testTomcatMetrics(self): count = 0 while self.reporter.metrics is None: time.sleep(1) count += 1 if count > 25: raise Exception("No metrics were received in 25 seconds") metrics = self.reporter.metrics self.assertTrue(isinstance(metrics, ListType)) self.assertTrue(len(metrics) > 8, metrics) self.assertEquals(len([t for t in metrics if 'instance:solr_instance' in t['tags'] and t['metric'] == "jvm.thread_count"]), 1, metrics) self.assertTrue(len([t for t in metrics if "jvm." in t['metric'] and 'instance:solr_instance' in t['tags']]) > 4, metrics) self.assertTrue(len([t for t in metrics if "solr." in t['metric'] and 'instance:solr_instance' in t['tags']]) > 4, metrics)
class JMXFetchProcess(multiprocessing.Process): def __init__(self, agentConfig, hostname): multiprocessing.Process.__init__(self, name='jmxfetch') self.config = agentConfig self.hostname = hostname try: confd_path = get_confd_path() self.jmx_daemon = JMXFetch(confd_path, agentConfig) self.jmx_daemon.configure() self.is_enabled = self.jmx_daemon.should_run() except PathNotFound: self.is_enabled = False def run(self): if self.is_enabled: JMXFiles.clean_exit_file() self.jmx_daemon.run() def terminate(self): """ Override `terminate` method to properly exit JMXFetch. """ JMXFiles.write_exit_file() self.join()
class JMXTestCase(unittest.TestCase): def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.environ['VOLATILE_DIR'], 'jmx_yaml') self.jmx_daemon = JMXFetch(confd_path, {'sdstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start() def tearDown(self): self.server.stop() self.reporter.finished = True self.jmx_daemon.terminate() def testCustomJMXMetric(self): count = 0 while self.reporter.metrics is None: time.sleep(1) count += 1 if count > 20: raise Exception("No metrics were received in 20 seconds") metrics = self.reporter.metrics self.assertTrue(isinstance(metrics, ListType)) self.assertTrue(len(metrics) > 0) self.assertEquals(len([t for t in metrics if t['metric'] == "my.metric.buf" and "instance:jmx_instance1" in t['tags']]), 2, metrics) self.assertTrue(len([t for t in metrics if 'type:ThreadPool' in t['tags'] and "instance:jmx_instance1" in t['tags'] and "jmx.catalina" in t['metric']]) > 8, metrics) self.assertTrue(len([t for t in metrics if "jvm." in t['metric'] and "instance:jmx_instance1" in t['tags']]) == 13, metrics)
class JMXFetchProcess(multiprocessing.Process): def __init__(self, agentConfig, hostname, **options): multiprocessing.Process.__init__(self, name='jmxfetch') self.config = agentConfig self.hostname = hostname self.options = options try: confd_path = get_confd_path() self.jmx_daemon = JMXFetch(confd_path, agentConfig) self.jmx_daemon.configure() self.is_enabled = self.jmx_daemon.should_run() except PathNotFound: self.is_enabled = False def run(self): from config import initialize_logging initialize_logging('jmxfetch') if self.is_enabled: log.debug("Windows Service - Starting JMXFetch") JMXFiles.clean_exit_file() self.jmx_daemon.run() else: log.info("Windows Service - Not starting JMXFetch: no valid configuration found") def terminate(self): """ Override `terminate` method to properly exit JMXFetch. """ JMXFiles.write_exit_file() self.join()
def _handle_sigterm(self, signum, frame): log.debug("Caught sigterm. Stopping run loop.") self.run_forever = False if JMXFetch.is_running(): JMXFetch.stop() if self.collector: self.collector.stop() log.debug("Collector is stopped.")
def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) pid_file = PidFile('dogstatsd') self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.environ['VOLATILE_DIR'], 'jmx_yaml') JMXFetch.init(confd_path, {'dogstatsd_port':STATSD_PORT}, get_logging_config(), 15, JMX_COLLECT_COMMAND)
def _get_jmxfetch_subprocess_args(self, yaml_jmx_conf, mock_subprocess_call): # Helper function # Returns the Java JMX subprocess_args called from a YAML configuration tmp_dir = tempfile.mkdtemp() filename = "jmx.yaml" with open(os.path.join(tmp_dir, filename), 'wb') as temp_file: temp_file.write(yaml.dump(yaml_jmx_conf)) jmx = JMXFetch(tmp_dir, {}) jmx.run(reporter="console") return mock_subprocess_call.call_args[0][0]
def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) pid_file = PidFile('dogstatsd') self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.realpath(os.path.join(os.path.abspath(__file__), "..", "jmx_yamls")) JMXFetch.init(confd_path, {'dogstatsd_port':STATSD_PORT}, get_logging_config(), 15)
class TestKafka(unittest.TestCase): """Basic Test for kafka integration.""" def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.path.dirname(__file__), 'resources/') self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start() def tearDown(self): self.server.stop() self.reporter.finished = True self.jmx_daemon.terminate() def testCustomJMXMetric(self): count = 0 while self.reporter.metrics is None: time.sleep(1) count += 1 if count > 25: raise Exception("No metrics were received in 25 seconds") metrics = self.reporter.metrics # expected_tags = ['env:test', 'instance:kafka-172.17.0.1-9999', 'kafka:broker'] self.assertTrue(isinstance(metrics, ListType)) self.assertTrue(len(metrics) > 0) log.info(metrics) log.info(len(metrics)) self.assertTrue( len([t for t in metrics if "jvm." in t['metric'] and "instance:kafka-172.17.0.1-9999" in t['tags']]) >= 13, metrics) self.assertTrue( len([t for t in metrics if "kafka.request." in t['metric'] and "instance:kafka-172.17.0.1-9999" in t['tags']]) == 12, metrics) self.assertTrue( len([t for t in metrics if "kafka.replication." in t['metric'] and "instance:kafka-172.17.0.1-9999" in t['tags']]) == 6, metrics) # CLIENT metrics. # kafka.producer.request_latency_avg self.assertTrue( len([t for t in metrics if "kafka.producer." in t['metric'] and "instance:kafka-172.17.0.1-7777" in t['tags']]) == 1, metrics) # kafka.consumer.fetch_rate, kafka.consumer.max_lag self.assertTrue( len([t for t in metrics if "kafka.consumer." in t['metric'] and "instance:kafka-172.17.0.1-7777" in t['tags']]) == 2, metrics)
def _add_jmxinfo_tar(self): _, _, should_run_jmx = self._capture_output(self._should_run_jmx) if should_run_jmx: # status files (before listing beans because executing jmxfetch overwrites status files) for file_name, file_path in [ (JMXFiles._STATUS_FILE, JMXFiles.get_status_file_path()), (JMXFiles._PYTHON_STATUS_FILE, JMXFiles.get_python_status_file_path()) ]: if self._can_read(file_path, warn=False): self._add_file_tar( file_path, os.path.join('jmxinfo', file_name) ) # beans lists for command in ['list_matching_attributes', 'list_everything']: log.info(" * datadog-agent jmx {0} output".format(command)) self._add_command_output_tar( os.path.join('jmxinfo', '{0}.log'.format(command)), partial(self._jmx_command_call, command) ) # java version log.info(" * java -version output") _, _, java_bin_path = self._capture_output( lambda: JMXFetch.get_configuration(get_confd_path())[2] or 'java') self._add_command_output_tar( os.path.join('jmxinfo', 'java_version.log'), lambda: self._java_version(java_bin_path), command_desc="{0} -version".format(java_bin_path) )
def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.path.dirname(__file__)) self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start()
def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.environ['VOLATILE_DIR'], 'jmx_yaml') self.jmx_daemon = JMXFetch(confd_path, {'sdstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start()
def jmx_command(args, agent_config, redirect_std_streams=False): """ Run JMXFetch with the given command if it is valid (and print user-friendly info if it's not) """ from jmxfetch import JMX_LIST_COMMANDS, JMXFetch if len(args) < 1 or args[0] not in JMX_LIST_COMMANDS.keys(): print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[0] checks_list = args[1:] confd_directory = get_confd_path() jmx_process = JMXFetch(confd_directory, agent_config) jmx_process.configure() should_run = jmx_process.should_run() if should_run: jmx_process.run(jmx_command, checks_list, reporter="console", redirect_std_streams=redirect_std_streams) else: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support"
def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) pid_file = PidFile('dogstatsd') self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = Fixtures.directory() self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start()
class JMXFetchProcess(multiprocessing.Process): def __init__(self, agentConfig, hostname): multiprocessing.Process.__init__(self, name='jmxfetch') self.config = agentConfig self.hostname = hostname try: osname = get_os() confd_path = get_confd_path(osname) self.jmx_daemon = JMXFetch(confd_path, agentConfig) self.jmx_daemon.configure() self.is_enabled = self.jmx_daemon.should_run() except PathNotFound: self.is_enabled = False def run(self): if self.is_enabled: self.jmx_daemon.run() def stop(self): pass
def __init__(self, agentConfig, hostname): multiprocessing.Process.__init__(self, name='jmxfetch') self.config = agentConfig self.hostname = hostname try: confd_path = get_confd_path() self.jmx_daemon = JMXFetch(confd_path, agentConfig) self.jmx_daemon.configure() self.is_enabled = self.jmx_daemon.should_run() except PathNotFound: self.is_enabled = False
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) # todo autorestart isn't used remove autorestart = agentConfig.get('autorestart', False) COMMANDS = [ 'start', 'stop', 'restart', 'foreground', 'status', 'info', 'check', 'configcheck', 'jmx', ] if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 pid_file = PidFile('mon-agent') if options.clean: pid_file.clean() agent = CollectorDaemon(pid_file.get_path(), autorestart) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.run() def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.run(config=agentConfig) elif 'check' == command: check_name = args[1] try: # Try the old-style check first print getattr(collector.checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks_d checks = load_check_directory(agentConfig) for check in checks['initialized_checks']: if check.name == check_name: check.run() print check.get_metrics() print check.get_events() if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) check.run() print check.get_metrics() print check.get_events() elif 'configcheck' == command or 'configtest' == command: osname = get_os() all_valid = True for conf_path in glob.glob(os.path.join(get_confd_path(osname), "*.yaml")): basename = os.path.basename(conf_path) try: check_yaml(conf_path) except Exception as e: all_valid = False print "%s contains errors:\n %s" % (basename, e) else: print "%s is valid" % basename if all_valid: print "All yaml files passed. You can now run the Monitoring agent." return 0 else: print("Fix the invalid yaml files above in order to start the Monitoring agent. " "A useful external tool for yaml parsing can be found at " "http://yaml-online-parser.appspot.com/") return 1 elif 'jmx' == command: from collector.jmxfetch import JMX_LIST_COMMANDS, JMXFetch if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys(): print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following command:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/mon-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) should_run = JMXFetch.init( confd_directory, agentConfig, get_logging_config(), 15, jmx_command, checks_list, reporter="console") if not should_run: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" return 0
def _is_jmxfetch_enabled(self, config): confd_path = get_confd_path() jmxfetch = JMXFetch(confd_path, config) jmxfetch.configure() return jmxfetch.should_run()
def tearDown(self): self.server.stop() self.reporter.finished = True JMXFetch.stop()
checksd_path = get_checksd_path(osname) checks_paths.append(glob.glob(os.path.join(checksd_path, '*.py'))) except PathNotFound, e: log.error(e.args[0]) sys.exit(3) try: confd_path = get_confd_path(osname) except PathNotFound, e: log.error("No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0]) sys.exit(3) from migration import migrate_old_style_configuration migrate_old_style_configuration(agentConfig, confd_path) JMXFetch.init(confd_path, agentConfig, get_logging_config(), DEFAULT_CHECK_FREQUENCY) # For backwards-compatability with old style checks, we have to load every # checks.d module and check for a corresponding config OR check if the old # config will "activate" the check. # # Once old-style checks aren't supported, we'll just read the configs and # import the corresponding check module for check in itertools.chain(*checks_paths): check_name = os.path.basename(check).split('.')[0] if check_name in initialized_checks or check_name in init_failed_checks: log.debug('Skipping check %s because it has already been loaded from another location', check) continue try: check_module = imp.load_source('checksd_%s' % check_name, check) except Exception, e:
def stop(self): log.debug("Windows Service - Stopping collector") self.collector.stop() if JMXFetch.is_running(): JMXFetch.stop() self.running = False
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) else: sdk_integrations = get_sdk_integration_paths() for name, path in sdk_integrations.iteritems(): lib_path = os.path.join(path, 'lib') if os.path.exists(lib_path): sys.path.append(lib_path) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if self.sd_backend and _is_affirmative(self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig) else: log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.') # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: self.sd_pipe_jmx_configs(hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # JMXFetch restarts should prompt re-piping *all* JMX configs if self._jmx_service_discovery_enabled and \ (not self.reload_configs_flag or isinstance(self.reload_configs_flag, set)): try: jmx_launch = JMXFetch._get_jmx_launchtime() if self.last_jmx_piped and self.last_jmx_piped < jmx_launch: self.sd_pipe_jmx_configs(hostname) except Exception as e: log.debug("could not stat JMX lunch file: %s", e) # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
print "All yaml files passed. You can now run the Datadog agent." else: print("Fix the invalid yaml files above in order to start the Datadog agent. " "A useful external tool for yaml parsing can be found at " "http://yaml-online-parser.appspot.com/") elif 'jmx' == command: from jmxfetch import JMX_LIST_COMMANDS, JMXFetch if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS: print "You have to specify one of the following command %s" % JMX_LIST_COMMANDS else: jmx_command = args[1] from config import get_confd_path, get_logging_config from util import get_os JMXFetch.init(get_confd_path(get_os()), agentConfig, get_logging_config(), 15, jmx_command) return 0 if __name__ == '__main__': try: sys.exit(main()) except StandardError: # Try our best to log the error. try: log.exception("Uncaught error running the Agent") except Exception: pass raise
checks_paths.append(glob.glob(os.path.join(checksd_path, '*.py'))) except PathNotFound, e: log.error(e.args[0]) sys.exit(3) try: confd_path = get_confd_path(osname) except PathNotFound, e: log.error("No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0]) sys.exit(3) # Migrate datadog.conf integration configurations that are not supported anymore migrate_old_style_configuration(agentConfig, confd_path, get_config_path(None, os_name=get_os())) # Start JMXFetch if needed JMXFetch.init(confd_path, agentConfig, get_logging_config(), DEFAULT_CHECK_FREQUENCY, JMX_COLLECT_COMMAND) # For backwards-compatability with old style checks, we have to load every # checks.d module and check for a corresponding config OR check if the old # config will "activate" the check. # # Once old-style checks aren't supported, we'll just read the configs and # import the corresponding check module for check in itertools.chain(*checks_paths): check_name = os.path.basename(check).split('.')[0] if check_name in initialized_checks or check_name in init_failed_checks: log.debug('Skipping check %s because it has already been loaded from another location', check) continue try: check_module = imp.load_source('checksd_%s' % check_name, check) except Exception, e:
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile('dd-agent').get_path(), autorestart) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: check.run() print check.get_metrics() print check.get_events() print check.get_service_checks() if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) check.run() print check.get_metrics() print check.get_events() print check.get_service_checks() check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: from jmxfetch import JMX_LIST_COMMANDS, JMXFetch if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys(): print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) jmx_process = JMXFetch(confd_directory, agentConfig) jmx_process.configure() should_run = jmx_process.should_run() if should_run: jmx_process.run(jmx_command, checks_list, reporter="console") else: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support" elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
sys.exit(3) try: confd_path = get_confd_path(osname) except PathNotFound, e: log.error( "No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0]) sys.exit(3) # Migrate datadog.conf integration configurations that are not supported anymore migrate_old_style_configuration(agentConfig, confd_path, get_config_path(None, os_name=get_os())) # Start JMXFetch if needed JMXFetch.init(confd_path, agentConfig, get_logging_config(), DEFAULT_CHECK_FREQUENCY, JMX_COLLECT_COMMAND) # We don't support old style configs anymore # So we iterate over the files in the checks.d directory # If there is a matching configuration file in the conf.d directory # then we import the check for check in itertools.chain(*checks_paths): check_name = os.path.basename(check).split('.')[0] check_config = None if check_name in initialized_checks or check_name in init_failed_checks: log.debug( 'Skipping check %s because it has already been loaded from another location', check) continue # Let's see if there is a conf.d for this check
def _should_run_jmx(self): jmx_process = JMXFetch(get_confd_path(), self._config) jmx_process.configure(clean_status_file=False) return jmx_process.should_run()
class TestHbase_regionserver(unittest.TestCase): """Basic Test for hbase_regionserver integration.""" def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.path.dirname(__file__), 'ci/resources/') self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start() def tearDown(self): self.server.stop() self.reporter.finished = True self.jmx_daemon.terminate() def testCustomJMXMetric(self): count = 0 while self.reporter.metrics is None: time.sleep(1) count += 1 if count > 60: raise Exception("No metrics were received in 60 seconds") metrics = self.reporter.metrics self.assertTrue(isinstance(metrics, list)) self.assertTrue(len(metrics) > 0) self.assertTrue( len([ t for t in metrics if "jvm." in t['metric'] and "instance:hbase_regionserver-localhost-10102" in t['tags'] ]) >= 13, metrics) # waiting for receiving metrics which appears after a while. count = 0 while True: metrics = self.reporter.metrics mutations_metrics = [ t for t in metrics if "hbase.regionserver.server.mutations" in t['metric'] and "instance:hbase_regionserver-localhost-10102" in t['tags'] ] slow_appned_metrics = [ t for t in metrics if "hbase.regionserver.server.slow_append" in t['metric'] and "instance:hbase_regionserver-localhost-10102" in t['tags'] ] # hedged_metrics = [t for t in metrics if "hbase.regionserver.server.hedged_read" in t['metric'] and "instance:hbase_regionserver-localhost-10102" in t['tags']] # pause_time_metrics = [t for t in metrics if "hbase.regionserver.server.pause_time" in t['metric'] and "instance:hbase_regionserver-localhost-10102" in t['tags']] time.sleep(1) count += 1 if len(mutations_metrics) >= 2 and len(slow_appned_metrics) >= 1: break elif count <= 60: continue else: log.info(metrics) raise Exception("enough metrics were received in 60 seconds") # hbase.regionserver.server.hedged_read* and hbase.regionserver.server.pause* is ignored here # because these metrics won't emit until these process will actually happen. metrics = [ t for t in self.reporter.metrics if "hbase." in t['metric'] and "instance:hbase_regionserver-localhost-10102" in t['tags'] ] num_total = 159 num_hedged_read = 2 num_pause = 14 num = len(metrics) log.info(num) self.assertTrue(num >= (num_total - num_hedged_read - num_pause), metrics)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile('dd-agent').get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys(): print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) jmx_process = JMXFetch(confd_directory, agentConfig) jmx_process.configure() should_run = jmx_process.should_run() if should_run: jmx_process.run(jmx_command, checks_list, reporter="console") else: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support" elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) should_run = JMXFetch.init(confd_directory, agentConfig, get_logging_config(), 15, jmx_command, checks_list, reporter="console") if not should_run: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support" return 0 if __name__ == '__main__': try: sys.exit(main()) except StandardError:
print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) should_run = JMXFetch.init(confd_directory, agentConfig, get_logging_config(), 15, jmx_command, checks_list, reporter="console") if not should_run: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support" return 0 if __name__ == '__main__': try: sys.exit(main()) except StandardError: # Try our best to log the error. try:
class TestKafka(unittest.TestCase): """Basic Test for kafka integration.""" def setUp(self): aggregator = MetricsAggregator("test_host") self.server = Server(aggregator, "localhost", STATSD_PORT) self.reporter = DummyReporter(aggregator) self.t1 = threading.Thread(target=self.server.start) self.t1.start() confd_path = os.path.join(os.path.dirname(__file__), 'ci/resources/') self.jmx_daemon = JMXFetch(confd_path, {'dogstatsd_port': STATSD_PORT}) self.t2 = threading.Thread(target=self.jmx_daemon.run) self.t2.start() def tearDown(self): self.server.stop() self.reporter.finished = True self.jmx_daemon.terminate() def testCustomJMXMetric(self): count = 0 while self.reporter.metrics is None: time.sleep(1) count += 1 if count > 25: raise Exception("No metrics were received in 25 seconds") metrics = self.reporter.metrics # expected_tags = ['env:test', 'instance:kafka-172.17.0.1-9999', 'kafka:broker'] self.assertTrue(isinstance(metrics, ListType)) self.assertTrue(len(metrics) > 0) log.info(metrics) log.info(len(metrics)) self.assertTrue( len([ t for t in metrics if "jvm." in t['metric'] and "instance:kafka-172.17.0.1-9999" in t['tags'] ]) >= 13, metrics) self.assertTrue( len([ t for t in metrics if "kafka.request." in t['metric'] and "instance:kafka-172.17.0.1-9999" in t['tags'] ]) == 12, metrics) self.assertTrue( len([ t for t in metrics if "kafka.replication." in t['metric'] and "instance:kafka-172.17.0.1-9999" in t['tags'] ]) == 6, metrics) # CLIENT metrics. # kafka.producer.request_latency_avg self.assertTrue( len([ t for t in metrics if "kafka.producer." in t['metric'] and "instance:kafka-172.17.0.1-7777" in t['tags'] ]) == 1, metrics) # kafka.consumer.fetch_rate, kafka.consumer.max_lag self.assertTrue( len([ t for t in metrics if "kafka.consumer." in t['metric'] and "instance:kafka-172.17.0.1-7777" in t['tags'] ]) == 2, metrics)
checks_paths.append(glob.glob(os.path.join(checksd_path, '*.py'))) except PathNotFound, e: log.error(e.args[0]) sys.exit(3) try: confd_path = get_confd_path(osname) except PathNotFound, e: log.error("No conf.d folder found at '%s' or in the directory where the Agent is currently deployed.\n" % e.args[0]) sys.exit(3) # Migrate datadog.conf integration configurations that are not supported anymore migrate_old_style_configuration(agentConfig, confd_path, get_config_path(None, os_name=get_os())) # Start JMXFetch if needed JMXFetch.init(confd_path, agentConfig, get_logging_config(), DEFAULT_CHECK_FREQUENCY) # For backwards-compatability with old style checks, we have to load every # checks.d module and check for a corresponding config OR check if the old # config will "activate" the check. # # Once old-style checks aren't supported, we'll just read the configs and # import the corresponding check module for check in itertools.chain(*checks_paths): check_name = os.path.basename(check).split('.')[0] if check_name in initialized_checks or check_name in init_failed_checks: log.debug('Skipping check %s because it has already been loaded from another location', check) continue try: check_module = imp.load_source('checksd_%s' % check_name, check) except Exception, e:
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get("autorestart", False) hostname = get_hostname(agentConfig) COMMANDS = ["start", "stop", "restart", "foreground", "status", "info", "check", "configcheck", "jmx", "flare"] if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 pid_file = PidFile("dd-agent") if options.clean: pid_file.clean() agent = Agent(pid_file.get_path(), autorestart) if command in START_COMMANDS: log.info("Agent version %s" % get_version()) if "start" == command: log.info("Start daemon") agent.start() elif "stop" == command: log.info("Stop daemon") agent.stop() elif "restart" == command: log.info("Restart daemon") agent.restart() elif "status" == command: agent.status() elif "info" == command: return agent.info(verbose=options.verbose) elif "foreground" == command: logging.info("Running in foreground") if autorestart: # Set-up the supervisor callbacks and fork it. logging.info("Running Agent with auto-restart ON") def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif "check" == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks["initialized_checks"]: if check.name == check_name: check.run() print check.get_metrics() print check.get_events() print check.get_service_checks() if len(args) == 3 and args[2] == "check_rate": print "Running 2nd iteration to capture rate metrics" time.sleep(1) check.run() print check.get_metrics() print check.get_events() print check.get_service_checks() check.stop() elif "configcheck" == command or "configtest" == command: configcheck() elif "jmx" == command: from jmxfetch import JMX_LIST_COMMANDS, JMXFetch if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys(): print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) jmx_process = JMXFetch(confd_directory, agentConfig) should_run = jmx_process.run(jmx_command, checks_list, reporter="console") if not should_run: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support" elif "flare" == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print "The upload failed:\n{0}".format(str(e))