def test_persistence_fail(): # Assert remove doesn't crap out if a file doesn't exist. CollectorStatus.remove_latest_status() CollectorStatus.remove_latest_status() status = CollectorStatus.load_latest_status() assert not status
def test_persistence(): i1 = InstanceStatus(1, STATUS_OK) chk1 = CheckStatus("dummy", [i1], 1, 2) c1 = CollectorStatus([chk1]) c1.persist() c2 = CollectorStatus.load_latest_status() nt.assert_equal(1, len(c2.check_statuses)) chk2 = c2.check_statuses[0] assert chk2.name == chk1.name assert chk2.status == chk2.status assert chk2.metric_count == 1 assert chk2.event_count == 2
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(agentConfig) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd) # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def get(self): dogstatsd_status = DogstatsdStatus.load_latest_status() forwarder_status = ForwarderStatus.load_latest_status() collector_status = CollectorStatus.load_latest_status() self.render(os.path.join(agent_root, "pup", "status.html"), port=port, platform=platform.platform(), agent_version=get_version(), python_version=platform.python_version(), logger_info=logger_info(), dogstatsd=dogstatsd_status.to_dict(), forwarder=forwarder_status.to_dict(), collector=collector_status.to_dict(), )
def latest_status(self): try: loaded_template = template.Loader(".") dogstatsd_status = DogstatsdStatus.load_latest_status() forwarder_status = ForwarderStatus.load_latest_status() collector_status = CollectorStatus.load_latest_status() generated_template = loaded_template.load("status.html").generate( port=22, platform=platform.platform(), agent_version=get_version(), python_version=platform.python_version(), logger_info=logger_info(), dogstatsd=dogstatsd_status.to_dict(), forwarder=forwarder_status.to_dict(), collector=collector_status.to_dict(), ) return generated_template except Exception: return "Unable to fetch latest status"
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get("autorestart", False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get("developer_mode") COMMANDS_AGENT = ["start", "stop", "restart", "status", "foreground"] COMMANDS_NO_AGENT = ["info", "check", "configcheck", "jmx", "flare"] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info("Agent version %s" % get_version()) if "start" == command: log.info("Start daemon") agent.start() elif "stop" == command: log.info("Stop daemon") agent.stop() elif "restart" == command: log.info("Restart daemon") agent.restart() elif "status" == command: agent.status() elif "info" == command: return Agent.info(verbose=options.verbose) elif "foreground" == command: logging.info("Running in foreground") if autorestart: # Set-up the supervisor callbacks and fork it. logging.info("Running Agent with auto-restart ON") def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif "check" == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks["initialized_checks"]: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == "check_rate": print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif "configcheck" == command or "configtest" == command: configcheck() if agentConfig.get("service_discovery", False): # set the TRACE_CONFIG flag to True to make load_check_directory return # the source of config objects. # Then call load_check_directory here and pass the result to sd_configcheck # to avoid circular imports agentConfig[TRACE_CONFIG] = True configs = { # check_name: (config_source, config) } print ("\nLoading check configurations...\n\n") configs = load_check_directory(agentConfig, hostname) sd_configcheck(agentConfig, configs) elif "jmx" == command: jmx_command(args[1:], agentConfig) elif "flare" == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print "The upload failed:\n{0}".format(str(e))
def run(self, config=None): signal.signal(signal.SIGTERM, self._handle_sigterm) signal.signal(signal.SIGUSR1, self._handle_sigusr1) signal.signal(signal.SIGINT, self._handle_sigterm) signal.signal(signal.SIGHUP, self._handle_sighup) CollectorStatus().persist() if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() self._checksd = load_check_directory(self._agentConfig, hostname) self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.check_frequency = int(self._agentConfig['check_freq']) watchmonitor = self._get_watchmonitor(self.check_frequency) self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) if self.autorestart and self._should_restart(): self._do_restart() if self.run_forever: if watchmonitor: watchmonitor.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) try: CollectorStatus.remove_latest_status() except Exception: pass log.info("Exiting. Bye bye.") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) COMMANDS = [ 'start', 'stop', 'restart', 'foreground', 'status', 'info', ] if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 pid_file = PidFile('dd-agent') # Only initialize the Agent if we're starting or stopping it. if command in ['start', 'stop', 'restart', 'foreground']: if options.clean: pid_file.clean() agent = Agent(pid_file.get_path(), autorestart) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.run() def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.run(config=agentConfig) # Commands that don't need the agent to be initialized. else: if 'status' == command: pid = pid_file.get_pid() if pid is not None: sys.stdout.write('dd-agent is running as pid %s.\n' % pid) log.info("dd-agent is running as pid %s." % pid) else: sys.stdout.write('dd-agent is not running.\n') log.info("dd-agent is not running.") elif 'info' == command: logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=options.verbose) return 0
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) COMMANDS = [ 'start', 'stop', 'restart', 'foreground', 'status', 'info', 'check', ] if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 pid_file = PidFile('dd-agent') # Only initialize the Agent if we're starting or stopping it. if command in ['start', 'stop', 'restart', 'foreground', 'check']: if options.clean: pid_file.clean() agent = Agent(pid_file.get_path(), autorestart) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.run() def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.run(config=agentConfig) elif 'check' == command: check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig) for check in checks: if check.name == check_name: check.run() print check.get_metrics() print check.get_events() if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) check.run() print check.get_metrics() print check.get_events() # Commands that don't need the agent to be initialized. else: if 'status' == command: pid = pid_file.get_pid() if pid is not None: sys.stdout.write('dd-agent is running as pid %s.\n' % pid) log.info("dd-agent is running as pid %s." % pid) else: sys.stdout.write('dd-agent is not running.\n') log.info("dd-agent is not running.") elif 'info' == command: logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=options.verbose) return 0
logger.exception("Error running check %s" % check.name) check_status = CheckStatus(check.name, instance_statuses, metric_count, event_count) check_statuses.append(check_status) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events collect_duration = timer.step() emitter_statuses = self._emit(payload) emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses).persist() except Exception: logger.exception("Error persisting collector status") logger.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(emit_duration, 2))) def _emit(self, payload): """ Send the payload via the emitters. """ statuses = [] for emitter in self.emitters: # Don't try to send to an emitter if we're stopping/ if not self.continue_running: return statuses name = emitter.__name__
def info(self, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose)
payload['metrics'].extend( self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration, time.clock() - cpu_clock)) else: payload['metrics'].extend( self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration)) emitter_statuses = self._emit(payload) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" %
def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd['init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks for check_name in ['memory', 'cpu', 'network', 'io', 'proc', 'system']: try: metrics.extend(self._win32_system_checks[check_name].check(self.agentConfig)) except Exception: log.exception('Unable to get %s metrics', check_name) else: # Unix system checks sys_checks = self._unix_system_checks for check_name in ['load', 'system', 'cpu']: try: result_check = sys_checks[check_name].check(self.agentConfig) if result_check: payload.update(result_check) except Exception: log.exception('Unable to get %s metrics', check_name) try: memory = sys_checks['memory'].check(self.agentConfig) except Exception: log.exception('Unable to get memory metrics') else: if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) try: ioStats = sys_checks['io'].check(self.agentConfig) except Exception: log.exception('Unable to get io metrics') else: if ioStats: payload['ioStats'] = ioStats try: processes = sys_checks['processes'].check(self.agentConfig) except Exception: log.exception('Unable to get processes metrics') else: payload.update({'processes': processes}) # Run old-style checks if self._ganglia is not None: payload['ganglia'] = self._ganglia.check(self.agentConfig) if self._dogstream is not None: dogstreamData = self._dogstream.check(self.agentConfig) dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # process collector of gohai (compliant with payload of legacy "resources checks") if not Platform.is_windows() and self._should_send_additional_data('processes'): gohai_processes = self._run_gohai_processes() if gohai_processes: try: gohai_processes_json = json.loads(gohai_processes) processes_snaps = gohai_processes_json.get('processes') if processes_snaps: processes_payload = { 'snaps': [processes_snaps] } payload['resources'] = { 'processes': processes_payload, 'meta': { 'host': self.hostname, } } except Exception: log.exception("Error running gohai processes collection") # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # Use `info` log level for some messages on the first run only, then `debug` log_at_first_run = log.info if self._is_first_run() else log.debug # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log_at_first_run("Running check %s", check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats, check_version=check.check_version ) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) # -1 because the user doesn't care about the service check for check failure service_check_count = len(current_check_service_checks) - 1 # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, check_version=info.get('version'), init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append(create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats)) ) # Flush metadata for the Agent Metrics check. Otherwise they'll just accumulate and leak. self._agent_metrics.get_service_metadata() # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, checksd=None, start_event=True): """ Collect data from each check and submit their data. """ timer = Timer() if self.os != 'windows': cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) payload = self._build_payload(start_event=start_event) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] if checksd: self.initialized_checks_d = checksd[ 'initialized_checks'] # is of type {check_name: check} self.init_failed_checks_d = checksd[ 'init_failed_checks'] # is of type {check_name: {error, traceback}} # Run the system checks. Checks will depend on the OS if self.os == 'windows': # Win32 system checks try: metrics.extend(self._win32_system_checks['disk'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['memory'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check( self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks diskUsage = sys_checks['disk'].check(self.agentConfig) if diskUsage and len(diskUsage) == 2: payload["diskUsage"] = diskUsage[0] payload["inodes"] = diskUsage[1] load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: payload.update({ 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') }) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Resources checks if self.os != 'windows': has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][ resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() # Save them for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append( create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for instance_status in check.instance_statuses: agent_checks.append(( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "")) else: agent_checks.append( (check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error))) payload['agent_checks'] = agent_checks payload['meta'] = self.metadata_cache # add hostname metadata collect_duration = timer.step() if self.os != 'windows': payload['metrics'].extend( self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration, time.clock() - cpu_clock)) else: payload['metrics'].extend( self._agent_metrics.check(payload, self.agentConfig, collect_duration, self.emit_duration)) emitter_statuses = self._emit(payload) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def _info_all(self): CollectorStatus.print_latest_status(verbose=True) DogstatsdStatus.print_latest_status(verbose=True) ForwarderStatus.print_latest_status(verbose=True)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile('sd-agent').get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: jmx_command(args[1:], agentConfig) return 0
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.info("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile('dd-agent').get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: if len(args) < 2 or args[1] not in JMX_LIST_COMMANDS.keys(): print "#" * 80 print "JMX tool to be used to help configuring your JMX checks." print "See http://docs.datadoghq.com/integrations/java/ for more information" print "#" * 80 print "\n" print "You have to specify one of the following commands:" for command, desc in JMX_LIST_COMMANDS.iteritems(): print " - %s [OPTIONAL: LIST OF CHECKS]: %s" % (command, desc) print "Example: sudo /etc/init.d/datadog-agent jmx list_matching_attributes tomcat jmx solr" print "\n" else: jmx_command = args[1] checks_list = args[2:] confd_directory = get_confd_path(get_os()) jmx_process = JMXFetch(confd_directory, agentConfig) jmx_process.configure() should_run = jmx_process.should_run() if should_run: jmx_process.run(jmx_command, checks_list, reporter="console") else: print "Couldn't find any valid JMX configuration in your conf.d directory: %s" % confd_directory print "Have you enabled any JMX check ?" print "If you think it's not normal please get in touch with Datadog Support" elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
def main(): options, args = get_parsed_args() agentConfig = get_config() # Logging setup_logging(agentConfig) COMMANDS = [ 'start', 'stop', 'restart', 'foreground', 'status', 'info', ] if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 pid_file = PidFile('dd-agent') # Only initialize the Agent if we're starting or stopping it. if command in ['start', 'stop', 'restart', 'foreground']: if options.clean: pid_file.clean() agent = Agent(pid_file.get_path()) if 'start' == command: logging.info('Start daemon') agent.start() elif 'stop' == command: logging.info('Stop daemon') agent.stop() elif 'restart' == command: logging.info('Restart daemon') agent.restart() elif 'foreground' == command: logging.info('Running in foreground') agent.run() # Commands that don't need the agent to be initialized. else: if 'status' == command: pid = pid_file.get_pid() if pid is not None: sys.stdout.write('dd-agent is running as pid %s.\n' % pid) else: sys.stdout.write('dd-agent is not running.\n') elif 'info' == command: CollectorStatus.print_latest_status() return 0
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable')): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig) else: log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.') # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config() # Logging setup_logging(agentConfig) COMMANDS = [ 'start', 'stop', 'restart', 'foreground', 'status', 'info', ] if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 pid_file = PidFile('dd-agent') # Only initialize the Agent if we're starting or stopping it. if command in ['start', 'stop', 'restart', 'foreground']: if options.clean: pid_file.clean() agent = Agent(pid_file.get_path()) if 'start' == command: logging.info('Start daemon') agent.start() elif 'stop' == command: logging.info('Stop daemon') agent.stop() elif 'restart' == command: logging.info('Restart daemon') agent.restart() elif 'foreground' == command: logging.info('Running in foreground') agent.run() # Commands that don't need the agent to be initialized. else: if 'status' == command: pid = pid_file.get_pid() if pid is not None: sys.stdout.write('dd-agent is running as pid %s.\n' % pid) else: sys.stdout.write('dd-agent is not running.\n') elif 'info' == command: return CollectorStatus.print_latest_status() return 0
def run(self, checksd=None, start_event=True, configs_reloaded=False): """ Collect data from each check and submit their data. """ log.debug("Found {num_checks} checks".format( num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd[ 'initialized_checks'] # is a list of AgentCheck instances self.init_failed_checks_d = checksd[ 'init_failed_checks'] # is of type {check_name: {error, traceback}} payload = AgentPayload() # Find the AgentMetrics check and pop it out # This check must run at the end of the loop to collect info on agent performance if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break # Initialize payload self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] # Run the system checks. Checks will depend on the OS if Platform.is_windows(): # Win32 system checks try: metrics.extend(self._win32_system_checks['memory'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check( self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check( self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: # Unix system checks sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsable': memory.get('physPctUsable'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) # Run old-style checks gangliaData = self._ganglia.check(self.agentConfig) dogstreamData = self._dogstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData # dogstream if dogstreamData: dogstreamEvents = dogstreamData.get('dogstreamEvents', None) if dogstreamEvents: if 'dogstream' in payload['events']: events['dogstream'].extend(dogstreamEvents) else: events['dogstream'] = dogstreamEvents del dogstreamData['dogstreamEvents'] payload.update(dogstreamData) # metrics about the forwarder if ddforwarderData: payload['datadog'] = ddforwarderData # Resources checks if not Platform.is_windows(): has_resource = False for resources_check in self._resources_checks: resources_check.check() snaps = resources_check.pop_snapshots() if snaps: has_resource = True res_value = { 'snaps': snaps, 'format_version': resources_check.get_format_version() } res_format = resources_check.describe_format_if_needed() if res_format is not None: res_value['format_description'] = res_format payload['resources'][ resources_check.RESOURCE_KEY] = res_value if has_resource: payload['resources']['meta'] = { 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], } # newer-style checks (not checks.d style) for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) # checks.d checks check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: # Run the check. instance_statuses = check.run() # Collect the metrics and events. current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() # Collect metadata current_check_metadata = check.get_service_metadata() # Save metrics & events for the payload. metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events # Save the status of the check. metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats) # Service check for Agent checks failures service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datadog.agent.check_status', status, tags=service_check_tags) # Collect the service checks and save them in the payload current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) # Update the check status with the correct service_check_count check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) # Intrument check run timings if enabled. if self.check_timings: metric = 'datadog.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['traceback']) check_statuses.append(check_status) # Add a service check for the agent service_checks.append( create_service_check('datadog.agent.up', AgentCheck.OK, hostname=self.hostname)) # Store the metrics and events in the payload. payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks # Populate metadata self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats))) # Let's send our payload emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() # Persist the status of the collection run. try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info( "First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug( "Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get( 'profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get( 'profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format( num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def info(cls, verbose=None): logging.getLogger().setLevel(logging.ERROR) return CollectorStatus.print_latest_status(verbose=verbose)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() if agentConfig.get('service_discovery', False): # set the TRACE_CONFIG flag to True to make load_check_directory return # the source of config objects. # Then call load_check_directory here and pass the result to sd_configcheck # to avoid circular imports agentConfig[TRACE_CONFIG] = True configs = { # check_name: (config_source, config) } print("\nLoading check configurations...\n\n") configs = load_check_directory(agentConfig, hostname) sd_configcheck(agentConfig, configs) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # TODO: actually kill the start/stop/restart/status command for 5.11 if command in ['start', 'stop', 'restart', 'status'] and not in_developer_mode: logging.error('Please use supervisor to manage the agent') return 1 if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: log.info('Agent version %s' % get_version()) if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() sd_configcheck(agentConfig) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception as e: print 'The upload failed:\n{0}'.format(str(e)) return 0
def run(self, checksd=None, start_event=True, configs_reloaded=False): log.debug("Found {num_checks} checks".format(num_checks=len(checksd['initialized_checks']))) timer = Timer() if not Platform.is_windows(): cpu_clock = time.clock() self.run_count += 1 log.debug("Starting collection run #%s" % self.run_count) if checksd: self.initialized_checks_d = checksd['initialized_checks'] self.init_failed_checks_d = checksd['init_failed_checks'] payload = AgentPayload() if not self._agent_metrics or configs_reloaded: for check in self.initialized_checks_d: if check.name == AGENT_METRICS_CHECK_NAME: self._agent_metrics = check self.initialized_checks_d.remove(check) break self._build_payload(payload) metrics = payload['metrics'] events = payload['events'] service_checks = payload['service_checks'] if Platform.is_windows(): try: metrics.extend(self._win32_system_checks['memory'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['cpu'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['network'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['io'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['proc'].check(self.agentConfig)) metrics.extend(self._win32_system_checks['system'].check(self.agentConfig)) except Exception: log.exception('Unable to fetch Windows system metrics.') else: sys_checks = self._unix_system_checks load = sys_checks['load'].check(self.agentConfig) payload.update(load) system = sys_checks['system'].check(self.agentConfig) payload.update(system) memory = sys_checks['memory'].check(self.agentConfig) if memory: memstats = { 'memPhysUsed': memory.get('physUsed'), 'memPhysPctUsage': memory.get('physPctUsage'), 'memPhysFree': memory.get('physFree'), 'memPhysTotal': memory.get('physTotal'), 'memPhysUsable': memory.get('physUsable'), 'memSwapUsed': memory.get('swapUsed'), 'memSwapFree': memory.get('swapFree'), 'memSwapPctFree': memory.get('swapPctFree'), 'memSwapTotal': memory.get('swapTotal'), 'memCached': memory.get('physCached'), 'memBuffers': memory.get('physBuffers'), 'memShared': memory.get('physShared'), 'memSlab': memory.get('physSlab'), 'memPageTables': memory.get('physPageTables'), 'memSwapCached': memory.get('swapCached') } payload.update(memstats) ioStats = sys_checks['io'].check(self.agentConfig) if ioStats: payload['ioStats'] = ioStats processes = sys_checks['processes'].check(self.agentConfig) payload.update({'processes': processes}) cpuStats = sys_checks['cpu'].check(self.agentConfig) if cpuStats: payload.update(cpuStats) gangliaData = self._ganglia.check(self.agentConfig) monitorstreamData = self._monitorstream.check(self.agentConfig) ddforwarderData = self._ddforwarder.check(self.agentConfig) if gangliaData is not False and gangliaData is not None: payload['ganglia'] = gangliaData if monitorstreamData: monitorstreamEvents = monitorstreamData.get('monitorstreamEvents', None) if monitorstreamEvents: if 'monitorstream' in payload['events']: events['monitorstream'].extend(monitorstreamEvents) else: events['monitorstream'] = monitorstreamEvents del monitorstreamData['monitorstreamEvents'] payload.update(monitorstreamData) if ddforwarderData: payload['datamonitor'] = ddforwarderData for metrics_check in self._metrics_checks: res = metrics_check.check(self.agentConfig) if res: metrics.extend(res) check_statuses = [] for check in self.initialized_checks_d: if not self.continue_running: return log.info("Running check %s" % check.name) instance_statuses = [] metric_count = 0 event_count = 0 service_check_count = 0 check_start_time = time.time() check_stats = None try: instance_statuses = check.run() current_check_metrics = check.get_metrics() current_check_events = check.get_events() check_stats = check._get_internal_profiling_stats() current_check_metadata = check.get_service_metadata() metrics.extend(current_check_metrics) if current_check_events: if check.name not in events: events[check.name] = current_check_events else: events[check.name] += current_check_events metric_count = len(current_check_metrics) event_count = len(current_check_events) except Exception: log.exception("Error running check %s" % check.name) check_status = CheckStatus( check.name, instance_statuses, metric_count, event_count, service_check_count, service_metadata=current_check_metadata, library_versions=check.get_library_info(), source_type_name=check.SOURCE_TYPE_NAME or check.name, check_stats=check_stats ) service_check_tags = ["check:%s" % check.name] if check_status.status == STATUS_OK: status = AgentCheck.OK elif check_status.status == STATUS_ERROR: status = AgentCheck.CRITICAL check.service_check('datamonitor.agent.check_status', status, tags=service_check_tags) current_check_service_checks = check.get_service_checks() if current_check_service_checks: service_checks.extend(current_check_service_checks) service_check_count = len(current_check_service_checks) check_status.service_check_count = service_check_count check_statuses.append(check_status) check_run_time = time.time() - check_start_time log.debug("Check %s ran in %.2f s" % (check.name, check_run_time)) if self.check_timings: metric = 'datamonitor.agent.check_run_time' meta = {'tags': ["check:%s" % check.name]} metrics.append((metric, time.time(), check_run_time, meta)) for check_name, info in self.init_failed_checks_d.iteritems(): if not self.continue_running: return check_status = CheckStatus(check_name, None, None, None, None, init_failed_error=info['error'], init_failed_traceback=info['error']) check_statuses.append(check_status) service_checks.append(create_service_check('datamonitor.agent.up', AgentCheck.OK, hostname=self.hostname)) payload['metrics'] = metrics payload['events'] = events payload['service_checks'] = service_checks self._populate_payload_metadata(payload, check_statuses, start_event) collect_duration = timer.step() if self._agent_metrics: metric_context = { 'collection_time': collect_duration, 'emit_time': self.emit_duration, } if not Platform.is_windows(): metric_context['cpu_time'] = time.clock() - cpu_clock self._agent_metrics.set_metric_context(payload, metric_context) self._agent_metrics.run() agent_stats = self._agent_metrics.get_metrics() payload['metrics'].extend(agent_stats) if self.agentConfig.get('developer_mode'): log.debug("\n Agent developer mode stats: \n {0}".format( Collector._stats_for_display(agent_stats)) ) self._agent_metrics.get_service_metadata() emitter_statuses = payload.emit(log, self.agentConfig, self.emitters, self.continue_running) self.emit_duration = timer.step() try: CollectorStatus(check_statuses, emitter_statuses, self.hostname_metadata_cache).persist() except Exception: log.exception("Error persisting collector status") if self.run_count <= FLUSH_LOGGING_INITIAL or self.run_count % FLUSH_LOGGING_PERIOD == 0: log.info("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) if self.run_count == FLUSH_LOGGING_INITIAL: log.info("First flushes done, next flushes will be logged every %s flushes." % FLUSH_LOGGING_PERIOD) else: log.debug("Finished run #%s. Collection time: %ss. Emit time: %ss" % (self.run_count, round(collect_duration, 2), round(self.emit_duration, 2))) return payload
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get("service_discovery"): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( "collector_profile_interval", DEFAULT_COLLECTOR_PROFILE_INTERVAL ) # Configure the watchdog. self.check_frequency = int(self._agentConfig["check_freq"]) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get("restart_interval", RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd["initialized_checks"]))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded ) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if ( self._agentConfig.get("service_discovery") and self.sd_backend and not self.sd_backend.reload_check_configs ): try: self.sd_backend.reload_check_configs = get_config_store(self._agentConfig).crawl_config_template() except Exception as e: log.warn("Something went wrong while looking for config template changes: %s" % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get("service_discovery") and self.sd_backend and self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile('sd-agent').get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: jmx_command(args[1:], agentConfig) return 0