def testLoad(self): global logger load = Load(logger) res = load.check({'system_stats': get_system_stats()}) cores = int(get_system_stats().get('cpuCores')) assert 'system.load.1' in res assert 'system.load.norm.1' in res assert abs(res['system.load.1'] - cores * res['system.load.norm.1']) <= 0.1, (res['system.load.1'], cores * res['system.load.norm.1']) # same test but without cpu count, no normalized load sent. res = load.check({}) assert 'system.load.1' in res assert 'system.load.norm.1' not in res
def testLoad(self): global logger load = Load(logger) res = load.check({'system_stats': get_system_stats()}) assert 'system.load.1' in res if Platform.is_linux(): cores = int(get_system_stats().get('cpuCores')) assert 'system.load.norm.1' in res assert abs(res['system.load.1'] - cores * res['system.load.norm.1']) <= 0.1, (res['system.load.1'], cores * res['system.load.norm.1']) # same test but without cpu count, no normalized load sent. res = load.check({}) assert 'system.load.1' in res assert 'system.load.norm.1' not in res
def testLoad(self): global logger load = Load(logger) res = load.check({"system_stats": get_system_stats()}) assert "system.load.1" in res if Platform.is_linux(): cores = int(get_system_stats().get("cpuCores")) assert "system.load.norm.1" in res assert abs(res["system.load.1"] - cores * res["system.load.norm.1"]) <= 0.1, ( res["system.load.1"], cores * res["system.load.norm.1"], ) # same test but without cpu count, no normalized load sent. res = load.check({}) assert "system.load.1" in res assert "system.load.norm.1" not in res
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { 'collection_timestamp': now, 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'service_checks': [], 'resources': {}, 'internalHostname' : get_hostname(self.agentConfig), 'uuid' : get_uuid(), 'host-tags': {}, } # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload['systemStats'] = get_system_stats() payload['meta'] = self._get_metadata() self.metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags()) if host_tags: payload['host-tags']['system'] = host_tags GCE_tags = GCE.get_tags() if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload['host-tags'])) return payload
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { 'collection_timestamp': now, 'os': self.os, 'python': sys.version, 'agentVersion': self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'resources': {}, 'internalHostname': get_hostname(self.agentConfig), 'uuid': get_uuid(), } # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type': 'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload['systemStats'] = get_system_stats() payload['meta'] = self._get_metadata() self.metadata_cache = payload['meta'] # Add static tags from the configuration file if self.agentConfig['tags'] is not None: payload['tags'] = self.agentConfig['tags'] # Log the metadata on the first run if self._is_first_run(): if self.agentConfig['tags'] is not None: log.info(u"Hostnames: %s, tags: %s" \ % (repr(self.metadata_cache), self.agentConfig['tags'])) else: log.info(u"Hostnames: %s" % repr(self.metadata_cache)) return payload
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { "collection_timestamp": now, "os": self.os, "python": sys.version, "agentVersion": self.agentConfig["version"], "apiKey": self.agentConfig["api_key"], "events": {}, "metrics": [], "resources": {}, "internalHostname": get_hostname(self.agentConfig), "uuid": get_uuid(), "host-tags": {}, } # Include system stats on first postback if start_event and self._is_first_run(): payload["systemStats"] = self.agentConfig.get("system_stats", {}) # Also post an event in the newsfeed payload["events"]["System"] = [ { "api_key": self.agentConfig["api_key"], "host": payload["internalHostname"], "timestamp": now, "event_type": "Agent Startup", "msg_text": "Version %s" % get_version(), } ] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload["systemStats"] = get_system_stats() payload["meta"] = self._get_metadata() self.metadata_cache = payload["meta"] # Add static tags from the configuration file host_tags = [] if self.agentConfig["tags"] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig["tags"].split(",")]) if self.agentConfig["collect_ec2_tags"]: host_tags.extend(EC2.get_tags()) if host_tags: payload["host-tags"]["system"] = host_tags # Log the metadata on the first run if self._is_first_run(): log.info(u"Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload["host-tags"])) return payload
def run(self): emitters = self.get_emitters() systemStats = get_system_stats() collector = Collector(self.config, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(self.config) # Main agent loop will run until interrupted while self.running: collector.run(checksd=checksd) time.sleep(self.config['check_freq'])
def run(self): log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(self.config) # Main agent loop will run until interrupted while self.running: self.collector.run(checksd=checksd, start_event=self.start_event) time.sleep(self.config['check_freq'])
def run(self): from config import initialize_logging; initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: self.collector.run(checksd=checksd, start_event=self.start_event) time.sleep(self.config['check_freq'])
def run(self, agentConfig=None, run_forever=True): """Main loop of the collector""" agentLogger = logging.getLogger('agent') systemStats = get_system_stats() if agentConfig is None: agentConfig = get_config() # Load the checks.d checks checksd = load_check_directory(agentConfig) # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get( 'use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: agentLogger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: agentLogger.info( 'Not running on EC2, using hostname to identify this server' ) emitters = [http_emitter] for emitter_spec in [ s.strip() for s in agentConfig.get('custom_emitters', '').split(',') ]: if len(emitter_spec) == 0: continue emitters.append(modules.load(emitter_spec, 'emitter')) check_freq = int(agentConfig['check_freq']) # Checks instance collector = Collector(agentConfig, emitters, systemStats) # Watchdog watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() # Main loop while run_forever: collector.run(checksd=checksd) if watchdog is not None: watchdog.reset() time.sleep(check_freq)
def run(self): emitters = self.get_emitters(); chk = checks(self.config, emitters) # Load the checks.d checks checksd = load_check_directory(self.config) # Main agent loop will run until interrupted systemStats = get_system_stats() chk.doChecks(True, systemStats, checksd) while self.running: chk.doChecks(checksd=checksd) time.sleep(self.config['check_freq'])
def run(self, agentConfig=None, run_forever=True): """Main loop of the collector""" agentLogger = logging.getLogger('agent') systemStats = get_system_stats() agentLogger.debug('System Properties: ' + str(systemStats)) if agentConfig is None: agentConfig = get_config() # Load the checks.d checks checksd = load_check_directory(agentConfig) # Try to fetch instance Id from EC2 if not hostname has been set # in the config file. # DEPRECATED if agentConfig.get('hostname') is None and agentConfig.get('use_ec2_instance_id'): instanceId = EC2.get_instance_id() if instanceId is not None: agentLogger.info("Running on EC2, instanceId: %s" % instanceId) agentConfig['hostname'] = instanceId else: agentLogger.info('Not running on EC2, using hostname to identify this server') emitters = [http_emitter] for emitter_spec in [s.strip() for s in agentConfig.get('custom_emitters', '').split(',')]: if len(emitter_spec) == 0: continue emitters.append(modules.load(emitter_spec, 'emitter')) check_freq = int(agentConfig['check_freq']) # Checks instance c = checks(agentConfig, emitters) # Watchdog watchdog = None if agentConfig.get("watchdog", True): watchdog = Watchdog(check_freq * WATCHDOG_MULTIPLIER) watchdog.reset() # Run checks once, to get once-in-a-run data c.doChecks(True, systemStats, checksd) # Main loop while run_forever: if watchdog is not None: watchdog.reset() time.sleep(check_freq) c.doChecks(checksd=checksd)
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { 'collection_timestamp': now, 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'resources': {}, 'internalHostname' : get_hostname(self.agentConfig), 'uuid' : get_uuid(), } # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload['systemStats'] = get_system_stats() payload['meta'] = self._get_metadata() self.metadata_cache = payload['meta'] # Add static tags from the configuration file if self.agentConfig['tags'] is not None: payload['tags'] = self.agentConfig['tags'] # Log the metadata on the first run if self._is_first_run(): if self.agentConfig['tags'] is not None: log.info(u"Hostnames: %s, tags: %s" \ % (repr(self.metadata_cache), self.agentConfig['tags'])) else: log.info(u"Hostnames: %s" % repr(self.metadata_cache)) return payload
def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") set_win32_requests_ca_bundle_path() emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) in_developer_mode = self.config.get('developer_mode') # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = self.config.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) profiled = False collector_profiled_runs = 0 # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) if in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=checksd) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) else: collector_profiled_runs += 1 time.sleep(self.config['check_freq'])
def run(self): from config import initialize_logging; initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) self.collector.run(checksd=checksd) time.sleep(self.config['check_freq'])
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(agentConfig) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd) # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self): from config import initialize_logging initialize_logging('windows_collector') log.debug("Windows Service - Starting collector") set_win32_requests_ca_bundle_path() emitters = self.get_emitters() systemStats = get_system_stats() self.collector = Collector(self.config, emitters, systemStats, self.hostname) in_developer_mode = self.config.get('developer_mode') # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = self.config.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) profiled = False collector_profiled_runs = 0 # Load the checks.d checks checksd = load_check_directory(self.config, self.hostname) # Main agent loop will run until interrupted while self.running: if self._heartbeat: self._heartbeat.send(0) if in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=checksd) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) else: collector_profiled_runs += 1 time.sleep(self.config['check_freq'])
def run(self): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. agentConfig = self._set_agent_config_hostname(get_config()) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Load the checks.d checks checksd = load_check_directory(agentConfig) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd) # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. agent_logger.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def _populate_payload_metadata(self, payload, check_statuses, start_event=True): """ Periodically populate the payload with metadata related to the system, host, and/or checks. """ now = time.time() # Include system stats on first postback if start_event and self._is_first_run(): payload["systemStats"] = self.agentConfig.get("system_stats", {}) # Also post an event in the newsfeed payload["events"]["System"] = [ { "api_key": self.agentConfig["api_key"], "host": payload["internalHostname"], "timestamp": now, "event_type": "Agent Startup", "msg_text": "Version %s" % get_version(), } ] # Periodically send the host metadata. if self._should_send_additional_data("host_metadata"): # gather metadata with gohai try: if not Platform.is_windows(): command = "gohai" else: command = "gohai\gohai.exe" gohai_metadata, gohai_err, _ = get_subprocess_output([command], log) payload["gohai"] = gohai_metadata if gohai_err: log.warning("GOHAI LOG | {0}".format(gohai_err)) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: raise e except Exception as e: log.warning("gohai command failed with error %s" % str(e)) payload["systemStats"] = get_system_stats() payload["meta"] = self._get_hostname_metadata() self.hostname_metadata_cache = payload["meta"] # Add static tags from the configuration file host_tags = [] if self.agentConfig["tags"] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig["tags"].split(",")]) if self.agentConfig["collect_ec2_tags"]: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload["host-tags"]["system"] = host_tags # If required by the user, let's create the dd_check:xxx host tags if self.agentConfig["create_dd_check_tags"]: app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d] app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames()]) if "system" not in payload["host-tags"]: payload["host-tags"]["system"] = [] payload["host-tags"]["system"].extend(app_tags_list) GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload["host-tags"][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload["host-tags"])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data("external_host_tags"): for check in self.initialized_checks_d: try: getter = getattr(check, "get_external_host_tags") check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload["external_host_tags"] = external_host_tags # Periodically send agent_checks metadata if self._should_send_additional_data("agent_checks"): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate(check.instance_statuses): agent_checks.append( ( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "", check.service_metadata[i], ) ) else: agent_checks.append( ( check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error), ) ) payload["agent_checks"] = agent_checks payload["meta"] = self.hostname_metadata_cache # add hostname metadata
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get( 'profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get( 'profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.info("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable')): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig) else: log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.') # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get("service_discovery"): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( "collector_profile_interval", DEFAULT_COLLECTOR_PROFILE_INTERVAL ) # Configure the watchdog. self.check_frequency = int(self._agentConfig["check_freq"]) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get("restart_interval", RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd["initialized_checks"]))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded ) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if ( self._agentConfig.get("service_discovery") and self.sd_backend and not self.sd_backend.reload_check_configs ): try: self.sd_backend.reload_check_configs = get_config_store(self._agentConfig).crawl_config_template() except Exception as e: log.warn("Something went wrong while looking for config template changes: %s" % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get("service_discovery") and self.sd_backend and self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { 'collection_timestamp': now, 'os': self.os, 'python': sys.version, 'agentVersion': self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'service_checks': [], 'resources': {}, 'internalHostname': self.hostname, 'uuid': get_uuid(), 'host-tags': {}, 'external_host_tags': {} } # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type': 'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('metadata'): # gather metadata with gohai try: if get_os() != 'windows': command = "gohai" else: command = "gohai\gohai.exe" gohai_metadata = subprocess.Popen( [command], stdout=subprocess.PIPE).communicate()[0] payload['gohai'] = gohai_metadata except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: raise e except Exception as e: log.warning("gohai command failed with error %s" % str(e)) payload['systemStats'] = get_system_stats() payload['meta'] = self._get_metadata() self.metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([ unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",") ]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags return payload
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { 'collection_timestamp': now, 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'service_checks': [], 'resources': {}, 'internalHostname' : self.hostname, 'uuid' : get_uuid(), 'host-tags': {}, 'external_host_tags': {} } # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('metadata'): # gather metadata with gohai try: if get_os() != 'windows': command = "gohai" else: command = "gohai\gohai.exe" gohai_metadata = subprocess.Popen( [command], stdout=subprocess.PIPE ).communicate()[0] payload['gohai'] = gohai_metadata except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: raise e except Exception as e: log.warning("gohai command failed with error %s" % str(e)) payload['systemStats'] = get_system_stats() payload['meta'] = self._get_metadata() self.metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags return payload
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format( num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def _populate_payload_metadata(self, payload, check_statuses, start_event=True): """ Periodically populate the payload with metadata related to the system, host, and/or checks. """ now = time.time() # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('host_metadata'): # gather metadata with gohai gohai_metadata = self._run_gohai_metadata() if gohai_metadata: payload['gohai'] = gohai_metadata payload['systemStats'] = get_system_stats( proc_path=self.agentConfig.get('procfs_path', '/proc').rstrip('/') ) payload['meta'] = self._get_hostname_metadata() self.hostname_metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags # If required by the user, let's create the dd_check:xxx host tags if self.agentConfig['create_dd_check_tags']: app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d] app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames()]) if 'system' not in payload['host-tags']: payload['host-tags']['system'] = [] payload['host-tags']['system'].extend(app_tags_list) GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags # Periodically send agent_checks metadata if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate(check.instance_statuses): agent_checks.append( ( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "", check.service_metadata[i] ) ) else: agent_checks.append( ( check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error) ) ) payload['agent_checks'] = agent_checks payload['meta'] = self.hostname_metadata_cache # add hostname metadata
def run(self, config=None): signal.signal(signal.SIGTERM, self._handle_sigterm) signal.signal(signal.SIGUSR1, self._handle_sigusr1) signal.signal(signal.SIGINT, self._handle_sigterm) signal.signal(signal.SIGHUP, self._handle_sighup) CollectorStatus().persist() if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() self._checksd = load_check_directory(self._agentConfig, hostname) self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.check_frequency = int(self._agentConfig['check_freq']) watchmonitor = self._get_watchmonitor(self.check_frequency) self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) if self.autorestart and self._should_restart(): self._do_restart() if self.run_forever: if watchmonitor: watchmonitor.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) try: CollectorStatus.remove_latest_status() except Exception: pass log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig) self.collector = Collector(agentConfig, emitters, systemStats) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open( pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket( self._agentConfig) else: log.debug( 'Unable to create pipe in temporary directory. JMX service discovery disabled.' ) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def _populate_payload_metadata(self, payload, check_statuses, start_event=True): now = time.time() if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type': 'Agent Startup', 'msg_text': 'Version %s' % get_version() }] if self._should_send_additional_data('host_metadata'): payload['gohai'] = get_gohai_data() payload['systemStats'] = get_system_stats() payload['meta'] = self._get_hostname_metadata() log.info('GOHAI data: {0}'.format(payload['gohai'])) self.hostname_metadata_cache = payload['meta'] host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([tag.strip() for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags if self.agentConfig['create_dd_check_tags']: app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d] app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames()]) if 'system' not in payload['host-tags']: payload['host-tags']['system'] = [] payload['host-tags']['system'].extend(app_tags_list) GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload['host-tags'])) external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags if self._should_send_additional_data('agent_checks'): agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate(check.instance_statuses): agent_checks.append( ( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, instance_status.error or instance_status.warnings or "", check.service_metadata[i] ) ) else: agent_checks.append( ( check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error) ) ) payload['agent_checks'] = agent_checks payload['meta'] = self.hostname_metadata_cache