def run(self, command=None, check_list=None, reporter=None): if check_list or self.jmx_checks is None: # (Re)set/(re)configure JMXFetch parameters when `check_list` is specified or # no configuration was found self.configure(check_list) try: command = command or JMX_COLLECT_COMMAND if len(self.invalid_checks) > 0: try: JMXFiles.write_status_file(self.invalid_checks) except Exception: log.exception("Error while writing JMX status file") if len(self.jmx_checks) > 0: return self._start(self.java_bin_path, self.java_options, self.jmx_checks, command, reporter, self.tools_jar_path) else: # We're exiting purposefully, so exit with zero (supervisor's expected # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly # and thus can exit cleanly. time.sleep(4) log.info("No valid JMX integration was found. Exiting ...") except Exception: log.exception("Error while initiating JMXFetch") raise
def configure(self, check_list=None): """ Instantiate JMXFetch parameters, clean potential previous run leftovers. """ JMXFiles.clean_status_file() self.jmx_checks, self.invalid_checks, self.java_bin_path, self.java_options, self.tools_jar_path = \ self.get_configuration(check_list)
def terminate(self): """ Override `terminate` method to properly exit JMXFetch. """ JMXFiles.write_exit_file() self.join()
def run(self): if self.is_enabled: JMXFiles.clean_exit_file() self.jmx_daemon.run()
def _populate_payload_metadata(self, payload, check_statuses, start_event=True): """ Periodically populate the payload with metadata related to the system, host, and/or checks. """ now = time.time() # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{ 'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._should_send_additional_data('host_metadata'): # gather metadata with gohai try: if get_os() != 'windows': command = "gohai" else: command = "gohai\gohai.exe" gohai_metadata, gohai_log = subprocess.Popen( [command], stdout=subprocess.PIPE, stderr=subprocess.PIPE ).communicate() payload['gohai'] = gohai_metadata if gohai_log: log.warning("GOHAI LOG | {0}".format(gohai_log)) except OSError as e: if e.errno == 2: # file not found, expected when install from source log.info("gohai file not found") else: raise e except Exception as e: log.warning("gohai command failed with error %s" % str(e)) payload['systemStats'] = get_system_stats() payload['meta'] = self._get_hostname_metadata() self.hostname_metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags(self.agentConfig)) if host_tags: payload['host-tags']['system'] = host_tags GCE_tags = GCE.get_tags(self.agentConfig) if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.hostname_metadata_cache), payload['host-tags'])) # Periodically send extra hosts metadata (vsphere) # Metadata of hosts that are not the host where the agent runs, not all the checks use # that external_host_tags = [] if self._should_send_additional_data('external_host_tags'): for check in self.initialized_checks_d: try: getter = getattr(check, 'get_external_host_tags') check_tags = getter() external_host_tags.extend(check_tags) except AttributeError: pass if external_host_tags: payload['external_host_tags'] = external_host_tags # Periodically send agent_checks metadata if self._should_send_additional_data('agent_checks'): # Add agent checks statuses and error/warning messages agent_checks = [] for check in check_statuses: if check.instance_statuses is not None: for i, instance_status in enumerate(check.instance_statuses): agent_checks.append( ( check.name, check.source_type_name, instance_status.instance_id, instance_status.status, # put error message or list of warning messages in the same field # it will be handled by the UI instance_status.error or instance_status.warnings or "", check.service_metadata[i] ) ) else: agent_checks.append( ( check.name, check.source_type_name, "initialization", check.status, repr(check.init_failed_error) ) ) payload['agent_checks'] = agent_checks payload['meta'] = self.hostname_metadata_cache # add hostname metadata # If required by the user, let's create the dd_check:xxx host tags if self.agentConfig['create_dd_check_tags'] and \ self._should_send_additional_data('dd_check_tags'): app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d] app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname in JMXFiles.get_jmx_appnames()]) if 'system' not in payload['host-tags']: payload['host-tags']['system'] = [] payload['host-tags']['system'].extend(app_tags_list)
def get_jmx_status(): """This function tries to read the 2 jmxfetch status file which are yaml file located in the temp directory. There are 2 files: - One generated by the Agent itself, for jmx checks that can't be initialized because there are missing stuff. Its format is as following: ### invalid_checks: jmx: !!python/object/apply:jmxfetch.InvalidJMXConfiguration [You need to have at least one instance defined in the YAML file for this check] timestamp: 1391040927.136523 ### - One generated by jmxfetch that return information about the collection of metrics its format is as following: ### timestamp: 1391037347435 checks: failed_checks: jmx: - {message: Unable to create instance. Please check your yaml file, status: ERROR} initialized_checks: tomcat: - {message: null, status: OK, metric_count: 7, instance_name: jmx-remihakim.fr-3000} ### """ check_statuses = [] java_status_path = JMXFiles.get_status_file_path() python_status_path = JMXFiles.get_python_status_file_path() if not os.path.exists(java_status_path) and not os.path.exists(python_status_path): log.debug("There is no jmx_status file at: %s or at: %s" % (java_status_path, python_status_path)) return [] check_data = defaultdict(lambda: defaultdict(list)) try: if os.path.exists(java_status_path): java_jmx_stats = yaml.load(file(java_status_path)) status_age = time.time() - java_jmx_stats.get('timestamp')/1000 # JMX timestamp is saved in milliseconds jmx_checks = java_jmx_stats.get('checks', {}) if status_age > 60: check_statuses.append( CheckStatus("jmx", [ InstanceStatus( 0, STATUS_ERROR, error="JMXfetch didn't return any metrics during the last minute" ) ]) ) else: for check_name, instances in jmx_checks.get('failed_checks', {}).iteritems(): for info in instances: message = info.get('message', None) metric_count = info.get('metric_count', 0) service_check_count = info.get('service_check_count', 0) status = info.get('status') instance_name = info.get('instance_name', None) check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count)) check_data[check_name]['metric_count'].append(metric_count) check_data[check_name]['service_check_count'].append(service_check_count) for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems(): for info in instances: message = info.get('message', None) metric_count = info.get('metric_count', 0) service_check_count = info.get('service_check_count', 0) status = info.get('status') instance_name = info.get('instance_name', None) check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count)) check_data[check_name]['metric_count'].append(metric_count) check_data[check_name]['service_check_count'].append(service_check_count) for check_name, data in check_data.iteritems(): check_status = CheckStatus(check_name, data['statuses'], metric_count=sum(data['metric_count']), service_check_count=sum(data['service_check_count'])) check_statuses.append(check_status) if os.path.exists(python_status_path): python_jmx_stats = yaml.load(file(python_status_path)) jmx_checks = python_jmx_stats.get('invalid_checks', {}) for check_name, excep in jmx_checks.iteritems(): check_statuses.append(CheckStatus(check_name, [], init_failed_error=excep)) return check_statuses except Exception: log.exception("Couldn't load latest jmx status") return []
def _start(self, path_to_java, java_run_opts, jmx_checks, command, reporter, tools_jar_path): statsd_port = self.agentConfig.get('dogstatsd_port', "8125") if reporter is None: reporter = "statsd:%s" % str(statsd_port) log.info("Starting jmxfetch:") try: path_to_java = path_to_java or "java" java_run_opts = java_run_opts or "" path_to_jmxfetch = self._get_path_to_jmxfetch() path_to_status_file = JMXFiles.get_status_file_path() if tools_jar_path is None: classpath = path_to_jmxfetch else: classpath = r"%s:%s" % (tools_jar_path, path_to_jmxfetch) subprocess_args = [ path_to_java, # Path to the java bin '-classpath', classpath, JMXFETCH_MAIN_CLASS, '--check_period', str(self.check_frequency * 1000), # Period of the main loop of jmxfetch in ms '--conf_directory', r"%s" % self.confd_path, # Path of the conf.d directory that will be read by jmxfetch, '--log_level', JAVA_LOGGING_LEVEL.get(self.logging_config.get("log_level"), "INFO"), # Log Level: Mapping from Python log level to log4j log levels '--log_location', r"%s" % self.logging_config.get('jmxfetch_log_file'), # Path of the log file '--reporter', reporter, # Reporter to use '--status_location', r"%s" % path_to_status_file, # Path to the status file to write command, # Name of the command ] if Platform.is_windows(): # Signal handlers are not supported on Windows: # use a file to trigger JMXFetch exit instead path_to_exit_file = JMXFiles.get_python_exit_file_path() subprocess_args.insert(len(subprocess_args) - 1, '--exit_file_location') subprocess_args.insert(len(subprocess_args) - 1, path_to_exit_file) subprocess_args.insert(4, '--check') for check in jmx_checks: subprocess_args.insert(5, check) # Specify a maximum memory allocation pool for the JVM if "Xmx" not in java_run_opts and "XX:MaxHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_MAX_MEMORY_ALLOCATION # Specify the initial memory allocation pool for the JVM if "Xms" not in java_run_opts and "XX:InitialHeapSize" not in java_run_opts: java_run_opts += _JVM_DEFAULT_INITIAL_MEMORY_ALLOCATION for opt in java_run_opts.split(): subprocess_args.insert(1, opt) log.info("Running %s" % " ".join(subprocess_args)) jmx_process = subprocess.Popen(subprocess_args, close_fds=True) self.jmx_process = jmx_process # Register SIGINT and SIGTERM signal handlers self.register_signal_handlers() # Wait for JMXFetch to return jmx_process.wait() return jmx_process.returncode except OSError: java_path_msg = "Couldn't launch JMXTerm. Is Java in your PATH ?" log.exception(java_path_msg) invalid_checks = {} for check in jmx_checks: check_name = check.split('.')[0] check_name = check_name.encode('ascii', 'ignore') invalid_checks[check_name] = java_path_msg JMXFiles.write_status_file(invalid_checks) raise except Exception: log.exception("Couldn't launch JMXFetch") raise