Ejemplo n.º 1
0
    def run(self, command=None, check_list=None, reporter=None):

        if check_list or self.jmx_checks is None:
            # (Re)set/(re)configure JMXFetch parameters when `check_list` is specified or
            # no configuration was found
            self.configure(check_list)

        try:
            command = command or JMX_COLLECT_COMMAND

            if len(self.invalid_checks) > 0:
                try:
                    JMXFiles.write_status_file(self.invalid_checks)
                except Exception:
                    log.exception("Error while writing JMX status file")

            if len(self.jmx_checks) > 0:
                return self._start(self.java_bin_path, self.java_options, self.jmx_checks,
                                   command, reporter, self.tools_jar_path)
            else:
                # We're exiting purposefully, so exit with zero (supervisor's expected
                # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly
                # and thus can exit cleanly.
                time.sleep(4)
                log.info("No valid JMX integration was found. Exiting ...")
        except Exception:
            log.exception("Error while initiating JMXFetch")
            raise
Ejemplo n.º 2
0
    def configure(self, check_list=None):
        """
        Instantiate JMXFetch parameters, clean potential previous run leftovers.
        """
        JMXFiles.clean_status_file()

        self.jmx_checks, self.invalid_checks, self.java_bin_path, self.java_options, self.tools_jar_path = \
            self.get_configuration(check_list)
Ejemplo n.º 3
0
 def terminate(self):
     """
     Override `terminate` method to properly exit JMXFetch.
     """
     JMXFiles.write_exit_file()
     self.join()
Ejemplo n.º 4
0
 def run(self):
     if self.is_enabled:
         JMXFiles.clean_exit_file()
         self.jmx_daemon.run()
Ejemplo n.º 5
0
    def _populate_payload_metadata(self, payload, check_statuses, start_event=True):
        """
        Periodically populate the payload with metadata related to the system, host, and/or checks.
        """
        now = time.time()

        # Include system stats on first postback
        if start_event and self._is_first_run():
            payload['systemStats'] = self.agentConfig.get('system_stats', {})
            # Also post an event in the newsfeed
            payload['events']['System'] = [{
                'api_key': self.agentConfig['api_key'],
                'host': payload['internalHostname'],
                'timestamp': now,
                'event_type':'Agent Startup',
                'msg_text': 'Version %s' % get_version()
            }]

        # Periodically send the host metadata.
        if self._should_send_additional_data('host_metadata'):
            # gather metadata with gohai
            try:
                if get_os() != 'windows':
                    command = "gohai"
                else:
                    command = "gohai\gohai.exe"
                gohai_metadata, gohai_log = subprocess.Popen(
                    [command], stdout=subprocess.PIPE, stderr=subprocess.PIPE
                ).communicate()
                payload['gohai'] = gohai_metadata
                if gohai_log:
                    log.warning("GOHAI LOG | {0}".format(gohai_log))
            except OSError as e:
                if e.errno == 2:  # file not found, expected when install from source
                    log.info("gohai file not found")
                else:
                    raise e
            except Exception as e:
                log.warning("gohai command failed with error %s" % str(e))

            payload['systemStats'] = get_system_stats()
            payload['meta'] = self._get_hostname_metadata()

            self.hostname_metadata_cache = payload['meta']
            # Add static tags from the configuration file
            host_tags = []
            if self.agentConfig['tags'] is not None:
                host_tags.extend([unicode(tag.strip())
                                 for tag in self.agentConfig['tags'].split(",")])

            if self.agentConfig['collect_ec2_tags']:
                host_tags.extend(EC2.get_tags(self.agentConfig))

            if host_tags:
                payload['host-tags']['system'] = host_tags

            GCE_tags = GCE.get_tags(self.agentConfig)
            if GCE_tags is not None:
                payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags

            # Log the metadata on the first run
            if self._is_first_run():
                log.info("Hostnames: %s, tags: %s" %
                         (repr(self.hostname_metadata_cache), payload['host-tags']))

        # Periodically send extra hosts metadata (vsphere)
        # Metadata of hosts that are not the host where the agent runs, not all the checks use
        # that
        external_host_tags = []
        if self._should_send_additional_data('external_host_tags'):
            for check in self.initialized_checks_d:
                try:
                    getter = getattr(check, 'get_external_host_tags')
                    check_tags = getter()
                    external_host_tags.extend(check_tags)
                except AttributeError:
                    pass

        if external_host_tags:
            payload['external_host_tags'] = external_host_tags

        # Periodically send agent_checks metadata
        if self._should_send_additional_data('agent_checks'):
            # Add agent checks statuses and error/warning messages
            agent_checks = []
            for check in check_statuses:
                if check.instance_statuses is not None:
                    for i, instance_status in enumerate(check.instance_statuses):
                        agent_checks.append(
                            (
                                check.name, check.source_type_name,
                                instance_status.instance_id,
                                instance_status.status,
                                # put error message or list of warning messages in the same field
                                # it will be handled by the UI
                                instance_status.error or instance_status.warnings or "",
                                check.service_metadata[i]
                            )
                        )
                else:
                    agent_checks.append(
                        (
                            check.name, check.source_type_name,
                            "initialization",
                            check.status, repr(check.init_failed_error)
                        )
                    )
            payload['agent_checks'] = agent_checks
            payload['meta'] = self.hostname_metadata_cache  # add hostname metadata

        # If required by the user, let's create the dd_check:xxx host tags
        if self.agentConfig['create_dd_check_tags'] and \
                self._should_send_additional_data('dd_check_tags'):
            app_tags_list = [DD_CHECK_TAG.format(c.name) for c in self.initialized_checks_d]
            app_tags_list.extend([DD_CHECK_TAG.format(cname) for cname
                                  in JMXFiles.get_jmx_appnames()])

            if 'system' not in payload['host-tags']:
                payload['host-tags']['system'] = []

            payload['host-tags']['system'].extend(app_tags_list)
Ejemplo n.º 6
0
def get_jmx_status():
    """This function tries to read the 2 jmxfetch status file which are yaml file
    located in the temp directory.

    There are 2 files:
        - One generated by the Agent itself, for jmx checks that can't be initialized because
        there are missing stuff.
        Its format is as following:

        ###
        invalid_checks:
              jmx: !!python/object/apply:jmxfetch.InvalidJMXConfiguration [You need to have at
                              least one instance defined in the YAML file for this check]
        timestamp: 1391040927.136523
        ###

        - One generated by jmxfetch that return information about the collection of metrics
        its format is as following:

        ###
        timestamp: 1391037347435
        checks:
          failed_checks:
            jmx:
            - {message: Unable to create instance. Please check your yaml file, status: ERROR}
          initialized_checks:
            tomcat:
            - {message: null, status: OK, metric_count: 7, instance_name: jmx-remihakim.fr-3000}
        ###
    """
    check_statuses = []
    java_status_path = JMXFiles.get_status_file_path()
    python_status_path = JMXFiles.get_python_status_file_path()
    if not os.path.exists(java_status_path) and not os.path.exists(python_status_path):
        log.debug("There is no jmx_status file at: %s or at: %s" % (java_status_path, python_status_path))
        return []

    check_data = defaultdict(lambda: defaultdict(list))
    try:
        if os.path.exists(java_status_path):
            java_jmx_stats = yaml.load(file(java_status_path))

            status_age = time.time() - java_jmx_stats.get('timestamp')/1000 # JMX timestamp is saved in milliseconds
            jmx_checks = java_jmx_stats.get('checks', {})

            if status_age > 60:
                check_statuses.append(
                    CheckStatus("jmx", [
                        InstanceStatus(
                            0,
                            STATUS_ERROR,
                            error="JMXfetch didn't return any metrics during the last minute"
                        )
                    ])
                )
            else:
                for check_name, instances in jmx_checks.get('failed_checks', {}).iteritems():
                    for info in instances:
                        message = info.get('message', None)
                        metric_count = info.get('metric_count', 0)
                        service_check_count = info.get('service_check_count', 0)
                        status = info.get('status')
                        instance_name = info.get('instance_name', None)
                        check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count))
                        check_data[check_name]['metric_count'].append(metric_count)
                        check_data[check_name]['service_check_count'].append(service_check_count)

                for check_name, instances in jmx_checks.get('initialized_checks', {}).iteritems():
                    for info in instances:
                        message = info.get('message', None)
                        metric_count = info.get('metric_count', 0)
                        service_check_count = info.get('service_check_count', 0)
                        status = info.get('status')
                        instance_name = info.get('instance_name', None)
                        check_data[check_name]['statuses'].append(get_jmx_instance_status(instance_name, status, message, metric_count))
                        check_data[check_name]['metric_count'].append(metric_count)
                        check_data[check_name]['service_check_count'].append(service_check_count)

                for check_name, data in check_data.iteritems():
                    check_status = CheckStatus(check_name, data['statuses'],
                                               metric_count=sum(data['metric_count']),
                                               service_check_count=sum(data['service_check_count']))
                    check_statuses.append(check_status)

        if os.path.exists(python_status_path):
            python_jmx_stats = yaml.load(file(python_status_path))
            jmx_checks = python_jmx_stats.get('invalid_checks', {})
            for check_name, excep in jmx_checks.iteritems():
                check_statuses.append(CheckStatus(check_name, [], init_failed_error=excep))


        return check_statuses

    except Exception:
        log.exception("Couldn't load latest jmx status")
        return []
Ejemplo n.º 7
0
    def _start(self, path_to_java, java_run_opts, jmx_checks, command, reporter, tools_jar_path):
        statsd_port = self.agentConfig.get('dogstatsd_port', "8125")
        if reporter is None:
            reporter = "statsd:%s" % str(statsd_port)

        log.info("Starting jmxfetch:")
        try:
            path_to_java = path_to_java or "java"
            java_run_opts = java_run_opts or ""
            path_to_jmxfetch = self._get_path_to_jmxfetch()
            path_to_status_file = JMXFiles.get_status_file_path()

            if tools_jar_path is None:
                classpath = path_to_jmxfetch
            else:
                classpath = r"%s:%s" % (tools_jar_path, path_to_jmxfetch)

            subprocess_args = [
                path_to_java,  # Path to the java bin
                '-classpath',
                classpath,
                JMXFETCH_MAIN_CLASS,
                '--check_period', str(self.check_frequency * 1000),  # Period of the main loop of jmxfetch in ms
                '--conf_directory', r"%s" % self.confd_path,  # Path of the conf.d directory that will be read by jmxfetch,
                '--log_level', JAVA_LOGGING_LEVEL.get(self.logging_config.get("log_level"), "INFO"),  # Log Level: Mapping from Python log level to log4j log levels
                '--log_location', r"%s" % self.logging_config.get('jmxfetch_log_file'),  # Path of the log file
                '--reporter', reporter,  # Reporter to use
                '--status_location', r"%s" % path_to_status_file,  # Path to the status file to write
                command,  # Name of the command
            ]

            if Platform.is_windows():
                # Signal handlers are not supported on Windows:
                # use a file to trigger JMXFetch exit instead
                path_to_exit_file = JMXFiles.get_python_exit_file_path()
                subprocess_args.insert(len(subprocess_args) - 1, '--exit_file_location')
                subprocess_args.insert(len(subprocess_args) - 1, path_to_exit_file)

            subprocess_args.insert(4, '--check')
            for check in jmx_checks:
                subprocess_args.insert(5, check)

            # Specify a maximum memory allocation pool for the JVM
            if "Xmx" not in java_run_opts and "XX:MaxHeapSize" not in java_run_opts:
                java_run_opts += _JVM_DEFAULT_MAX_MEMORY_ALLOCATION
            # Specify the initial memory allocation pool for the JVM
            if "Xms" not in java_run_opts and "XX:InitialHeapSize" not in java_run_opts:
                java_run_opts += _JVM_DEFAULT_INITIAL_MEMORY_ALLOCATION

            for opt in java_run_opts.split():
                subprocess_args.insert(1, opt)

            log.info("Running %s" % " ".join(subprocess_args))
            jmx_process = subprocess.Popen(subprocess_args, close_fds=True)
            self.jmx_process = jmx_process

            # Register SIGINT and SIGTERM signal handlers
            self.register_signal_handlers()

            # Wait for JMXFetch to return
            jmx_process.wait()

            return jmx_process.returncode

        except OSError:
            java_path_msg = "Couldn't launch JMXTerm. Is Java in your PATH ?"
            log.exception(java_path_msg)
            invalid_checks = {}
            for check in jmx_checks:
                check_name = check.split('.')[0]
                check_name = check_name.encode('ascii', 'ignore')
                invalid_checks[check_name] = java_path_msg
            JMXFiles.write_status_file(invalid_checks)
            raise
        except Exception:
            log.exception("Couldn't launch JMXFetch")
            raise