Example #1
0
    def check(self, instance, create_event=True):
        if self.high_watermarks.get(instance.get('name'), None) is None:
            # On the first run of check(), prime the high_watermarks dict
            # so that we only send events that occured after the agent
            # started.
            # (Setting high_watermarks in the next statement prevents
            #  any kind of infinite loop (assuming nothing ever sets
            #  high_watermarks to None again!))
            self.high_watermarks[instance.get('name')] = defaultdict(lambda: 0)
            self.check(instance, create_event=False)

        jenkins_home = instance.get('jenkins_home', None)

        if not jenkins_home:
            raise Exception("No jenkins_home directory set in the config file")

        job_dirs = glob(os.path.join(jenkins_home, 'jobs', '*'))

        build_events = []

        for job_dir in job_dirs:
            for output in self._get_build_results(instance.get('name'), job_dir):
                output['api_key'] = self.agentConfig['api_key']
                output['host'] = get_hostname(self.agentConfig)
                if create_event:
                    self.log.debug("Creating event for job: %s" % output['job_name'])
                    self.event(output)
Example #2
0
    def test_collector(self):
        agentConfig = {
            'agent_key': 'test_agentkey',
            'check_timings': True,
            'collect_ec2_tags': True,
            'collect_instance_metadata': False,
            'create_dd_check_tags': False,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        redis_config = {
            "init_config": {},
            "instances": [{"host": "localhost", "port": 6379}]
        }
        checks = [load_check('redisdb', redis_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({
            'initialized_checks': checks,
            'init_failed_checks': {}
        })
        metrics = payload['metrics']

        # Check that we got a timing metric for all checks.
        timing_metrics = [m for m in metrics
            if m[0] == 'sd.agent.check_run_time']
        all_tags = []
        for metric in timing_metrics:
            all_tags.extend(metric[3]['tags'])
        for check in checks:
            tag = "check:%s" % check.name
            assert tag in all_tags, all_tags
Example #3
0
def load_check(name, config, agentConfig, is_sdk=False):
    if not is_sdk:
        checksd_path = get_checksd_path(get_os())

        # find (in checksd_path) and load the check module
        fd, filename, desc = imp.find_module(name, [checksd_path])
        check_module = imp.load_module(name, fd, filename, desc)
    else:
        check_module = __import__("check")

    check_class = None
    classes = inspect.getmembers(check_module, inspect.isclass)
    for _, clsmember in classes:
        if clsmember == AgentCheck:
            continue
        if issubclass(clsmember, AgentCheck):
            check_class = clsmember
            if AgentCheck in clsmember.__bases__:
                continue
            else:
                break
    if check_class is None:
        raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)

    init_config = config.get('init_config', {})
    instances = config.get('instances')
    agentConfig['checksd_hostname'] = get_hostname(agentConfig)

    # init the check class
    try:
        return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances)
    except TypeError as e:
        raise Exception("Check is using old API, {0}".format(e))
    except Exception:
        raise
Example #4
0
def load_check(name, config, agentConfig):
    checksd_path = get_checksd_path(get_os())
    if checksd_path not in sys.path:
        sys.path.append(checksd_path)

    check_module = __import__(name)
    check_class = None
    classes = inspect.getmembers(check_module, inspect.isclass)
    for name, clsmember in classes:
        if clsmember == AgentCheck:
            continue
        if issubclass(clsmember, AgentCheck):
            check_class = clsmember
            if AgentCheck in clsmember.__bases__:
                continue
            else:
                break
    if check_class is None:
        raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)

    init_config = config.get('init_config', None)
    instances = config.get('instances')
    agentConfig['checksd_hostname'] = get_hostname(agentConfig)

    # init the check class
    try:
        return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances)
    except:
        # Backwards compatitiblity for old checks that don't support the
        # instances argument.
        c = check_class(name, init_config=init_config, agentConfig=agentConfig)
        c.instances = instances
        return c
Example #5
0
    def test_apptags(self):
        '''
        Tests that the app tags are sent if specified so
        '''
        agentConfig = {
            'agent_key': 'test_agentkey',
            'collect_ec2_tags': False,
            'collect_instance_metadata': False,
            'create_dd_check_tags': True,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        redis_config = {
            "init_config": {},
            "instances": [{"host": "localhost", "port": 6379}]
        }
        checks = [load_check('redisdb', redis_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({
            'initialized_checks': checks,
            'init_failed_checks': {}
        })

        # We check that the redis DD_CHECK_TAG is sent in the payload
        self.assertTrue('dd_check:redisdb' in payload['host-tags']['system'])
Example #6
0
def init(config_path=None, use_watchdog=False, use_forwarder=False):
    c = get_config(parse_args=False, cfg_path=config_path)
    log.debug("Configuration dogstatsd")

    port      = c['dogstatsd_port']
    interval  = int(c['dogstatsd_interval'])
    normalize = c['dogstatsd_normalize']
    api_key   = c['api_key']
    non_local_traffic = c['non_local_traffic']

    target = c['dd_url']
    if use_forwarder:
        target = c['dogstatsd_target'] 

    hostname = get_hostname(c)

    # Create the aggregator (which is the point of communication between the
    # server and reporting threads.
    assert 0 < interval

    aggregator = MetricsAggregator(hostname, interval, recent_point_threshold=c.get('recent_point_threshold', None))

    # Start the reporting thread.
    reporter = Reporter(interval, aggregator, target, api_key, use_watchdog)

    # Start the server on an IPv4 stack
    # Default to loopback
    server_host = '127.0.0.1'
    # If specified, bind to all addressses
    if non_local_traffic:
        server_host = ''

    server = Server(aggregator, server_host, port)

    return reporter, server
Example #7
0
    def check(self, agentConfig):
        process_exclude_args = agentConfig.get('exclude_process_args', False)
        if process_exclude_args:
            ps_arg = 'aux'
        else:
            ps_arg = 'auxww'
        # Get output from ps
        try:
            ps = sp.Popen(['ps', ps_arg], stdout=sp.PIPE, close_fds=True).communicate()[0]
        except StandardError:
            self.logger.exception('getProcesses')
            return False

        # Split out each process
        processLines = ps.split('\n')

        del processLines[0]  # Removes the headers
        processLines.pop()  # Removes a trailing empty line

        processes = []

        for line in processLines:
            line = line.split(None, 10)
            processes.append(map(lambda s: s.strip(), line))

        return {'processes':   processes,
                'apiKey':      agentConfig['api_key'],
                'host':        get_hostname(agentConfig)}
Example #8
0
    def create_event(self, state, server, agentConfig):
        """Create an event with a message describing the replication
            state of a mongo node"""

        def get_state_description(state):
            if state == 0: return 'Starting Up'
            elif state == 1: return 'Primary'
            elif state == 2: return 'Secondary'
            elif state == 3: return 'Recovering'
            elif state == 4: return 'Fatal'
            elif state == 5: return 'Starting up (initial sync)'
            elif state == 6: return 'Unknown'
            elif state == 7: return 'Arbiter'
            elif state == 8: return 'Down'
            elif state == 9: return 'Rollback'

        status = get_state_description(state)
        hostname = get_hostname(agentConfig)
        msg_title = "%s is %s" % (server, status)
        msg = "TokuMX %s just reported as %s" % (server, status)

        self.event({
            'timestamp': int(time.time()),
            'event_type': 'tokumx',
            'api_key': agentConfig['api_key'],
            'msg_title': msg_title,
            'msg_text': msg,
            'host': hostname
        })
    def _wait_for_machine_configured(self, file_reader):
        """In case of nosql and bigdata CMT is changing hostname, wait for that
           action being complete"""

        total_sleep_time = 0
        wait_for_conf = False
        for n in self._ctx.node_list:
            machine_type = file_reader.read_attribute(n.ip_address, 'MACHINE_TYPE')
            if machine_type == 'manager':
                wait_for_conf = True
                break
        if wait_for_conf:
            while True:
                if util.get_hostname() != self._ctx.this_node.hostname:
                    self._logger.debug("Sleep")
                    total_sleep_time += self._ctx.CMT_CONF_WAIT
                    if total_sleep_time >= self._ctx.MAX_CMT_CONF_WAIT:
                        util.log_exception("Waiting for machine configurtion took too long")
                        self.shutdown()
                    time.sleep(self._ctx.CMT_CONF_WAIT)
                else:
                    # sleep once more before the exit: to make sure that hostname
                    # change propagated
                    time.sleep(self._ctx.CMT_CONF_WAIT)
                    break
Example #10
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        """
        Initialize a new check.

        :param name: The name of the check
        :param init_config: The config for initializing the check
        :param agentConfig: The global configuration for the agent
        :param instances: A list of configuration objects for each instance.
        """
        from aggregator import MetricsAggregator


        self.name = name
        self.init_config = init_config
        self.agentConfig = agentConfig
        self.hostname = get_hostname(agentConfig)
        self.log = logging.getLogger('%s.%s' % (__name__, name))

        self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None))

        self.events = []
        self.service_checks = []
        self.instances = instances or []
        self.warnings = []
        self.library_versions = None
Example #11
0
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None):
    """Configure the server and the reporting thread.
    """
    c = get_config(parse_args=False, cfg_path=config_path)

    if (not c['use_dogstatsd'] and
            (args and args[0] in ['start', 'restart'] or not args)):
        log.info("Dogstatsd is disabled. Exiting")
        # We're exiting purposefully, so exit with zero (supervisor's expected
        # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly
        # and thus can exit cleanly.
        sleep(4)
        sys.exit(0)

    port = c['dogstatsd_port']
    interval = DOGSTATSD_FLUSH_INTERVAL
    api_key = c['api_key']
    aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE
    non_local_traffic = c['non_local_traffic']
    forward_to_host = c.get('statsd_forward_host')
    forward_to_port = c.get('statsd_forward_port')
    event_chunk_size = c.get('event_chunk_size')
    recent_point_threshold = c.get('recent_point_threshold', None)
    server_host = c['bind_host']

    target = c['dd_url']
    if use_forwarder:
        target = c['dogstatsd_target']

    hostname = get_hostname(c)

    # Create the aggregator (which is the point of communication between the
    # server and reporting threads.
    assert 0 < interval

    aggregator = MetricsBucketAggregator(
        hostname,
        aggregator_interval,
        recent_point_threshold=recent_point_threshold,
        formatter=get_formatter(c),
        histogram_aggregates=c.get('histogram_aggregates'),
        histogram_percentiles=c.get('histogram_percentiles'),
        utf8_decoding=c['utf8_decoding']
    )

    # Start the reporting thread.
    reporter = Reporter(interval, aggregator, target, api_key, use_watchdog, event_chunk_size)

    # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses
    # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the
    # network settings), so it's enough to just pass an empty string '' to the library.
    # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to
    # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and
    # use the '::' meta address as `bind_host`.
    if non_local_traffic:
        server_host = '0.0.0.0'

    server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)

    return reporter, server, c
Example #12
0
    def check(self, logger, agentConfig):
        if self.high_watermarks is None:
            # On the first run of check(), prime the high_watermarks dict
            # so that we only send events that occured after the agent
            # started.
            # (Setting high_watermarks in the next statement prevents
            #  any kind of infinite loop (assuming nothing ever sets
            #  high_watermarks to None again!))
            self.high_watermarks = defaultdict(lambda: 0)
            self.check(logger, agentConfig)

        hudson_home = agentConfig.get('hudson_home', None)

        if not hudson_home:
            return False

        job_dirs = glob(os.path.join(hudson_home, 'jobs', '*'))

        build_events = []

        for job_dir in job_dirs:
            for output in self._get_build_results(logger, job_dir):
                output['api_key'] = agentConfig['api_key']
                output['host'] = get_hostname(agentConfig)
                build_events.append(output)

        return build_events
Example #13
0
    def _get_hostname_metadata(self):
        """
        Returns a dictionnary that contains hostname metadata.
        """
        metadata = EC2.get_metadata(self.agentConfig)
        if metadata.get('hostname'):
            metadata['ec2-hostname'] = metadata.get('hostname')
            del metadata['hostname']

        if self.agentConfig.get('hostname'):
            metadata['agent-hostname'] = self.agentConfig.get('hostname')
        else:
            try:
                metadata["socket-hostname"] = socket.gethostname()
            except Exception:
                pass
        try:
            metadata["socket-fqdn"] = socket.getfqdn()
        except Exception:
            pass

        metadata["hostname"] = get_hostname()

        # Add cloud provider aliases
        host_aliases = GCE.get_host_aliases(self.agentConfig)
        if host_aliases:
            metadata['host_aliases'] = host_aliases

        return metadata
Example #14
0
def load_check(name, config, agentConfig):
    checksd_path = get_checksd_path(get_os())
    if checksd_path not in sys.path:
        sys.path.append(checksd_path)

    check_module = __import__(name)
    check_class = None
    classes = inspect.getmembers(check_module, inspect.isclass)
    for _, clsmember in classes:
        if clsmember == AgentCheck:
            continue
        if issubclass(clsmember, AgentCheck):
            check_class = clsmember
            if AgentCheck in clsmember.__bases__:
                continue
            else:
                break
    if check_class is None:
        raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name)

    init_config = config.get('init_config', {})
    instances = config.get('instances')
    agentConfig['checksd_hostname'] = get_hostname(agentConfig)

    # init the check class
    try:
        return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances)
    except Exception as e:
        raise Exception("Check is using old API, {0}".format(e))
Example #15
0
def setup_agent4(hostname=None, domain=None, pc="1", agent_conf="files/puppet-agent.conf", puppetserver=None, proxy_url=None, hosts_file=None):
    """Setup Puppet 4 agent"""
    import package, util, config

    if not hostname:
        hostname = util.get_hostname()
    if not domain:
        domain = util.get_domain()

    install_puppetlabs_release_package(pc, proxy_url=proxy_url)
    package.install("puppet-agent")

    # Use puppetserver value from setting.ini file if none is given on the
    # command-line. If that fails use the default.
    if not puppetserver:
        try:    puppetserver = config.get("puppet", "puppetserver")
        except: puppetserver = None

    # Add a customized puppet.conf
    util.put_and_chown(agent_conf, "/etc/puppetlabs/puppet/puppet.conf")
    if puppetserver: server = puppetserver
    else:            server = "puppet.%s" % domain
    sudo("puppet config set --section agent server %s" % server)

    util.set_hostname(hostname + "." + domain)
    util.add_host_entry(util.get_ip(), hostname, domain)

    # Optionally add hosts from a separate file. This is useful when the IP of
    # the puppetmaster as seen from the Puppet agent node does not match its
    # name in DNS.
    util.add_host_entries(hosts_file)
    util.add_to_path("/opt/puppetlabs/bin")
    run_agent(noop="True", onlychanges="False")
Example #16
0
    def check(self, agentConfig):
        process_exclude_args = agentConfig.get('exclude_process_args', False)
        if process_exclude_args:
            ps_arg = 'aux'
        else:
            ps_arg = 'auxww'
        # Get output from ps
        try:
            output, _, _ = get_subprocess_output(['ps', ps_arg], self.logger)
            processLines = output.splitlines()  # Also removes a trailing empty line
        except StandardError:
            self.logger.exception('getProcesses')
            return False

        del processLines[0]  # Removes the headers

        processes = []

        for line in processLines:
            line = line.split(None, 10)
            processes.append(map(lambda s: s.strip(), line))

        return {'processes':   processes,
                'apiKey':      agentConfig['api_key'],
                'host':        get_hostname(agentConfig)}
Example #17
0
    def __init__(self, args):
        win32serviceutil.ServiceFramework.__init__(self, args)
        self.hWaitStop = win32event.CreateEvent(None, 0, 0, None)
        config = get_config(parse_args=False)

        # Setup the correct options so the agent will use the forwarder
        opts, args = Values({
            'autorestart': False,
            'dd_url': None,
            'use_forwarder': True,
            'disabled_dd': False,
            'profile': False
        }), []
        agentConfig = get_config(parse_args=False, options=opts)
        self.hostname = get_hostname(agentConfig)

        # Watchdog for Windows
        self._collector_heartbeat, self._collector_send_heartbeat = multiprocessing.Pipe(False)
        self._collector_failed_heartbeats = 0
        self._max_failed_heartbeats = \
            MAX_FAILED_HEARTBEATS * agentConfig['check_freq'] / SERVICE_SLEEP_INTERVAL

        # Watch JMXFetch restarts
        self._MAX_JMXFETCH_RESTARTS = 3
        self._count_jmxfetch_restarts = 0

        # Keep a list of running processes so we can start/end as needed.
        # Processes will start started in order and stopped in reverse order.
        self.procs = {
            'forwarder': ProcessWatchDog("forwarder", DDForwarder(config, self.hostname)),
            'collector': ProcessWatchDog("collector", DDAgent(agentConfig, self.hostname,
                                         heartbeat=self._collector_send_heartbeat)),
            'dogstatsd': ProcessWatchDog("dogstatsd", DogstatsdProcess(config, self.hostname)),
            'jmxfetch': ProcessWatchDog("jmxfetch", JMXFetchProcess(config, self.hostname), 3),
        }
Example #18
0
    def reload_configs(self, checks_to_reload=set()):
        """Reload the agent configuration and checksd configurations.
           Can also reload only an explicit set of checks."""
        log.info("Attempting a configuration reload...")
        hostname = get_hostname(self._agentConfig)

        # if no check was given, reload them all
        if not checks_to_reload:
            log.debug("No check list was passed, reloading every check")
            # stop checks
            for check in self._checksd.get('initialized_checks', []):
                check.stop()

            self._checksd = load_check_directory(self._agentConfig, hostname)
        else:
            new_checksd = copy(self._checksd)

            self.refresh_specific_checks(hostname, new_checksd, checks_to_reload)
            # once the reload is done, replace existing checks with the new ones
            self._checksd = new_checksd

        # Logging
        num_checks = len(self._checksd['initialized_checks'])
        if num_checks > 0:
            opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else ''

            msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format(
                num_checks=num_checks, opt_msg=opt_msg)
            log.info(msg)
        else:
            log.info("No checksd configs found")
Example #19
0
    def test_collector(self):
        agentConfig = {
            "api_key": "test_apikey",
            "check_timings": True,
            "collect_ec2_tags": True,
            "collect_instance_metadata": False,
            "version": "test",
            "tags": "",
        }

        # Run a single checks.d check as part of the collector.
        redis_config = {"init_config": {}, "instances": [{"host": "localhost", "port": 6379}]}
        checks = [load_check("redisdb", redis_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({"initialized_checks": checks, "init_failed_checks": {}})
        metrics = payload["metrics"]

        # Check that we got a timing metric for all checks.
        timing_metrics = [m for m in metrics if m[0] == "datadog.agent.check_run_time"]
        all_tags = []
        for metric in timing_metrics:
            all_tags.extend(metric[3]["tags"])
        for check in checks:
            tag = "check:%s" % check.name
            assert tag in all_tags, all_tags
Example #20
0
    def __init__(self, args):
        win32serviceutil.ServiceFramework.__init__(self, args)
        self.hWaitStop = win32event.CreateEvent(None, 0, 0, None)
        config = get_config(parse_args=False)

        # Setup the correct options so the agent will use the forwarder
        opts, args = Values({
            'dd_url': None,
            'clean': False,
            'use_forwarder': True,
            'disabled_dd': False
        }), []
        agentConfig = get_config(parse_args=False, options=opts)
        self.hostname = get_hostname(agentConfig)
        self.restart_interval = \
            int(agentConfig.get('autorestart_interval', RESTART_INTERVAL))
        log.info("Autorestarting the collector ever %s seconds" % self.restart_interval)

        # Keep a list of running processes so we can start/end as needed.
        # Processes will start started in order and stopped in reverse order.
        self.procs = {
            'forwarder': DDForwarder(config, self.hostname),
            'collector': DDAgent(agentConfig, self.hostname),
            'dogstatsd': DogstatsdProcess(config, self.hostname),
            'pup':       PupProcess(config),
        }
Example #21
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        """
        Initialize a new check.

        :param name: The name of the check
        :param init_config: The config for initializing the check
        :param agentConfig: The global configuration for the agent
        :param instances: A list of configuration objects for each instance.
        """
        from aggregator import MetricsAggregator

        self.name = name
        self.init_config = init_config or {}
        self.agentConfig = agentConfig
        self.in_developer_mode = agentConfig.get('developer_mode') and psutil is not None
        self._internal_profiling_stats = None

        self.hostname = agentConfig.get('checksd_hostname') or get_hostname(agentConfig)
        self.log = logging.getLogger('%s.%s' % (__name__, name))

        self.aggregator = MetricsAggregator(
            self.hostname,
            formatter=agent_formatter,
            recent_point_threshold=agentConfig.get('recent_point_threshold', None),
            histogram_aggregates=agentConfig.get('histogram_aggregates'),
            histogram_percentiles=agentConfig.get('histogram_percentiles')
        )

        self.events = []
        self.service_checks = []
        self.instances = instances or []
        self.warnings = []
        self.library_versions = None
        self.last_collection_time = defaultdict(int)
Example #22
0
    def testCheck(self):
        config = {
            'init_config': {},
            'instances': [{
                'url': 'http://localhost:3834/stats',
                'username': '******',
                'password': '******',
                'status_check': True,
                'collect_aggregates_only': False,
                'tag_service_check_by_host': True,
            }]
        }
        self.start_server(HAPROXY_CFG, config)

        # Run the check against our running server
        self.check.check(config['instances'][0])
        # Sleep for 1 second so the rate interval >=1
        time.sleep(1)
        # Run the check again so we get the rates
        self.check.check(config['instances'][0])

        # Metric assertions
        metrics = self.check.get_metrics()
        assert metrics
        self.assertTrue(type(metrics) == type([]))
        self.assertTrue(len(metrics) > 0)
        service_checks = self.check.get_service_checks()
        assert service_checks
        self.assertTrue(type(service_checks) == type([]))
        self.assertTrue(len(service_checks) > 0)

        self.assertEquals(len([t for t in metrics
            if t[0] == "haproxy.backend.bytes.in_rate"]), 3, metrics)
        self.assertEquals(len([t for t in metrics
            if t[0] == "haproxy.frontend.session.current"]), 1, metrics)
        # check was run 2 times
        #       - FRONTEND is reporting OPEN that we ignore
        #       - only the BACKEND aggregate is reporting UP -> OK
        #       - The 3 individual servers are returning no check -> UNKNOWN
        self.assertEquals(len([t for t in service_checks
            if t['status']== 0]), 2, service_checks)
        self.assertEquals(len([t for t in service_checks
            if t['status']== 3]), 6, service_checks)

        # Make sure the service checks aren't tagged with an empty hostname.
        for service_check in service_checks:
            self.assertEquals(service_check['host_name'], get_hostname())

        inst = config['instances'][0]
        data = self.check._fetch_data(inst['url'], inst['username'], inst['password'])
        new_data = [l.replace("no check", "UP") for l in data]
        self.check._process_data(new_data, False, True, inst['url']),

        assert self.check.has_events()
        assert len(self.check.get_events()) == 3 # The 3 individual backend servers were switched to UP
        service_checks = self.check.get_service_checks()
        # The 3 servers + the backend aggregate are reporting UP
        self.assertEquals(len([t for t in service_checks
            if t['status'] == 0]), 4, service_checks)
Example #23
0
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None):
    """Configure the server and the reporting thread.
    """
    c = get_config(parse_args=False, cfg_path=config_path)

    if (not c['use_dogstatsd'] and
            (args and args[0] in ['start', 'restart'] or not args)):
        log.info("Dogstatsd is disabled. Exiting")
        # We're exiting purposefully, so exit with zero (supervisor's expected
        # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly
        # and thus can exit cleanly.
        sleep(4)
        sys.exit(0)

    log.debug("Configuring dogstatsd")

    port = c['dogstatsd_port']
    interval = DOGSTATSD_FLUSH_INTERVAL
    api_key = c['api_key']
    aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE
    non_local_traffic = c['non_local_traffic']
    forward_to_host = c.get('statsd_forward_host')
    forward_to_port = c.get('statsd_forward_port')
    event_chunk_size = c.get('event_chunk_size')
    recent_point_threshold = c.get('recent_point_threshold', None)

    target = c['dd_url']
    if use_forwarder:
        target = c['dogstatsd_target']

    hostname = get_hostname(c)

    # Create the aggregator (which is the point of communication between the
    # server and reporting threads.
    assert 0 < interval

    aggregator = MetricsBucketAggregator(
        hostname,
        aggregator_interval,
        recent_point_threshold=recent_point_threshold,
        formatter=get_formatter(c),
        histogram_aggregates=c.get('histogram_aggregates'),
        histogram_percentiles=c.get('histogram_percentiles'),
        utf8_decoding=c['utf8_decoding']
    )

    # Start the reporting thread.
    reporter = Reporter(interval, aggregator, target, api_key, use_watchdog, event_chunk_size)

    # Start the server on an IPv4 stack
    # Default to loopback
    server_host = c['bind_host']
    # If specified, bind to all addressses
    if non_local_traffic:
        server_host = ''

    server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port)

    return reporter, server, c
Example #24
0
    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics["uuid"] = get_uuid()
            self._metrics["internalHostname"] = get_hostname(self._agentConfig)
            self._metrics["apiKey"] = self._agentConfig["api_key"]
            MetricTransaction(json.dumps(self._metrics), headers={"Content-Type": "application/json"})
            self._metrics = {}
Example #25
0
    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = get_hostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(self._metrics, {})
            self._metrics = {}
Example #26
0
    def check(self, instance, create_event=True):
        """
        DEPRECATED:
        This Jenkins check is deprecated and not actively developed anymore. It will be
        removed in a future version of the Datadog Agent. Please move to using the Datadog
        plugin for Jenkins. More information can be found on the Jenkins Integration panel
        under the Configuration tab (https://app.datadoghq.com/account/settings#integrations/jenkins)
        """
        self.warning("This check is deprecated in favor of our Jenkins Datadog plugin."
                     " It will be removed in a future version of the Datadog Agent."
                     " More information can be found on the Jenkins Integration panel"
                     " under the Configuration tab"
                     " (https://app.datadoghq.com/account/settings#integrations/jenkins)")

        if self.high_watermarks.get(instance.get('name'), None) is None:
            # On the first run of check(), prime the high_watermarks dict
            # so that we only send events that occured after the agent
            # started.
            # (Setting high_watermarks in the next statement prevents
            #  any kind of infinite loop (assuming nothing ever sets
            #  high_watermarks to None again!))
            self.high_watermarks[instance.get('name')] = defaultdict(lambda: 0)
            self.check(instance, create_event=False)

        jenkins_home = instance.get('jenkins_home')

        if not jenkins_home:
            raise Exception("No jenkins_home directory set in the config file")

        jenkins_jobs_dir = os.path.join(jenkins_home, 'jobs', '*')
        job_dirs = glob(jenkins_jobs_dir)

        if not job_dirs:
            raise Exception('No jobs found in `%s`! '
                            'Check `jenkins_home` in your config' % (jenkins_jobs_dir))

        for job_dir in job_dirs:
            for output in self._get_build_results(instance.get('name'), job_dir):
                output['host'] = get_hostname(self.agentConfig)
                if create_event:
                    self.log.debug("Creating event for job: %s" % output['job_name'])
                    self.event(output)

                    tags = [
                        'job_name:%s' % output['job_name'],
                        'result:%s' % output['result'],
                        'build_number:%s' % output['number']
                    ]

                    if 'branch' in output:
                        tags.append('branch:%s' % output['branch'])
                    self.gauge("jenkins.job.duration", float(output['duration'])/1000.0, tags=tags)

                    if output['result'] == 'SUCCESS':
                        self.increment('jenkins.job.success', tags=tags)
                    else:
                        self.increment('jenkins.job.failure', tags=tags)
Example #27
0
    def _postMetrics(self):

        if len(self._metrics) > 0:
            self._metrics['uuid'] = get_uuid()
            self._metrics['internalHostname'] = get_hostname(self._agentConfig)
            self._metrics['apiKey'] = self._agentConfig['api_key']
            MetricTransaction(json.dumps(self._metrics),
                              headers={'Content-Type': 'application/json'})
            self._metrics = {}
Example #28
0
    def _build_payload(self, start_event=True):
        """
        Return an dictionary that contains all of the generic payload data.
        """
        now = time.time()
        payload = {
            'collection_timestamp': now,
            'os' : self.os,
            'python': sys.version,
            'agentVersion' : self.agentConfig['version'],
            'apiKey': self.agentConfig['api_key'],
            'events': {},
            'metrics': [],
            'service_checks': [],
            'resources': {},
            'internalHostname' : get_hostname(self.agentConfig),
            'uuid' : get_uuid(),
            'host-tags': {},
        }

        # Include system stats on first postback
        if start_event and self._is_first_run():
            payload['systemStats'] = self.agentConfig.get('system_stats', {})
            # Also post an event in the newsfeed
            payload['events']['System'] = [{'api_key': self.agentConfig['api_key'],
                                 'host': payload['internalHostname'],
                                 'timestamp': now,
                                 'event_type':'Agent Startup',
                                 'msg_text': 'Version %s' % get_version()
                                 }]

        # Periodically send the host metadata.
        if self._is_first_run() or self._should_send_metadata():
            payload['systemStats'] = get_system_stats()
            payload['meta'] = self._get_metadata()
            self.metadata_cache = payload['meta']
            # Add static tags from the configuration file
            host_tags = []
            if self.agentConfig['tags'] is not None:
                host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")])

            if self.agentConfig['collect_ec2_tags']:
                host_tags.extend(EC2.get_tags())

            if host_tags:
                payload['host-tags']['system'] = host_tags

            GCE_tags = GCE.get_tags()
            if GCE_tags is not None:
                payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags

            # Log the metadata on the first run
            if self._is_first_run():
                log.info("Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload['host-tags']))

        return payload
Example #29
0
def parse_log(api_key, log_file):
    import logging
    import socket
    import sys

    logger = logging.getLogger("ddagent.checks.nagios")
    nagios = Nagios(get_hostname())

    events = nagios.check(logger, {'api_key': api_key, 'nagios_log': log_file}, move_end=False)
    for e in events:
        yield e
Example #30
0
    def _build_payload(self, start_event=True):
        """
        Return an dictionary that contains all of the generic payload data.
        """
        now = time.time()
        payload = {
            "collection_timestamp": now,
            "os": self.os,
            "python": sys.version,
            "agentVersion": self.agentConfig["version"],
            "apiKey": self.agentConfig["api_key"],
            "events": {},
            "metrics": [],
            "resources": {},
            "internalHostname": get_hostname(self.agentConfig),
            "uuid": get_uuid(),
            "host-tags": {},
        }

        # Include system stats on first postback
        if start_event and self._is_first_run():
            payload["systemStats"] = self.agentConfig.get("system_stats", {})
            # Also post an event in the newsfeed
            payload["events"]["System"] = [
                {
                    "api_key": self.agentConfig["api_key"],
                    "host": payload["internalHostname"],
                    "timestamp": now,
                    "event_type": "Agent Startup",
                    "msg_text": "Version %s" % get_version(),
                }
            ]

        # Periodically send the host metadata.
        if self._is_first_run() or self._should_send_metadata():
            payload["systemStats"] = get_system_stats()
            payload["meta"] = self._get_metadata()
            self.metadata_cache = payload["meta"]
            # Add static tags from the configuration file
            host_tags = []
            if self.agentConfig["tags"] is not None:
                host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig["tags"].split(",")])

            if self.agentConfig["collect_ec2_tags"]:
                host_tags.extend(EC2.get_tags())

            if host_tags:
                payload["host-tags"]["system"] = host_tags

            # Log the metadata on the first run
            if self._is_first_run():
                log.info(u"Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload["host-tags"]))

        return payload
Example #31
0
    def test_collector(self):
        agentConfig = {
            'api_key': 'test_apikey',
            'check_timings': True,
            'collect_ec2_tags': True,
            'collect_instance_metadata': False,
            'version': 'test',
            'tags': '',
        }

        # Run a single checks.d check as part of the collector.
        redis_config = {
            "init_config": {},
            "instances": [{
                "host": "localhost",
                "port": 6379
            }]
        }
        checks = [load_check('redisdb', redis_config, agentConfig)]

        c = Collector(agentConfig, [], {}, get_hostname(agentConfig))
        payload = c.run({
            'initialized_checks': checks,
            'init_failed_checks': {}
        })
        metrics = payload['metrics']

        # Check that we got a timing metric for all checks.
        timing_metrics = [
            m for m in metrics if m[0] == 'datadog.agent.check_run_time'
        ]
        all_tags = []
        for metric in timing_metrics:
            all_tags.extend(metric[3]['tags'])
        for check in checks:
            tag = "check:%s" % check.name
            assert tag in all_tags, all_tags
Example #32
0
    def __init__(self):
        GObject.Object.__init__(self)
        self.hostname = util.get_hostname()
        self.code = None
        self.ident = None

        self.cert_server = None
        self.requests_lock = threading.Lock()
        self.requests = {}

        self.clean_cert_folder()

        self.keyfile = GLib.KeyFile()

        try:
            self.keyfile.load_from_file(os.path.join(CONFIG_FOLDER, CONFIG_FILE_NAME), GLib.KeyFileFlags.NONE)
        except GLib.Error as e:
            if e.code == GLib.FileError.NOENT:
                logging.debug("Auth: No group code file, making one.")
                pass
            else:
                logging.debug("Auth: Could not load existing keyfile (%s): %s" %(CONFIG_FOLDER, e.message))

        self.code = self.get_group_code()
Example #33
0
    def submit_events(self, events):
        headers = {'Content-Type': 'application/json'}
        method = 'POST'

        events_len = len(events)
        event_chunk_size = self.event_chunk_size

        for chunk in chunks(events, event_chunk_size):
            payload = {
                'apiKey': self.api_key,
                'events': {
                    'api': chunk
                },
                'uuid': get_uuid(),
                'internalHostname': get_hostname()
            }
            params = {}
            if self.api_key:
                params['api_key'] = self.api_key
            url = '/intake?%s' % urlencode(params)

            status = None
            conn = self.http_conn_cls(self.api_host)
            try:
                start_time = time()
                conn.request(method, url, json.dumps(payload), headers)

                response = conn.getresponse()
                status = response.status
                response.close()
                duration = round((time() - start_time) * 1000.0, 4)
                log.debug("%s %s %s%s (%sms)" %
                          (status, method, self.api_host, url, duration))

            finally:
                conn.close()
Example #34
0
    def _get_hostname_metadata(self):
        """
        Returns a dictionnary that contains hostname metadata.
        """
        metadata = EC2.get_metadata(self.agentConfig)
        if metadata.get('hostname'):
            metadata['ec2-hostname'] = metadata.get('hostname')
            del metadata['hostname']

        if self.agentConfig.get('hostname'):
            metadata['agent-hostname'] = self.agentConfig.get('hostname')
        else:
            try:
                metadata["socket-hostname"] = socket.gethostname()
            except Exception:
                pass
        try:
            metadata["socket-fqdn"] = socket.getfqdn()
        except Exception:
            pass

        metadata["hostname"] = get_hostname()

        return metadata
Example #35
0
def load_check(name, config, agentConfig):
    checksd_path = get_checksd_path(get_os())
    if checksd_path not in sys.path:
        sys.path.append(checksd_path)

    check_module = __import__(name)
    check_class = None
    classes = inspect.getmembers(check_module, inspect.isclass)
    for _, clsmember in classes:
        if clsmember == AgentCheck:
            continue
        if issubclass(clsmember, AgentCheck):
            check_class = clsmember
            if AgentCheck in clsmember.__bases__:
                continue
            else:
                break
    if check_class is None:
        raise Exception(
            "Unable to import check %s. Missing a class that inherits AgentCheck"
            % name)

    init_config = config.get('init_config', {})
    instances = config.get('instances')
    agentConfig['checksd_hostname'] = get_hostname(agentConfig)

    # init the check class
    try:
        return check_class(name,
                           init_config=init_config,
                           agentConfig=agentConfig,
                           instances=instances)
    except TypeError as e:
        raise Exception("Check is using old API, {0}".format(e))
    except Exception:
        raise
Example #36
0
    def __init__(self, name, init_config, agentConfig, instances=None):
        """
        Initialize a new check.

        :param name: The name of the check
        :param init_config: The config for initializing the check
        :param agentConfig: The global configuration for the agent
        :param instances: A list of configuration objects for each instance.
        """
        from aggregator import MetricsAggregator

        self.name = name
        self.init_config = init_config or {}
        self.agentConfig = agentConfig
        self.in_developer_mode = agentConfig.get('developer_mode') and psutil is not None
        self._internal_profiling_stats = None

        self.hostname = agentConfig.get('checksd_hostname') or get_hostname(agentConfig)
        self.log = logging.getLogger('%s.%s' % (__name__, name))

        self.aggregator = MetricsAggregator(
            self.hostname,
            formatter=agent_formatter,
            recent_point_threshold=agentConfig.get('recent_point_threshold', None),
            histogram_aggregates=agentConfig.get('histogram_aggregates'),
            histogram_percentiles=agentConfig.get('histogram_percentiles')
        )

        self.events = []
        self.service_checks = []
        self.instances = instances or []
        self.warnings = []
        self.library_versions = None
        self.last_collection_time = defaultdict(int)
        self._instance_metadata = []
        self.svc_metadata = []
Example #37
0
def main():
    options, args = get_parsed_args()
    agentConfig = get_config(options=options)
    autorestart = agentConfig.get('autorestart', False)
    hostname = get_hostname(agentConfig)

    COMMANDS = [
        'start',
        'stop',
        'restart',
        'foreground',
        'status',
        'info',
        'check',
        'configcheck',
        'jmx',
    ]

    if len(args) < 1:
        sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS)))
        return 2

    command = args[0]
    if command not in COMMANDS:
        sys.stderr.write("Unknown command: %s\n" % command)
        return 3

    pid_file = PidFile('dd-agent')

    if options.clean:
        pid_file.clean()

    agent = Agent(pid_file.get_path(), autorestart)

    if command in START_COMMANDS:
        log.info('Agent version %s' % get_version())

    if 'start' == command:
        log.info('Start daemon')
        agent.start()

    elif 'stop' == command:
        log.info('Stop daemon')
        agent.stop()

    elif 'restart' == command:
        log.info('Restart daemon')
        agent.restart()

    elif 'status' == command:
        agent.status()

    elif 'info' == command:
        return agent.info(verbose=options.verbose)

    elif 'foreground' == command:
        logging.info('Running in foreground')
        if autorestart:
            # Set-up the supervisor callbacks and fork it.
            logging.info('Running Agent with auto-restart ON')
            def child_func(): agent.run()
            def parent_func(): agent.start_event = False
            AgentSupervisor.start(parent_func, child_func)
        else:
            # Run in the standard foreground.
            agent.run(config=agentConfig)

    elif 'check' == command:
        check_name = args[1]
        try:
            import checks.collector
            # Try the old-style check first
            print getattr(checks.collector, check_name)(log).check(agentConfig)
        except Exception:
            # If not an old-style check, try checks.d
            checks = load_check_directory(agentConfig, hostname)
            for check in checks['initialized_checks']:
                if check.name == check_name:
                    check.run()
                    print check.get_metrics()
                    print check.get_events()
                    if len(args) == 3 and args[2] == 'check_rate':
                        print "Running 2nd iteration to capture rate metrics"
                        time.sleep(1)
                        check.run()
                        print check.get_metrics()
                        print check.get_events()

    elif 'configcheck' == command or 'configtest' == command:
        osname = get_os()
        all_valid = True
        for conf_path in glob.glob(os.path.join(get_confd_path(osname), "*.yaml")):
            basename = os.path.basename(conf_path)
            try:
                check_yaml(conf_path)
            except Exception, e:
                all_valid = False
                print "%s contains errors:\n    %s" % (basename, e)
            else:
                print "%s is valid" % basename
        if all_valid:
            print "All yaml files passed. You can now run the Datadog agent."
            return 0
        else:
            print("Fix the invalid yaml files above in order to start the Datadog agent. "
                    "A useful external tool for yaml parsing can be found at "
                    "http://yaml-online-parser.appspot.com/")
            return 1
Example #38
0
 def get_my_hostname(self):
     """
     Returns a best guess for the hostname registered with OpenStack for this host
     """
     return self.init_config.get("os_host") or get_hostname(
         self.agentConfig)
Example #39
0
def generate_instances(check_platform=False):
    print("Generating instances...")
    instances = []

    core_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
    instances.extend(
        get_instance_list(BenchmarkApo(),
                          PlatformCrayUPC(param_cores=[128, 256, 512, 1024]),
                          check_platform))
    instances.extend(
        get_instance_list(
            BenchmarkApo(),
            PlatformBerkeley(param_cores=core_list, conduits=['ibv']),
            check_platform))

    return instances

    instances.extend(
        get_instance_list(
            BenchmarkNPB(minClass='C', maxClass='D', kernel_list=['ft']),
            PlatformCrayUPC(param_cores=[128, 256, 512, 1024]),
            check_platform))

    instances.extend(
        get_instance_list(
            BenchmarkNPB(minClass='B', maxClass='C', kernel_list=None),
            PlatformCrayUPC(param_cores=[16, 32, 64, 128, 256, 512, 1024]),
            check_platform))

    if get_hostname() == 'bulldozer-server':
        instances.extend(
            get_instance_list(
                BenchmarkUBMatrixMultiplication(),
                PlatformBerkeley(param_cores=[1, 2, 4, 8, 16, 32],
                                 conduits=['smp']), check_platform)
        )  #   disable_optimization=True, experimental=True
        return instances

    if False:
        instances.extend(
            get_instance_list(
                BenchmarkNPB(maxClass='B', minClass='B', kernel_list=['ft']),
                PlatformBerkeleyTrace(param_cores=core_list, conduits=['ibv']),
                check_platform))
        instances.extend(
            get_instance_list(
                BenchmarkNPB(minClass='B', maxClass='B', kernel_list=['ft']),
                PlatformBerkeley(param_cores=core_list, conduits=['ibv']),
                check_platform)
        )  #   disable_optimization=True, experimental=True
        instances.extend(
            get_instance_list(
                BenchmarkNPB(maxClass='A',
                             minClass='A',
                             kernel_list=['ft', 'cg', 'is', 'mg']),
                PlatformBerkeleyTrace(param_cores=core_list, conduits=['ibv']),
                check_platform))
        instances.extend(
            get_instance_list(
                BenchmarkNPB(minClass='A',
                             maxClass='A',
                             kernel_list=['ft', 'cg', 'is', 'mg']),
                PlatformBerkeley(param_cores=core_list, conduits=['ibv']),
                check_platform)
        )  #   disable_optimization=True, experimental=True

    #instances.extend( get_instance_list(BenchmarkNPB(minClass='A', maxClass='A'), PlatformBerkeleyTile(), check_platform) ) #   disable_optimization=True, experimental=True
    #instances.extend( get_instance_list(BenchmarkNPB_CSEQ(minClass='A', maxClass='A'), PlatformTileCC(), check_platform) )

    #instances.extend( get_instance_list(BenchmarkSSCA3(maxScale=3), PlatformCrayUPCxt5(), check_platform) )
    #instances.extend( get_instance_list(BenchmarkSSCA3(maxScale=2, withFFTW=False), PlatformBerkeleyTile()) )
    #instances.extend( get_instance_list(BenchmarkMatrixMultiplication(), PlatformBerkeleyTile(), check_platform) )
    #instances.extend( get_instance_list(BenchmarkSobel(), PlatformBerkeleyTile(), check_platform) )
    #instances.extend( get_instance_list(BenchmarkRandomAccess2(), PlatformBerkeleyTile(), check_platform) )
    if False:
        instances.extend(
            get_instance_list(BenchmarkMatrixMultiplication(),
                              PlatformBerkeley(), check_platform))
        instances.extend(
            get_instance_list(BenchmarkSobel(), PlatformBerkeley(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkRandomAccess(), PlatformBerkeley(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkRandomAccess(), PlatformBerkeleyTile(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkRandomAccessSeq(), PlatformTileCC(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkRandomAccessSeq(), PlatformGCC(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkRandomAccess2(), PlatformBerkeley(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkRandomAccess2Seq(), PlatformTileCC(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkRandomAccess2Seq(), PlatformGCC(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkSobelSeq(), PlatformTileCC(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkSobelSeq(), PlatformGCC(),
                              check_platform))
        instances.extend(
            get_instance_list(BenchmarkMatrixMultiplicationSeq(),
                              PlatformTileCC(), check_platform))
        instances.extend(
            get_instance_list(BenchmarkMatrixMultiplicationSeq(),
                              PlatformGCC(), check_platform))

    print(len(instances), " instances available.")
    print("")

    return instances
Example #40
0
 def check_if_valid(self):
     hostname = util.get_hostname()
     return True
Example #41
0
    def testCheck(self):
        config = {
            'init_config': {},
            'instances': [{
                'url': 'http://localhost:3834/stats',
                'username': '******',
                'password': '******',
                'status_check': True,
                'collect_aggregates_only': False,
                'tag_service_check_by_host': True,
            }]
        }
        self.start_server(HAPROXY_CFG, config)

        # Run the check against our running server
        self.check.check(config['instances'][0])
        # Sleep for 1 second so the rate interval >=1
        time.sleep(1)
        # Run the check again so we get the rates
        self.check.check(config['instances'][0])

        # Metric assertions
        metrics = self.check.get_metrics()
        assert metrics
        self.assertTrue(type(metrics) == type([]))
        self.assertTrue(len(metrics) > 0)
        service_checks = self.check.get_service_checks()
        assert service_checks
        self.assertTrue(type(service_checks) == type([]))
        self.assertTrue(len(service_checks) > 0)

        self.assertEquals(
            len([
                t for t in metrics if t[0] == "haproxy.backend.bytes.in_rate"
            ]), 3, metrics)
        self.assertEquals(
            len([
                t for t in metrics
                if t[0] == "haproxy.frontend.session.current"
            ]), 1, metrics)
        # check was run 2 times
        #       - FRONTEND is reporting OPEN that we ignore
        #       - only the BACKEND aggregate is reporting UP -> OK
        #       - The 3 individual servers are returning no check -> UNKNOWN
        self.assertEquals(len([t for t in service_checks if t['status'] == 0]),
                          2, service_checks)
        self.assertEquals(len([t for t in service_checks if t['status'] == 3]),
                          6, service_checks)

        # Make sure the service checks aren't tagged with an empty hostname.
        for service_check in service_checks:
            self.assertEquals(service_check['host_name'], get_hostname())

        inst = config['instances'][0]
        data = self.check._fetch_data(inst['url'], inst['username'],
                                      inst['password'])
        new_data = [l.replace("no check", "UP") for l in data]
        self.check._process_data(new_data, False, True, inst['url']),

        assert self.check.has_events()
        assert len(self.check.get_events(
        )) == 3  # The 3 individual backend servers were switched to UP
        service_checks = self.check.get_service_checks()
        # The 3 servers + the backend aggregate are reporting UP
        self.assertEquals(len([t for t in service_checks if t['status'] == 0]),
                          4, service_checks)
Example #42
0
class Collector(object):
    """
    The collector is responsible for collecting data from each check and
    passing it along to the emitters, who send it to their final destination.
    """

    def __init__(self, agentConfig, emitters, systemStats):
        self.emit_duration = None
        self.agentConfig = agentConfig
        # system stats is generated by config.get_system_stats
        self.agentConfig['system_stats'] = systemStats
        # agent config is used during checks, system_stats can be accessed through the config
        self.os = get_os()
        self.plugins = None
        self.emitters = emitters
        self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60))
        self.metadata_start = time.time()
        socket.setdefaulttimeout(15)
        self.run_count = 0
        self.continue_running = True
        self.metadata_cache = None
        self.initialized_checks_d = []
        self.init_failed_checks_d = []

        # Unix System Checks
        self._unix_system_checks = {
            'disk': u.Disk(log),
            'io': u.IO(log),
            'load': u.Load(log),
            'memory': u.Memory(log),
            'processes': u.Processes(log),
            'cpu': u.Cpu(log)
        }

        # Win32 System `Checks
        self._win32_system_checks = {
            'disk': w32.Disk(log),
            'io': w32.IO(log),
            'proc': w32.Processes(log),
            'memory': w32.Memory(log),
            'network': w32.Network(log),
            'cpu': w32.Cpu(log)
        }

        # Old-style metric checks
        self._ganglia = Ganglia(log)
        self._dogstream = Dogstreams.init(log, self.agentConfig)
        self._ddforwarder = DdForwarder(log, self.agentConfig)

        # Agent Metrics
        self._agent_metrics = CollectorMetrics(log)

        self._metrics_checks = []

        # Custom metric checks
        for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]:
            if len(module_spec) == 0: continue
            try:
                self._metrics_checks.append(modules.load(module_spec, 'Check')(log))
                log.info("Registered custom check %s" % module_spec)
                log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version")
            except Exception, e:
                log.exception('Unable to load custom check module %s' % module_spec)

        # Event Checks
        self._event_checks = [
            Nagios(get_hostname()),
        ]

        # Resource Checks
        self._resources_checks = [
            ResProcesses(log,self.agentConfig)
        ]
Example #43
0
        def flush_trs():
            if self._watchdog:
                self._watchdog.reset()
            self._postMetrics()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL,
                                                   io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address="localhost")

        # Start everything
        if self._watchdog:
            self._watchdog.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")

    def stop(self):
        self.mloop.stop()
Example #44
0
 def get_value(self):
     return util.get_hostname()
Example #45
0
def init(config_path=None,
         use_watchmonitor=False,
         use_forwarder=False,
         args=None):
    """Configure the server and the reporting thread.
    """
    c = get_config(parse_args=False, cfg_path=config_path)

    if (not c['use_monitorstatsd']
            and (args and args[0] in ['start', 'restart'] or not args)):
        log.info("Monitorstatsd is disabled. Exiting")
        # We're exiting purposefully, so exit with zero (supervisor's expected
        # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly
        # and thus can exit cleanly.
        sleep(4)
        sys.exit(0)

    log.debug("Configuring monitorstatsd")

    port = c['monitorstatsd_port']
    interval = monitorSTATSD_FLUSH_INTERVAL
    api_key = c['api_key']
    aggregator_interval = monitorSTATSD_AGGREGATOR_BUCKET_SIZE
    non_local_traffic = c['non_local_traffic']
    forward_to_host = c.get('statsd_forward_host')
    forward_to_port = c.get('statsd_forward_port')
    event_chunk_size = c.get('event_chunk_size')
    recent_point_threshold = c.get('recent_point_threshold', None)
    ip = c.get('ip', "unknown")

    target = c['m_url']
    if use_forwarder:
        target = c['monitorstatsd_target']

    hostname = get_hostname(c)

    # Create the aggregator (which is the point of communication between the
    # server and reporting threads.
    assert 0 < interval

    aggregator = MetricsBucketAggregator(
        hostname,
        aggregator_interval,
        recent_point_threshold=recent_point_threshold,
        formatter=get_formatter(c),
        histogram_aggregates=c.get('histogram_aggregates'),
        histogram_percentiles=c.get('histogram_percentiles'),
        utf8_decoding=c['utf8_decoding'])

    # Start the reporting thread.
    reporter = Reporter(c, interval, aggregator, target, api_key,
                        use_watchmonitor, event_chunk_size)

    # Start the server on an IPv4 stack
    # Default to loopback
    server_host = c['bind_host']
    # If specified, bind to all addressses
    if non_local_traffic:
        server_host = ''

    server = Server(aggregator,
                    server_host,
                    port,
                    forward_to_host=forward_to_host,
                    forward_to_port=forward_to_port)

    return reporter, server, c
Example #46
0
        def run_secure_loop():
            logging.debug(
                "Remote: Starting a new connection loop for %s (%s:%d)" %
                (self.display_hostname, self.ip_address, self.port))

            cert = auth.get_singleton().load_cert(self.hostname,
                                                  self.ip_address)
            creds = grpc.ssl_channel_credentials(cert)

            with grpc.secure_channel("%s:%d" % (self.ip_address, self.port),
                                     creds) as channel:
                future = grpc.channel_ready_future(channel)

                try:
                    future.result(timeout=4)
                    self.stub = warp_pb2_grpc.WarpStub(channel)
                except grpc.FutureTimeoutError:
                    self.set_remote_status(RemoteStatus.UNREACHABLE)
                    future.cancel()

                    if not self.ping_timer.is_set():
                        logging.debug(
                            "Remote: Unable to establish secure connection with %s (%s:%d). Trying again in %ds"
                            % (self.display_hostname, self.ip_address,
                               self.port, CHANNEL_RETRY_WAIT_TIME))
                        self.ping_timer.wait(CHANNEL_RETRY_WAIT_TIME)
                        return True  # run_secure_loop()

                    return False  # run_secure_loop()

                duplex_fail_counter = 0
                one_ping = False  # A successful duplex response lets us finish setting things up.

                while not self.ping_timer.is_set():

                    if self.busy:
                        logging.debug(
                            "Remote Ping: Skipping keepalive ping to %s (%s:%d) (busy)"
                            % (self.display_hostname, self.ip_address,
                               self.port))
                        self.busy = False
                    else:
                        try:
                            # t = GLib.get_monotonic_time()
                            logging.debug("Remote Ping: to   %s (%s:%d)" %
                                          (self.display_hostname,
                                           self.ip_address, self.port))
                            self.stub.Ping(warp_pb2.LookupName(
                                id=self.local_ident,
                                readable_name=util.get_hostname()),
                                           timeout=5)
                            # logging.debug("Latency: %s (%s)"
                            # % (util.precise_format_time_span(GLib.get_monotonic_time() - t), self.display_hostname))
                            if not one_ping:
                                self.set_remote_status(
                                    RemoteStatus.AWAITING_DUPLEX)
                                if self.check_duplex_connection():
                                    logging.debug(
                                        "Remote: Connected to %s (%s:%d)" %
                                        (self.display_hostname,
                                         self.ip_address, self.port))

                                    self.set_remote_status(RemoteStatus.ONLINE)

                                    self.rpc_call(
                                        self.update_remote_machine_info)
                                    self.rpc_call(
                                        self.update_remote_machine_avatar)
                                    one_ping = True
                                else:
                                    duplex_fail_counter += 1
                                    if duplex_fail_counter > DUPLEX_MAX_FAILURES:
                                        logging.debug(
                                            "Remote: CheckDuplexConnection to %s (%s:%d) failed too many times"
                                            % (self.display_hostname,
                                               self.ip_address, self.port))
                                        self.ping_timer.wait(
                                            CHANNEL_RETRY_WAIT_TIME)
                                        return True
                        except grpc.RpcError as e:
                            logging.debug(
                                "Remote: Ping failed, shutting down %s (%s:%d)"
                                % (self.display_hostname, self.ip_address,
                                   self.port))
                            break

                    self.ping_timer.wait(
                        CONNECTED_PING_TIME if self.status ==
                        RemoteStatus.ONLINE else DUPLEX_WAIT_PING_TIME)

                # This is reached by the RpcError break above.  If the remote is still discoverable, start
                # the secure loop over.  This could have happened as a result of a quick disco/reconnect,
                # And we don't notice until it has already come back. In this case, try a new connection.
                if self.has_zc_presence and not self.ping_timer.is_set():
                    return True  # run_secure_loop()

                # The ping timer has been triggered, this is an orderly shutdown.
                return False  # run_secure_loop()
Example #47
0
def main():
    options, args = get_parsed_args()
    agentConfig = get_config(options=options)
    autorestart = agentConfig.get('autorestart', False)
    hostname = get_hostname(agentConfig)
    in_developer_mode = agentConfig.get('developer_mode')
    COMMANDS_AGENT = [
        'start',
        'stop',
        'restart',
        'status',
        'foreground',
    ]

    COMMANDS_NO_AGENT = [
        'info',
        'check',
        'configcheck',
        'jmx',
        'flare',
    ]

    COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT

    if len(args) < 1:
        sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS)))
        return 2

    command = args[0]
    if command not in COMMANDS:
        sys.stderr.write("Unknown command: %s\n" % command)
        return 3

    if command not in DD_AGENT_COMMANDS:
        from utils.deprecations import deprecate_old_command_line_tools
        deprecate_old_command_line_tools()

    if command in COMMANDS_AGENT:
        agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode)

    if command in START_COMMANDS:
        log.info('Agent version %s' % get_version())

    if 'start' == command:
        log.info('Start daemon')
        agent.start()

    elif 'stop' == command:
        log.info('Stop daemon')
        agent.stop()

    elif 'restart' == command:
        log.info('Restart daemon')
        agent.restart()

    elif 'status' == command:
        agent.status()

    elif 'info' == command:
        return Agent.info(verbose=options.verbose)

    elif 'foreground' == command:
        logging.info('Running in foreground')
        if autorestart:
            logging.info('Running Agent with auto-restart ON')

            def child_func():
                agent.start(foreground=True)

            def parent_func():
                agent.start_event = False

            AgentSupervisor.start(parent_func, child_func)
        else:
            agent.start(foreground=True)

    elif 'check' == command:
        if len(args) < 2:
            sys.stderr.write(
                "Usage: %s check <check_name> [check_rate]\n"
                "Add check_rate as last argument to compute rates\n"
                % sys.argv[0]
            )
            return 1

        check_name = args[1]
        try:
            import checks.collector
            print getattr(checks.collector, check_name)(log).check(agentConfig)
        except Exception:
            checks = load_check_directory(agentConfig, hostname)
            for check in checks['initialized_checks']:
                if check.name == check_name:
                    if in_developer_mode:
                        check.run = AgentProfiler.wrap_profiling(check.run)

                    cs = Collector.run_single_check(check, verbose=True)
                    print CollectorStatus.render_check_status(cs)

                    if len(args) == 3 and args[2] == 'check_rate':
                        print "Running 2nd iteration to capture rate metrics"
                        time.sleep(1)
                        cs = Collector.run_single_check(check, verbose=True)
                        print CollectorStatus.render_check_status(cs)

                    check.stop()

    elif 'configcheck' == command or 'configtest' == command:
        configcheck()

    elif 'jmx' == command:
        jmx_command(args[1:], agentConfig)

    elif 'flare' == command:
        Flare.check_user_rights()
        case_id = int(args[1]) if len(args) > 1 else None
        f = Flare(True, case_id)
        f.collect()
        try:
            f.upload()
        except Exception, e:
            print 'The upload failed:\n{0}'.format(str(e))
Example #48
0
    def run(self, config=None):

        signal.signal(signal.SIGTERM, self._handle_sigterm)

        signal.signal(signal.SIGUSR1, self._handle_sigusr1)

        signal.signal(signal.SIGINT, self._handle_sigterm)

        signal.signal(signal.SIGHUP, self._handle_sighup)

        CollectorStatus().persist()

        if not config:
            config = get_config(parse_args=True)

        self._agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(self._agentConfig)
        systemStats = get_system_stats()
        emitters = self._get_emitters()

        self._checksd = load_check_directory(self._agentConfig, hostname)

        self.collector = Collector(self._agentConfig, emitters, systemStats, hostname)

        self.collector_profile_interval = self._agentConfig.get('collector_profile_interval',
                                                                DEFAULT_COLLECTOR_PROFILE_INTERVAL)

        self.check_frequency = int(self._agentConfig['check_freq'])
        watchmonitor = self._get_watchmonitor(self.check_frequency)

        self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        profiled = False
        collector_profiled_runs = 0

        while self.run_forever:
            log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks'])))

            if self.in_developer_mode and not profiled:
                try:
                    profiler = AgentProfiler()
                    profiler.enable_profiling()
                    profiled = True
                except Exception as e:
                    log.warn("Cannot enable profiler: %s" % str(e))

            self.collector.run(checksd=self._checksd,
                               start_event=self.start_event,
                               configs_reloaded=self.configs_reloaded)
            if self.configs_reloaded:
                self.configs_reloaded = False
            if profiled:
                if collector_profiled_runs >= self.collector_profile_interval:
                    try:
                        profiler.disable_profiling()
                        profiled = False
                        collector_profiled_runs = 0
                    except Exception as e:
                        log.warn("Cannot disable profiler: %s" % str(e))

            if self.autorestart and self._should_restart():
                self._do_restart()

            if self.run_forever:
                if watchmonitor:
                    watchmonitor.reset()
                if profiled:
                    collector_profiled_runs += 1
                log.debug("Sleeping for {0} seconds".format(self.check_frequency))
                time.sleep(self.check_frequency)

        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        log.info("Exiting. Bye bye.")
        sys.exit(0)
Example #49
0
    def run(self, config=None):
        """Main loop of the collector"""

        # Gracefully exit on sigterm.
        signal.signal(signal.SIGTERM, self._handle_sigterm)

        # A SIGUSR1 signals an exit with an autorestart
        signal.signal(signal.SIGUSR1, self._handle_sigusr1)

        # Handle Keyboard Interrupt
        signal.signal(signal.SIGINT, self._handle_sigterm)

        # Save the agent start-up stats.
        CollectorStatus().persist()

        # Intialize the collector.
        if not config:
            config = get_config(parse_args=True)

        agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(agentConfig)
        systemStats = get_system_stats()
        emitters = self._get_emitters(agentConfig)
        # Load the checks.d checks
        checksd = load_check_directory(agentConfig, hostname)

        self.collector = Collector(agentConfig, emitters, systemStats, hostname)

        # Configure the watchdog.
        check_frequency = int(agentConfig['check_freq'])
        watchdog = self._get_watchdog(check_frequency, agentConfig)

        # Initialize the auto-restarter
        self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        # Run the main loop.
        while self.run_forever:

            # enable profiler if needed
            profiled = False
            if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes':
                try:
                    import cProfile
                    profiler = cProfile.Profile()
                    profiled = True
                    profiler.enable()
                    log.debug("Agent profiling is enabled")
                except Exception:
                    log.warn("Cannot enable profiler")

            # Do the work.
            self.collector.run(checksd=checksd, start_event=self.start_event)

            # disable profiler and printout stats to stdout
            if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes' and profiled:
                try:
                    profiler.disable()
                    import pstats
                    from cStringIO import StringIO
                    s = StringIO()
                    ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative")
                    ps.print_stats()
                    log.debug(s.getvalue())
                except Exception:
                    log.warn("Cannot disable profiler")

            # Check if we should restart.
            if self.autorestart and self._should_restart():
                self._do_restart()

            # Only plan for the next loop if we will continue,
            # otherwise just exit quickly.
            if self.run_forever:
                if watchdog:
                    watchdog.reset()
                time.sleep(check_frequency)

        # Now clean-up.
        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        # Explicitly kill the process, because it might be running
        # as a daemon.
        log.info("Exiting. Bye bye.")
        sys.exit(0)
Example #50
0
    def save_server_cert(self, cert_bytes):
        path = os.path.join(CERT_FOLDER, "%s.pem" % (util.get_hostname(),))

        self._save_bytes(path, cert_bytes)
Example #51
0
    def run(self, config=None):
        """Main loop of the collector"""

        # Gracefully exit on sigterm.
        signal.signal(signal.SIGTERM, self._handle_sigterm)

        # A SIGUSR1 signals an exit with an autorestart
        signal.signal(signal.SIGUSR1, self._handle_sigusr1)

        # Handle Keyboard Interrupt
        signal.signal(signal.SIGINT, self._handle_sigterm)

        # Save the agent start-up stats.
        CollectorStatus().persist()

        # Intialize the collector.
        if not config:
            config = get_config(parse_args=True)

        agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(agentConfig)
        systemStats = get_system_stats()
        emitters = self._get_emitters(agentConfig)
        # Load the checks.d checks
        checksd = load_check_directory(agentConfig, hostname)

        self.collector = Collector(agentConfig, emitters, systemStats,
                                   hostname)

        # In developer mode, the number of runs to be included in a single collector profile
        collector_profile_interval = agentConfig.get(
            'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)

        # Configure the watchdog.
        check_frequency = int(agentConfig['check_freq'])
        watchdog = self._get_watchdog(check_frequency, agentConfig)

        # Initialize the auto-restarter
        self.restart_interval = int(
            agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        profiled = False
        collector_profiled_runs = 0

        # Run the main loop.
        while self.run_forever:
            # Setup profiling if necessary
            if self.in_developer_mode and not profiled:
                try:
                    profiler = AgentProfiler()
                    profiler.enable_profiling()
                    profiled = True
                except Exception as e:
                    log.warn("Cannot enable profiler: %s" % str(e))

            # Do the work.
            self.collector.run(checksd=checksd, start_event=self.start_event)
            if profiled:
                if collector_profiled_runs >= collector_profile_interval:
                    try:
                        profiler.disable_profiling()
                        profiled = False
                        collector_profiled_runs = 0
                    except Exception as e:
                        log.warn("Cannot disable profiler: %s" % str(e))

            # Check if we should restart.
            if self.autorestart and self._should_restart():
                self._do_restart()

            # Only plan for the next loop if we will continue,
            # otherwise just exit quickly.
            if self.run_forever:
                if watchdog:
                    watchdog.reset()
                if profiled:
                    collector_profiled_runs += 1
                time.sleep(check_frequency)
        # Now clean-up.
        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        # Explicitly kill the process, because it might be running
        # as a daemon.
        log.info("Exiting. Bye bye.")
        sys.exit(0)
Example #52
0
    import sys

    logger = logging.getLogger("ddagent.checks.nagios")
    nagios = Nagios(get_hostname())

    events = nagios.check(logger, {
        'api_key': api_key,
        'nagios_log': log_file
    },
                          move_end=False)
    for e in events:
        yield e


if __name__ == "__main__":
    import logging
    import socket

    logger = logging.getLogger("ddagent.checks.nagios")
    nagios = Nagios(get_hostname())

    config = {
        'api_key': 'apikey_2',
        'nagios_log': '/var/log/nagios3/nagios.log'
    }
    events = nagios.check(logger, config, move_end=False)
    while True:
        #for e in events:
        #    print "Event:", e
        time.sleep(5)
        events = nagios.check(logger, config)
Example #53
0
 def check_if_valid(self):
     return util.get_hostname() == 'cray'
Example #54
0
#!/usr/bin/env python3

import instance
import configuration_gem5
from run_benchmarks import run_benchmarks
import argparse
import os
import sys
import util
import shutil
from jobschedulers.jobscheduler_all import *

job_scheduler = get_current_scheduler()
if util.get_hostname() == 'login':
    job_scheduler.max_cores = 24
compiled_benchmarks = set()

def parse_arguments():
    parser = argparse.ArgumentParser(description='Load GEM5 results')

    parser.add_argument('-b', '--build-gem5', action='store_true', help='Build gem5')
    parser.add_argument('-c', '--clean-disk-image', action='store_true', help='Clean the disk image (rebuild benchmarks)')
    parser.add_argument('-r', '--resubmit-all', action='store_true', help='Resubmit even running jobs')
    parser.add_argument('-m', '--max-results', default=1, help='Number of results needed per instance', type=int)

    res = parser.parse_args()
    return res

def main():
    args = parse_arguments()
    gem5_environment_check()
Example #55
0
                self._watchmonitor.reset()
            self._postMetrics()
            self._postAgentInfoToServer()
            self._tr_manager.flush()

        tr_sched = tornado.ioloop.PeriodicCallback(flush_trs,
                                                   TRANSACTION_FLUSH_INTERVAL,
                                                   io_loop=self.mloop)

        # Register optional Graphite listener
        gport = self._agentConfig.get("graphite_listen_port", None)
        if gport is not None:
            log.info("Starting graphite listener on port %s" % gport)
            from graphite import GraphiteServer
            gs = GraphiteServer(self,
                                get_hostname(self._agentConfig),
                                io_loop=self.mloop)
            if non_local_traffic is True:
                gs.listen(gport)
            else:
                gs.listen(gport, address="localhost")

        # Start everything
        if self._watchmonitor:
            self._watchmonitor.reset()
        tr_sched.start()

        self.mloop.start()
        log.info("Stopped")

    def stop(self):
Example #56
0
    def check(self, instance):
        host = instance.get('host', 'localhost')
        port = int(instance.get('port', 2181))
        timeout = float(instance.get('timeout', 3.0))
        expected_mode = (instance.get('expected_mode') or '').strip()
        tags = instance.get('tags', [])
        cx_args = (host, port, timeout)
        sc_tags = ["host:{0}".format(host), "port:{0}".format(port)]
        hostname = get_hostname(self.agentConfig)
        report_instance_mode = instance.get("report_instance_mode", True)

        zk_version = None # parse_stat will parse and set version string

        # Send a service check based on the `ruok` response.
        # Set instance status to down if not ok.
        try:
            ruok_out = self._send_command('ruok', *cx_args)
        except ZKConnectionFailure:
            # The server should not respond at all if it's not OK.
            status = AgentCheck.CRITICAL
            message = 'No response from `ruok` command'
            self.increment('zookeeper.timeouts')

            if report_instance_mode:
                self.report_instance_mode(hostname, 'down', tags)
            raise
        else:
            ruok_out.seek(0)
            ruok = ruok_out.readline()
            if ruok == 'imok':
                status = AgentCheck.OK
            else:
                status = AgentCheck.WARNING
            message = u'Response from the server: %s' % ruok
        finally:
            self.service_check('zookeeper.ruok', status, message=message,
                    tags=sc_tags)

        # Read metrics from the `stat` output.
        try:
            stat_out = self._send_command('stat', *cx_args)
        except ZKConnectionFailure:
            self.increment('zookeeper.timeouts')
            if report_instance_mode:
                self.report_instance_mode(hostname, 'down', tags)
            raise
        except Exception as e:
            self.warning(e)
            self.increment('zookeeper.datadog_client_exception')
            if report_instance_mode:
                self.report_instance_mode(hostname, 'unknown', tags)
            raise
        else:
            # Parse the response
            metrics, new_tags, mode, zk_version = self.parse_stat(stat_out)

            # Write the data
            if mode != 'inactive':
                for metric, value, m_type in metrics:
                    submit_metric = getattr(self, m_type)
                    submit_metric(metric, value, tags=tags + new_tags)

            if report_instance_mode:
                self.report_instance_mode(hostname, mode, tags)

            if expected_mode:
                if mode == expected_mode:
                    status = AgentCheck.OK
                    message = u"Server is in %s mode" % mode
                else:
                    status = AgentCheck.CRITICAL
                    message = u"Server is in %s mode but check expects %s mode"\
                              % (mode, expected_mode)
                self.service_check('zookeeper.mode', status, message=message,
                                   tags=sc_tags)


        # Read metrics from the `mntr` output
        if zk_version and LooseVersion(zk_version) > LooseVersion("3.4.0"):
            try:
                mntr_out = self._send_command('mntr', *cx_args)
            except ZKConnectionFailure:
                self.increment('zookeeper.timeouts')
                if report_instance_mode:
                    self.report_instance_mode(hostname, 'down', tags)
                raise
            except Exception as e:
                self.warning(e)
                self.increment('zookeeper.datadog_client_exception')
                if report_instance_mode:
                    self.report_instance_mode(hostname, 'unknown', tags)
                raise
            else:
                metrics, mode = self.parse_mntr(mntr_out)
                mode_tag = "mode:%s" % mode
                if mode != 'inactive':
                    for name in metrics:
                        self.gauge(name, metrics[name], tags=tags + [mode_tag])

                if report_instance_mode:
                    self.report_instance_mode(hostname, mode, tags)
Example #57
0
def main():
    options, args = get_parsed_args()
    agentConfig = get_config(options=options)
    autorestart = agentConfig.get('autorestart', False)
    hostname = get_hostname(agentConfig)
    in_developer_mode = agentConfig.get('developer_mode')
    COMMANDS_AGENT = [
        'start',
        'stop',
        'restart',
        'status',
        'foreground',
    ]

    COMMANDS_NO_AGENT = [
        'info',
        'check',
        'configcheck',
        'jmx',
        'flare',
    ]

    COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT

    if len(args) < 1:
        sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS)))
        return 2

    command = args[0]
    if command not in COMMANDS:
        sys.stderr.write("Unknown command: %s\n" % command)
        return 3

    # Deprecation notice
    if command not in DD_AGENT_COMMANDS:
        # Will become an error message and exit after deprecation period
        from utils.deprecations import deprecate_old_command_line_tools
        deprecate_old_command_line_tools()

    if command in COMMANDS_AGENT:
        agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(),
                      autorestart,
                      in_developer_mode=in_developer_mode)

    if command in START_COMMANDS:
        log.info('Agent version %s' % get_version())

    if 'start' == command:
        log.info('Start daemon')
        agent.start()

    elif 'stop' == command:
        log.info('Stop daemon')
        agent.stop()

    elif 'restart' == command:
        log.info('Restart daemon')
        agent.restart()

    elif 'status' == command:
        agent.status()

    elif 'info' == command:
        return Agent.info(verbose=options.verbose)

    elif 'foreground' == command:
        logging.info('Running in foreground')
        if autorestart:
            # Set-up the supervisor callbacks and fork it.
            logging.info('Running Agent with auto-restart ON')

            def child_func():
                agent.start(foreground=True)

            def parent_func():
                agent.start_event = False

            AgentSupervisor.start(parent_func, child_func)
        else:
            # Run in the standard foreground.
            agent.start(foreground=True)

    elif 'check' == command:
        if len(args) < 2:
            sys.stderr.write(
                "Usage: %s check <check_name> [check_rate]\n"
                "Add check_rate as last argument to compute rates\n" %
                sys.argv[0])
            return 1

        check_name = args[1]
        try:
            import checks.collector
            # Try the old-style check first
            print getattr(checks.collector, check_name)(log).check(agentConfig)
        except Exception:
            # If not an old-style check, try checks.d
            checks = load_check_directory(agentConfig, hostname)
            for check in checks['initialized_checks']:
                if check.name == check_name:
                    if in_developer_mode:
                        check.run = AgentProfiler.wrap_profiling(check.run)

                    cs = Collector.run_single_check(check, verbose=True)
                    print CollectorStatus.render_check_status(cs)

                    if len(args) == 3 and args[2] == 'check_rate':
                        print "Running 2nd iteration to capture rate metrics"
                        time.sleep(1)
                        cs = Collector.run_single_check(check, verbose=True)
                        print CollectorStatus.render_check_status(cs)

                    check.stop()

    elif 'configcheck' == command or 'configtest' == command:
        configcheck()

        if agentConfig.get('service_discovery', False):
            # set the TRACE_CONFIG flag to True to make load_check_directory return
            # the source of config objects.
            # Then call load_check_directory here and pass the result to sd_configcheck
            # to avoid circular imports
            agentConfig[TRACE_CONFIG] = True
            configs = {
                # check_name: (config_source, config)
            }
            print("\nLoading check configurations...\n\n")
            configs = load_check_directory(agentConfig, hostname)
            sd_configcheck(agentConfig, configs)

    elif 'jmx' == command:
        jmx_command(args[1:], agentConfig)

    elif 'flare' == command:
        Flare.check_user_rights()
        case_id = int(args[1]) if len(args) > 1 else None
        f = Flare(True, case_id)
        f.collect()
        try:
            f.upload()
        except Exception, e:
            print 'The upload failed:\n{0}'.format(str(e))
Example #58
0
def start_graphite_listener(port):
    from util import get_hostname
    echo_server = GraphiteServer(None, get_hostname(None))
    echo_server.listen(port)
    IOLoop.instance().start()
Example #59
0
    def run(self, config=None):
        """Main loop of the collector"""

        # Gracefully exit on sigterm.
        signal.signal(signal.SIGTERM, self._handle_sigterm)

        # A SIGUSR1 signals an exit with an autorestart
        signal.signal(signal.SIGUSR1, self._handle_sigusr1)

        # Handle Keyboard Interrupt
        signal.signal(signal.SIGINT, self._handle_sigterm)

        # A SIGHUP signals a configuration reload
        signal.signal(signal.SIGHUP, self._handle_sighup)

        # Save the agent start-up stats.
        CollectorStatus().persist()

        # Intialize the collector.
        if not config:
            config = get_config(parse_args=True)

        self._agentConfig = self._set_agent_config_hostname(config)
        hostname = get_hostname(self._agentConfig)
        systemStats = get_system_stats(proc_path=self._agentConfig.get(
            'procfs_path', '/proc').rstrip('/'))
        emitters = self._get_emitters()

        # Initialize service discovery
        if self._agentConfig.get('service_discovery'):
            self.sd_backend = get_sd_backend(self._agentConfig)

        # Load the checks.d checks
        self._checksd = load_check_directory(self._agentConfig, hostname)

        # Initialize the Collector
        self.collector = Collector(self._agentConfig, emitters, systemStats,
                                   hostname)

        # In developer mode, the number of runs to be included in a single collector profile
        self.collector_profile_interval = self._agentConfig.get(
            'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)

        # Configure the watchdog.
        self.check_frequency = int(self._agentConfig['check_freq'])
        watchdog = self._get_watchdog(self.check_frequency)

        # Initialize the auto-restarter
        self.restart_interval = int(
            self._agentConfig.get('restart_interval', RESTART_INTERVAL))
        self.agent_start = time.time()

        profiled = False
        collector_profiled_runs = 0

        # Run the main loop.
        while self.run_forever:
            log.debug("Found {num_checks} checks".format(
                num_checks=len(self._checksd['initialized_checks'])))

            # Setup profiling if necessary
            if self.in_developer_mode and not profiled:
                try:
                    profiler = AgentProfiler()
                    profiler.enable_profiling()
                    profiled = True
                except Exception as e:
                    log.warn("Cannot enable profiler: %s" % str(e))

            # Do the work.
            self.collector.run(checksd=self._checksd,
                               start_event=self.start_event,
                               configs_reloaded=self.configs_reloaded)

            # This flag is used to know if the check configs have been reloaded at the current
            # run of the agent yet or not. It's used by the collector to know if it needs to
            # look for the AgentMetrics check and pop it out.
            # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272
            self.configs_reloaded = False

            # Look for change in the config template store.
            # The self.sd_backend.reload_check_configs flag is set
            # to True if a config reload is needed.
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               not self.sd_backend.reload_check_configs:
                try:
                    self.sd_backend.reload_check_configs = get_config_store(
                        self._agentConfig).crawl_config_template()
                except Exception as e:
                    log.warn(
                        'Something went wrong while looking for config template changes: %s'
                        % str(e))

            # Check if we should run service discovery
            # The `reload_check_configs` flag can be set through the docker_daemon check or
            # using ConfigStore.crawl_config_template
            if self._agentConfig.get('service_discovery') and self.sd_backend and \
               self.sd_backend.reload_check_configs:
                self.reload_configs()
                self.configs_reloaded = True
                self.sd_backend.reload_check_configs = False

            if profiled:
                if collector_profiled_runs >= self.collector_profile_interval:
                    try:
                        profiler.disable_profiling()
                        profiled = False
                        collector_profiled_runs = 0
                    except Exception as e:
                        log.warn("Cannot disable profiler: %s" % str(e))

            # Check if we should restart.
            if self.autorestart and self._should_restart():
                self._do_restart()

            # Only plan for next loop if we will continue, otherwise exit quickly.
            if self.run_forever:
                if watchdog:
                    watchdog.reset()
                if profiled:
                    collector_profiled_runs += 1
                log.debug("Sleeping for {0} seconds".format(
                    self.check_frequency))
                time.sleep(self.check_frequency)

        # Now clean-up.
        try:
            CollectorStatus.remove_latest_status()
        except Exception:
            pass

        # Explicitly kill the process, because it might be running as a daemon.
        log.info("Exiting. Bye bye.")
        sys.exit(0)
Example #60
0
 def load_server_cert(self):
     path = os.path.join(CERT_FOLDER, "%s.pem" % (util.get_hostname(),))
     return self._load_bytes(path)