def check(self, instance, create_event=True): if self.high_watermarks.get(instance.get('name'), None) is None: # On the first run of check(), prime the high_watermarks dict # so that we only send events that occured after the agent # started. # (Setting high_watermarks in the next statement prevents # any kind of infinite loop (assuming nothing ever sets # high_watermarks to None again!)) self.high_watermarks[instance.get('name')] = defaultdict(lambda: 0) self.check(instance, create_event=False) jenkins_home = instance.get('jenkins_home', None) if not jenkins_home: raise Exception("No jenkins_home directory set in the config file") job_dirs = glob(os.path.join(jenkins_home, 'jobs', '*')) build_events = [] for job_dir in job_dirs: for output in self._get_build_results(instance.get('name'), job_dir): output['api_key'] = self.agentConfig['api_key'] output['host'] = get_hostname(self.agentConfig) if create_event: self.log.debug("Creating event for job: %s" % output['job_name']) self.event(output)
def test_collector(self): agentConfig = { 'agent_key': 'test_agentkey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_instance_metadata': False, 'create_dd_check_tags': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{"host": "localhost", "port": 6379}] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) metrics = payload['metrics'] # Check that we got a timing metric for all checks. timing_metrics = [m for m in metrics if m[0] == 'sd.agent.check_run_time'] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]['tags']) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
def load_check(name, config, agentConfig, is_sdk=False): if not is_sdk: checksd_path = get_checksd_path(get_os()) # find (in checksd_path) and load the check module fd, filename, desc = imp.find_module(name, [checksd_path]) check_module = imp.load_module(name, fd, filename, desc) else: check_module = __import__("check") check_class = None classes = inspect.getmembers(check_module, inspect.isclass) for _, clsmember in classes: if clsmember == AgentCheck: continue if issubclass(clsmember, AgentCheck): check_class = clsmember if AgentCheck in clsmember.__bases__: continue else: break if check_class is None: raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name) init_config = config.get('init_config', {}) instances = config.get('instances') agentConfig['checksd_hostname'] = get_hostname(agentConfig) # init the check class try: return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances) except TypeError as e: raise Exception("Check is using old API, {0}".format(e)) except Exception: raise
def load_check(name, config, agentConfig): checksd_path = get_checksd_path(get_os()) if checksd_path not in sys.path: sys.path.append(checksd_path) check_module = __import__(name) check_class = None classes = inspect.getmembers(check_module, inspect.isclass) for name, clsmember in classes: if clsmember == AgentCheck: continue if issubclass(clsmember, AgentCheck): check_class = clsmember if AgentCheck in clsmember.__bases__: continue else: break if check_class is None: raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name) init_config = config.get('init_config', None) instances = config.get('instances') agentConfig['checksd_hostname'] = get_hostname(agentConfig) # init the check class try: return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances) except: # Backwards compatitiblity for old checks that don't support the # instances argument. c = check_class(name, init_config=init_config, agentConfig=agentConfig) c.instances = instances return c
def test_apptags(self): ''' Tests that the app tags are sent if specified so ''' agentConfig = { 'agent_key': 'test_agentkey', 'collect_ec2_tags': False, 'collect_instance_metadata': False, 'create_dd_check_tags': True, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{"host": "localhost", "port": 6379}] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) # We check that the redis DD_CHECK_TAG is sent in the payload self.assertTrue('dd_check:redisdb' in payload['host-tags']['system'])
def init(config_path=None, use_watchdog=False, use_forwarder=False): c = get_config(parse_args=False, cfg_path=config_path) log.debug("Configuration dogstatsd") port = c['dogstatsd_port'] interval = int(c['dogstatsd_interval']) normalize = c['dogstatsd_normalize'] api_key = c['api_key'] non_local_traffic = c['non_local_traffic'] target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = get_hostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsAggregator(hostname, interval, recent_point_threshold=c.get('recent_point_threshold', None)) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog) # Start the server on an IPv4 stack # Default to loopback server_host = '127.0.0.1' # If specified, bind to all addressses if non_local_traffic: server_host = '' server = Server(aggregator, server_host, port) return reporter, server
def check(self, agentConfig): process_exclude_args = agentConfig.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' # Get output from ps try: ps = sp.Popen(['ps', ps_arg], stdout=sp.PIPE, close_fds=True).communicate()[0] except StandardError: self.logger.exception('getProcesses') return False # Split out each process processLines = ps.split('\n') del processLines[0] # Removes the headers processLines.pop() # Removes a trailing empty line processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return {'processes': processes, 'apiKey': agentConfig['api_key'], 'host': get_hostname(agentConfig)}
def create_event(self, state, server, agentConfig): """Create an event with a message describing the replication state of a mongo node""" def get_state_description(state): if state == 0: return 'Starting Up' elif state == 1: return 'Primary' elif state == 2: return 'Secondary' elif state == 3: return 'Recovering' elif state == 4: return 'Fatal' elif state == 5: return 'Starting up (initial sync)' elif state == 6: return 'Unknown' elif state == 7: return 'Arbiter' elif state == 8: return 'Down' elif state == 9: return 'Rollback' status = get_state_description(state) hostname = get_hostname(agentConfig) msg_title = "%s is %s" % (server, status) msg = "TokuMX %s just reported as %s" % (server, status) self.event({ 'timestamp': int(time.time()), 'event_type': 'tokumx', 'api_key': agentConfig['api_key'], 'msg_title': msg_title, 'msg_text': msg, 'host': hostname })
def _wait_for_machine_configured(self, file_reader): """In case of nosql and bigdata CMT is changing hostname, wait for that action being complete""" total_sleep_time = 0 wait_for_conf = False for n in self._ctx.node_list: machine_type = file_reader.read_attribute(n.ip_address, 'MACHINE_TYPE') if machine_type == 'manager': wait_for_conf = True break if wait_for_conf: while True: if util.get_hostname() != self._ctx.this_node.hostname: self._logger.debug("Sleep") total_sleep_time += self._ctx.CMT_CONF_WAIT if total_sleep_time >= self._ctx.MAX_CMT_CONF_WAIT: util.log_exception("Waiting for machine configurtion took too long") self.shutdown() time.sleep(self._ctx.CMT_CONF_WAIT) else: # sleep once more before the exit: to make sure that hostname # change propagated time.sleep(self._ctx.CMT_CONF_WAIT) break
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config self.agentConfig = agentConfig self.hostname = get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator(self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None)) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None): """Configure the server and the reporting thread. """ c = get_config(parse_args=False, cfg_path=config_path) if (not c['use_dogstatsd'] and (args and args[0] in ['start', 'restart'] or not args)): log.info("Dogstatsd is disabled. Exiting") # We're exiting purposefully, so exit with zero (supervisor's expected # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly # and thus can exit cleanly. sleep(4) sys.exit(0) port = c['dogstatsd_port'] interval = DOGSTATSD_FLUSH_INTERVAL api_key = c['api_key'] aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE non_local_traffic = c['non_local_traffic'] forward_to_host = c.get('statsd_forward_host') forward_to_port = c.get('statsd_forward_port') event_chunk_size = c.get('event_chunk_size') recent_point_threshold = c.get('recent_point_threshold', None) server_host = c['bind_host'] target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = get_hostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsBucketAggregator( hostname, aggregator_interval, recent_point_threshold=recent_point_threshold, formatter=get_formatter(c), histogram_aggregates=c.get('histogram_aggregates'), histogram_percentiles=c.get('histogram_percentiles'), utf8_decoding=c['utf8_decoding'] ) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog, event_chunk_size) # NOTICE: when `non_local_traffic` is passed we need to bind to any interface on the box. The forwarder uses # Tornado which takes care of sockets creation (more than one socket can be used at once depending on the # network settings), so it's enough to just pass an empty string '' to the library. # In Dogstatsd we use a single, fullstack socket, so passing '' as the address doesn't work and we default to # '0.0.0.0'. If someone needs to bind Dogstatsd to the IPv6 '::', they need to turn off `non_local_traffic` and # use the '::' meta address as `bind_host`. if non_local_traffic: server_host = '0.0.0.0' server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port) return reporter, server, c
def check(self, logger, agentConfig): if self.high_watermarks is None: # On the first run of check(), prime the high_watermarks dict # so that we only send events that occured after the agent # started. # (Setting high_watermarks in the next statement prevents # any kind of infinite loop (assuming nothing ever sets # high_watermarks to None again!)) self.high_watermarks = defaultdict(lambda: 0) self.check(logger, agentConfig) hudson_home = agentConfig.get('hudson_home', None) if not hudson_home: return False job_dirs = glob(os.path.join(hudson_home, 'jobs', '*')) build_events = [] for job_dir in job_dirs: for output in self._get_build_results(logger, job_dir): output['api_key'] = agentConfig['api_key'] output['host'] = get_hostname(agentConfig) build_events.append(output) return build_events
def _get_hostname_metadata(self): """ Returns a dictionnary that contains hostname metadata. """ metadata = EC2.get_metadata(self.agentConfig) if metadata.get('hostname'): metadata['ec2-hostname'] = metadata.get('hostname') del metadata['hostname'] if self.agentConfig.get('hostname'): metadata['agent-hostname'] = self.agentConfig.get('hostname') else: try: metadata["socket-hostname"] = socket.gethostname() except Exception: pass try: metadata["socket-fqdn"] = socket.getfqdn() except Exception: pass metadata["hostname"] = get_hostname() # Add cloud provider aliases host_aliases = GCE.get_host_aliases(self.agentConfig) if host_aliases: metadata['host_aliases'] = host_aliases return metadata
def load_check(name, config, agentConfig): checksd_path = get_checksd_path(get_os()) if checksd_path not in sys.path: sys.path.append(checksd_path) check_module = __import__(name) check_class = None classes = inspect.getmembers(check_module, inspect.isclass) for _, clsmember in classes: if clsmember == AgentCheck: continue if issubclass(clsmember, AgentCheck): check_class = clsmember if AgentCheck in clsmember.__bases__: continue else: break if check_class is None: raise Exception("Unable to import check %s. Missing a class that inherits AgentCheck" % name) init_config = config.get('init_config', {}) instances = config.get('instances') agentConfig['checksd_hostname'] = get_hostname(agentConfig) # init the check class try: return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances) except Exception as e: raise Exception("Check is using old API, {0}".format(e))
def setup_agent4(hostname=None, domain=None, pc="1", agent_conf="files/puppet-agent.conf", puppetserver=None, proxy_url=None, hosts_file=None): """Setup Puppet 4 agent""" import package, util, config if not hostname: hostname = util.get_hostname() if not domain: domain = util.get_domain() install_puppetlabs_release_package(pc, proxy_url=proxy_url) package.install("puppet-agent") # Use puppetserver value from setting.ini file if none is given on the # command-line. If that fails use the default. if not puppetserver: try: puppetserver = config.get("puppet", "puppetserver") except: puppetserver = None # Add a customized puppet.conf util.put_and_chown(agent_conf, "/etc/puppetlabs/puppet/puppet.conf") if puppetserver: server = puppetserver else: server = "puppet.%s" % domain sudo("puppet config set --section agent server %s" % server) util.set_hostname(hostname + "." + domain) util.add_host_entry(util.get_ip(), hostname, domain) # Optionally add hosts from a separate file. This is useful when the IP of # the puppetmaster as seen from the Puppet agent node does not match its # name in DNS. util.add_host_entries(hosts_file) util.add_to_path("/opt/puppetlabs/bin") run_agent(noop="True", onlychanges="False")
def check(self, agentConfig): process_exclude_args = agentConfig.get('exclude_process_args', False) if process_exclude_args: ps_arg = 'aux' else: ps_arg = 'auxww' # Get output from ps try: output, _, _ = get_subprocess_output(['ps', ps_arg], self.logger) processLines = output.splitlines() # Also removes a trailing empty line except StandardError: self.logger.exception('getProcesses') return False del processLines[0] # Removes the headers processes = [] for line in processLines: line = line.split(None, 10) processes.append(map(lambda s: s.strip(), line)) return {'processes': processes, 'apiKey': agentConfig['api_key'], 'host': get_hostname(agentConfig)}
def __init__(self, args): win32serviceutil.ServiceFramework.__init__(self, args) self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) config = get_config(parse_args=False) # Setup the correct options so the agent will use the forwarder opts, args = Values({ 'autorestart': False, 'dd_url': None, 'use_forwarder': True, 'disabled_dd': False, 'profile': False }), [] agentConfig = get_config(parse_args=False, options=opts) self.hostname = get_hostname(agentConfig) # Watchdog for Windows self._collector_heartbeat, self._collector_send_heartbeat = multiprocessing.Pipe(False) self._collector_failed_heartbeats = 0 self._max_failed_heartbeats = \ MAX_FAILED_HEARTBEATS * agentConfig['check_freq'] / SERVICE_SLEEP_INTERVAL # Watch JMXFetch restarts self._MAX_JMXFETCH_RESTARTS = 3 self._count_jmxfetch_restarts = 0 # Keep a list of running processes so we can start/end as needed. # Processes will start started in order and stopped in reverse order. self.procs = { 'forwarder': ProcessWatchDog("forwarder", DDForwarder(config, self.hostname)), 'collector': ProcessWatchDog("collector", DDAgent(agentConfig, self.hostname, heartbeat=self._collector_send_heartbeat)), 'dogstatsd': ProcessWatchDog("dogstatsd", DogstatsdProcess(config, self.hostname)), 'jmxfetch': ProcessWatchDog("jmxfetch", JMXFetchProcess(config, self.hostname), 3), }
def reload_configs(self, checks_to_reload=set()): """Reload the agent configuration and checksd configurations. Can also reload only an explicit set of checks.""" log.info("Attempting a configuration reload...") hostname = get_hostname(self._agentConfig) # if no check was given, reload them all if not checks_to_reload: log.debug("No check list was passed, reloading every check") # stop checks for check in self._checksd.get('initialized_checks', []): check.stop() self._checksd = load_check_directory(self._agentConfig, hostname) else: new_checksd = copy(self._checksd) self.refresh_specific_checks(hostname, new_checksd, checks_to_reload) # once the reload is done, replace existing checks with the new ones self._checksd = new_checksd # Logging num_checks = len(self._checksd['initialized_checks']) if num_checks > 0: opt_msg = " (refreshed %s checks)" % len(checks_to_reload) if checks_to_reload else '' msg = "Check reload was successful. Running {num_checks} checks{opt_msg}.".format( num_checks=num_checks, opt_msg=opt_msg) log.info(msg) else: log.info("No checksd configs found")
def test_collector(self): agentConfig = { "api_key": "test_apikey", "check_timings": True, "collect_ec2_tags": True, "collect_instance_metadata": False, "version": "test", "tags": "", } # Run a single checks.d check as part of the collector. redis_config = {"init_config": {}, "instances": [{"host": "localhost", "port": 6379}]} checks = [load_check("redisdb", redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({"initialized_checks": checks, "init_failed_checks": {}}) metrics = payload["metrics"] # Check that we got a timing metric for all checks. timing_metrics = [m for m in metrics if m[0] == "datadog.agent.check_run_time"] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]["tags"]) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
def __init__(self, args): win32serviceutil.ServiceFramework.__init__(self, args) self.hWaitStop = win32event.CreateEvent(None, 0, 0, None) config = get_config(parse_args=False) # Setup the correct options so the agent will use the forwarder opts, args = Values({ 'dd_url': None, 'clean': False, 'use_forwarder': True, 'disabled_dd': False }), [] agentConfig = get_config(parse_args=False, options=opts) self.hostname = get_hostname(agentConfig) self.restart_interval = \ int(agentConfig.get('autorestart_interval', RESTART_INTERVAL)) log.info("Autorestarting the collector ever %s seconds" % self.restart_interval) # Keep a list of running processes so we can start/end as needed. # Processes will start started in order and stopped in reverse order. self.procs = { 'forwarder': DDForwarder(config, self.hostname), 'collector': DDAgent(agentConfig, self.hostname), 'dogstatsd': DogstatsdProcess(config, self.hostname), 'pup': PupProcess(config), }
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil is not None self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles') ) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int)
def testCheck(self): config = { 'init_config': {}, 'instances': [{ 'url': 'http://localhost:3834/stats', 'username': '******', 'password': '******', 'status_check': True, 'collect_aggregates_only': False, 'tag_service_check_by_host': True, }] } self.start_server(HAPROXY_CFG, config) # Run the check against our running server self.check.check(config['instances'][0]) # Sleep for 1 second so the rate interval >=1 time.sleep(1) # Run the check again so we get the rates self.check.check(config['instances'][0]) # Metric assertions metrics = self.check.get_metrics() assert metrics self.assertTrue(type(metrics) == type([])) self.assertTrue(len(metrics) > 0) service_checks = self.check.get_service_checks() assert service_checks self.assertTrue(type(service_checks) == type([])) self.assertTrue(len(service_checks) > 0) self.assertEquals(len([t for t in metrics if t[0] == "haproxy.backend.bytes.in_rate"]), 3, metrics) self.assertEquals(len([t for t in metrics if t[0] == "haproxy.frontend.session.current"]), 1, metrics) # check was run 2 times # - FRONTEND is reporting OPEN that we ignore # - only the BACKEND aggregate is reporting UP -> OK # - The 3 individual servers are returning no check -> UNKNOWN self.assertEquals(len([t for t in service_checks if t['status']== 0]), 2, service_checks) self.assertEquals(len([t for t in service_checks if t['status']== 3]), 6, service_checks) # Make sure the service checks aren't tagged with an empty hostname. for service_check in service_checks: self.assertEquals(service_check['host_name'], get_hostname()) inst = config['instances'][0] data = self.check._fetch_data(inst['url'], inst['username'], inst['password']) new_data = [l.replace("no check", "UP") for l in data] self.check._process_data(new_data, False, True, inst['url']), assert self.check.has_events() assert len(self.check.get_events()) == 3 # The 3 individual backend servers were switched to UP service_checks = self.check.get_service_checks() # The 3 servers + the backend aggregate are reporting UP self.assertEquals(len([t for t in service_checks if t['status'] == 0]), 4, service_checks)
def init(config_path=None, use_watchdog=False, use_forwarder=False, args=None): """Configure the server and the reporting thread. """ c = get_config(parse_args=False, cfg_path=config_path) if (not c['use_dogstatsd'] and (args and args[0] in ['start', 'restart'] or not args)): log.info("Dogstatsd is disabled. Exiting") # We're exiting purposefully, so exit with zero (supervisor's expected # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly # and thus can exit cleanly. sleep(4) sys.exit(0) log.debug("Configuring dogstatsd") port = c['dogstatsd_port'] interval = DOGSTATSD_FLUSH_INTERVAL api_key = c['api_key'] aggregator_interval = DOGSTATSD_AGGREGATOR_BUCKET_SIZE non_local_traffic = c['non_local_traffic'] forward_to_host = c.get('statsd_forward_host') forward_to_port = c.get('statsd_forward_port') event_chunk_size = c.get('event_chunk_size') recent_point_threshold = c.get('recent_point_threshold', None) target = c['dd_url'] if use_forwarder: target = c['dogstatsd_target'] hostname = get_hostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsBucketAggregator( hostname, aggregator_interval, recent_point_threshold=recent_point_threshold, formatter=get_formatter(c), histogram_aggregates=c.get('histogram_aggregates'), histogram_percentiles=c.get('histogram_percentiles'), utf8_decoding=c['utf8_decoding'] ) # Start the reporting thread. reporter = Reporter(interval, aggregator, target, api_key, use_watchdog, event_chunk_size) # Start the server on an IPv4 stack # Default to loopback server_host = c['bind_host'] # If specified, bind to all addressses if non_local_traffic: server_host = '' server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port) return reporter, server, c
def _postMetrics(self): if len(self._metrics) > 0: self._metrics["uuid"] = get_uuid() self._metrics["internalHostname"] = get_hostname(self._agentConfig) self._metrics["apiKey"] = self._agentConfig["api_key"] MetricTransaction(json.dumps(self._metrics), headers={"Content-Type": "application/json"}) self._metrics = {}
def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = get_hostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(self._metrics, {}) self._metrics = {}
def check(self, instance, create_event=True): """ DEPRECATED: This Jenkins check is deprecated and not actively developed anymore. It will be removed in a future version of the Datadog Agent. Please move to using the Datadog plugin for Jenkins. More information can be found on the Jenkins Integration panel under the Configuration tab (https://app.datadoghq.com/account/settings#integrations/jenkins) """ self.warning("This check is deprecated in favor of our Jenkins Datadog plugin." " It will be removed in a future version of the Datadog Agent." " More information can be found on the Jenkins Integration panel" " under the Configuration tab" " (https://app.datadoghq.com/account/settings#integrations/jenkins)") if self.high_watermarks.get(instance.get('name'), None) is None: # On the first run of check(), prime the high_watermarks dict # so that we only send events that occured after the agent # started. # (Setting high_watermarks in the next statement prevents # any kind of infinite loop (assuming nothing ever sets # high_watermarks to None again!)) self.high_watermarks[instance.get('name')] = defaultdict(lambda: 0) self.check(instance, create_event=False) jenkins_home = instance.get('jenkins_home') if not jenkins_home: raise Exception("No jenkins_home directory set in the config file") jenkins_jobs_dir = os.path.join(jenkins_home, 'jobs', '*') job_dirs = glob(jenkins_jobs_dir) if not job_dirs: raise Exception('No jobs found in `%s`! ' 'Check `jenkins_home` in your config' % (jenkins_jobs_dir)) for job_dir in job_dirs: for output in self._get_build_results(instance.get('name'), job_dir): output['host'] = get_hostname(self.agentConfig) if create_event: self.log.debug("Creating event for job: %s" % output['job_name']) self.event(output) tags = [ 'job_name:%s' % output['job_name'], 'result:%s' % output['result'], 'build_number:%s' % output['number'] ] if 'branch' in output: tags.append('branch:%s' % output['branch']) self.gauge("jenkins.job.duration", float(output['duration'])/1000.0, tags=tags) if output['result'] == 'SUCCESS': self.increment('jenkins.job.success', tags=tags) else: self.increment('jenkins.job.failure', tags=tags)
def _postMetrics(self): if len(self._metrics) > 0: self._metrics['uuid'] = get_uuid() self._metrics['internalHostname'] = get_hostname(self._agentConfig) self._metrics['apiKey'] = self._agentConfig['api_key'] MetricTransaction(json.dumps(self._metrics), headers={'Content-Type': 'application/json'}) self._metrics = {}
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { 'collection_timestamp': now, 'os' : self.os, 'python': sys.version, 'agentVersion' : self.agentConfig['version'], 'apiKey': self.agentConfig['api_key'], 'events': {}, 'metrics': [], 'service_checks': [], 'resources': {}, 'internalHostname' : get_hostname(self.agentConfig), 'uuid' : get_uuid(), 'host-tags': {}, } # Include system stats on first postback if start_event and self._is_first_run(): payload['systemStats'] = self.agentConfig.get('system_stats', {}) # Also post an event in the newsfeed payload['events']['System'] = [{'api_key': self.agentConfig['api_key'], 'host': payload['internalHostname'], 'timestamp': now, 'event_type':'Agent Startup', 'msg_text': 'Version %s' % get_version() }] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload['systemStats'] = get_system_stats() payload['meta'] = self._get_metadata() self.metadata_cache = payload['meta'] # Add static tags from the configuration file host_tags = [] if self.agentConfig['tags'] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig['tags'].split(",")]) if self.agentConfig['collect_ec2_tags']: host_tags.extend(EC2.get_tags()) if host_tags: payload['host-tags']['system'] = host_tags GCE_tags = GCE.get_tags() if GCE_tags is not None: payload['host-tags'][GCE.SOURCE_TYPE_NAME] = GCE_tags # Log the metadata on the first run if self._is_first_run(): log.info("Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload['host-tags'])) return payload
def parse_log(api_key, log_file): import logging import socket import sys logger = logging.getLogger("ddagent.checks.nagios") nagios = Nagios(get_hostname()) events = nagios.check(logger, {'api_key': api_key, 'nagios_log': log_file}, move_end=False) for e in events: yield e
def _build_payload(self, start_event=True): """ Return an dictionary that contains all of the generic payload data. """ now = time.time() payload = { "collection_timestamp": now, "os": self.os, "python": sys.version, "agentVersion": self.agentConfig["version"], "apiKey": self.agentConfig["api_key"], "events": {}, "metrics": [], "resources": {}, "internalHostname": get_hostname(self.agentConfig), "uuid": get_uuid(), "host-tags": {}, } # Include system stats on first postback if start_event and self._is_first_run(): payload["systemStats"] = self.agentConfig.get("system_stats", {}) # Also post an event in the newsfeed payload["events"]["System"] = [ { "api_key": self.agentConfig["api_key"], "host": payload["internalHostname"], "timestamp": now, "event_type": "Agent Startup", "msg_text": "Version %s" % get_version(), } ] # Periodically send the host metadata. if self._is_first_run() or self._should_send_metadata(): payload["systemStats"] = get_system_stats() payload["meta"] = self._get_metadata() self.metadata_cache = payload["meta"] # Add static tags from the configuration file host_tags = [] if self.agentConfig["tags"] is not None: host_tags.extend([unicode(tag.strip()) for tag in self.agentConfig["tags"].split(",")]) if self.agentConfig["collect_ec2_tags"]: host_tags.extend(EC2.get_tags()) if host_tags: payload["host-tags"]["system"] = host_tags # Log the metadata on the first run if self._is_first_run(): log.info(u"Hostnames: %s, tags: %s" % (repr(self.metadata_cache), payload["host-tags"])) return payload
def test_collector(self): agentConfig = { 'api_key': 'test_apikey', 'check_timings': True, 'collect_ec2_tags': True, 'collect_instance_metadata': False, 'version': 'test', 'tags': '', } # Run a single checks.d check as part of the collector. redis_config = { "init_config": {}, "instances": [{ "host": "localhost", "port": 6379 }] } checks = [load_check('redisdb', redis_config, agentConfig)] c = Collector(agentConfig, [], {}, get_hostname(agentConfig)) payload = c.run({ 'initialized_checks': checks, 'init_failed_checks': {} }) metrics = payload['metrics'] # Check that we got a timing metric for all checks. timing_metrics = [ m for m in metrics if m[0] == 'datadog.agent.check_run_time' ] all_tags = [] for metric in timing_metrics: all_tags.extend(metric[3]['tags']) for check in checks: tag = "check:%s" % check.name assert tag in all_tags, all_tags
def __init__(self): GObject.Object.__init__(self) self.hostname = util.get_hostname() self.code = None self.ident = None self.cert_server = None self.requests_lock = threading.Lock() self.requests = {} self.clean_cert_folder() self.keyfile = GLib.KeyFile() try: self.keyfile.load_from_file(os.path.join(CONFIG_FOLDER, CONFIG_FILE_NAME), GLib.KeyFileFlags.NONE) except GLib.Error as e: if e.code == GLib.FileError.NOENT: logging.debug("Auth: No group code file, making one.") pass else: logging.debug("Auth: Could not load existing keyfile (%s): %s" %(CONFIG_FOLDER, e.message)) self.code = self.get_group_code()
def submit_events(self, events): headers = {'Content-Type': 'application/json'} method = 'POST' events_len = len(events) event_chunk_size = self.event_chunk_size for chunk in chunks(events, event_chunk_size): payload = { 'apiKey': self.api_key, 'events': { 'api': chunk }, 'uuid': get_uuid(), 'internalHostname': get_hostname() } params = {} if self.api_key: params['api_key'] = self.api_key url = '/intake?%s' % urlencode(params) status = None conn = self.http_conn_cls(self.api_host) try: start_time = time() conn.request(method, url, json.dumps(payload), headers) response = conn.getresponse() status = response.status response.close() duration = round((time() - start_time) * 1000.0, 4) log.debug("%s %s %s%s (%sms)" % (status, method, self.api_host, url, duration)) finally: conn.close()
def _get_hostname_metadata(self): """ Returns a dictionnary that contains hostname metadata. """ metadata = EC2.get_metadata(self.agentConfig) if metadata.get('hostname'): metadata['ec2-hostname'] = metadata.get('hostname') del metadata['hostname'] if self.agentConfig.get('hostname'): metadata['agent-hostname'] = self.agentConfig.get('hostname') else: try: metadata["socket-hostname"] = socket.gethostname() except Exception: pass try: metadata["socket-fqdn"] = socket.getfqdn() except Exception: pass metadata["hostname"] = get_hostname() return metadata
def load_check(name, config, agentConfig): checksd_path = get_checksd_path(get_os()) if checksd_path not in sys.path: sys.path.append(checksd_path) check_module = __import__(name) check_class = None classes = inspect.getmembers(check_module, inspect.isclass) for _, clsmember in classes: if clsmember == AgentCheck: continue if issubclass(clsmember, AgentCheck): check_class = clsmember if AgentCheck in clsmember.__bases__: continue else: break if check_class is None: raise Exception( "Unable to import check %s. Missing a class that inherits AgentCheck" % name) init_config = config.get('init_config', {}) instances = config.get('instances') agentConfig['checksd_hostname'] = get_hostname(agentConfig) # init the check class try: return check_class(name, init_config=init_config, agentConfig=agentConfig, instances=instances) except TypeError as e: raise Exception("Check is using old API, {0}".format(e)) except Exception: raise
def __init__(self, name, init_config, agentConfig, instances=None): """ Initialize a new check. :param name: The name of the check :param init_config: The config for initializing the check :param agentConfig: The global configuration for the agent :param instances: A list of configuration objects for each instance. """ from aggregator import MetricsAggregator self.name = name self.init_config = init_config or {} self.agentConfig = agentConfig self.in_developer_mode = agentConfig.get('developer_mode') and psutil is not None self._internal_profiling_stats = None self.hostname = agentConfig.get('checksd_hostname') or get_hostname(agentConfig) self.log = logging.getLogger('%s.%s' % (__name__, name)) self.aggregator = MetricsAggregator( self.hostname, formatter=agent_formatter, recent_point_threshold=agentConfig.get('recent_point_threshold', None), histogram_aggregates=agentConfig.get('histogram_aggregates'), histogram_percentiles=agentConfig.get('histogram_percentiles') ) self.events = [] self.service_checks = [] self.instances = instances or [] self.warnings = [] self.library_versions = None self.last_collection_time = defaultdict(int) self._instance_metadata = [] self.svc_metadata = []
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) COMMANDS = [ 'start', 'stop', 'restart', 'foreground', 'status', 'info', 'check', 'configcheck', 'jmx', ] if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 pid_file = PidFile('dd-agent') if options.clean: pid_file.clean() agent = Agent(pid_file.get_path(), autorestart) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.run() def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.run(config=agentConfig) elif 'check' == command: check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: check.run() print check.get_metrics() print check.get_events() if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) check.run() print check.get_metrics() print check.get_events() elif 'configcheck' == command or 'configtest' == command: osname = get_os() all_valid = True for conf_path in glob.glob(os.path.join(get_confd_path(osname), "*.yaml")): basename = os.path.basename(conf_path) try: check_yaml(conf_path) except Exception, e: all_valid = False print "%s contains errors:\n %s" % (basename, e) else: print "%s is valid" % basename if all_valid: print "All yaml files passed. You can now run the Datadog agent." return 0 else: print("Fix the invalid yaml files above in order to start the Datadog agent. " "A useful external tool for yaml parsing can be found at " "http://yaml-online-parser.appspot.com/") return 1
def get_my_hostname(self): """ Returns a best guess for the hostname registered with OpenStack for this host """ return self.init_config.get("os_host") or get_hostname( self.agentConfig)
def generate_instances(check_platform=False): print("Generating instances...") instances = [] core_list = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512] instances.extend( get_instance_list(BenchmarkApo(), PlatformCrayUPC(param_cores=[128, 256, 512, 1024]), check_platform)) instances.extend( get_instance_list( BenchmarkApo(), PlatformBerkeley(param_cores=core_list, conduits=['ibv']), check_platform)) return instances instances.extend( get_instance_list( BenchmarkNPB(minClass='C', maxClass='D', kernel_list=['ft']), PlatformCrayUPC(param_cores=[128, 256, 512, 1024]), check_platform)) instances.extend( get_instance_list( BenchmarkNPB(minClass='B', maxClass='C', kernel_list=None), PlatformCrayUPC(param_cores=[16, 32, 64, 128, 256, 512, 1024]), check_platform)) if get_hostname() == 'bulldozer-server': instances.extend( get_instance_list( BenchmarkUBMatrixMultiplication(), PlatformBerkeley(param_cores=[1, 2, 4, 8, 16, 32], conduits=['smp']), check_platform) ) # disable_optimization=True, experimental=True return instances if False: instances.extend( get_instance_list( BenchmarkNPB(maxClass='B', minClass='B', kernel_list=['ft']), PlatformBerkeleyTrace(param_cores=core_list, conduits=['ibv']), check_platform)) instances.extend( get_instance_list( BenchmarkNPB(minClass='B', maxClass='B', kernel_list=['ft']), PlatformBerkeley(param_cores=core_list, conduits=['ibv']), check_platform) ) # disable_optimization=True, experimental=True instances.extend( get_instance_list( BenchmarkNPB(maxClass='A', minClass='A', kernel_list=['ft', 'cg', 'is', 'mg']), PlatformBerkeleyTrace(param_cores=core_list, conduits=['ibv']), check_platform)) instances.extend( get_instance_list( BenchmarkNPB(minClass='A', maxClass='A', kernel_list=['ft', 'cg', 'is', 'mg']), PlatformBerkeley(param_cores=core_list, conduits=['ibv']), check_platform) ) # disable_optimization=True, experimental=True #instances.extend( get_instance_list(BenchmarkNPB(minClass='A', maxClass='A'), PlatformBerkeleyTile(), check_platform) ) # disable_optimization=True, experimental=True #instances.extend( get_instance_list(BenchmarkNPB_CSEQ(minClass='A', maxClass='A'), PlatformTileCC(), check_platform) ) #instances.extend( get_instance_list(BenchmarkSSCA3(maxScale=3), PlatformCrayUPCxt5(), check_platform) ) #instances.extend( get_instance_list(BenchmarkSSCA3(maxScale=2, withFFTW=False), PlatformBerkeleyTile()) ) #instances.extend( get_instance_list(BenchmarkMatrixMultiplication(), PlatformBerkeleyTile(), check_platform) ) #instances.extend( get_instance_list(BenchmarkSobel(), PlatformBerkeleyTile(), check_platform) ) #instances.extend( get_instance_list(BenchmarkRandomAccess2(), PlatformBerkeleyTile(), check_platform) ) if False: instances.extend( get_instance_list(BenchmarkMatrixMultiplication(), PlatformBerkeley(), check_platform)) instances.extend( get_instance_list(BenchmarkSobel(), PlatformBerkeley(), check_platform)) instances.extend( get_instance_list(BenchmarkRandomAccess(), PlatformBerkeley(), check_platform)) instances.extend( get_instance_list(BenchmarkRandomAccess(), PlatformBerkeleyTile(), check_platform)) instances.extend( get_instance_list(BenchmarkRandomAccessSeq(), PlatformTileCC(), check_platform)) instances.extend( get_instance_list(BenchmarkRandomAccessSeq(), PlatformGCC(), check_platform)) instances.extend( get_instance_list(BenchmarkRandomAccess2(), PlatformBerkeley(), check_platform)) instances.extend( get_instance_list(BenchmarkRandomAccess2Seq(), PlatformTileCC(), check_platform)) instances.extend( get_instance_list(BenchmarkRandomAccess2Seq(), PlatformGCC(), check_platform)) instances.extend( get_instance_list(BenchmarkSobelSeq(), PlatformTileCC(), check_platform)) instances.extend( get_instance_list(BenchmarkSobelSeq(), PlatformGCC(), check_platform)) instances.extend( get_instance_list(BenchmarkMatrixMultiplicationSeq(), PlatformTileCC(), check_platform)) instances.extend( get_instance_list(BenchmarkMatrixMultiplicationSeq(), PlatformGCC(), check_platform)) print(len(instances), " instances available.") print("") return instances
def check_if_valid(self): hostname = util.get_hostname() return True
def testCheck(self): config = { 'init_config': {}, 'instances': [{ 'url': 'http://localhost:3834/stats', 'username': '******', 'password': '******', 'status_check': True, 'collect_aggregates_only': False, 'tag_service_check_by_host': True, }] } self.start_server(HAPROXY_CFG, config) # Run the check against our running server self.check.check(config['instances'][0]) # Sleep for 1 second so the rate interval >=1 time.sleep(1) # Run the check again so we get the rates self.check.check(config['instances'][0]) # Metric assertions metrics = self.check.get_metrics() assert metrics self.assertTrue(type(metrics) == type([])) self.assertTrue(len(metrics) > 0) service_checks = self.check.get_service_checks() assert service_checks self.assertTrue(type(service_checks) == type([])) self.assertTrue(len(service_checks) > 0) self.assertEquals( len([ t for t in metrics if t[0] == "haproxy.backend.bytes.in_rate" ]), 3, metrics) self.assertEquals( len([ t for t in metrics if t[0] == "haproxy.frontend.session.current" ]), 1, metrics) # check was run 2 times # - FRONTEND is reporting OPEN that we ignore # - only the BACKEND aggregate is reporting UP -> OK # - The 3 individual servers are returning no check -> UNKNOWN self.assertEquals(len([t for t in service_checks if t['status'] == 0]), 2, service_checks) self.assertEquals(len([t for t in service_checks if t['status'] == 3]), 6, service_checks) # Make sure the service checks aren't tagged with an empty hostname. for service_check in service_checks: self.assertEquals(service_check['host_name'], get_hostname()) inst = config['instances'][0] data = self.check._fetch_data(inst['url'], inst['username'], inst['password']) new_data = [l.replace("no check", "UP") for l in data] self.check._process_data(new_data, False, True, inst['url']), assert self.check.has_events() assert len(self.check.get_events( )) == 3 # The 3 individual backend servers were switched to UP service_checks = self.check.get_service_checks() # The 3 servers + the backend aggregate are reporting UP self.assertEquals(len([t for t in service_checks if t['status'] == 0]), 4, service_checks)
class Collector(object): """ The collector is responsible for collecting data from each check and passing it along to the emitters, who send it to their final destination. """ def __init__(self, agentConfig, emitters, systemStats): self.emit_duration = None self.agentConfig = agentConfig # system stats is generated by config.get_system_stats self.agentConfig['system_stats'] = systemStats # agent config is used during checks, system_stats can be accessed through the config self.os = get_os() self.plugins = None self.emitters = emitters self.metadata_interval = int(agentConfig.get('metadata_interval', 10 * 60)) self.metadata_start = time.time() socket.setdefaulttimeout(15) self.run_count = 0 self.continue_running = True self.metadata_cache = None self.initialized_checks_d = [] self.init_failed_checks_d = [] # Unix System Checks self._unix_system_checks = { 'disk': u.Disk(log), 'io': u.IO(log), 'load': u.Load(log), 'memory': u.Memory(log), 'processes': u.Processes(log), 'cpu': u.Cpu(log) } # Win32 System `Checks self._win32_system_checks = { 'disk': w32.Disk(log), 'io': w32.IO(log), 'proc': w32.Processes(log), 'memory': w32.Memory(log), 'network': w32.Network(log), 'cpu': w32.Cpu(log) } # Old-style metric checks self._ganglia = Ganglia(log) self._dogstream = Dogstreams.init(log, self.agentConfig) self._ddforwarder = DdForwarder(log, self.agentConfig) # Agent Metrics self._agent_metrics = CollectorMetrics(log) self._metrics_checks = [] # Custom metric checks for module_spec in [s.strip() for s in self.agentConfig.get('custom_checks', '').split(',')]: if len(module_spec) == 0: continue try: self._metrics_checks.append(modules.load(module_spec, 'Check')(log)) log.info("Registered custom check %s" % module_spec) log.warning("Old format custom checks are deprecated. They should be moved to the checks.d interface as old custom checks will be removed in a next version") except Exception, e: log.exception('Unable to load custom check module %s' % module_spec) # Event Checks self._event_checks = [ Nagios(get_hostname()), ] # Resource Checks self._resources_checks = [ ResProcesses(log,self.agentConfig) ]
def flush_trs(): if self._watchdog: self._watchdog.reset() self._postMetrics() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address="localhost") # Start everything if self._watchdog: self._watchdog.reset() tr_sched.start() self.mloop.start() log.info("Stopped") def stop(self): self.mloop.stop()
def get_value(self): return util.get_hostname()
def init(config_path=None, use_watchmonitor=False, use_forwarder=False, args=None): """Configure the server and the reporting thread. """ c = get_config(parse_args=False, cfg_path=config_path) if (not c['use_monitorstatsd'] and (args and args[0] in ['start', 'restart'] or not args)): log.info("Monitorstatsd is disabled. Exiting") # We're exiting purposefully, so exit with zero (supervisor's expected # code). HACK: Sleep a little bit so supervisor thinks we've started cleanly # and thus can exit cleanly. sleep(4) sys.exit(0) log.debug("Configuring monitorstatsd") port = c['monitorstatsd_port'] interval = monitorSTATSD_FLUSH_INTERVAL api_key = c['api_key'] aggregator_interval = monitorSTATSD_AGGREGATOR_BUCKET_SIZE non_local_traffic = c['non_local_traffic'] forward_to_host = c.get('statsd_forward_host') forward_to_port = c.get('statsd_forward_port') event_chunk_size = c.get('event_chunk_size') recent_point_threshold = c.get('recent_point_threshold', None) ip = c.get('ip', "unknown") target = c['m_url'] if use_forwarder: target = c['monitorstatsd_target'] hostname = get_hostname(c) # Create the aggregator (which is the point of communication between the # server and reporting threads. assert 0 < interval aggregator = MetricsBucketAggregator( hostname, aggregator_interval, recent_point_threshold=recent_point_threshold, formatter=get_formatter(c), histogram_aggregates=c.get('histogram_aggregates'), histogram_percentiles=c.get('histogram_percentiles'), utf8_decoding=c['utf8_decoding']) # Start the reporting thread. reporter = Reporter(c, interval, aggregator, target, api_key, use_watchmonitor, event_chunk_size) # Start the server on an IPv4 stack # Default to loopback server_host = c['bind_host'] # If specified, bind to all addressses if non_local_traffic: server_host = '' server = Server(aggregator, server_host, port, forward_to_host=forward_to_host, forward_to_port=forward_to_port) return reporter, server, c
def run_secure_loop(): logging.debug( "Remote: Starting a new connection loop for %s (%s:%d)" % (self.display_hostname, self.ip_address, self.port)) cert = auth.get_singleton().load_cert(self.hostname, self.ip_address) creds = grpc.ssl_channel_credentials(cert) with grpc.secure_channel("%s:%d" % (self.ip_address, self.port), creds) as channel: future = grpc.channel_ready_future(channel) try: future.result(timeout=4) self.stub = warp_pb2_grpc.WarpStub(channel) except grpc.FutureTimeoutError: self.set_remote_status(RemoteStatus.UNREACHABLE) future.cancel() if not self.ping_timer.is_set(): logging.debug( "Remote: Unable to establish secure connection with %s (%s:%d). Trying again in %ds" % (self.display_hostname, self.ip_address, self.port, CHANNEL_RETRY_WAIT_TIME)) self.ping_timer.wait(CHANNEL_RETRY_WAIT_TIME) return True # run_secure_loop() return False # run_secure_loop() duplex_fail_counter = 0 one_ping = False # A successful duplex response lets us finish setting things up. while not self.ping_timer.is_set(): if self.busy: logging.debug( "Remote Ping: Skipping keepalive ping to %s (%s:%d) (busy)" % (self.display_hostname, self.ip_address, self.port)) self.busy = False else: try: # t = GLib.get_monotonic_time() logging.debug("Remote Ping: to %s (%s:%d)" % (self.display_hostname, self.ip_address, self.port)) self.stub.Ping(warp_pb2.LookupName( id=self.local_ident, readable_name=util.get_hostname()), timeout=5) # logging.debug("Latency: %s (%s)" # % (util.precise_format_time_span(GLib.get_monotonic_time() - t), self.display_hostname)) if not one_ping: self.set_remote_status( RemoteStatus.AWAITING_DUPLEX) if self.check_duplex_connection(): logging.debug( "Remote: Connected to %s (%s:%d)" % (self.display_hostname, self.ip_address, self.port)) self.set_remote_status(RemoteStatus.ONLINE) self.rpc_call( self.update_remote_machine_info) self.rpc_call( self.update_remote_machine_avatar) one_ping = True else: duplex_fail_counter += 1 if duplex_fail_counter > DUPLEX_MAX_FAILURES: logging.debug( "Remote: CheckDuplexConnection to %s (%s:%d) failed too many times" % (self.display_hostname, self.ip_address, self.port)) self.ping_timer.wait( CHANNEL_RETRY_WAIT_TIME) return True except grpc.RpcError as e: logging.debug( "Remote: Ping failed, shutting down %s (%s:%d)" % (self.display_hostname, self.ip_address, self.port)) break self.ping_timer.wait( CONNECTED_PING_TIME if self.status == RemoteStatus.ONLINE else DUPLEX_WAIT_PING_TIME) # This is reached by the RpcError break above. If the remote is still discoverable, start # the secure loop over. This could have happened as a result of a quick disco/reconnect, # And we don't notice until it has already come back. In this case, try a new connection. if self.has_zc_presence and not self.ping_timer.is_set(): return True # run_secure_loop() # The ping timer has been triggered, this is an orderly shutdown. return False # run_secure_loop()
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 if command not in DD_AGENT_COMMANDS: from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0] ) return 1 check_name = args[1] try: import checks.collector print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
def run(self, config=None): signal.signal(signal.SIGTERM, self._handle_sigterm) signal.signal(signal.SIGUSR1, self._handle_sigusr1) signal.signal(signal.SIGINT, self._handle_sigterm) signal.signal(signal.SIGHUP, self._handle_sighup) CollectorStatus().persist() if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() self._checksd = load_check_directory(self._agentConfig, hostname) self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) self.collector_profile_interval = self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.check_frequency = int(self._agentConfig['check_freq']) watchmonitor = self._get_watchmonitor(self.check_frequency) self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd['initialized_checks']))) if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) if self.configs_reloaded: self.configs_reloaded = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) if self.autorestart and self._should_restart(): self._do_restart() if self.run_forever: if watchmonitor: watchmonitor.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) try: CollectorStatus.remove_latest_status() except Exception: pass log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int(agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() # Run the main loop. while self.run_forever: # enable profiler if needed profiled = False if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes': try: import cProfile profiler = cProfile.Profile() profiled = True profiler.enable() log.debug("Agent profiling is enabled") except Exception: log.warn("Cannot enable profiler") # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) # disable profiler and printout stats to stdout if agentConfig.get('profile', False) and agentConfig.get('profile').lower() == 'yes' and profiled: try: profiler.disable() import pstats from cStringIO import StringIO s = StringIO() ps = pstats.Stats(profiler, stream=s).sort_stats("cumulative") ps.print_stats() log.debug(s.getvalue()) except Exception: log.warn("Cannot disable profiler") # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def save_server_cert(self, cert_bytes): path = os.path.join(CERT_FOLDER, "%s.pem" % (util.get_hostname(),)) self._save_bytes(path, cert_bytes)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(agentConfig) systemStats = get_system_stats() emitters = self._get_emitters(agentConfig) # Load the checks.d checks checksd = load_check_directory(agentConfig, hostname) self.collector = Collector(agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile collector_profile_interval = agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. check_frequency = int(agentConfig['check_freq']) watchdog = self._get_watchdog(check_frequency, agentConfig) # Initialize the auto-restarter self.restart_interval = int( agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=checksd, start_event=self.start_event) if profiled: if collector_profiled_runs >= collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for the next loop if we will continue, # otherwise just exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 time.sleep(check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running # as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
import sys logger = logging.getLogger("ddagent.checks.nagios") nagios = Nagios(get_hostname()) events = nagios.check(logger, { 'api_key': api_key, 'nagios_log': log_file }, move_end=False) for e in events: yield e if __name__ == "__main__": import logging import socket logger = logging.getLogger("ddagent.checks.nagios") nagios = Nagios(get_hostname()) config = { 'api_key': 'apikey_2', 'nagios_log': '/var/log/nagios3/nagios.log' } events = nagios.check(logger, config, move_end=False) while True: #for e in events: # print "Event:", e time.sleep(5) events = nagios.check(logger, config)
def check_if_valid(self): return util.get_hostname() == 'cray'
#!/usr/bin/env python3 import instance import configuration_gem5 from run_benchmarks import run_benchmarks import argparse import os import sys import util import shutil from jobschedulers.jobscheduler_all import * job_scheduler = get_current_scheduler() if util.get_hostname() == 'login': job_scheduler.max_cores = 24 compiled_benchmarks = set() def parse_arguments(): parser = argparse.ArgumentParser(description='Load GEM5 results') parser.add_argument('-b', '--build-gem5', action='store_true', help='Build gem5') parser.add_argument('-c', '--clean-disk-image', action='store_true', help='Clean the disk image (rebuild benchmarks)') parser.add_argument('-r', '--resubmit-all', action='store_true', help='Resubmit even running jobs') parser.add_argument('-m', '--max-results', default=1, help='Number of results needed per instance', type=int) res = parser.parse_args() return res def main(): args = parse_arguments() gem5_environment_check()
self._watchmonitor.reset() self._postMetrics() self._postAgentInfoToServer() self._tr_manager.flush() tr_sched = tornado.ioloop.PeriodicCallback(flush_trs, TRANSACTION_FLUSH_INTERVAL, io_loop=self.mloop) # Register optional Graphite listener gport = self._agentConfig.get("graphite_listen_port", None) if gport is not None: log.info("Starting graphite listener on port %s" % gport) from graphite import GraphiteServer gs = GraphiteServer(self, get_hostname(self._agentConfig), io_loop=self.mloop) if non_local_traffic is True: gs.listen(gport) else: gs.listen(gport, address="localhost") # Start everything if self._watchmonitor: self._watchmonitor.reset() tr_sched.start() self.mloop.start() log.info("Stopped") def stop(self):
def check(self, instance): host = instance.get('host', 'localhost') port = int(instance.get('port', 2181)) timeout = float(instance.get('timeout', 3.0)) expected_mode = (instance.get('expected_mode') or '').strip() tags = instance.get('tags', []) cx_args = (host, port, timeout) sc_tags = ["host:{0}".format(host), "port:{0}".format(port)] hostname = get_hostname(self.agentConfig) report_instance_mode = instance.get("report_instance_mode", True) zk_version = None # parse_stat will parse and set version string # Send a service check based on the `ruok` response. # Set instance status to down if not ok. try: ruok_out = self._send_command('ruok', *cx_args) except ZKConnectionFailure: # The server should not respond at all if it's not OK. status = AgentCheck.CRITICAL message = 'No response from `ruok` command' self.increment('zookeeper.timeouts') if report_instance_mode: self.report_instance_mode(hostname, 'down', tags) raise else: ruok_out.seek(0) ruok = ruok_out.readline() if ruok == 'imok': status = AgentCheck.OK else: status = AgentCheck.WARNING message = u'Response from the server: %s' % ruok finally: self.service_check('zookeeper.ruok', status, message=message, tags=sc_tags) # Read metrics from the `stat` output. try: stat_out = self._send_command('stat', *cx_args) except ZKConnectionFailure: self.increment('zookeeper.timeouts') if report_instance_mode: self.report_instance_mode(hostname, 'down', tags) raise except Exception as e: self.warning(e) self.increment('zookeeper.datadog_client_exception') if report_instance_mode: self.report_instance_mode(hostname, 'unknown', tags) raise else: # Parse the response metrics, new_tags, mode, zk_version = self.parse_stat(stat_out) # Write the data if mode != 'inactive': for metric, value, m_type in metrics: submit_metric = getattr(self, m_type) submit_metric(metric, value, tags=tags + new_tags) if report_instance_mode: self.report_instance_mode(hostname, mode, tags) if expected_mode: if mode == expected_mode: status = AgentCheck.OK message = u"Server is in %s mode" % mode else: status = AgentCheck.CRITICAL message = u"Server is in %s mode but check expects %s mode"\ % (mode, expected_mode) self.service_check('zookeeper.mode', status, message=message, tags=sc_tags) # Read metrics from the `mntr` output if zk_version and LooseVersion(zk_version) > LooseVersion("3.4.0"): try: mntr_out = self._send_command('mntr', *cx_args) except ZKConnectionFailure: self.increment('zookeeper.timeouts') if report_instance_mode: self.report_instance_mode(hostname, 'down', tags) raise except Exception as e: self.warning(e) self.increment('zookeeper.datadog_client_exception') if report_instance_mode: self.report_instance_mode(hostname, 'unknown', tags) raise else: metrics, mode = self.parse_mntr(mntr_out) mode_tag = "mode:%s" % mode if mode != 'inactive': for name in metrics: self.gauge(name, metrics[name], tags=tags + [mode_tag]) if report_instance_mode: self.report_instance_mode(hostname, mode, tags)
def main(): options, args = get_parsed_args() agentConfig = get_config(options=options) autorestart = agentConfig.get('autorestart', False) hostname = get_hostname(agentConfig) in_developer_mode = agentConfig.get('developer_mode') COMMANDS_AGENT = [ 'start', 'stop', 'restart', 'status', 'foreground', ] COMMANDS_NO_AGENT = [ 'info', 'check', 'configcheck', 'jmx', 'flare', ] COMMANDS = COMMANDS_AGENT + COMMANDS_NO_AGENT if len(args) < 1: sys.stderr.write("Usage: %s %s\n" % (sys.argv[0], "|".join(COMMANDS))) return 2 command = args[0] if command not in COMMANDS: sys.stderr.write("Unknown command: %s\n" % command) return 3 # Deprecation notice if command not in DD_AGENT_COMMANDS: # Will become an error message and exit after deprecation period from utils.deprecations import deprecate_old_command_line_tools deprecate_old_command_line_tools() if command in COMMANDS_AGENT: agent = Agent(PidFile(PID_NAME, PID_DIR).get_path(), autorestart, in_developer_mode=in_developer_mode) if command in START_COMMANDS: log.info('Agent version %s' % get_version()) if 'start' == command: log.info('Start daemon') agent.start() elif 'stop' == command: log.info('Stop daemon') agent.stop() elif 'restart' == command: log.info('Restart daemon') agent.restart() elif 'status' == command: agent.status() elif 'info' == command: return Agent.info(verbose=options.verbose) elif 'foreground' == command: logging.info('Running in foreground') if autorestart: # Set-up the supervisor callbacks and fork it. logging.info('Running Agent with auto-restart ON') def child_func(): agent.start(foreground=True) def parent_func(): agent.start_event = False AgentSupervisor.start(parent_func, child_func) else: # Run in the standard foreground. agent.start(foreground=True) elif 'check' == command: if len(args) < 2: sys.stderr.write( "Usage: %s check <check_name> [check_rate]\n" "Add check_rate as last argument to compute rates\n" % sys.argv[0]) return 1 check_name = args[1] try: import checks.collector # Try the old-style check first print getattr(checks.collector, check_name)(log).check(agentConfig) except Exception: # If not an old-style check, try checks.d checks = load_check_directory(agentConfig, hostname) for check in checks['initialized_checks']: if check.name == check_name: if in_developer_mode: check.run = AgentProfiler.wrap_profiling(check.run) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) if len(args) == 3 and args[2] == 'check_rate': print "Running 2nd iteration to capture rate metrics" time.sleep(1) cs = Collector.run_single_check(check, verbose=True) print CollectorStatus.render_check_status(cs) check.stop() elif 'configcheck' == command or 'configtest' == command: configcheck() if agentConfig.get('service_discovery', False): # set the TRACE_CONFIG flag to True to make load_check_directory return # the source of config objects. # Then call load_check_directory here and pass the result to sd_configcheck # to avoid circular imports agentConfig[TRACE_CONFIG] = True configs = { # check_name: (config_source, config) } print("\nLoading check configurations...\n\n") configs = load_check_directory(agentConfig, hostname) sd_configcheck(agentConfig, configs) elif 'jmx' == command: jmx_command(args[1:], agentConfig) elif 'flare' == command: Flare.check_user_rights() case_id = int(args[1]) if len(args) > 1 else None f = Flare(True, case_id) f.collect() try: f.upload() except Exception, e: print 'The upload failed:\n{0}'.format(str(e))
def start_graphite_listener(port): from util import get_hostname echo_server = GraphiteServer(None, get_hostname(None)) echo_server.listen(port) IOLoop.instance().start()
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format( num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def load_server_cert(self): path = os.path.join(CERT_FOLDER, "%s.pem" % (util.get_hostname(),)) return self._load_bytes(path)