def __init__(self, agentConfig): try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.dockerutil = DockerUtil(config_store=self.config_store) self.docker_client = self.dockerutil.client if Platform.is_k8s(): try: self.kubeutil = KubeUtil() except Exception as ex: self.kubeutil = None log.error("Couldn't instantiate the kubernetes client, " "subsequent kubernetes calls will fail as well. Error: %s" % str(ex)) if Platform.is_nomad(): self.nomadutil = NomadUtil() elif Platform.is_ecs_instance(): self.ecsutil = ECSUtil() self.VAR_MAPPING = { 'host': self._get_host_address, 'port': self._get_port, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig)
def __init__(self, agentConfig): try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.dockerutil = DockerUtil(config_store=self.config_store) self.docker_client = self.dockerutil.client if Platform.is_k8s(): try: self.kubeutil = KubeUtil() except Exception as ex: self.kubeutil = None log.error( "Couldn't instantiate the kubernetes client, " "subsequent kubernetes calls will fail as well. Error: %s" % str(ex)) if Platform.is_nomad(): self.nomadutil = NomadUtil() elif Platform.is_ecs_instance(): self.ecsutil = ECSUtil() self.VAR_MAPPING = { 'host': self._get_host_address, 'port': self._get_port, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig)
def __init__(self, agentConfig): try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.dockerutil = DockerUtil(config_store=self.config_store) self.kubeutil = None if Platform.is_k8s(): try: self.kubeutil = KubeUtil() except Exception as ex: log.error( "Couldn't instantiate the kubernetes client, " "subsequent kubernetes calls will fail as well. Error: %s" % str(ex)) self.metadata_collector = MetadataCollector() self.VAR_MAPPING = { 'host': self._get_host_address, 'pid': self._get_container_pid, 'port': self._get_port, 'container-name': self._get_container_name, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig)
def test_get_check_tpls_kube(self, *args): """Test get_check_tpls for kubernetes annotations""" valid_config = ['image_0', 'image_1', 'image_2'] invalid_config = ['bad_image_0'] config_store = get_config_store(self.auto_conf_agentConfig) for image in valid_config + invalid_config: tpl = self.mock_raw_templates.get(image)[1] tpl = [(CONFIG_FROM_KUBE, t[1]) for t in tpl] if tpl: self.assertNotEquals( tpl, config_store.get_check_tpls('k8s-' + image, auto_conf=True)) self.assertEquals( tpl, config_store.get_check_tpls( 'k8s-' + image, auto_conf=True, kube_pod_name=image, kube_container_name='foo', kube_annotations=dict( zip([ 'service-discovery.datadoghq.com/foo.check_names', 'service-discovery.datadoghq.com/foo.init_configs', 'service-discovery.datadoghq.com/foo.instances' ], self.mock_raw_templates[image][0]))))
def __init__(self, **kwargs): self._docker_root = None self.events = [] if 'init_config' in kwargs and 'instance' in kwargs: init_config = kwargs.get('init_config') instance = kwargs.get('instance') else: init_config, instance = self.get_check_config() self.set_docker_settings(init_config, instance) # At first run we'll just collect the events from the latest 60 secs self._latest_event_collection_ts = int(time.time()) - 60 # if agentConfig is passed it means service discovery is enabled and we need to get_config_store if 'agentConfig' in kwargs: self.config_store = get_config_store(kwargs['agentConfig']) else: self.config_store = None # Try to detect if we are on ECS self._is_ecs = False try: containers = self.client.containers() for co in containers: if '/ecs-agent' in co.get('Names', ''): self._is_ecs = True except Exception: pass
def __init__(self, agentConfig): self.docker_client = DockerUtil().client if is_k8s(): self.kubeutil = KubeUtil() try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.VAR_MAPPING = { 'host': self._get_host, 'port': self._get_ports, 'tags': self._get_additional_tags, } AbstractSDBackend.__init__(self, agentConfig)
def test_get_check_tpls(self, *args): """Test get_check_tpls""" valid_config = ['image_0', 'image_1', 'image_2'] invalid_config = ['bad_image_0', 'bad_image_1'] config_store = get_config_store(self.auto_conf_agentConfig) for image in valid_config: tpl = self.mock_raw_templates.get(image)[1] self.assertEquals(tpl, config_store.get_check_tpls(image)) for image in invalid_config: tpl = self.mock_raw_templates.get(image)[1] self.assertEquals(tpl, config_store.get_check_tpls(image))
def test_get_check_tpls(self, mock_client_read): """Test get_check_tpls""" valid_config = ['image_0', 'image_1', 'image_2'] invalid_config = ['bad_image_0', 'bad_image_1'] config_store = get_config_store(self.auto_conf_agentConfig) for image in valid_config: tpl = self.mock_tpls.get(image)[1] self.assertEquals(tpl, config_store.get_check_tpls(image)) for image in invalid_config: tpl = self.mock_tpls.get(image)[1] self.assertEquals(tpl, config_store.get_check_tpls(image))
def test_get_auto_config(self): """Test _get_auto_config""" expected_tpl = { 'redis': ('redisdb', None, {"host": "%%host%%", "port": "%%port%%"}), 'consul': ('consul', None, {"url": "http://%%host%%:%%port%%", "catalog_checks": True, "new_leader_checks": True}), 'foobar': None } config_store = get_config_store(self.auto_conf_agentConfig) for image in expected_tpl.keys(): config = config_store._get_auto_config(image) self.assertEquals(config, expected_tpl.get(image))
def test_get_auto_config(self, mock_get_auto_confd_path): """Test _get_auto_config""" expected_tpl = { 'disk': [('disk', None, {"host": "%%host%%", "port": "%%port%%"})], 'consul': [('consul', None, { "url": "http://%%host%%:%%port%%", "catalog_checks": True, "new_leader_checks": True })], 'disk:v1': [('disk', None, {"host": "%%host%%", "port": "%%port%%"})], 'foobar': [] } config_store = get_config_store(self.auto_conf_agentConfig) for image in expected_tpl.keys(): config = config_store._get_auto_config(image) self.assertEquals(config, expected_tpl.get(image))
def test_read_config_from_store(self, issue_read): """Test read_config_from_store""" valid_idents = [('nginx', 'nginx'), ('nginx:latest', 'nginx:latest'), ('custom-nginx', 'custom-nginx'), ('custom-nginx:latest', 'custom-nginx'), ('repo/custom-nginx:latest', 'custom-nginx'), ('repo/dir:5000/custom-nginx:latest', 'repo/dir:5000/custom-nginx:latest')] invalid_idents = ['foo'] config_store = get_config_store(self.auto_conf_agentConfig) for ident, expected_key in valid_idents: tpl = config_store.read_config_from_store(ident) # source is added after reading from the store self.assertEquals(tpl, ('template',) + self.mock_tpls.get(expected_key)) for ident in invalid_idents: self.assertEquals(config_store.read_config_from_store(ident), [])
def __init__(self, agentConfig): try: self.config_store = get_config_store(agentConfig=agentConfig) except Exception as e: log.error('Failed to instantiate the config store client. ' 'Auto-config only will be used. %s' % str(e)) agentConfig['sd_config_backend'] = None self.config_store = get_config_store(agentConfig=agentConfig) self.dockerutil = DockerUtil(config_store=self.config_store) self.kubeutil = None if Platform.is_k8s(): try: self.kubeutil = KubeUtil() except Exception as ex: log.error("Couldn't instantiate the kubernetes client, " "subsequent kubernetes calls will fail as well. Error: %s" % str(ex)) self.metadata_collector = MetadataCollector() self.VAR_MAPPING = { 'host': self._get_host_address, 'pid': self._get_container_pid, 'port': self._get_port, 'container-name': self._get_container_name, 'tags': self._get_additional_tags, } # docker labels we'll add as tags to all instances SD configures self.docker_labels_as_tags = agentConfig.get('docker_labels_as_tags', '') if self.docker_labels_as_tags: self.docker_labels_as_tags = [label.strip() for label in self.docker_labels_as_tags.split(',')] else: self.docker_labels_as_tags = [] AbstractSDBackend.__init__(self, agentConfig)
def print_templates(agentConfig): if agentConfig.get('sd_config_backend') in SD_CONFIG_BACKENDS: print("Configuration templates:\n") templates = {} sd_template_dir = agentConfig.get('sd_template_dir') config_store = get_config_store(agentConfig) try: templates = config_store.dump_directory(sd_template_dir) except Exception as ex: print("Failed to extract configuration templates from the backend:\n%s" % str(ex)) for ident, tpl in templates.iteritems(): print( "- Identifier %s:\n\tcheck names: %s\n\tinit_configs: %s\n\tinstances: %s" % ( ident, json.dumps(json.loads(tpl.get('check_names')), indent=2), json.dumps(json.loads(tpl.get('init_configs')), indent=2), json.dumps(json.loads(tpl.get('instances')), indent=2), ) )
def test_get_check_tpls_kube(self, mock_client_read): """Test get_check_tpls""" valid_config = ['image_0', 'image_1', 'image_2'] invalid_config = ['bad_image_0'] config_store = get_config_store(self.auto_conf_agentConfig) for image in valid_config + invalid_config: tpl = self.mock_tpls.get(image)[1] if tpl: self.assertNotEquals( tpl, config_store.get_check_tpls('k8s-' + image, auto_conf=True)) self.assertEquals( tpl, config_store.get_check_tpls( 'k8s-' + image, auto_conf=True, kube_annotations=dict(zip( ['com.datadoghq.sd/check_names', 'com.datadoghq.sd/init_configs', 'com.datadoghq.sd/instances'], self.mock_tpls[image][0]))))
def print_templates(agentConfig): if agentConfig.get('sd_config_backend') in SD_CONFIG_BACKENDS: print("Configuration templates:\n") templates = {} sd_template_dir = agentConfig.get('sd_template_dir') config_store = get_config_store(agentConfig) try: templates = config_store.dump_directory(sd_template_dir) except Exception as ex: print("Failed to extract configuration templates from the backend:\n%s" % str(ex)) for ident, tpl in templates.iteritems(): print( "- Identifier %s:\n\tcheck names: %s\n\tinit_configs: %s\n\tinstances: %s" % ( ident, tpl.get('check_names'), tpl.get('init_configs'), tpl.get('instances'), ) )
def test_read_config_from_store(self, *args): """Test read_config_from_store""" valid_idents = [('nginx', 'nginx'), ('nginx:latest', 'nginx:latest'), ('custom-nginx', 'custom-nginx'), ('custom-nginx:latest', 'custom-nginx'), ('repo/custom-nginx:latest', 'custom-nginx'), ('repo/dir:5000/custom-nginx:latest', 'repo/dir:5000/custom-nginx:latest')] invalid_idents = ['foo'] config_store = get_config_store(self.auto_conf_agentConfig) for ident, expected_key in valid_idents: tpl = config_store.read_config_from_store(ident) # source is added after reading from the store self.assertEquals( tpl, { CONFIG_FROM_AUTOCONF: None, CONFIG_FROM_TEMPLATE: self.mock_raw_templates.get(expected_key) } ) for ident in invalid_idents: self.assertEquals(config_store.read_config_from_store(ident), [])
def test_get_check_tpls_labels(self, *args): """Test get_check_tpls from docker labesl""" valid_config = ['image_0', 'image_1', 'image_2', 'image_3', 'image_4'] invalid_config = ['bad_image_0'] config_store = get_config_store(self.auto_conf_agentConfig) for image in valid_config + invalid_config: tpl = self.mock_raw_templates.get(image)[1] tpl = [(CONFIG_FROM_LABELS, t[1]) for t in tpl] if tpl: self.assertNotEquals( tpl, config_store.get_check_tpls(image, auto_conf=True)) self.assertEquals( tpl, config_store.get_check_tpls( image, auto_conf=True, docker_labels=dict(zip( ['com.serverdensity.ad.check_names', 'com.serverdensity.ad.init_configs', 'com.serverdensity.ad.instances'], self.mock_raw_templates[image][0]))))
def test_get_check_tpls_labels(self, *args): """Test get_check_tpls from docker labesl""" valid_config = ['image_0', 'image_1', 'image_2', 'image_3', 'image_4'] invalid_config = ['bad_image_0'] config_store = get_config_store(self.auto_conf_agentConfig) for image in valid_config + invalid_config: tpl = self.mock_raw_templates.get(image)[1] tpl = [(CONFIG_FROM_LABELS, t[1]) for t in tpl] if tpl: self.assertNotEquals( tpl, config_store.get_check_tpls(image, auto_conf=True)) self.assertEquals( tpl, config_store.get_check_tpls( image, auto_conf=True, docker_labels=dict(zip( ['com.datadoghq.ad.check_names', 'com.datadoghq.ad.init_configs', 'com.datadoghq.ad.instances'], self.mock_raw_templates[image][0]))))
def test_get_check_tpls_kube(self, *args): """Test get_check_tpls for kubernetes annotations""" valid_config = ['image_0', 'image_1', 'image_2', 'image_3', 'image_4'] invalid_config = ['bad_image_0'] config_store = get_config_store(self.auto_conf_agentConfig) for image in valid_config + invalid_config: tpl = self.mock_raw_templates.get(image)[1] tpl = [(CONFIG_FROM_KUBE, t[1]) for t in tpl] if tpl: self.assertNotEquals( tpl, config_store.get_check_tpls('k8s-' + image, auto_conf=True)) self.assertEquals( tpl, config_store.get_check_tpls( 'k8s-' + image, auto_conf=True, kube_pod_name=image, kube_container_name='foo', kube_annotations=dict(zip( ['service-discovery.datadoghq.com/foo.check_names', 'service-discovery.datadoghq.com/foo.init_configs', 'service-discovery.datadoghq.com/foo.instances'], self.mock_raw_templates[image][0]))))
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) if not Platform.is_windows(): # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable', False)): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open( pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket( self._agentConfig) else: log.debug( 'Unable to create pipe in temporary directory. JMX service discovery disabled.' ) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs( checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def init(self): try: instance = self.instances[0] # if service discovery is enabled dockerutil will need a reference to the config store if self._service_discovery: self.docker_util = DockerUtil(agentConfig=self.agentConfig, config_store=get_config_store( self.agentConfig)) else: self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if Platform.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints( CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get( "collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative( instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning( "You must specify an exclude section to enable filtering" ) else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters( include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative( instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative( instance.get('collect_container_size', False)) self.collect_events = _is_affirmative( instance.get('collect_events', True)) self.collect_image_size = _is_affirmative( instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative( instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative( instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats(proc_path=self._agentConfig.get( 'procfs_path', '/proc').rstrip('/')) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( 'collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL) # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int( self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format( num_checks=len(self._checksd['initialized_checks']))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn( 'Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format( self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def clear_singletons(agentConfig): get_config_store(agentConfig)._drop() get_sd_backend(agentConfig)._drop()
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats() emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get("service_discovery"): self.sd_backend = get_sd_backend(self._agentConfig) # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile self.collector_profile_interval = self._agentConfig.get( "collector_profile_interval", DEFAULT_COLLECTOR_PROFILE_INTERVAL ) # Configure the watchdog. self.check_frequency = int(self._agentConfig["check_freq"]) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get("restart_interval", RESTART_INTERVAL)) self.agent_start = time.time() profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: log.debug("Found {num_checks} checks".format(num_checks=len(self._checksd["initialized_checks"]))) # Setup profiling if necessary if self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) # Do the work. self.collector.run( checksd=self._checksd, start_event=self.start_event, configs_reloaded=self.configs_reloaded ) # This flag is used to know if the check configs have been reloaded at the current # run of the agent yet or not. It's used by the collector to know if it needs to # look for the AgentMetrics check and pop it out. # See: https://github.com/DataDog/dd-agent/blob/5.6.x/checks/collector.py#L265-L272 self.configs_reloaded = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if ( self._agentConfig.get("service_discovery") and self.sd_backend and not self.sd_backend.reload_check_configs ): try: self.sd_backend.reload_check_configs = get_config_store(self._agentConfig).crawl_config_template() except Exception as e: log.warn("Something went wrong while looking for config template changes: %s" % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get("service_discovery") and self.sd_backend and self.sd_backend.reload_check_configs: self.reload_configs() self.configs_reloaded = True self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def run(self, config=None): """Main loop of the collector""" # Gracefully exit on sigterm. signal.signal(signal.SIGTERM, self._handle_sigterm) # A SIGUSR1 signals an exit with an autorestart signal.signal(signal.SIGUSR1, self._handle_sigusr1) # Handle Keyboard Interrupt signal.signal(signal.SIGINT, self._handle_sigterm) # A SIGHUP signals a configuration reload signal.signal(signal.SIGHUP, self._handle_sighup) # Save the agent start-up stats. CollectorStatus().persist() # Intialize the collector. if not config: config = get_config(parse_args=True) self._agentConfig = self._set_agent_config_hostname(config) hostname = get_hostname(self._agentConfig) systemStats = get_system_stats( proc_path=self._agentConfig.get('procfs_path', '/proc').rstrip('/') ) emitters = self._get_emitters() # Initialize service discovery if self._agentConfig.get('service_discovery'): self.sd_backend = get_sd_backend(self._agentConfig) if _is_affirmative(self._agentConfig.get('sd_jmx_enable')): pipe_path = get_jmx_pipe_path() if Platform.is_windows(): pipe_name = pipe_path.format(pipename=SD_PIPE_NAME) else: pipe_name = os.path.join(pipe_path, SD_PIPE_NAME) if os.access(pipe_path, os.W_OK): if not os.path.exists(pipe_name): os.mkfifo(pipe_name) self.sd_pipe = os.open(pipe_name, os.O_RDWR) # RW to avoid blocking (will only W) # Initialize Supervisor proxy self.supervisor_proxy = self._get_supervisor_socket(self._agentConfig) else: log.debug('Unable to create pipe in temporary directory. JMX service discovery disabled.') # Load the checks.d checks self._checksd = load_check_directory(self._agentConfig, hostname) # Load JMX configs if available if self._jmx_service_discovery_enabled: jmx_sd_configs = generate_jmx_configs(self._agentConfig, hostname) if jmx_sd_configs: self._submit_jmx_service_discovery(jmx_sd_configs) # Initialize the Collector self.collector = Collector(self._agentConfig, emitters, systemStats, hostname) # In developer mode, the number of runs to be included in a single collector profile try: self.collector_profile_interval = int( self._agentConfig.get('collector_profile_interval', DEFAULT_COLLECTOR_PROFILE_INTERVAL)) except ValueError: log.warn('collector_profile_interval is invalid. ' 'Using default value instead (%s).' % DEFAULT_COLLECTOR_PROFILE_INTERVAL) self.collector_profile_interval = DEFAULT_COLLECTOR_PROFILE_INTERVAL # Configure the watchdog. self.check_frequency = int(self._agentConfig['check_freq']) watchdog = self._get_watchdog(self.check_frequency) # Initialize the auto-restarter self.restart_interval = int(self._agentConfig.get('restart_interval', RESTART_INTERVAL)) self.agent_start = time.time() self.allow_profiling = self._agentConfig.get('allow_profiling', True) profiled = False collector_profiled_runs = 0 # Run the main loop. while self.run_forever: # Setup profiling if necessary if self.allow_profiling and self.in_developer_mode and not profiled: try: profiler = AgentProfiler() profiler.enable_profiling() profiled = True except Exception as e: log.warn("Cannot enable profiler: %s" % str(e)) if self.reload_configs_flag: if isinstance(self.reload_configs_flag, set): self.reload_configs(checks_to_reload=self.reload_configs_flag) else: self.reload_configs() # Do the work. Pass `configs_reloaded` to let the collector know if it needs to # look for the AgentMetrics check and pop it out. self.collector.run(checksd=self._checksd, start_event=self.start_event, configs_reloaded=True if self.reload_configs_flag else False) self.reload_configs_flag = False # Look for change in the config template store. # The self.sd_backend.reload_check_configs flag is set # to True if a config reload is needed. if self._agentConfig.get('service_discovery') and self.sd_backend and \ not self.sd_backend.reload_check_configs: try: self.sd_backend.reload_check_configs = get_config_store( self._agentConfig).crawl_config_template() except Exception as e: log.warn('Something went wrong while looking for config template changes: %s' % str(e)) # Check if we should run service discovery # The `reload_check_configs` flag can be set through the docker_daemon check or # using ConfigStore.crawl_config_template if self._agentConfig.get('service_discovery') and self.sd_backend and \ self.sd_backend.reload_check_configs: self.reload_configs_flag = self.sd_backend.reload_check_configs self.sd_backend.reload_check_configs = False if profiled: if collector_profiled_runs >= self.collector_profile_interval: try: profiler.disable_profiling() profiled = False collector_profiled_runs = 0 except Exception as e: log.warn("Cannot disable profiler: %s" % str(e)) # Check if we should restart. if self.autorestart and self._should_restart(): self._do_restart() # Only plan for next loop if we will continue, otherwise exit quickly. if self.run_forever: if watchdog: watchdog.reset() if profiled: collector_profiled_runs += 1 log.debug("Sleeping for {0} seconds".format(self.check_frequency)) time.sleep(self.check_frequency) # Now clean-up. try: CollectorStatus.remove_latest_status() except Exception: pass # Explicitly kill the process, because it might be running as a daemon. log.info("Exiting. Bye bye.") sys.exit(0)
def init(self): try: instance = self.instances[0] # if service discovery is enabled dockerutil will need a reference to the config store if self._service_discovery: self.docker_util = DockerUtil( agentConfig=self.agentConfig, config_store=get_config_store(self.agentConfig) ) else: self.docker_util = DockerUtil() self.docker_client = self.docker_util.client self.docker_gateway = DockerUtil.get_gateway() if self.is_k8s(): self.kubeutil = KubeUtil() # We configure the check with the right cgroup settings for this host # Just needs to be done once self._mountpoints = self.docker_util.get_mountpoints(CGROUP_METRICS) self.cgroup_listing_retries = 0 self._latest_size_query = 0 self._filtered_containers = set() self._disable_net_metrics = False # Set tagging options self.custom_tags = instance.get("tags", []) self.collect_labels_as_tags = instance.get("collect_labels_as_tags", []) self.kube_labels = {} self.use_histogram = _is_affirmative(instance.get('use_histogram', False)) performance_tags = instance.get("performance_tags", DEFAULT_PERFORMANCE_TAGS) self.tag_names = { CONTAINER: instance.get("container_tags", DEFAULT_CONTAINER_TAGS), PERFORMANCE: performance_tags, IMAGE: instance.get('image_tags', DEFAULT_IMAGE_TAGS) } # Set filtering settings if not instance.get("exclude"): self._filtering_enabled = False if instance.get("include"): self.log.warning("You must specify an exclude section to enable filtering") else: self._filtering_enabled = True include = instance.get("include", []) exclude = instance.get("exclude", []) self._exclude_patterns, self._include_patterns, _filtered_tag_names = get_filters(include, exclude) self.tag_names[FILTERED] = _filtered_tag_names # Other options self.collect_image_stats = _is_affirmative(instance.get('collect_images_stats', False)) self.collect_container_size = _is_affirmative(instance.get('collect_container_size', False)) self.collect_events = _is_affirmative(instance.get('collect_events', True)) self.collect_image_size = _is_affirmative(instance.get('collect_image_size', False)) self.collect_disk_stats = _is_affirmative(instance.get('collect_disk_stats', False)) self.collect_ecs_tags = _is_affirmative(instance.get('ecs_tags', True)) and Platform.is_ecs_instance() self.ecs_tags = {} except Exception as e: self.log.critical(e) self.warning("Initialization failed. Will retry at next iteration") else: self.init_success = True