def check(self, instance): """ Integration logic """ if 'url' not in instance: raise ConfigurationError('Missing URL in configuration.') if 'token' not in instance: raise ConfigurationError('Missing API Token in configuration.') self.url = instance.get('url') self.token = instance.get('token') self.tags = instance.get('tags', []) self.domain = instance.get('domain', 'dynatrace') self.environment = instance.get('environment', 'production') self.verify = instance.get('verify', True) self.cert = instance.get('cert', '') self.keyfile = instance.get('keyfile', '') try: self.start_snapshot() self.process_topology() self.stop_snapshot() msg = "Dynatrace topology processed successfully" self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=self.tags, message=msg) except Exception as e: self.log.exception(str(e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self.tags, message=str(e))
def get_connection(self, key, host, port, user, password, dbname, ssl, connect_fct, tags, use_cached=True): """Get and memoize connections to instances""" if key in self.dbs and use_cached: return self.dbs[key] elif host != "" and user != "": try: if host == 'localhost' and password == '': # Use ident method connection = connect_fct("user=%s dbname=%s" % (user, dbname)) elif port != '': connection = connect_fct(host=host, port=port, user=user, password=password, database=dbname, ssl=ssl) elif host.startswith('/'): # If the hostname starts with /, it's probably a path # to a UNIX socket. This is similar behaviour to psql connection = connect_fct(unix_sock=host, user=user, password=password, database=dbname) else: connection = connect_fct(host=host, user=user, password=password, database=dbname, ssl=ssl) self.dbs[key] = connection return connection except Exception as e: message = u'Error establishing postgres connection: %s' % ( str(e)) service_check_tags = self._get_service_check_tags( host, port, tags) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, message=message) raise else: if not host: raise ConfigurationError( 'Please specify a Postgres host to connect to.') elif not user: raise ConfigurationError( 'Please specify a user to connect to Postgres as.')
def _get_custom_metrics(self, custom_metrics, key): # Pre-processed cached custom_metrics if key in self.custom_metrics: return self.custom_metrics[key] # Otherwise pre-process custom metrics and verify definition required_parameters = ("descriptors", "metrics", "query", "relation") for m in custom_metrics: for param in required_parameters: if param not in m: raise ConfigurationError( 'Missing {} parameter in custom metric'.format(param)) self.log.debug("Metric: {0}".format(m)) try: for ref, (_, mtype) in iteritems(m['metrics']): cap_mtype = mtype.upper() if cap_mtype not in ('RATE', 'GAUGE', 'MONOTONIC'): raise ConfigurationError( 'Collector method {} is not known. ' 'Known methods are RATE, GAUGE, MONOTONIC'.format( cap_mtype)) m['metrics'][ref][1] = getattr(PostgreSql, cap_mtype) self.log.debug("Method: %s" % (str(mtype))) except Exception as e: raise Exception( 'Error processing custom metric `{}`: {}'.format(m, e)) self.custom_metrics[key] = custom_metrics return custom_metrics
def get_instance_key(self, instance): if "name" not in instance: raise ConfigurationError("Missing name in topology instance configuration.") if "location" not in instance: raise ConfigurationError("Missing location in topology instance configuration.") name = instance["name"] return TopologyInstance(self.INSTANCE_TYPE, name)
def check(self, instance): host, url, user, password, tags = self._get_config(instance) if not (url and user and password): raise ConfigurationError( "Missing 'url', 'user' or 'password' in instance configuration." ) try: self.start_snapshot() self._collect_topology() self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, message="OK", tags=self.tags) except Exception as e: self.log.exception(str(e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=str(e), tags=self.tags) finally: self.stop_snapshot()
def get_instance_key(self, instance): if 'url' not in instance: raise ConfigurationError('Missing url in configuration.') return TopologyInstance(self.INSTANCE_TYPE, instance["url"], with_snapshots=False)
def get_instance_key(self, instance): if 'hostip' not in instance: raise ConfigurationError( 'Missing url in topology instance configuration.') instance_url = instance['hostip'] return TopologyInstance(self.INSTANCE_TYPE, instance_url)
def check(self, instance): username = instance.get("username", "") password = instance.get("password", "") self.organization_id = instance.get("organization_id", None) tags = instance.get("tags", []) if not (username and password): raise ConfigurationError( "Missing 'username' or 'password' in instance configuration.") try: self.start_snapshot() #self._authenticate(username, password) self._collect_topology() self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.OK, message="OK", tags=["organization_id:%s" % self.organization_id]) except Exception as e: self.log.exception(str(e)) self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=str(e), tags=["organization_id:%s" % self.organization_id]) finally: self.stop_snapshot()
def get_instance_key(self, instance): if 'url' not in instance: raise ConfigurationError( 'Missing url in topology instance configuration.') instance_url = urlparse(instance['url']).netloc return TopologyInstance(self.INSTANCE_TYPE, instance_url)
def handle_health_csv(self, filelocation, delimiter): self.log.debug("Processing health CSV file %s." % filelocation) CHECK_STATE_ID_FIELD = 'check_state_id' NAME_FIELD = 'name' HEALTH_FIELD = 'health' TOPOLOGY_ELEMENT_IDENTIFIER_FIELD = 'topology_element_identifier' MESSAGE_FIELD = 'message' with codecs.open(filelocation, mode='r', encoding="utf-8-sig") as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar='"') header_row = next(reader, None) if header_row is None: raise ConfigurationError("Health CSV file is empty.") self.log.debug("Detected health header: %s" % str(header_row)) if len(header_row) == 1: self.log.warn("Detected one field in header, is the delimiter set properly?") self.log.warn("Detected health header: %s" % str(header_row)) # mandatory fields for field in (CHECK_STATE_ID_FIELD, NAME_FIELD, HEALTH_FIELD, TOPOLOGY_ELEMENT_IDENTIFIER_FIELD): if field not in header_row: raise ConfigurationError('CSV header %s not found in health csv.' % field) header_row_number_of_fields = len(header_row) for row in reader: data = dict(zip(header_row, row)) if len(data) != header_row_number_of_fields: self.log.warn("Skipping row because number of fields do not match header row, got: %s" % row) continue check_state_id = data.get(CHECK_STATE_ID_FIELD) name = data.get(NAME_FIELD) health = HealthType().convert(data.get(HEALTH_FIELD), None) topology_element_identifier = data.get(TOPOLOGY_ELEMENT_IDENTIFIER_FIELD) message = data.get(MESSAGE_FIELD, None) self.health.check_state(check_state_id, name, health, topology_element_identifier, message if message != "" else None)
def get_instance_key(self, instance): if "host" not in instance: raise ConfigurationError( "Missing 'host' in instance configuration.") return TopologyInstance(self.INSTANCE_TYPE, instance["host"])
def check(self, instance): """ Integration logic """ if 'user' not in instance: raise ConfigurationError('Missing API user in configuration.') if 'password' not in instance: raise ConfigurationError('Missing API password in configuration.') stackstate_environment = instance.get('stackstate_environment', 'Production') self.ssl_verify = instance.get('ssl_verify', True) url = instance['url'] topology_instance = {"type": self.SERVICE_CHECK_NAME, "url": url} try: self.start_snapshot() self.check_connection(url) auth = self.login(url, instance['user'], instance['password']) hosts = {} # key: host_id, value: ZabbixHost # Topology, get all hosts for zabbix_host in self.retrieve_hosts(url, auth): self.process_host_topology(topology_instance, zabbix_host, stackstate_environment) hosts[zabbix_host.host_id] = zabbix_host # Telemetry, get all problems. zabbix_problems = self.retrieve_problems(url, auth) event_ids = list(problem.event_id for problem in zabbix_problems) zabbix_events = [] if len( event_ids) == 0 else self.retrieve_events( url, auth, event_ids) rolled_up_events_per_host = {} # host_id -> [ZabbixEvent] most_severe_severity_per_host = {} # host_id -> severity int for zabbix_event in zabbix_events: for host_id in zabbix_event.host_ids: if host_id in rolled_up_events_per_host: rolled_up_events_per_host[host_id].append(zabbix_event) if most_severe_severity_per_host[ host_id] < zabbix_event.trigger.priority: most_severe_severity_per_host[ host_id] = zabbix_event.trigger.priority else: rolled_up_events_per_host[host_id] = [zabbix_event] most_severe_severity_per_host[ host_id] = zabbix_event.trigger.priority self.log.debug('rolled_up_events_per_host:' + str(rolled_up_events_per_host)) self.log.debug('most_severe_severity_per_host:' + str(most_severe_severity_per_host)) # iterate all hosts to send an event per host, either in OK/PROBLEM state for host_id, zabbix_host in hosts.items(): severity = 0 triggers = [] if host_id in rolled_up_events_per_host: triggers = [ event.trigger.description for event in rolled_up_events_per_host[host_id] ] severity = most_severe_severity_per_host[host_id] self.event({ 'timestamp': int(time.time()), 'msg_title': "Zabbix event on host '{}': severity: {}".format( zabbix_host.name, severity), 'msg_text': "Zabbix event on host '{}': severity: {}".format( zabbix_host.name, severity), 'source_type_name': self.INSTANCE_TYPE, 'host': self.hostname, 'tags': [ 'host_id:%s' % host_id, 'host:%s' % zabbix_host.host, 'host_name:%s' % zabbix_host.name, 'severity:%s' % severity, 'triggers:%s' % triggers ] }) self.stop_snapshot() msg = "Zabbix instance detected at %s " % url tags = ["url:%s" % url] self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=tags, message=msg) except Exception as e: self.log.exception(str(e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=str(e))
def get_instance_key(self, instance): if 'url' not in instance: raise ConfigurationError('Missing API url in configuration.') return StackPackInstance(self.INSTANCE_TYPE, instance["url"])
def check(self, instance): if 'nginx_status_url' not in instance: raise ConfigurationError( 'NginX instance missing "nginx_status_url" value.') tags = instance.get('tags', []) url, use_plus_api, plus_api_version = self._get_instance_params( instance) if not use_plus_api: response, content_type, version = self._get_data(instance, url) # for unpaid versions self._set_version_metadata(version) self.log.debug("Nginx status `response`: %s", response) self.log.debug("Nginx status `content_type`: %s", content_type) if content_type.startswith('application/json'): metrics = self.parse_json(response, tags) else: metrics = self.parse_text(response, tags) else: metrics = [] self._perform_service_check(instance, '{}/{}'.format(url, plus_api_version)) # These are all the endpoints we have to call to get the same data as we did with the old API # since we can't get everything in one place anymore. for endpoint, nest in chain(iteritems(PLUS_API_ENDPOINTS), iteritems(PLUS_API_STREAM_ENDPOINTS)): response = self._get_plus_api_data(url, plus_api_version, endpoint, nest) self.log.debug("Nginx Plus API version %s `response`: %s", plus_api_version, response) metrics.extend(self.parse_json(response, tags)) funcs = { 'gauge': self.gauge, 'rate': self.rate, 'count': self.monotonic_count } conn = None handled = None for row in metrics: try: name, value, tags, metric_type = row # Translate metrics received from VTS if instance.get('use_vts', False): # Requests per second if name == 'nginx.connections.handled': handled = value if name == 'nginx.connections.accepted': conn = value self.rate('nginx.net.conn_opened_per_s', conn, tags) if handled is not None and conn is not None: self.rate('nginx.net.conn_dropped_per_s', conn - handled, tags) handled = None conn = None if name == 'nginx.connections.requests': self.rate('nginx.net.request_per_s', value, tags) name = VTS_METRIC_MAP.get(name) if name is None: continue if name in METRICS_SEND_AS_COUNT: func_count = funcs['count'] func_count(name + "_count", value, tags) func = funcs[metric_type] func(name, value, tags) # for vts and plus versions if name == 'nginx.version': self._set_version_metadata(value) except Exception as e: self.log.error('Could not submit metric: %s: %s', repr(row), e)
def get_instance_key(self, instance): if "organization_id" not in instance: raise ConfigurationError( "Missing 'organization' in instance configuration.") return TopologyInstance(self.INSTANCE_TYPE, str(self.organization_id))
def check(self, instance): url, user, password = self._get_config(instance) if not (url and user and password): raise ConfigurationError( "Missing 'url', 'user' or 'password' in instance configuration." ) start_time = time.time() try: self.start_threads(self.thread_count, self.thread_timeout) self.log.info("{0}: Checks started".format(self.host)) self.start_snapshot() snapshot1_time = time.time() lap_time = snapshot1_time - start_time self.log.info("{0}: start_snapshot run time is {1}Seconds".format( self.host, lap_time)) self._collect_topology() topology_time = time.time() lap_time = topology_time - snapshot1_time self.log.info( "{0}: _collect_topology run time is {1}Seconds".format( self.host, lap_time)) self._collect_metrics() metrics_time = time.time() lap_time = metrics_time - topology_time self.log.info( "{0}: _collect_metrics run time is {1}Seconds".format( self.host, lap_time)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, message="OK", tags=self.tags) service_time = time.time() lap_time = service_time - metrics_time self.log.info("{0}: service_check run time is {1}Seconds".format( self.host, lap_time)) except Exception as e: # sys.stdout.write("check: Exception\n{}\n".format(e)) self.log.exception(str(e)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, message=str(e), tags=self.tags) finally: if self.host in self.queue.keys(): self.queue[self.host].join( ) # Waits/Blocks until self.queue is empty... self.stop_snapshot() snapshot2_time = time.time() try: lap_time = snapshot2_time - service_time self.log.info( "{0}: Drain queue & stop_snapshot run time is {1}Seconds". format(self.host, lap_time)) except NameError: self.log.info( "{0}: Exception occured, measuments unavailable".format( self.host)) stop_time = time.time() run_time = stop_time - start_time self.log.info("{0}: Check total run time is {1}Seconds".format( self.host, run_time))