def _build_web_query(self, alert_uri): """ Builds a URL out of the URI structure. If the URI is already a URL of the form http[s]:// then this will return the URI as the URL; otherwise, it will build the URL from the URI structure's elements """ # shortcut if the supplied URI starts with the information needed string_uri = str(alert_uri.uri) if string_uri.startswith('http://') or string_uri.startswith( 'https://'): return alert_uri.uri # start building the URL manually host = BaseAlert.get_host_from_url(alert_uri.uri) if host is None: host = self.host_name # maybe slightly realistic port = 80 if alert_uri.is_ssl_enabled is True: port = 443 # extract the port try: port = int(get_port_from_url(alert_uri.uri)) except: pass scheme = 'http' if alert_uri.is_ssl_enabled is True: scheme = 'https' if OSCheck.is_windows_family(): # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 host = resolve_address(host) return "{0}://{1}:{2}".format(scheme, host, str(port))
def _build_web_query(self, alert_uri): """ Builds a URL out of the URI structure. If the URI is already a URL of the form http[s]:// then this will return the URI as the URL; otherwise, it will build the URL from the URI structure's elements """ # shortcut if the supplied URI starts with the information needed string_uri = str(alert_uri.uri) if string_uri.startswith('http://') or string_uri.startswith('https://'): return alert_uri.uri uri_path = None if string_uri and string_uri != str(None): uri_path = get_path_from_url(string_uri) # start building the URL manually host = BaseAlert.get_host_from_url(alert_uri.uri) if host is None: host = self.host_name # maybe slightly realistic port = 80 if alert_uri.is_ssl_enabled is True: port = 443 # extract the port try: port = int(get_port_from_url(alert_uri.uri)) except: pass scheme = 'http' if alert_uri.is_ssl_enabled is True: scheme = 'https' if OSCheck.is_windows_family(): # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 host = resolve_address(host) if uri_path: return "{0}://{1}:{2}/{3}".format(scheme, host, str(port), uri_path) else: return "{0}://{1}:{2}".format(scheme, host, str(port))
def execute(parameters=None, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: parameters (dictionary): a mapping of parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if parameters is None: return (result_code, ['There were no parameters supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if NODEMANAGER_HTTP_ADDRESS_KEY in parameters: http_uri = parameters[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in parameters: https_uri = parameters[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in parameters: http_policy = parameters[YARN_HTTP_POLICY_KEY] # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) try: query = "{0}://{1}/ws/v1/node/info".format(scheme, uri) # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query) except: label = CRITICAL_CONNECTION_MESSAGE.format(uri) return (RESULT_CODE_CRITICAL, [label]) # URL response received, parse it try: json_response = json.loads(url_response.read()) node_healthy = json_response['nodeInfo']['nodeHealthy'] # convert boolean to string node_healthy = str(node_healthy) except: return (RESULT_CODE_CRITICAL, [query]) # proper JSON received, compare against known value if node_healthy.lower() == 'true': result_code = RESULT_CODE_OK label = OK_MESSAGE else: result_code = RESULT_CODE_CRITICAL label = CRITICAL_NODEMANAGER_STATUS_MESSAGE.format(node_healthy) return (result_code, [label])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, query, "nm_health_alert", None, False, "NodeManager Health", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(url_response) else: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) json_response = json.loads(url_response.read()) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError)) return (RESULT_CODE_CRITICAL, [label])
def _collect(self): # can be parameterized or static # if not parameterized, this will return the static value uri_value = self._get_configuration_value(self.uri) if uri_value is None: uri_value = self.host_name logger.debug("[Alert][{0}] Setting the URI to this host since it wasn't specified".format( self.get_name())) # in some cases, a single property is a comma-separated list like # host1:8080,host2:8081,host3:8083 uri_value_array = uri_value.split(',') if len(uri_value_array) > 1: for item in uri_value_array: if self.host_name in item: uri_value = item if logger.isEnabledFor(logging.DEBUG): logger.debug("[Alert][{0}] Extracted {1} as the host name while parsing the CSV URI {2}".format( self.get_name(), uri_value, str(uri_value_array))) break host = BaseAlert.get_host_from_url(uri_value) if host is None: host = self.host_name try: port = int(get_port_from_url(uri_value)) except: if self.default_port is None: label = 'Unable to determine port from URI {0}'.format(uri_value) return (self.RESULT_UNKNOWN, [label]) port = self.default_port if logger.isEnabledFor(logging.DEBUG): logger.debug("[Alert][{0}] Checking {1} on port {2}".format( self.get_name(), host, str(port))) try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(self.critical_timeout) if OSCheck.is_windows_family(): # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 host = resolve_address(host) start_time = time.time() s.connect((host, port)) end_time = time.time() milliseconds = end_time - start_time seconds = milliseconds / 1000.0 # not sure why this happens sometimes, but we don't always get a # socket exception if the connect() is > than the critical threshold if seconds >= self.critical_timeout: return (self.RESULT_CRITICAL, ['Socket Timeout', host, port]) result = self.RESULT_OK if seconds >= self.warning_timeout: result = self.RESULT_WARNING return (result, [seconds, port]) except Exception as e: return (self.RESULT_CRITICAL, [str(e), host, port]) finally: if s is not None: try: s.close() except: # no need to log a close failure pass
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme, uri) try: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError)) return (RESULT_CODE_CRITICAL, [label])
def _collect(self): # can be parameterized or static # if not parameterized, this will return the static value uri_value = self._get_configuration_value(self.uri) host_not_specified = False if uri_value is None: host_not_specified = True uri_value = self.host_name logger.debug("[Alert][{0}] Setting the URI to this host since it wasn't specified".format( self.get_name())) # in some cases, a single property is a comma-separated list like # host1:8080,host2:8081,host3:8083 uri_value_array = uri_value.split(',') if len(uri_value_array) > 1: for item in uri_value_array: if self.host_name in item: uri_value = item if logger.isEnabledFor(logging.DEBUG): logger.debug("[Alert][{0}] Extracted {1} as the host name while parsing the CSV URI {2}".format( self.get_name(), uri_value, str(uri_value_array))) break host = BaseAlert.get_host_from_url(uri_value) if host is None or host == "localhost" or host == "0.0.0.0": host = self.host_name host_not_specified = True hosts = [host] # If host is not specified in the uri, hence we are using current host name # then also add public host name as a fallback. if host_not_specified and host.lower() == self.host_name.lower() \ and self.host_name.lower() != self.public_host_name.lower(): hosts.append(self.public_host_name) if logger.isEnabledFor(logging.DEBUG): logger.debug("[Alert][{0}] List of hosts = {1}".format(self.get_name(), hosts)) try: port = int(get_port_from_url(uri_value)) except: if self.default_port is None: label = 'Unable to determine port from URI {0}'.format(uri_value) return (self.RESULT_UNKNOWN, [label]) port = self.default_port exceptions = [] for host in hosts: if logger.isEnabledFor(logging.DEBUG): logger.debug("[Alert][{0}] Checking {1} on port {2}".format( self.get_name(), host, str(port))) s = None try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(self.critical_timeout) if OSCheck.is_windows_family(): # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 host = resolve_address(host) start_time = time.time() s.connect((host, port)) if self.socket_command is not None: s.sendall(self.socket_command) data = s.recv(1024) if self.socket_command_response is not None and data != self.socket_command_response: raise Exception("Expected response {0}, Actual response {1}".format( self.socket_command_response, data)) end_time = time.time() milliseconds = end_time - start_time seconds = milliseconds / 1000.0 # not sure why this happens sometimes, but we don't always get a # socket exception if the connect() is > than the critical threshold if seconds >= self.critical_timeout: return (self.RESULT_CRITICAL, ['Socket Timeout', host, port]) result = self.RESULT_OK if seconds >= self.warning_timeout: result = self.RESULT_WARNING return (result, [seconds, port]) except Exception as e: exceptions.append(e) finally: if s is not None: try: s.close() except: # no need to log a close failure pass if exceptions: return (self.RESULT_CRITICAL, [str(exceptions[0]), hosts[0], port])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) try: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError)) return (RESULT_CODE_CRITICAL, [label])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, query, "nm_health_alert", executable_paths, False, "NodeManager Health", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(url_response) else: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) json_response = json.loads(url_response.read()) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError), traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label])