def __init__(self, jsonvalue): if isinstance(jsonvalue, dict): json_dict = jsonvalue elif isinstance(jsonvalue, basestring): json_dict = json.loads(jsonvalue) if json_dict is None: raise Fail("Cannot deserialize command repository {0}".format( str(jsonvalue))) # version_id is the primary id of the repo_version table in the database self.version_id = _find_value(json_dict, 'repoVersionId') self.stack_name = _find_value(json_dict, 'stackName') self.version_string = _find_value(json_dict, 'repoVersion') self.repositories = [] repos_def = _find_value(json_dict, 'repositories') if repos_def is not None: if not isinstance(repos_def, list): repos_def = [repos_def] for repo_def in repos_def: self.repositories.append(_CommandRepositoryEntry(repo_def))
def update_ranger_policy(self, policyId, data, usernamepassword): """ :param policyId: policy id which needs to be updated :param data: policy data that needs to be updated :param usernamepassword: user credentials using which policy needs to be updated :return Returns successful response and response code else None """ try: searchRepoURL = self.urlPolicies + "/" + str(policyId) base64string = base64.encodestring( '{0}'.format(usernamepassword)).replace('\n', '') headers = { 'Accept': 'application/json', "Content-Type": "application/json" } request = urllib2.Request(searchRepoURL, data, headers) request.add_header("Authorization", "Basic {0}".format(base64string)) request.get_method = lambda: 'PUT' result = openurl(request, timeout=20) response_code = result.getcode() response = json.loads(json.JSONEncoder().encode(result.read())) if response_code == 200: Logger.info('Policy updated Successfully') return response_code else: Logger.error('Update Policy failed') return None except urllib2.URLError, e: if isinstance(e, urllib2.HTTPError): raise Fail( "Error updating policy. Http status code - {0}. \n {1}". format(e.code, e.read())) else: raise Fail("Error updating policy. Reason - {0}.".format( e.reason))
def create_repo(url, data, usernamepassword): try: base_url = url + '/service/public/v2/api/service' base64string = base64.encodestring('{0}'.format(usernamepassword)).replace('\n', '') headers = { 'Accept': 'application/json', "Content-Type": "application/json" } request = urllib2.Request(base_url, data, headers) request.add_header("Authorization", "Basic {0}".format(base64string)) result = urllib2.urlopen(request, timeout=20) response_code = result.getcode() response = json.loads(json.JSONEncoder().encode(result.read())) if response_code == 200: Logger.info('Repository created Successfully') return True else: Logger.info('Repository not created') return False except urllib2.URLError, e: if isinstance(e, urllib2.HTTPError): raise Fail("Error creating service. Http status code - {0}. \n {1}".format(e.code, e.read())) else: raise Fail("Error creating service. Reason - {0}.".format(e.reason))
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, query, "nm_health_alert", None, False, "NodeManager Health", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(url_response) else: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) json_response = json.loads(url_response.read()) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError)) return (RESULT_CODE_CRITICAL, [label])
def create_ams_datasource(): import params server = Server(protocol = params.ams_grafana_protocol.strip(), host = params.ams_grafana_host.strip(), port = params.ams_grafana_port, user = params.ams_grafana_admin_user, password = params.ams_grafana_admin_pwd) """ Create AMS datasource in Grafana, if exsists make sure the collector url is accurate """ ams_datasource_json = Template('metrics_grafana_datasource.json.j2', ams_datasource_name=METRICS_GRAFANA_DATASOURCE_NAME).get_content() Logger.info("Checking if AMS Grafana datasource already exists") response = perform_grafana_get_call(GRAFANA_DATASOURCE_URL, server) create_datasource = True if response and response.status == 200: datasources = response.read() datasources_json = json.loads(datasources) for i in xrange(0, len(datasources_json)): datasource_name = datasources_json[i]["name"] if datasource_name == METRICS_GRAFANA_DATASOURCE_NAME: create_datasource = False # datasource already exists Logger.info("Ambari Metrics Grafana datasource already present. Checking Metrics Collector URL") datasource_url = datasources_json[i]["url"] if is_unchanged_datasource_url(datasource_url): Logger.info("Metrics Collector URL validation succeeded.") return else: # Metrics datasource present, but collector host is wrong. datasource_id = datasources_json[i]["id"] Logger.info("Metrics Collector URL validation failed. Updating " "datasource, id = %s" % datasource_id) (response, data) = perform_grafana_put_call(GRAFANA_DATASOURCE_URL, datasource_id, ams_datasource_json, server) if response.status == 200: Logger.info("Ambari Metrics Grafana data source updated.") elif response.status == 500: Logger.info("Ambari Metrics Grafana data source update failed. Not retrying.") raise Fail("Ambari Metrics Grafana data source update failed. PUT request status: %s %s \n%s" % (response.status, response.reason, data)) else: raise Fail("Ambari Metrics Grafana data source creation failed. " "PUT request status: %s %s \n%s" % (response.status, response.reason, data)) pass pass pass else: Logger.info("Error checking for Ambari Metrics Grafana datasource. Will attempt to create.") if not create_datasource: return else: Logger.info("Generating datasource:\n%s" % ams_datasource_json) (response, data) = perform_grafana_post_call(GRAFANA_DATASOURCE_URL, ams_datasource_json, server) if response.status == 200: Logger.info("Ambari Metrics Grafana data source created.") elif response.status == 500: Logger.info("Ambari Metrics Grafana data source creation failed. Not retrying.") raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" % (response.status, response.reason, data)) else: Logger.info("Ambari Metrics Grafana data source creation failed.") raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" % (response.status, response.reason, data)) pass
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) location_quota = LOCATION_QUOTA_DEFAULT if LOCATION_QUOTA_KEY in parameters: location_quota = str(parameters[LOCATION_QUOTA_KEY]) quota_warning = QUOTA_WARN_DEFAULT if QUOTA_WARN_KEY in parameters: quota_warning = float(parameters[QUOTA_WARN_KEY]) quota_critical = QUOTA_CRIT_DEFAULT if QUOTA_CRIT_KEY in parameters: quota_critical = float(parameters[QUOTA_CRIT_KEY]) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine the right URI and whether to use SSL hdfs_site = configurations[HDFS_SITE_KEY] scheme = "https" if http_policy == "HTTPS_ONLY" else "http" nn_addresses = get_all_namenode_addresses(hdfs_site) for nn_address in nn_addresses: if nn_address.startswith(host_name + ":"): uri = nn_address break if not uri: return (RESULT_STATE_SKIPPED, [ 'NameNode on host {0} not found (namenode adresses = {1})'.format( host_name, ', '.join(nn_addresses)) ]) current_time = int(round(time.time() * 1000)) critical = [] warning = [] ok = [] for location in location_quota.split(','): all_users_qry = "{0}://{1}/webhdfs/v1".format( scheme, uri) + location + "?op=LISTSTATUS" # start out assuming an OK status label = None result_code = "OK" try: # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) all_users_response, error_msg, time_millis = curl_krb_request( "/tmp", kerberos_keytab, kerberos_principal, all_users_qry, "hdfs_space_quota_alert", executable_paths, False, "HDFS Space Quota", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) # if path does not exist then error if "FileNotFoundException" in all_users_response: return (RESULT_STATE_UNKNOWN, ['Path {p} does not exist'.format(p=location)]) all_users_response_json = json.loads(all_users_response) # if namenode is not active then skip if 'FileStatuses' not in all_users_response_json: return (RESULT_STATE_SKIPPED, ['NameNode is not active']) subdirectories = [] for filestatus in all_users_response_json['FileStatuses'][ 'FileStatus']: subdirectories.append(filestatus.get("pathSuffix")) for subdirectory in subdirectories: current_quota_qry = "{0}://{1}/webhdfs/v1".format( scheme, uri ) + location + "/" + subdirectory + "?op=GETCONTENTSUMMARY" current_quota_response, error_msg, time_millis = curl_krb_request( "/tmp", kerberos_keytab, kerberos_principal, current_quota_qry, "hdfs_space_quota_alert", executable_paths, False, "HDFS Space Quota", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) current_quota_response_json = json.loads( current_quota_response) result_in_percent = int( float(current_quota_response_json["ContentSummary"] ["spaceConsumed"]) / float(current_quota_response_json["ContentSummary"] ["spaceQuota"]) * 100) if (result_in_percent >= int(quota_critical)): critical.append(location + "/" + subdirectory) elif (result_in_percent >= int(quota_warning)): warning.append(location + "/" + subdirectory) else: ok.append(location + "/" + subdirectory) except: label = traceback.format_exc() result_code = 'UNKNOWN' if len(critical) > 0: result_code = 'CRITICAL' criticaldirectories = ",".join([str(x) for x in critical]) warningdirectories = ",".join([str(x) for x in warning]) if len(warning) > 0: label = 'The following directories are beyond the space quota CRITICAL Treshold of {c}%: "{d}" \n' \ 'The following directories are beyond the space quota WARNING Treshold of {w}%: "{r}"'.format(c=quota_critical,w=quota_warning,d=criticaldirectories,r=warningdirectories) else: label = 'The following directories are beyond the space quota CRITICAL Treshold of {c}%: "{d}"'.format( c=quota_critical, d=criticaldirectories) elif len(warning) > 0: result_code = 'WARNING' warningdirectories = ",".join([str(x) for x in warning]) label = 'The following directories are beyond the space quota WARNING Treshold of {w}%: "{r}"'.format( w=quota_warning, r=warningdirectories) else: result_code = "OK" label = 'All top-level subdirectories "{l}" are within configured quota capacity threshold'.format( l=location_quota) return ((result_code, [label]))
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (RESULT_CODE_UNKNOWN, ['There were no configurations supplied to the script.']) if SOLR_PORT in configurations: solr_port = configurations[SOLR_PORT] else: return (RESULT_CODE_UNKNOWN, ['No Solr port specified']) # parse script arguments solr_memory_usage_warning = SOLR_MEMORY_USAGE_WARNING_DEFAULT if SOLR_MEMORY_USAGE_WARNING_KEY in parameters: solr_memory_usage_warning = float( parameters[SOLR_MEMORY_USAGE_WARNING_KEY]) solr_memory_usage_critical = SOLR_MEMORY_USAGE_CRITICAL_DEFAULT if SOLR_MEMORY_USAGE_CRITICAL_KEY in parameters: solr_memory_usage_critical = float( parameters[SOLR_MEMORY_USAGE_CRITICAL_KEY]) try: query = "http://localhost:" + str( solr_port ) + "/solr/admin/cores?action=STATUS&indexInfo=false&wt=json" shard_response = urllib2.urlopen(query) shard_raw_data = shard_response.read() shard_json_data = json.loads(shard_raw_data) shard_name = shard_json_data["status"].keys()[0] query = "http://localhost:" + str( solr_port) + "/solr/" + shard_name + "/admin/system?wt=json" shard_details_response = urllib2.urlopen(query) shard_details_raw_data = shard_details_response.read() shard_details_json_data = json.loads(shard_details_raw_data) memory_percent = shard_details_json_data["jvm"]["memory"]["raw"][ "used%"] except: label = CRITICAL_CONNECTION_MESSAGE.format(query, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) memory_load = memory_percent / 100.0 label = MESSAGE.format(memory_load) if memory_percent <= solr_memory_usage_warning: result_code = RESULT_CODE_OK elif memory_percent <= solr_memory_usage_critical: result_code = RESULT_CODE_WARNING else: result_code = RESULT_CODE_CRITICAL return (result_code, [label])
def create_ams_datasource(): import params server = Server(protocol = params.ams_grafana_protocol.strip(), host = params.ams_grafana_host.strip(), port = params.ams_grafana_port, user = params.ams_grafana_admin_user, password = params.ams_grafana_admin_pwd) """ Create AMS datasource in Grafana, if exsists make sure the collector url is accurate """ Logger.info("Trying to find working metric collector") results = execute_in_parallel(do_ams_collector_post, params.ams_collector_hosts.split(','), params) new_datasource_host = "" for host in params.ams_collector_hosts.split(','): if host in results: if results[host].status == SUCCESS: new_datasource_host = host Logger.info("Found working collector on host %s" % new_datasource_host) break else: Logger.warning(results[host].result) if new_datasource_host == "": Logger.warning("All metric collectors are unavailable. Will use random collector as datasource host.") new_datasource_host = params.metric_collector_host Logger.info("New datasource host will be %s" % new_datasource_host) ams_datasource_json = Template('metrics_grafana_datasource.json.j2', ams_datasource_name=METRICS_GRAFANA_DATASOURCE_NAME, ams_datasource_host=new_datasource_host).get_content() Logger.info("Checking if AMS Grafana datasource already exists") response = perform_grafana_get_call(GRAFANA_DATASOURCE_URL, server) create_datasource = True if response and response.status == 200: datasources = response.read() datasources_json = json.loads(datasources) for i in xrange(0, len(datasources_json)): datasource_name = datasources_json[i]["name"] if datasource_name == METRICS_GRAFANA_DATASOURCE_NAME: create_datasource = False # datasource already exists Logger.info("Ambari Metrics Grafana datasource already present. Checking Metrics Collector URL") datasource_url = datasources_json[i]["url"] update_datasource = False if is_unchanged_datasource_url(datasource_url, new_datasource_host): Logger.info("Metrics Collector URL validation succeeded.") else: Logger.info("Metrics Collector URL validation failed.") update_datasource = True datasource_type = datasources_json[i]["type"] new_datasource_def = json.loads(ams_datasource_json) new_datasource_type = new_datasource_def["type"] if datasource_type == new_datasource_type: Logger.info("Grafana datasource type validation succeeded.") else: Logger.info("Grafana datasource type validation failed. Old type = %s, New type = %s" % (datasource_type, new_datasource_type)) update_datasource = True if update_datasource: # Metrics datasource present, but collector host is wrong or the datasource type is outdated. datasource_id = datasources_json[i]["id"] Logger.info("Updating datasource, id = %s" % datasource_id) (response, data) = perform_grafana_put_call(GRAFANA_DATASOURCE_URL, datasource_id, ams_datasource_json, server) if response.status == 200: Logger.info("Ambari Metrics Grafana data source updated.") elif response.status == 500: Logger.info("Ambari Metrics Grafana data source update failed. Not retrying.") raise Fail("Ambari Metrics Grafana data source update failed. PUT request status: %s %s \n%s" % (response.status, response.reason, data)) else: raise Fail("Ambari Metrics Grafana data source creation failed. " "PUT request status: %s %s \n%s" % (response.status, response.reason, data)) pass pass pass else: Logger.info("Error checking for Ambari Metrics Grafana datasource. Will attempt to create.") if not create_datasource: return else: Logger.info("Generating datasource:\n%s" % ams_datasource_json) (response, data) = perform_grafana_post_call(GRAFANA_DATASOURCE_URL, ams_datasource_json, server) if response.status == 200: Logger.info("Ambari Metrics Grafana data source created.") elif response.status == 500: Logger.info("Ambari Metrics Grafana data source creation failed. Not retrying.") raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" % (response.status, response.reason, data)) else: Logger.info("Ambari Metrics Grafana data source creation failed.") raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" % (response.status, response.reason, data)) pass
def service_check_for_single_host(self, metric_collector_host, params): random_value1 = random.random() headers = {"Content-type": "application/json"} ca_certs = os.path.join(params.ams_monitor_conf_dir, params.metric_truststore_ca_certs) current_time = int(time.time()) * 1000 metric_json = Template('smoketest_metrics.json.j2', hostname=params.hostname, random1=random_value1, current_time=current_time).get_content() try: post_metrics_to_collector( self.AMS_METRICS_POST_URL, metric_collector_host, params.metric_collector_port, params.metric_collector_https_enabled, metric_json, headers, ca_certs, self.AMS_CONNECT_TRIES, self.AMS_CONNECT_TIMEOUT) get_metrics_parameters = { "metricNames": "AMBARI_METRICS.SmokeTest.FakeMetric", "appId": "amssmoketestfake", "hostname": params.hostname, "startTime": current_time - 60000, "endTime": current_time + 61000, "precision": "seconds", "grouped": "false", } encoded_get_metrics_parameters = urllib.urlencode( get_metrics_parameters) Logger.info( "Connecting (GET) to %s:%s%s" % (metric_collector_host, params.metric_collector_port, self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters)) for i in xrange(0, self.AMS_READ_TRIES): conn = network.get_http_connection( metric_collector_host, int(params.metric_collector_port), params.metric_collector_https_enabled, ca_certs, ssl_version=Script.get_force_https_protocol_value()) conn.request( "GET", self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() Logger.info( "Http response for host %s : %s %s" % (metric_collector_host, response.status, response.reason)) data = response.read() Logger.info("Http data: %s" % data) conn.close() if response.status == 200: Logger.info("Metrics were retrieved from host %s" % metric_collector_host) else: raise Fail( "Metrics were not retrieved from host %s. GET request status: %s %s \n%s" % (metric_collector_host, response.status, response.reason, data)) data_json = json.loads(data) def floats_eq(f1, f2, delta): return abs(f1 - f2) < delta values_are_present = False for metrics_data in data_json["metrics"]: if (str(current_time) in metrics_data["metrics"] and str(current_time + 1000) in metrics_data["metrics"] and floats_eq( metrics_data["metrics"][str(current_time)], random_value1, 0.0000001) and floats_eq( metrics_data["metrics"][str(current_time + 1000)], current_time, 1)): Logger.info( "Values %s and %s were found in the response from host %s." % (metric_collector_host, random_value1, current_time)) values_are_present = True break pass if not values_are_present: if i < self.AMS_READ_TRIES - 1: #range/xrange returns items from start to end-1 Logger.info( "Values weren't stored yet. Retrying in %s seconds." % (self.AMS_READ_TIMEOUT)) time.sleep(self.AMS_READ_TIMEOUT) else: raise Fail( "Values %s and %s were not found in the response." % (random_value1, current_time)) else: break pass except Fail as ex: Logger.warning( "Ambari Metrics service check failed on collector host %s. Reason : %s" % (metric_collector_host, str(ex))) raise Fail( "Ambari Metrics service check failed on collector host %s. Reason : %s" % (metric_collector_host, str(ex)))
def service_check(self, env): import params env.set_params(params) params.HdfsResource( format("/user/{smokeuser}"), type="directory", action="create_on_execute", owner=params.smokeuser, mode=params.smoke_hdfs_user_mode, ) path_to_distributed_shell_jar = params.install_dir + "/share/hadoop/yarn/hadoop-yarn-applications-distributedshell*.jar" yarn_distrubuted_shell_check_params = [ "yarn org.apache.hadoop.yarn.applications.distributedshell.Client", "-shell_command", "ls", "-num_containers", "{number_of_nm}", "-jar", "{path_to_distributed_shell_jar}", "-timeout", "300000", "--queue", "{service_check_queue_name}" ] yarn_distrubuted_shell_check_cmd = format( " ".join(yarn_distrubuted_shell_check_params)) if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser_principal};" ) smoke_cmd = format( "{kinit_cmd} {yarn_distrubuted_shell_check_cmd}") else: smoke_cmd = yarn_distrubuted_shell_check_cmd return_code, out = shell.checked_call( smoke_cmd, path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin', user=params.smokeuser, ) m = re.search("appTrackingUrl=(.*),\s", out) app_url = m.group(1) splitted_app_url = str(app_url).split('/') for item in splitted_app_url: if "application" in item: application_name = item # Find out the active RM from RM list # Raise an exception if the active rm cannot be determined active_rm_webapp_address = self.get_active_rm_webapp_address() Logger.info("Active Resource Manager web app address is : " + active_rm_webapp_address) # Verify job state from active resource manager via rest api info_app_url = params.scheme + "://" + active_rm_webapp_address + "/ws/v1/cluster/apps/" + application_name get_app_info_cmd = "curl --negotiate -u : -ks --location-trusted --connect-timeout " + CURL_CONNECTION_TIMEOUT + " " + info_app_url return_code, stdout, _ = get_user_call_output( get_app_info_cmd, user=params.smokeuser, path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin', ) try: json_response = json.loads(stdout) except Exception as e: raise Fail( format( "Response from YARN API was not a valid JSON. Response: {stdout}" )) if json_response is None or 'app' not in json_response or \ 'state' not in json_response['app'] or 'finalStatus' not in json_response['app']: raise Fail("Application " + app_url + " returns invalid data.") if json_response['app']['state'] != "FINISHED" or json_response['app'][ 'finalStatus'] != "SUCCEEDED": raise Fail( "Application " + app_url + " state/status is not valid. Should be FINISHED/SUCCEEDED.")
def actionexecute(self, env): num_errors = 0 # Parse parameters config = Script.get_config() repo_rhel_suse = config['configurations']['cluster-env'][ 'repo_suse_rhel_template'] repo_ubuntu = config['configurations']['cluster-env'][ 'repo_ubuntu_template'] template = repo_rhel_suse if OSCheck.is_redhat_family( ) or OSCheck.is_suse_family() else repo_ubuntu # Handle a SIGTERM and SIGINT gracefully signal.signal(signal.SIGTERM, self.abort_handler) signal.signal(signal.SIGINT, self.abort_handler) # Select dict that contains parameters try: self.repository_version = config['roleParams'][ 'repository_version'] base_urls = json.loads(config['roleParams']['base_urls']) package_list = json.loads(config['roleParams']['package_list']) stack_id = config['roleParams']['stack_id'] except KeyError: # Last try self.repository_version = config['commandParams'][ 'repository_version'] base_urls = json.loads(config['commandParams']['base_urls']) package_list = json.loads(config['commandParams']['package_list']) stack_id = config['commandParams']['stack_id'] # current stack information self.current_hdp_stack_version = None if 'stack_version' in config['hostLevelParams']: current_stack_version_unformatted = str( config['hostLevelParams']['stack_version']) self.current_hdp_stack_version = format_hdp_stack_version( current_stack_version_unformatted) stack_name = None self.stack_root_folder = None if stack_id and "-" in stack_id: stack_split = stack_id.split("-") if len(stack_split) == 2: stack_name = stack_split[0].upper() if stack_name in self.STACK_TO_ROOT_FOLDER: self.stack_root_folder = self.STACK_TO_ROOT_FOLDER[ stack_name] if self.stack_root_folder is None: raise Fail( "Cannot determine the stack's root directory by parsing the stack_id property, {0}" .format(str(stack_id))) if self.repository_version is None: raise Fail("Cannot determine the repository version to install") self.repository_version = self.repository_version.strip() # Install/update repositories installed_repositories = [] self.current_repositories = [] self.current_repo_files = set() # Enable base system repositories # We don't need that for RHEL family, because we leave all repos enabled # except disabled HDP* ones if OSCheck.is_suse_family(): self.current_repositories.append('base') elif OSCheck.is_ubuntu_family(): self.current_repo_files.add('base') Logger.info("Will install packages for repository version {0}".format( self.repository_version)) try: append_to_file = False for url_info in base_urls: repo_name, repo_file = self.install_repository( url_info, append_to_file, template) self.current_repositories.append(repo_name) self.current_repo_files.add(repo_file) append_to_file = True installed_repositories = list_ambari_managed_repos() except Exception, err: Logger.logger.exception( "Cannot distribute repositories. Error: {0}".format(str(err))) num_errors += 1
def service_check(self, env): import params Logger.info("Ambari Metrics service check was started.") env.set_params(params) random_value1 = random.random() headers = {"Content-type": "application/json"} for i in xrange(0, self.AMS_CONNECT_TRIES): try: current_time = int(time.time()) * 1000 metric_json = Template( 'smoketest_metrics.json.j2', hostname=params.hostname, random1=random_value1, current_time=current_time).get_content() Logger.info("Generated metrics:\n%s" % metric_json) Logger.info( "Connecting (POST) to %s:%s%s" % (params.metric_collector_host, params.metric_collector_port, self.AMS_METRICS_POST_URL)) conn = self.get_http_connection( params.metric_collector_host, int(params.metric_collector_port), params.metric_collector_https_enabled) conn.request("POST", self.AMS_METRICS_POST_URL, metric_json, headers) response = conn.getresponse() Logger.info("Http response: %s %s" % (response.status, response.reason)) except (httplib.HTTPException, socket.error) as ex: if i < self.AMS_CONNECT_TRIES - 1: #range/xrange returns items from start to end-1 time.sleep(self.AMS_CONNECT_TIMEOUT) Logger.info( "Connection failed. Next retry in %s seconds." % (self.AMS_CONNECT_TIMEOUT)) continue else: raise Fail( "Metrics were not saved. Service check has failed. " "\nConnection failed.") data = response.read() Logger.info("Http data: %s" % data) conn.close() if response.status == 200: Logger.info("Metrics were saved.") break else: Logger.info( "Metrics were not saved. Service check has failed.") if i < self.AMS_CONNECT_TRIES - 1: #range/xrange returns items from start to end-1 time.sleep(self.AMS_CONNECT_TIMEOUT) Logger.info("Next retry in %s seconds." % (self.AMS_CONNECT_TIMEOUT)) else: raise Fail( "Metrics were not saved. Service check has failed. POST request status: %s %s \n%s" % (response.status, response.reason, data)) get_metrics_parameters = { "metricNames": "AMBARI_METRICS.SmokeTest.FakeMetric", "appId": "amssmoketestfake", "hostname": params.hostname, "startTime": current_time - 60000, "endTime": current_time + 61000, "precision": "seconds", "grouped": "false", } encoded_get_metrics_parameters = urllib.urlencode( get_metrics_parameters) Logger.info( "Connecting (GET) to %s:%s%s" % (params.metric_collector_host, params.metric_collector_port, self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters)) conn = self.get_http_connection(params.metric_collector_host, int(params.metric_collector_port), params.metric_collector_https_enabled) conn.request("GET", self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() Logger.info("Http response: %s %s" % (response.status, response.reason)) data = response.read() Logger.info("Http data: %s" % data) conn.close() if response.status == 200: Logger.info("Metrics were retrieved.") else: Logger.info( "Metrics were not retrieved. Service check has failed.") raise Fail( "Metrics were not retrieved. Service check has failed. GET request status: %s %s \n%s" % (response.status, response.reason, data)) data_json = json.loads(data) def floats_eq(f1, f2, delta): return abs(f1 - f2) < delta for metrics_data in data_json["metrics"]: if (str(current_time) in metrics_data["metrics"] and str(current_time + 1000) in metrics_data["metrics"] and floats_eq(metrics_data["metrics"][str(current_time)], random_value1, 0.0000001) and floats_eq( metrics_data["metrics"][str(current_time + 1000)], current_time, 1)): Logger.info("Values %s and %s were found in the response." % (random_value1, current_time)) break pass else: Logger.info("Values %s and %s were not found in the response." % (random_value1, current_time)) raise Fail("Values %s and %s were not found in the response." % (random_value1, current_time)) Logger.info("Ambari Metrics service check is finished.")
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # if not in HA mode, then SKIP if not NAMESERVICE_KEY in configurations: return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True name_service = configurations[NAMESERVICE_KEY] hdfs_site = configurations[HDFS_SITE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, [ 'Unable to find unique namenode alias key {0}'.format( nn_unique_ids_key) ]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus" active_namenodes = [] standby_namenodes = [] unknown_namenodes = [] # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() state_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri, "ha_nn_health", None, False, "NameNode High Availability Health") state_response_json = json.loads(state_response) state = state_response_json["beans"][0]['State'] else: state = get_value_from_jmx(jmx_uri, 'State', connection_timeout) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(value) elif state == HDFS_NN_STATE_STANDBY: standby_namenodes.append(value) else: unknown_namenodes.append(value) except: unknown_namenodes.append(value) # now that the request is done, determine if this host is the host that # should report the status of the HA topology is_active_namenode = False for active_namenode in active_namenodes: if active_namenode.startswith(host_name): is_active_namenode = True # there's only one scenario here; there is exactly 1 active and 1 standby is_topology_healthy = len(active_namenodes) == 1 and len( standby_namenodes) == 1 result_label = 'Active{0}, Standby{1}, Unknown{2}'.format( str(active_namenodes), str(standby_namenodes), str(unknown_namenodes)) # Healthy Topology: # - Active NN reports the alert, standby does not # # Unhealthy Topology: # - Report the alert if this is the first named host # - Report the alert if not the first named host, but the other host # could not report its status if is_topology_healthy: if is_active_namenode is True: return (RESULT_STATE_OK, [result_label]) else: return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) else: # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format( name_service, nn_unique_ids[0]) first_listed_host = '' if first_listed_host_key in hdfs_site: first_listed_host = hdfs_site[first_listed_host_key] is_first_listed_host = False if first_listed_host.startswith(host_name): is_first_listed_host = True if is_first_listed_host: return (RESULT_STATE_CRITICAL, [result_label]) else: # not the first listed host, but the first host might be in the unknown return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
def actionexecute(self, env): resolve_ambari_config() # Parse parameters from command json file. config = Script.get_config() host_name = socket.gethostname() version = default('/roleParams/version', None) # These 2 variables are optional service_package_folder = default( '/commandParams/service_package_folder', None) if service_package_folder is None: service_package_folder = default( '/serviceLevelParams/service_package_folder', None) hooks_folder = default('/commandParams/hooks_folder', None) tasks = json.loads(config['roleParams']['tasks']) if tasks: for t in tasks: task = ExecuteTask(t) Logger.info(str(task)) # If a (script, function) exists, it overwrites the command. if task.script and task.function: file_cache = FileCache(agent_config) if service_package_folder and hooks_folder: command_paths = { "commandParams": { "service_package_folder": service_package_folder, }, "clusterLevelParams": { "hooks_folder": hooks_folder }, "ambariLevelParams": { "jdk_location": default('/ambariLevelParams/jdk_location', "") } } base_dir = file_cache.get_service_base_dir( command_paths) else: base_dir = file_cache.get_custom_actions_base_dir({ "ambariLevelParams": { "jdk_location": default('/ambariLevelParams/jdk_location', "") } }) script_path = os.path.join(base_dir, task.script) if not os.path.exists(script_path): message = "Script %s does not exist" % str(script_path) raise Fail(message) # Notice that the script_path is now the fully qualified path, and the # same command-#.json file is used. # Also, the python wrapper is used, since it sets up the correct environment variables command_params = [ "/usr/bin/ambari-python-wrap", script_path, task.function, self.command_data_file, self.basedir, self.stroutfile, self.logging_level, Script.get_tmp_dir() ] task.command = "source /var/lib/ambari-agent/ambari-env.sh ; " + " ".join( command_params) # Replace redundant whitespace to make the unit tests easier to validate task.command = re.sub("\s+", " ", task.command).strip() if task.command: task.command = replace_variables(task.command, host_name, version) shell.checked_call(task.command, logoutput=True, quiet=True)
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri uri = str(host_name) + ":" + uri.split(":")[1] live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format( scheme, uri) convert_to_json_failed = False response_code = None try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() url_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", None, False, "NodeManager Health Summary", smokeuser) try: url_response_json = json.loads(url_response) live_nodemanagers = json.loads( url_response_json["beans"][0]["LiveNodeManagers"]) except ValueError, error: convert_to_json_failed = True if logger.isEnabledFor(logging.DEBUG): logger.exception( "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}" .format("NodeManager Health Summary", str(error))) if convert_to_json_failed: response_code, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", None, True, "NodeManager Health Summary", smokeuser) else:
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) checkpoint_txn_multiplier_warning = CHECKPOINT_TX_MULTIPLIER_WARNING_DEFAULT if CHECKPOINT_TX_MULTIPLIER_WARNING_KEY in parameters: checkpoint_txn_multiplier_warning = float(parameters[CHECKPOINT_TX_MULTIPLIER_WARNING_KEY]) checkpoint_txn_multiplier_critical = CHECKPOINT_TX_MULTIPLIER_CRITICAL_DEFAULT if CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY in parameters: checkpoint_txn_multiplier_critical = float(parameters[CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY]) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry,"checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms = kinit_timer_ms) last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response) last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry,"checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms = kinit_timer_ms) journal_transaction_info_response_json = json.loads(journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)) journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId']) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time)/1000 label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) is_checkpoint_txn_warning = transaction_difference > checkpoint_txn_multiplier_warning * int(checkpoint_tx) is_checkpoint_txn_critical = transaction_difference > checkpoint_txn_multiplier_critical * int(checkpoint_tx) # Either too many uncommitted transactions or missed check-pointing for # long time decided by the thresholds if is_checkpoint_txn_critical or (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)): logger.debug('Raising critical alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx)) result_code = 'CRITICAL' elif is_checkpoint_txn_warning or (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)): logger.debug('Raising warning alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx)) result_code = 'WARNING' except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))
if logger.isEnabledFor(logging.DEBUG): logger.debug(""" AMS request parameters - {0} AMS response - {1} """.format(encoded_get_metrics_parameters, data)) # explicitely close the connection as we've seen python hold onto these if conn is not None: try: conn.close() except: logger.debug( "[Alert][{0}] Unable to close URL connection to {1}". format(self.get_name(), url)) json_is_valid = True try: data_json = json.loads(data) except Exception, exception: json_is_valid = False if logger.isEnabledFor(logging.DEBUG): logger.exception( "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}" .format(self.get_name(), str(exception))) metrics = [] if json_is_valid: metric_dict = {} for metrics_data in data_json["metrics"]: metric_dict[ metrics_data["metricname"]] = metrics_data["metrics"]
def actionexecute(self, env): num_errors = 0 # Parse parameters config = Script.get_config() try: command_repository = CommandRepository(config['repositoryFile']) except KeyError: raise Fail( "The command repository indicated by 'repositoryFile' was not found" ) # Handle a SIGTERM and SIGINT gracefully signal.signal(signal.SIGTERM, self.abort_handler) signal.signal(signal.SIGINT, self.abort_handler) self.repository_version = command_repository.version_string # Select dict that contains parameters try: package_list = json.loads(config['roleParams']['package_list']) stack_id = config['roleParams']['stack_id'] except KeyError: pass self.stack_name = Script.get_stack_name() if self.stack_name is None: raise Fail("Cannot determine the stack name") self.stack_root_folder = Script.get_stack_root() if self.stack_root_folder is None: raise Fail("Cannot determine the stack's root directory") if self.repository_version is None: raise Fail("Cannot determine the repository version to install") self.repository_version = self.repository_version.strip() try: if not command_repository.items: Logger.warning( "Repository list is empty. Ambari may not be managing the repositories for {0}." .format(self.repository_version)) else: Logger.info( "Will install packages for repository version {0}".format( self.repository_version)) new_repo_files = Script.repository_util.create_repo_files() self.repo_files.update(new_repo_files) except Exception as err: Logger.logger.exception( "Cannot install repository files. Error: {0}".format(str(err))) num_errors += 1 # Build structured output with initial values self.structured_output = { 'package_installation_result': 'FAIL', 'repository_version_id': command_repository.version_id } self.put_structured_out(self.structured_output) try: # check package manager non-completed transactions if self.repo_mgr.check_uncompleted_transactions(): self.repo_mgr.print_uncompleted_transaction_hint() num_errors += 1 except Exception as e: # we need to ignore any exception Logger.warning( "Failed to check for uncompleted package manager transactions: " + str(e)) if num_errors > 0: raise Fail("Failed to distribute repositories/install packages") # Initial list of versions, used to compute the new version installed self.old_versions = get_stack_versions(self.stack_root_folder) try: is_package_install_successful = False ret_code = self.install_packages(package_list) if ret_code == 0: self.structured_output[ 'package_installation_result'] = 'SUCCESS' self.put_structured_out(self.structured_output) is_package_install_successful = True else: num_errors += 1 except Exception as err: num_errors += 1 Logger.logger.exception( "Could not install packages. Error: {0}".format(str(err))) # Provide correct exit code if num_errors > 0: raise Fail("Failed to distribute repositories/install packages") self._fix_default_links_for_current() # if installing a version of HDP that needs some symlink love, then create them if is_package_install_successful and 'actual_version' in self.structured_output: self._relink_configurations_with_conf_select( stack_id, self.structured_output['actual_version'])
content = response.read() except Exception, exception: if logger.isEnabledFor(logging.DEBUG): logger.exception("[Alert][{0}] Unable to make a web request: {1}".format(self.get_name(), str(exception))) finally: # explicitely close the connection as we've seen python hold onto these if response is not None: try: response.close() except: logger.debug("[Alert][{0}] Unable to close JMX URL connection to {1}".format (self.get_name(), url)) json_is_valid = True try: json_response = json.loads(content) json_data = json_response['beans'][0] except Exception, exception: json_is_valid = False if logger.isEnabledFor(logging.DEBUG): logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}". format(self.get_name(), str(exception))) if json_is_valid: for attr in jmx_property_value: if attr not in json_data: beans = json_response['beans'] for jmx_prop_list_item in beans: if "name" in jmx_prop_list_item and jmx_prop_list_item["name"] == jmx_property_key: if attr not in jmx_prop_list_item: raise Exception("Unable to find {0} in JSON from {1} ".format(attr, url))
def service_check(self, env): import params env.set_params(params) params.HdfsResource( format("/user/{smokeuser}"), type="directory", action="create_on_execute", owner=params.smokeuser, mode=params.smoke_hdfs_user_mode, ) if params.stack_version_formatted_major and check_stack_feature( StackFeature.ROLLING_UPGRADE, params.stack_version_formatted_major): path_to_distributed_shell_jar = format( "{stack_root}/current/hadoop-yarn-client/hadoop-yarn-applications-distributedshell.jar" ) else: path_to_distributed_shell_jar = "/usr/lib/hadoop-yarn/hadoop-yarn-applications-distributedshell*.jar" yarn_distrubuted_shell_check_params = [ "yarn org.apache.hadoop.yarn.applications.distributedshell.Client", "-shell_command", "ls", "-num_containers", "{number_of_nm}", "-jar", "{path_to_distributed_shell_jar}", "-timeout", "300000", "--queue", "{service_check_queue_name}" ] yarn_distrubuted_shell_check_cmd = format( " ".join(yarn_distrubuted_shell_check_params)) if params.security_enabled: kinit_cmd = format( "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser_principal};" ) smoke_cmd = format( "{kinit_cmd} {yarn_distrubuted_shell_check_cmd}") else: smoke_cmd = yarn_distrubuted_shell_check_cmd return_code, out = shell.checked_call( smoke_cmd, path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin', user=params.smokeuser, ) m = re.search("appTrackingUrl=(.*),\s", out) app_url = m.group(1) splitted_app_url = str(app_url).split('/') for item in splitted_app_url: if "application" in item: application_name = item for rm_webapp_address in params.rm_webapp_addresses_list: info_app_url = params.scheme + "://" + rm_webapp_address + "/ws/v1/cluster/apps/" + application_name get_app_info_cmd = "curl --negotiate -u : -ksL --connect-timeout " + CURL_CONNECTION_TIMEOUT + " " + info_app_url return_code, stdout, _ = get_user_call_output( get_app_info_cmd, user=params.smokeuser, path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin', ) # Handle HDP<2.2.8.1 where RM doesn't do automatic redirection from standby to active if stdout.startswith( "This is standby RM. Redirecting to the current active RM:" ): Logger.info( format( "Skipped checking of {rm_webapp_address} since returned '{stdout}'" )) continue try: json_response = json.loads(stdout) except Exception as e: raise Fail( format( "Response from YARN API was not a valid JSON. Response: {stdout}" )) if json_response is None or 'app' not in json_response or \ 'state' not in json_response['app'] or 'finalStatus' not in json_response['app']: raise Fail("Application " + app_url + " returns invalid data.") if json_response['app']['state'] != "FINISHED" or json_response[ 'app']['finalStatus'] != "SUCCEEDED": raise Fail( "Application " + app_url + " state/status is not valid. Should be FINISHED/SUCCEEDED." )
) #repo params repo_info = config['hostLevelParams']['repo_info'] service_repo_info = default("/hostLevelParams/service_repo_info", None) user_to_groups_dict = {} #Append new user-group mapping to the dict try: user_group_map = ast.literal_eval(config['hostLevelParams']['user_groups']) for key in user_group_map.iterkeys(): user_to_groups_dict[key] = user_group_map[key] except ValueError: print('User Group mapping (user_group) is missing in the hostLevelParams') user_to_gid_dict = collections.defaultdict(lambda: user_group) user_list = json.loads(config['hostLevelParams']['user_list']) group_list = json.loads(config['hostLevelParams']['group_list']) host_sys_prepped = default("/hostLevelParams/host_sys_prepped", False) tez_am_view_acls = config['configurations']['tez-site']["tez.am.view-acls"] override_uid = str(default("/configurations/cluster-env/override_uid", "true")).lower() # if NN HA on secure clutser, access Zookeper securely if stack_supports_zk_security and dfs_ha_enabled and security_enabled: hadoop_zkfc_opts = format( "-Dzookeeper.sasl.client=true -Dzookeeper.sasl.client.username=zookeeper -Djava.security.auth.login.config={hadoop_conf_secure_dir}/hdfs_jaas.conf -Dzookeeper.sasl.clientconfig=Client" )
def sync_ldap(options): if not is_root(): err = 'Ambari-server sync-ldap should be run with ' \ 'root-level privileges' raise FatalException(4, err) server_status, pid = is_server_runing() if not server_status: err = 'Ambari Server is not running.' raise FatalException(1, err) properties = get_ambari_properties() if properties == -1: raise FatalException(1, "Failed to read properties file.") ldap_configured = properties.get_property(IS_LDAP_CONFIGURED) if ldap_configured != 'true': err = "LDAP is not configured. Run 'ambari-server setup-ldap' first." raise FatalException(1, err) # set ldap sync options ldap_sync_options = LdapSyncOptions(options) if ldap_sync_options.no_ldap_sync_options_set(): err = 'Must specify a sync option (all, existing, users or groups). Please invoke ambari-server.py --help to print the options.' raise FatalException(1, err) admin_login = get_validated_string_input(prompt="Enter Ambari Admin login: "******"Enter Ambari Admin password: "******"Event":{"specs":[{"principal_type":"users","sync_type":"all"},{"principal_type":"groups","sync_type":"all"}]}}] elif ldap_sync_options.ldap_sync_existing: sys.stdout.write('Syncing existing.') bodies = [{"Event":{"specs":[{"principal_type":"users","sync_type":"existing"},{"principal_type":"groups","sync_type":"existing"}]}}] else: sys.stdout.write('Syncing specified users and groups.') bodies = [{"Event":{"specs":[]}}] body = bodies[0] events = body['Event'] specs = events['specs'] if ldap_sync_options.ldap_sync_users is not None: new_specs = [{"principal_type":"users","sync_type":"specific","names":""}] get_ldap_event_spec_names(ldap_sync_options.ldap_sync_users, specs, new_specs) if ldap_sync_options.ldap_sync_groups is not None: new_specs = [{"principal_type":"groups","sync_type":"specific","names":""}] get_ldap_event_spec_names(ldap_sync_options.ldap_sync_groups, specs, new_specs) if get_verbose(): sys.stdout.write('\nCalling API ' + url + ' : ' + str(bodies) + '\n') request.add_data(json.dumps(bodies)) request.get_method = lambda: 'POST' try: response = urllib2.urlopen(request) except Exception as e: err = 'Sync event creation failed. Error details: %s' % e raise FatalException(1, err) response_status_code = response.getcode() if response_status_code != 201: err = 'Error during syncing. Http status code - ' + str(response_status_code) raise FatalException(1, err) response_body = json.loads(response.read()) url = response_body['resources'][0]['href'] request = urllib2.Request(url) request.add_header('Authorization', 'Basic %s' % admin_auth) request.add_header('X-Requested-By', 'ambari') body = [{"LDAP":{"synced_groups":"*","synced_users":"*"}}] request.add_data(json.dumps(body)) request.get_method = lambda: 'GET' request_in_progress = True while request_in_progress: sys.stdout.write('.') sys.stdout.flush() try: response = urllib2.urlopen(request) except Exception as e: request_in_progress = False err = 'Sync event check failed. Error details: %s' % e raise FatalException(1, err) response_status_code = response.getcode() if response_status_code != 200: err = 'Error during syncing. Http status code - ' + str(response_status_code) raise FatalException(1, err) response_body = json.loads(response.read()) sync_info = response_body['Event'] if sync_info['status'] == 'ERROR': raise FatalException(1, str(sync_info['status_detail'])) elif sync_info['status'] == 'COMPLETE': print '\n\nCompleted LDAP Sync.' print 'Summary:' for principal_type, summary in sync_info['summary'].iteritems(): print ' {0}:'.format(principal_type) for action, amount in summary.iteritems(): print ' {0} = {1!s}'.format(action, amount) request_in_progress = False else: time.sleep(1) sys.stdout.write('\n') sys.stdout.flush()
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100 percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100 # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format( scheme, uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format( scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry, "checkpoint_time_alert", None, False, "NameNode Last Checkpoint") last_checkpoint_time_response_json = json.loads( last_checkpoint_time_response) last_checkpoint_time = int( last_checkpoint_time_response_json["beans"][0] ["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry, "checkpoint_time_alert", None, False, "NameNode Last Checkpoint") journal_transaction_info_response_json = json.loads( journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json[ "beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int( get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)) journal_transaction_info = get_value_from_jmx( journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int( journal_transaction_info_dict['LastAppliedOrWrittenTxId']) most_recent_tx = int( journal_transaction_info_dict['MostRecentCheckpointTxId']) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time) / 1000 label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) if (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_critical)): result_code = 'CRITICAL' elif (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_warning)): result_code = 'WARNING' except Exception, e: label = str(e) result_code = 'UNKNOWN'
def run_schema_upgrade(args): db_title = get_db_type(get_ambari_properties()).title confirm = get_YN_input( "Ambari Server configured for %s. Confirm " "you have made a backup of the Ambari Server database [y/n] (y)? " % db_title, True) if not confirm: print_error_msg("Database backup is not confirmed") return 1 jdk_path = get_java_exe_path() if jdk_path is None: print_error_msg( "No JDK found, please run the \"setup\" " "command to install a JDK automatically or install any " "JDK manually to " + configDefaults.JDK_INSTALL_DIR) return 1 ensure_jdbc_driver_is_installed(args, get_ambari_properties()) print_info_msg('Upgrading database schema', True) serverClassPath = ServerClassPath(get_ambari_properties(), args) class_path = serverClassPath.get_full_ambari_classpath_escaped_for_shell( validate_classpath=True) set_debug_mode_from_options(args) debug_mode = get_debug_mode() debug_start = (debug_mode & 1) or SCHEMA_UPGRADE_DEBUG suspend_start = (debug_mode & 2) or SUSPEND_START_MODE suspend_mode = 'y' if suspend_start else 'n' command = SCHEMA_UPGRADE_HELPER_CMD_DEBUG.format( jdk_path, class_path, suspend_mode) if debug_start else SCHEMA_UPGRADE_HELPER_CMD.format( jdk_path, class_path) ambari_user = read_ambari_user() current_user = ensure_can_start_under_current_user(ambari_user) environ = generate_env(args, ambari_user, current_user) (retcode, stdout, stderr) = run_os_command(command, env=environ) upgrade_response = json.loads(stdout) check_gpl_license_approved(upgrade_response) print_info_msg( "Return code from schema upgrade command, retcode = {0}".format( str(retcode)), True) if stdout: print_info_msg("Console output from schema upgrade command:", True) print_info_msg(stdout, True) print if retcode > 0: print_error_msg( "Error executing schema upgrade, please check the server logs.") if stderr: print_error_msg("Error output from schema upgrade command:") print_error_msg(stderr) print else: print_info_msg('Schema upgrade completed', True) return retcode
def _run_command(self, target, operation, method='POST', assertable_result=True, file_to_put=None, ignore_status_codes=[], **kwargs): """ assertable_result - some POST requests return '{"boolean":false}' or '{"boolean":true}' depending on if query was successful or not, we can assert this for them """ target = HdfsResourceProvider.parse_path(target) if not target: raise Fail("Target cannot be empty") url = format("{address}/webhdfs/v1{target}?op={operation}", address=self.address) request_args = kwargs if not self.security_enabled: request_args['user.name'] = self.run_user for k, v in request_args.iteritems(): url = format("{url}&{k}={v}") cmd = ["curl", "-sS", "-L", "-w", "%{http_code}", "-X", method] # When operation is "OPEN" the target is actually the DFS file to download and the file_to_put is actually the target see _download_file if operation == "OPEN": cmd += ["-o", file_to_put] else: if file_to_put and not os.path.exists(file_to_put): raise Fail(format("File {file_to_put} is not found.")) if file_to_put: cmd += [ "--data-binary", "@" + file_to_put, "-H", "Content-Type: application/octet-stream" ] if self.security_enabled: cmd += ["--negotiate", "-u", ":"] if self.is_https_enabled: cmd += ["-k"] cmd.append(url) _, out, err = get_user_call_output(cmd, user=self.run_user, logoutput=self.logoutput, quiet=False) status_code = out[-3:] out = out[:-3] # remove last line from output which is status code try: result_dict = json.loads(out) except ValueError: result_dict = out if status_code not in WebHDFSUtil.valid_status_codes + ignore_status_codes or assertable_result and result_dict and not result_dict[ 'boolean']: formatted_output = json.dumps(result_dict, indent=2) if isinstance( result_dict, dict) else result_dict formatted_output = err + "\n" + formatted_output err_msg = "Execution of '%s' returned status_code=%s. %s" % ( shell.string_cmd_from_args_list(cmd), status_code, formatted_output) raise WebHDFSCallException(err_msg, result_dict) return result_dict
def create_ambari_admin_user(self,ambari_admin_username, ambari_admin_password,usernamepassword): """ :param ambari_admin_username: username of user to be created :param ambari_admin_username: user password of user to be created :return Returns response code for successful user creation else None """ flag_ambari_admin_present = False match = re.match('[a-zA-Z0-9_\S]+$', ambari_admin_password) if match is None: raise Fail('Invalid password given for Ranger Admin user for Ambari') try: url = self.urlUsers + '?name=' + str(ambari_admin_username) request = urllib2.Request(url) base64string = base64.encodestring(usernamepassword).replace('\n', '') request.add_header("Content-Type", "application/json") request.add_header("Accept", "application/json") request.add_header("Authorization", "Basic {0}".format(base64string)) result = openurl(request, timeout=20) response_code = result.getcode() response = json.loads(result.read()) if response_code == 200 and len(response['vXUsers']) >= 0: for vxuser in response['vXUsers']: if vxuser['name'] == ambari_admin_username: flag_ambari_admin_present = True break else: flag_ambari_admin_present = False if flag_ambari_admin_present: Logger.info(ambari_admin_username + ' user already exists.') return response_code else: Logger.info(ambari_admin_username + ' user is not present, creating user using given configurations') url = self.urlSecUsers admin_user = dict() admin_user['status'] = 1 admin_user['userRoleList'] = ['ROLE_SYS_ADMIN'] admin_user['name'] = ambari_admin_username admin_user['password'] = ambari_admin_password admin_user['description'] = ambari_admin_username admin_user['firstName'] = ambari_admin_username data = json.dumps(admin_user) base64string = base64.encodestring('{0}'.format(usernamepassword)).replace('\n', '') headers = { 'Accept': 'application/json', "Content-Type": "application/json" } request = urllib2.Request(url, data, headers) request.add_header("Authorization", "Basic {0}".format(base64string)) result = openurl(request, timeout=20) response_code = result.getcode() response = json.loads(json.JSONEncoder().encode(result.read())) if response_code == 200 and response is not None: Logger.info('Ambari admin user creation successful.') return response_code else: Logger.info('Ambari admin user creation failed.') return None else: return None except urllib2.URLError, e: if isinstance(e, urllib2.HTTPError): raise Fail("Error creating ambari admin user. Http status code - {0}. \n {1}".format(e.code, e.read())) else: raise Fail("Error creating ambari admin user. Reason - {0}.".format(e.reason))
def create_ams_dashboards(): """ Create dashboards in grafana from the json files """ import params server = Server(protocol = params.ams_grafana_protocol.strip(), host = params.ams_grafana_host.strip(), port = params.ams_grafana_port, user = params.ams_grafana_admin_user, password = params.ams_grafana_admin_pwd) dashboard_files = params.get_grafana_dashboard_defs() version = params.get_ambari_version() Logger.info("Checking dashboards to update for Ambari version : %s" % version) # Friendly representation of dashboard Dashboard = namedtuple('Dashboard', ['uri', 'id', 'title', 'tags']) existing_dashboards = [] response = perform_grafana_get_call(GRAFANA_SEARCH_BULTIN_DASHBOARDS, server) if response and response.status == 200: data = response.read() try: dashboards = json.loads(data) except: Logger.error("Unable to parse JSON response from grafana request: %s" % GRAFANA_SEARCH_BULTIN_DASHBOARDS) Logger.info(data) return for dashboard in dashboards: if dashboard['title'] == 'HBase - Performance': perform_grafana_delete_call("/api/dashboards/" + dashboard['uri'], server) else: existing_dashboards.append( Dashboard(uri = dashboard['uri'], id = dashboard['id'], title = dashboard['title'], tags = dashboard['tags']) ) pass else: Logger.error("Failed to execute search query on Grafana dashboards. " "query = %s\n statuscode = %s\n reason = %s\n data = %s\n" % (GRAFANA_SEARCH_BULTIN_DASHBOARDS, response.status, response.reason, response.read())) return Logger.debug('Dashboard definitions found = %s' % str(dashboard_files)) if dashboard_files: for dashboard_file in dashboard_files: try: with open(dashboard_file, 'r') as file: dashboard_def = json.load(file) except Exception, e: Logger.error('Unable to load dashboard json file %s' % dashboard_file) Logger.error(str(e)) continue if dashboard_def: update_def = True # Make sure static json does not have id if "id" in dashboard_def: dashboard_def['id'] = None # Set correct tags if 'tags' in dashboard_def: dashboard_def['tags'].append('builtin') dashboard_def['tags'].append(version) else: dashboard_def['tags'] = [ 'builtin', version ] for dashboard in existing_dashboards: if dashboard.title == dashboard_def['title']: if version not in dashboard.tags: # Found existing dashboard with wrong version - update dashboard update_def = True else: update_def = False # Skip update pass if update_def: Logger.info("Updating dashboard definition for %s with tags: %s" % (dashboard_def['title'], dashboard_def['tags'])) # Discrepancy in grafana export vs import format dashboard_def_payload = { "dashboard" : dashboard_def, 'overwrite': True } paylaod = json.dumps(dashboard_def_payload).strip() (response, data) = perform_grafana_post_call(GRAFANA_DASHBOARDS_URL, paylaod, server) if response and response.status == 200: Logger.info("Dashboard created successfully.\n %s" % str(data)) else: Logger.error("Failed creating dashboard: %s" % dashboard_def['title']) pass else: Logger.info('No update needed for dashboard = %s' % dashboard_def['title']) pass pass
def get_restricted_packages(): """ Gets the list of conf-select 'package' names that need to be invoked on the command. When the server passes down the list of packages to install, check the service names and use the information in stack_packages json to determine the list of packages that should be executed. That is valid only for PATCH or MAINT upgrades. STANDARD upgrades should be conf-select'ing everything it can find. """ package_names = [] # shortcut the common case if we are not patching cluster_version_summary = default( "/roleParameters/cluster_version_summary/services", None) if cluster_version_summary is None: Logger.info( "Cluster Summary is not available, there are no restrictions for conf-select" ) return package_names service_names = [] # pick out the services that are targeted for servicename, servicedetail in cluster_version_summary.iteritems(): if servicedetail['upgrade']: service_names.append(servicename) if 0 == len(service_names): Logger.info( "No services found, there are no restrictions for conf-select") return package_names stack_name = default("/clusterLevelParams/stack_name", None) if stack_name is None: Logger.info( "The stack name is not present in the command. Restricted names skipped." ) return package_names stack_packages_config = default( "/configurations/cluster-env/stack_packages", None) if stack_packages_config is None: Logger.info( "The stack packages are not defined on the command. Restricted names skipped." ) return package_names data = json.loads(stack_packages_config) if stack_name not in data: Logger.info( "Cannot find conf-select packages for the {0} stack".format( stack_name)) return package_names conf_select_key = "conf-select-patching" if conf_select_key not in data[stack_name]: Logger.info( "There are no conf-select-patching elements defined for this command for the {0} stack" .format(stack_name)) return package_names service_dict = data[stack_name][conf_select_key] for servicename in service_names: if servicename in service_dict and 'packages' in service_dict[ servicename]: package_names.extend(service_dict[servicename]['packages']) return package_names
#repo params repo_info = config['hostLevelParams']['repoInfo'] service_repo_info = default("/hostLevelParams/service_repo_info", None) user_to_groups_dict = {} #Append new user-group mapping to the dict try: user_group_map = ast.literal_eval( config['clusterLevelParams']['user_groups']) for key in user_group_map.iterkeys(): user_to_groups_dict[key] = user_group_map[key] except ValueError: print('User Group mapping (user_group) is missing in the hostLevelParams') user_to_gid_dict = collections.defaultdict(lambda: user_group) user_list = json.loads(config['clusterLevelParams']['user_list']) group_list = json.loads(config['clusterLevelParams']['group_list']) host_sys_prepped = default("/ambariLevelParams/host_sys_prepped", False) tez_am_view_acls = config['configurations']['tez-site']["tez.am.view-acls"] override_uid = str(default("/configurations/cluster-env/override_uid", "true")).lower() # if NN HA on secure clutser, access Zookeper securely if stack_supports_zk_security and dfs_ha_enabled and security_enabled: hadoop_zkfc_opts = format( "-Dzookeeper.sasl.client=true -Dzookeeper.sasl.client.username=zookeeper -Djava.security.auth.login.config={hadoop_conf_secure_dir}/hdfs_jaas.conf -Dzookeeper.sasl.clientconfig=Client" )
def get_packages(scope, service_name = None, component_name = None): """ Gets the packages which should be used with the stack's stack-select tool for the specified service/component. Not all services/components are used with the stack-select tools, so those will return no packages. :param scope: the scope of the command :param service_name: the service name, such as ZOOKEEPER :param component_name: the component name, such as ZOOKEEPER_SERVER :return: the packages to use with stack-select or None """ from resource_management.libraries.functions.default import default if scope not in _PACKAGE_SCOPES: raise Fail("The specified scope of {0} is not valid".format(scope)) config = Script.get_config() if service_name is None or component_name is None: if 'role' not in config or 'serviceName' not in config: raise Fail("Both the role and the service name must be included in the command in order to determine which packages to use with the stack-select tool") service_name = config['serviceName'] component_name = config['role'] stack_name = default("/clusterLevelParams/stack_name", None) if stack_name is None: raise Fail("The stack name is not present in the command. Packages for stack-select tool cannot be loaded.") stack_packages_config = default("/configurations/cluster-env/stack_packages", None) if stack_packages_config is None: raise Fail("The stack packages are not defined on the command. Unable to load packages for the stack-select tool") data = json.loads(stack_packages_config) if stack_name not in data: raise Fail( "Cannot find stack-select packages for the {0} stack".format(stack_name)) stack_select_key = "stack-select" data = data[stack_name] if stack_select_key not in data: raise Fail( "There are no stack-select packages defined for this command for the {0} stack".format(stack_name)) # this should now be the dictionary of role name to package name data = data[stack_select_key] service_name = service_name.upper() component_name = component_name.upper() if service_name not in data: Logger.info("Skipping stack-select on {0} because it does not exist in the stack-select package structure.".format(service_name)) return None data = data[service_name] if component_name not in data: Logger.info("Skipping stack-select on {0} because it does not exist in the stack-select package structure.".format(component_name)) return None # this one scope is not an array, so transform it into one for now so we can # use the same code below packages = data[component_name][scope] if scope == PACKAGE_SCOPE_STACK_SELECT: packages = [packages] # grab the package name from the JSON and validate it against the packages # that the stack-select tool supports - if it doesn't support it, then try to find the legacy # package name if it exists supported_packages = get_supported_packages() for index, package in enumerate(packages): if not is_package_supported(package, supported_packages=supported_packages): if _PACKAGE_SCOPE_LEGACY in data[component_name]: legacy_package = data[component_name][_PACKAGE_SCOPE_LEGACY] Logger.info( "The package {0} is not supported by this version of the stack-select tool, defaulting to the legacy package of {1}".format(package, legacy_package)) # use the legacy package packages[index] = legacy_package else: raise Fail("The package {0} is not supported by this version of the stack-select tool.".format(package)) # transform the array bcak to a single element if scope == PACKAGE_SCOPE_STACK_SELECT: packages = packages[0] return packages