예제 #1
0
    def __init__(self, jsonvalue):

        if isinstance(jsonvalue, dict):
            json_dict = jsonvalue
        elif isinstance(jsonvalue, basestring):
            json_dict = json.loads(jsonvalue)

        if json_dict is None:
            raise Fail("Cannot deserialize command repository {0}".format(
                str(jsonvalue)))

        # version_id is the primary id of the repo_version table in the database
        self.version_id = _find_value(json_dict, 'repoVersionId')
        self.stack_name = _find_value(json_dict, 'stackName')
        self.version_string = _find_value(json_dict, 'repoVersion')
        self.repositories = []

        repos_def = _find_value(json_dict, 'repositories')
        if repos_def is not None:
            if not isinstance(repos_def, list):
                repos_def = [repos_def]

            for repo_def in repos_def:
                self.repositories.append(_CommandRepositoryEntry(repo_def))
예제 #2
0
 def update_ranger_policy(self, policyId, data, usernamepassword):
     """
 :param policyId: policy id which needs to be updated
 :param data: policy data that needs to be updated
 :param usernamepassword: user credentials using which policy needs to be updated
 :return Returns successful response and response code else None
 """
     try:
         searchRepoURL = self.urlPolicies + "/" + str(policyId)
         base64string = base64.encodestring(
             '{0}'.format(usernamepassword)).replace('\n', '')
         headers = {
             'Accept': 'application/json',
             "Content-Type": "application/json"
         }
         request = urllib2.Request(searchRepoURL, data, headers)
         request.add_header("Authorization",
                            "Basic {0}".format(base64string))
         request.get_method = lambda: 'PUT'
         result = openurl(request, timeout=20)
         response_code = result.getcode()
         response = json.loads(json.JSONEncoder().encode(result.read()))
         if response_code == 200:
             Logger.info('Policy updated Successfully')
             return response_code
         else:
             Logger.error('Update Policy failed')
             return None
     except urllib2.URLError, e:
         if isinstance(e, urllib2.HTTPError):
             raise Fail(
                 "Error updating policy. Http status code - {0}. \n {1}".
                 format(e.code, e.read()))
         else:
             raise Fail("Error updating policy. Reason - {0}.".format(
                 e.reason))
예제 #3
0
def create_repo(url, data, usernamepassword):
  try:
    base_url = url + '/service/public/v2/api/service'
    base64string = base64.encodestring('{0}'.format(usernamepassword)).replace('\n', '')
    headers = {
      'Accept': 'application/json',
      "Content-Type": "application/json"
    }
    request = urllib2.Request(base_url, data, headers)
    request.add_header("Authorization", "Basic {0}".format(base64string))
    result = urllib2.urlopen(request, timeout=20)
    response_code = result.getcode()
    response = json.loads(json.JSONEncoder().encode(result.read()))
    if response_code == 200:
      Logger.info('Repository created Successfully')
      return True
    else:
      Logger.info('Repository not created')
      return False
  except urllib2.URLError, e:
    if isinstance(e, urllib2.HTTPError):
      raise Fail("Error creating service. Http status code - {0}. \n {1}".format(e.code, e.read()))
    else:
      raise Fail("Error creating service. Reason - {0}.".format(e.reason))
예제 #4
0
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
  result_code = RESULT_CODE_UNKNOWN

  if configurations is None:
    return (result_code, ['There were no configurations supplied to the script.'])

  scheme = 'http'
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'

  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]
    
  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

  if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

  if YARN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[YARN_HTTP_POLICY_KEY]


  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])


  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'

    if https_uri is not None:
      uri = https_uri

  label = ''
  url_response = None
  node_healthy = 'false'
  total_time = 0

  # some yarn-site structures don't have the web ui address
  if uri is None:
    if host_name is None:
      host_name = socket.getfqdn()

    uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
    
  if OSCheck.is_windows_family():
    uri_host, uri_port = uri.split(':')
    # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1
    uri_host = resolve_address(uri_host)
    uri = '{0}:{1}'.format(uri_host, uri_port)

  query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      url_response, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
        query, "nm_health_alert", None, False, "NodeManager Health", smokeuser,
        connection_timeout=curl_connection_timeout)

      json_response = json.loads(url_response)
    else:
      # execute the query for the JSON that includes templeton status
      url_response = urllib2.urlopen(query, timeout=connection_timeout)
      json_response = json.loads(url_response.read())
  except urllib2.HTTPError, httpError:
    label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
      str(httpError))

    return (RESULT_CODE_CRITICAL, [label])
def create_ams_datasource():
  import params
  server = Server(protocol = params.ams_grafana_protocol.strip(),
                  host = params.ams_grafana_host.strip(),
                  port = params.ams_grafana_port,
                  user = params.ams_grafana_admin_user,
                  password = params.ams_grafana_admin_pwd)

  """
  Create AMS datasource in Grafana, if exsists make sure the collector url is accurate
  """
  ams_datasource_json = Template('metrics_grafana_datasource.json.j2',
                                 ams_datasource_name=METRICS_GRAFANA_DATASOURCE_NAME).get_content()

  Logger.info("Checking if AMS Grafana datasource already exists")


  response = perform_grafana_get_call(GRAFANA_DATASOURCE_URL, server)
  create_datasource = True

  if response and response.status == 200:
    datasources = response.read()
    datasources_json = json.loads(datasources)
    for i in xrange(0, len(datasources_json)):
      datasource_name = datasources_json[i]["name"]
      if datasource_name == METRICS_GRAFANA_DATASOURCE_NAME:
        create_datasource = False # datasource already exists
        Logger.info("Ambari Metrics Grafana datasource already present. Checking Metrics Collector URL")
        datasource_url = datasources_json[i]["url"]

        if is_unchanged_datasource_url(datasource_url):
          Logger.info("Metrics Collector URL validation succeeded.")
          return
        else: # Metrics datasource present, but collector host is wrong.
          datasource_id = datasources_json[i]["id"]
          Logger.info("Metrics Collector URL validation failed. Updating "
                      "datasource, id = %s" % datasource_id)

          (response, data) = perform_grafana_put_call(GRAFANA_DATASOURCE_URL, datasource_id,
                                                      ams_datasource_json, server)

          if response.status == 200:
            Logger.info("Ambari Metrics Grafana data source updated.")

          elif response.status == 500:
            Logger.info("Ambari Metrics Grafana data source update failed. Not retrying.")
            raise Fail("Ambari Metrics Grafana data source update failed. PUT request status: %s %s \n%s" %
                       (response.status, response.reason, data))
          else:
            raise Fail("Ambari Metrics Grafana data source creation failed. "
                       "PUT request status: %s %s \n%s" % (response.status, response.reason, data))
        pass
      pass
    pass
  else:
    Logger.info("Error checking for Ambari Metrics Grafana datasource. Will attempt to create.")

  if not create_datasource:
    return
  else:
    Logger.info("Generating datasource:\n%s" % ams_datasource_json)

    (response, data) = perform_grafana_post_call(GRAFANA_DATASOURCE_URL, ams_datasource_json, server)

    if response.status == 200:
      Logger.info("Ambari Metrics Grafana data source created.")
    elif response.status == 500:
      Logger.info("Ambari Metrics Grafana data source creation failed. Not retrying.")
      raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" %
                 (response.status, response.reason, data))
    else:
      Logger.info("Ambari Metrics Grafana data source creation failed.")
      raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" %
                 (response.status, response.reason, data))
  pass
def execute(configurations={}, parameters={}, host_name=None):
    """
    Returns a tuple containing the result code and a pre-formatted result label

    Keyword arguments:
    configurations (dictionary): a mapping of configuration key to value
    parameters (dictionary): a mapping of script parameter key to value
    host_name (string): the name of this host where the alert is running
    """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if NN_CHECKPOINT_TX_KEY in configurations:
        checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

    if NN_CHECKPOINT_PERIOD_KEY in configurations:
        checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
        executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    location_quota = LOCATION_QUOTA_DEFAULT
    if LOCATION_QUOTA_KEY in parameters:
        location_quota = str(parameters[LOCATION_QUOTA_KEY])

    quota_warning = QUOTA_WARN_DEFAULT
    if QUOTA_WARN_KEY in parameters:
        quota_warning = float(parameters[QUOTA_WARN_KEY])

    quota_critical = QUOTA_CRIT_DEFAULT
    if QUOTA_CRIT_KEY in parameters:
        quota_critical = float(parameters[QUOTA_CRIT_KEY])

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                    DEFAULT_KERBEROS_KINIT_TIMER_MS)

    # determine the right URI and whether to use SSL
    hdfs_site = configurations[HDFS_SITE_KEY]

    scheme = "https" if http_policy == "HTTPS_ONLY" else "http"

    nn_addresses = get_all_namenode_addresses(hdfs_site)
    for nn_address in nn_addresses:
        if nn_address.startswith(host_name + ":"):
            uri = nn_address
            break
    if not uri:
        return (RESULT_STATE_SKIPPED, [
            'NameNode on host {0} not found (namenode adresses = {1})'.format(
                host_name, ', '.join(nn_addresses))
        ])

    current_time = int(round(time.time() * 1000))

    critical = []
    warning = []
    ok = []
    for location in location_quota.split(','):

        all_users_qry = "{0}://{1}/webhdfs/v1".format(
            scheme, uri) + location + "?op=LISTSTATUS"

        # start out assuming an OK status
        label = None
        result_code = "OK"

        try:
            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)

            all_users_response, error_msg, time_millis = curl_krb_request(
                "/tmp",
                kerberos_keytab,
                kerberos_principal,
                all_users_qry,
                "hdfs_space_quota_alert",
                executable_paths,
                False,
                "HDFS Space Quota",
                smokeuser,
                connection_timeout=curl_connection_timeout,
                kinit_timer_ms=kinit_timer_ms)

            # if path does not exist then error
            if "FileNotFoundException" in all_users_response:
                return (RESULT_STATE_UNKNOWN,
                        ['Path {p} does not exist'.format(p=location)])

            all_users_response_json = json.loads(all_users_response)

            # if namenode is not active then skip
            if 'FileStatuses' not in all_users_response_json:
                return (RESULT_STATE_SKIPPED, ['NameNode is not active'])

            subdirectories = []
            for filestatus in all_users_response_json['FileStatuses'][
                    'FileStatus']:
                subdirectories.append(filestatus.get("pathSuffix"))

            for subdirectory in subdirectories:

                current_quota_qry = "{0}://{1}/webhdfs/v1".format(
                    scheme, uri
                ) + location + "/" + subdirectory + "?op=GETCONTENTSUMMARY"
                current_quota_response, error_msg, time_millis = curl_krb_request(
                    "/tmp",
                    kerberos_keytab,
                    kerberos_principal,
                    current_quota_qry,
                    "hdfs_space_quota_alert",
                    executable_paths,
                    False,
                    "HDFS Space Quota",
                    smokeuser,
                    connection_timeout=curl_connection_timeout,
                    kinit_timer_ms=kinit_timer_ms)

                current_quota_response_json = json.loads(
                    current_quota_response)
                result_in_percent = int(
                    float(current_quota_response_json["ContentSummary"]
                          ["spaceConsumed"]) /
                    float(current_quota_response_json["ContentSummary"]
                          ["spaceQuota"]) * 100)

                if (result_in_percent >= int(quota_critical)):
                    critical.append(location + "/" + subdirectory)
                elif (result_in_percent >= int(quota_warning)):
                    warning.append(location + "/" + subdirectory)
                else:
                    ok.append(location + "/" + subdirectory)

        except:
            label = traceback.format_exc()
            result_code = 'UNKNOWN'

    if len(critical) > 0:
        result_code = 'CRITICAL'
        criticaldirectories = ",".join([str(x) for x in critical])
        warningdirectories = ",".join([str(x) for x in warning])
        if len(warning) > 0:
            label = 'The following directories are beyond the space quota CRITICAL Treshold of {c}%: "{d}" \n' \
             'The following directories are beyond the space quota WARNING Treshold of {w}%: "{r}"'.format(c=quota_critical,w=quota_warning,d=criticaldirectories,r=warningdirectories)
        else:
            label = 'The following directories are beyond the space quota CRITICAL Treshold of {c}%: "{d}"'.format(
                c=quota_critical, d=criticaldirectories)
    elif len(warning) > 0:
        result_code = 'WARNING'
        warningdirectories = ",".join([str(x) for x in warning])
        label = 'The following directories are beyond the space quota WARNING Treshold of {w}%: "{r}"'.format(
            w=quota_warning, r=warningdirectories)
    else:
        result_code = "OK"
        label = 'All top-level subdirectories "{l}" are within configured quota capacity threshold'.format(
            l=location_quota)

    return ((result_code, [label]))
예제 #7
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (RESULT_CODE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    if SOLR_PORT in configurations:
        solr_port = configurations[SOLR_PORT]
    else:
        return (RESULT_CODE_UNKNOWN, ['No Solr port specified'])

    # parse script arguments
    solr_memory_usage_warning = SOLR_MEMORY_USAGE_WARNING_DEFAULT
    if SOLR_MEMORY_USAGE_WARNING_KEY in parameters:
        solr_memory_usage_warning = float(
            parameters[SOLR_MEMORY_USAGE_WARNING_KEY])

    solr_memory_usage_critical = SOLR_MEMORY_USAGE_CRITICAL_DEFAULT
    if SOLR_MEMORY_USAGE_CRITICAL_KEY in parameters:
        solr_memory_usage_critical = float(
            parameters[SOLR_MEMORY_USAGE_CRITICAL_KEY])

    try:
        query = "http://localhost:" + str(
            solr_port
        ) + "/solr/admin/cores?action=STATUS&indexInfo=false&wt=json"
        shard_response = urllib2.urlopen(query)
        shard_raw_data = shard_response.read()
        shard_json_data = json.loads(shard_raw_data)

        shard_name = shard_json_data["status"].keys()[0]
        query = "http://localhost:" + str(
            solr_port) + "/solr/" + shard_name + "/admin/system?wt=json"
        shard_details_response = urllib2.urlopen(query)
        shard_details_raw_data = shard_details_response.read()
        shard_details_json_data = json.loads(shard_details_raw_data)
        memory_percent = shard_details_json_data["jvm"]["memory"]["raw"][
            "used%"]
    except:
        label = CRITICAL_CONNECTION_MESSAGE.format(query,
                                                   traceback.format_exc())
        return (RESULT_CODE_CRITICAL, [label])

    memory_load = memory_percent / 100.0
    label = MESSAGE.format(memory_load)
    if memory_percent <= solr_memory_usage_warning:
        result_code = RESULT_CODE_OK
    elif memory_percent <= solr_memory_usage_critical:
        result_code = RESULT_CODE_WARNING
    else:
        result_code = RESULT_CODE_CRITICAL

    return (result_code, [label])
def create_ams_datasource():
  import params
  server = Server(protocol = params.ams_grafana_protocol.strip(),
                  host = params.ams_grafana_host.strip(),
                  port = params.ams_grafana_port,
                  user = params.ams_grafana_admin_user,
                  password = params.ams_grafana_admin_pwd)

  """
  Create AMS datasource in Grafana, if exsists make sure the collector url is accurate
  """
  Logger.info("Trying to find working metric collector")
  results = execute_in_parallel(do_ams_collector_post, params.ams_collector_hosts.split(','), params)
  new_datasource_host = ""

  for host in params.ams_collector_hosts.split(','):
    if host in results:
      if results[host].status == SUCCESS:
        new_datasource_host = host
        Logger.info("Found working collector on host %s" % new_datasource_host)
        break
      else:
        Logger.warning(results[host].result)

  if new_datasource_host == "":
    Logger.warning("All metric collectors are unavailable. Will use random collector as datasource host.")
    new_datasource_host = params.metric_collector_host

  Logger.info("New datasource host will be %s" % new_datasource_host)

  ams_datasource_json = Template('metrics_grafana_datasource.json.j2',
                            ams_datasource_name=METRICS_GRAFANA_DATASOURCE_NAME, ams_datasource_host=new_datasource_host).get_content()
  Logger.info("Checking if AMS Grafana datasource already exists")

  response = perform_grafana_get_call(GRAFANA_DATASOURCE_URL, server)
  create_datasource = True

  if response and response.status == 200:
    datasources = response.read()
    datasources_json = json.loads(datasources)
    for i in xrange(0, len(datasources_json)):
      datasource_name = datasources_json[i]["name"]
      if datasource_name == METRICS_GRAFANA_DATASOURCE_NAME:
        create_datasource = False # datasource already exists
        Logger.info("Ambari Metrics Grafana datasource already present. Checking Metrics Collector URL")
        datasource_url = datasources_json[i]["url"]

        update_datasource = False
        if is_unchanged_datasource_url(datasource_url, new_datasource_host):
          Logger.info("Metrics Collector URL validation succeeded.")
        else:
          Logger.info("Metrics Collector URL validation failed.")
          update_datasource = True

        datasource_type = datasources_json[i]["type"]
        new_datasource_def = json.loads(ams_datasource_json)
        new_datasource_type = new_datasource_def["type"]

        if datasource_type == new_datasource_type:
          Logger.info("Grafana datasource type validation succeeded.")
        else:
          Logger.info("Grafana datasource type validation failed. Old type = %s, New type = %s" % (datasource_type, new_datasource_type))
          update_datasource = True

        if update_datasource: # Metrics datasource present, but collector host is wrong or the datasource type is outdated.
          datasource_id = datasources_json[i]["id"]
          Logger.info("Updating datasource, id = %s" % datasource_id)

          (response, data) = perform_grafana_put_call(GRAFANA_DATASOURCE_URL, datasource_id,
                                                      ams_datasource_json, server)

          if response.status == 200:
            Logger.info("Ambari Metrics Grafana data source updated.")

          elif response.status == 500:
            Logger.info("Ambari Metrics Grafana data source update failed. Not retrying.")
            raise Fail("Ambari Metrics Grafana data source update failed. PUT request status: %s %s \n%s" %
                       (response.status, response.reason, data))
          else:
            raise Fail("Ambari Metrics Grafana data source creation failed. "
                       "PUT request status: %s %s \n%s" % (response.status, response.reason, data))
        pass
      pass
    pass
  else:
    Logger.info("Error checking for Ambari Metrics Grafana datasource. Will attempt to create.")

  if not create_datasource:
    return
  else:
    Logger.info("Generating datasource:\n%s" % ams_datasource_json)

    (response, data) = perform_grafana_post_call(GRAFANA_DATASOURCE_URL, ams_datasource_json, server)

    if response.status == 200:
      Logger.info("Ambari Metrics Grafana data source created.")
    elif response.status == 500:
      Logger.info("Ambari Metrics Grafana data source creation failed. Not retrying.")
      raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" %
                 (response.status, response.reason, data))
    else:
      Logger.info("Ambari Metrics Grafana data source creation failed.")
      raise Fail("Ambari Metrics Grafana data source creation failed. POST request status: %s %s \n%s" %
                 (response.status, response.reason, data))
  pass
예제 #9
0
    def service_check_for_single_host(self, metric_collector_host, params):
        random_value1 = random.random()
        headers = {"Content-type": "application/json"}
        ca_certs = os.path.join(params.ams_monitor_conf_dir,
                                params.metric_truststore_ca_certs)

        current_time = int(time.time()) * 1000
        metric_json = Template('smoketest_metrics.json.j2',
                               hostname=params.hostname,
                               random1=random_value1,
                               current_time=current_time).get_content()
        try:
            post_metrics_to_collector(
                self.AMS_METRICS_POST_URL, metric_collector_host,
                params.metric_collector_port,
                params.metric_collector_https_enabled, metric_json, headers,
                ca_certs, self.AMS_CONNECT_TRIES, self.AMS_CONNECT_TIMEOUT)

            get_metrics_parameters = {
                "metricNames": "AMBARI_METRICS.SmokeTest.FakeMetric",
                "appId": "amssmoketestfake",
                "hostname": params.hostname,
                "startTime": current_time - 60000,
                "endTime": current_time + 61000,
                "precision": "seconds",
                "grouped": "false",
            }
            encoded_get_metrics_parameters = urllib.urlencode(
                get_metrics_parameters)

            Logger.info(
                "Connecting (GET) to %s:%s%s" %
                (metric_collector_host, params.metric_collector_port,
                 self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters))
            for i in xrange(0, self.AMS_READ_TRIES):
                conn = network.get_http_connection(
                    metric_collector_host,
                    int(params.metric_collector_port),
                    params.metric_collector_https_enabled,
                    ca_certs,
                    ssl_version=Script.get_force_https_protocol_value())
                conn.request(
                    "GET",
                    self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
                response = conn.getresponse()
                Logger.info(
                    "Http response for host %s : %s %s" %
                    (metric_collector_host, response.status, response.reason))

                data = response.read()
                Logger.info("Http data: %s" % data)
                conn.close()

                if response.status == 200:
                    Logger.info("Metrics were retrieved from host %s" %
                                metric_collector_host)
                else:
                    raise Fail(
                        "Metrics were not retrieved from host %s. GET request status: %s %s \n%s"
                        % (metric_collector_host, response.status,
                           response.reason, data))
                data_json = json.loads(data)

                def floats_eq(f1, f2, delta):
                    return abs(f1 - f2) < delta

                values_are_present = False
                for metrics_data in data_json["metrics"]:
                    if (str(current_time) in metrics_data["metrics"] and
                            str(current_time + 1000) in metrics_data["metrics"]
                            and floats_eq(
                                metrics_data["metrics"][str(current_time)],
                                random_value1, 0.0000001) and floats_eq(
                                    metrics_data["metrics"][str(current_time +
                                                                1000)],
                                    current_time, 1)):
                        Logger.info(
                            "Values %s and %s were found in the response from host %s."
                            % (metric_collector_host, random_value1,
                               current_time))
                        values_are_present = True
                        break
                        pass

                if not values_are_present:
                    if i < self.AMS_READ_TRIES - 1:  #range/xrange returns items from start to end-1
                        Logger.info(
                            "Values weren't stored yet. Retrying in %s seconds."
                            % (self.AMS_READ_TIMEOUT))
                        time.sleep(self.AMS_READ_TIMEOUT)
                    else:
                        raise Fail(
                            "Values %s and %s were not found in the response."
                            % (random_value1, current_time))
                else:
                    break
                    pass
        except Fail as ex:
            Logger.warning(
                "Ambari Metrics service check failed on collector host %s. Reason : %s"
                % (metric_collector_host, str(ex)))
            raise Fail(
                "Ambari Metrics service check failed on collector host %s. Reason : %s"
                % (metric_collector_host, str(ex)))
예제 #10
0
    def service_check(self, env):
        import params
        env.set_params(params)

        params.HdfsResource(
            format("/user/{smokeuser}"),
            type="directory",
            action="create_on_execute",
            owner=params.smokeuser,
            mode=params.smoke_hdfs_user_mode,
        )

        path_to_distributed_shell_jar = params.install_dir + "/share/hadoop/yarn/hadoop-yarn-applications-distributedshell*.jar"

        yarn_distrubuted_shell_check_params = [
            "yarn org.apache.hadoop.yarn.applications.distributedshell.Client",
            "-shell_command", "ls", "-num_containers", "{number_of_nm}",
            "-jar", "{path_to_distributed_shell_jar}", "-timeout", "300000",
            "--queue", "{service_check_queue_name}"
        ]
        yarn_distrubuted_shell_check_cmd = format(
            " ".join(yarn_distrubuted_shell_check_params))

        if params.security_enabled:
            kinit_cmd = format(
                "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser_principal};"
            )
            smoke_cmd = format(
                "{kinit_cmd} {yarn_distrubuted_shell_check_cmd}")
        else:
            smoke_cmd = yarn_distrubuted_shell_check_cmd

        return_code, out = shell.checked_call(
            smoke_cmd,
            path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin',
            user=params.smokeuser,
        )

        m = re.search("appTrackingUrl=(.*),\s", out)
        app_url = m.group(1)

        splitted_app_url = str(app_url).split('/')

        for item in splitted_app_url:
            if "application" in item:
                application_name = item

        # Find out the active RM from RM list
        # Raise an exception if the active rm cannot be determined
        active_rm_webapp_address = self.get_active_rm_webapp_address()
        Logger.info("Active Resource Manager web app address is : " +
                    active_rm_webapp_address)

        # Verify job state from active resource manager via rest api
        info_app_url = params.scheme + "://" + active_rm_webapp_address + "/ws/v1/cluster/apps/" + application_name
        get_app_info_cmd = "curl --negotiate -u : -ks --location-trusted --connect-timeout " + CURL_CONNECTION_TIMEOUT + " " + info_app_url

        return_code, stdout, _ = get_user_call_output(
            get_app_info_cmd,
            user=params.smokeuser,
            path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin',
        )

        try:
            json_response = json.loads(stdout)
        except Exception as e:
            raise Fail(
                format(
                    "Response from YARN API was not a valid JSON. Response: {stdout}"
                ))

        if json_response is None or 'app' not in json_response or \
                'state' not in json_response['app'] or 'finalStatus' not in json_response['app']:
            raise Fail("Application " + app_url + " returns invalid data.")

        if json_response['app']['state'] != "FINISHED" or json_response['app'][
                'finalStatus'] != "SUCCEEDED":
            raise Fail(
                "Application " + app_url +
                " state/status is not valid. Should be FINISHED/SUCCEEDED.")
예제 #11
0
    def actionexecute(self, env):
        num_errors = 0

        # Parse parameters
        config = Script.get_config()

        repo_rhel_suse = config['configurations']['cluster-env'][
            'repo_suse_rhel_template']
        repo_ubuntu = config['configurations']['cluster-env'][
            'repo_ubuntu_template']
        template = repo_rhel_suse if OSCheck.is_redhat_family(
        ) or OSCheck.is_suse_family() else repo_ubuntu

        # Handle a SIGTERM and SIGINT gracefully
        signal.signal(signal.SIGTERM, self.abort_handler)
        signal.signal(signal.SIGINT, self.abort_handler)

        # Select dict that contains parameters
        try:
            self.repository_version = config['roleParams'][
                'repository_version']
            base_urls = json.loads(config['roleParams']['base_urls'])
            package_list = json.loads(config['roleParams']['package_list'])
            stack_id = config['roleParams']['stack_id']
        except KeyError:
            # Last try
            self.repository_version = config['commandParams'][
                'repository_version']
            base_urls = json.loads(config['commandParams']['base_urls'])
            package_list = json.loads(config['commandParams']['package_list'])
            stack_id = config['commandParams']['stack_id']

        # current stack information
        self.current_hdp_stack_version = None
        if 'stack_version' in config['hostLevelParams']:
            current_stack_version_unformatted = str(
                config['hostLevelParams']['stack_version'])
            self.current_hdp_stack_version = format_hdp_stack_version(
                current_stack_version_unformatted)

        stack_name = None
        self.stack_root_folder = None
        if stack_id and "-" in stack_id:
            stack_split = stack_id.split("-")
            if len(stack_split) == 2:
                stack_name = stack_split[0].upper()
                if stack_name in self.STACK_TO_ROOT_FOLDER:
                    self.stack_root_folder = self.STACK_TO_ROOT_FOLDER[
                        stack_name]
        if self.stack_root_folder is None:
            raise Fail(
                "Cannot determine the stack's root directory by parsing the stack_id property, {0}"
                .format(str(stack_id)))
        if self.repository_version is None:
            raise Fail("Cannot determine the repository version to install")

        self.repository_version = self.repository_version.strip()

        # Install/update repositories
        installed_repositories = []
        self.current_repositories = []
        self.current_repo_files = set()

        # Enable base system repositories
        # We don't need that for RHEL family, because we leave all repos enabled
        # except disabled HDP* ones
        if OSCheck.is_suse_family():
            self.current_repositories.append('base')
        elif OSCheck.is_ubuntu_family():
            self.current_repo_files.add('base')

        Logger.info("Will install packages for repository version {0}".format(
            self.repository_version))
        try:
            append_to_file = False
            for url_info in base_urls:
                repo_name, repo_file = self.install_repository(
                    url_info, append_to_file, template)
                self.current_repositories.append(repo_name)
                self.current_repo_files.add(repo_file)
                append_to_file = True

            installed_repositories = list_ambari_managed_repos()
        except Exception, err:
            Logger.logger.exception(
                "Cannot distribute repositories. Error: {0}".format(str(err)))
            num_errors += 1
예제 #12
0
    def service_check(self, env):
        import params

        Logger.info("Ambari Metrics service check was started.")
        env.set_params(params)

        random_value1 = random.random()
        headers = {"Content-type": "application/json"}

        for i in xrange(0, self.AMS_CONNECT_TRIES):
            try:
                current_time = int(time.time()) * 1000
                metric_json = Template(
                    'smoketest_metrics.json.j2',
                    hostname=params.hostname,
                    random1=random_value1,
                    current_time=current_time).get_content()
                Logger.info("Generated metrics:\n%s" % metric_json)

                Logger.info(
                    "Connecting (POST) to %s:%s%s" %
                    (params.metric_collector_host,
                     params.metric_collector_port, self.AMS_METRICS_POST_URL))
                conn = self.get_http_connection(
                    params.metric_collector_host,
                    int(params.metric_collector_port),
                    params.metric_collector_https_enabled)
                conn.request("POST", self.AMS_METRICS_POST_URL, metric_json,
                             headers)

                response = conn.getresponse()
                Logger.info("Http response: %s %s" %
                            (response.status, response.reason))
            except (httplib.HTTPException, socket.error) as ex:
                if i < self.AMS_CONNECT_TRIES - 1:  #range/xrange returns items from start to end-1
                    time.sleep(self.AMS_CONNECT_TIMEOUT)
                    Logger.info(
                        "Connection failed. Next retry in %s seconds." %
                        (self.AMS_CONNECT_TIMEOUT))
                    continue
                else:
                    raise Fail(
                        "Metrics were not saved. Service check has failed. "
                        "\nConnection failed.")

            data = response.read()
            Logger.info("Http data: %s" % data)
            conn.close()

            if response.status == 200:
                Logger.info("Metrics were saved.")
                break
            else:
                Logger.info(
                    "Metrics were not saved. Service check has failed.")
                if i < self.AMS_CONNECT_TRIES - 1:  #range/xrange returns items from start to end-1
                    time.sleep(self.AMS_CONNECT_TIMEOUT)
                    Logger.info("Next retry in %s seconds." %
                                (self.AMS_CONNECT_TIMEOUT))
                else:
                    raise Fail(
                        "Metrics were not saved. Service check has failed. POST request status: %s %s \n%s"
                        % (response.status, response.reason, data))

        get_metrics_parameters = {
            "metricNames": "AMBARI_METRICS.SmokeTest.FakeMetric",
            "appId": "amssmoketestfake",
            "hostname": params.hostname,
            "startTime": current_time - 60000,
            "endTime": current_time + 61000,
            "precision": "seconds",
            "grouped": "false",
        }
        encoded_get_metrics_parameters = urllib.urlencode(
            get_metrics_parameters)

        Logger.info(
            "Connecting (GET) to %s:%s%s" %
            (params.metric_collector_host, params.metric_collector_port,
             self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters))

        conn = self.get_http_connection(params.metric_collector_host,
                                        int(params.metric_collector_port),
                                        params.metric_collector_https_enabled)
        conn.request("GET",
                     self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
        response = conn.getresponse()
        Logger.info("Http response: %s %s" %
                    (response.status, response.reason))

        data = response.read()
        Logger.info("Http data: %s" % data)
        conn.close()

        if response.status == 200:
            Logger.info("Metrics were retrieved.")
        else:
            Logger.info(
                "Metrics were not retrieved. Service check has failed.")
            raise Fail(
                "Metrics were not retrieved. Service check has failed. GET request status: %s %s \n%s"
                % (response.status, response.reason, data))
        data_json = json.loads(data)

        def floats_eq(f1, f2, delta):
            return abs(f1 - f2) < delta

        for metrics_data in data_json["metrics"]:
            if (str(current_time) in metrics_data["metrics"]
                    and str(current_time + 1000) in metrics_data["metrics"]
                    and floats_eq(metrics_data["metrics"][str(current_time)],
                                  random_value1, 0.0000001)
                    and floats_eq(
                        metrics_data["metrics"][str(current_time + 1000)],
                        current_time, 1)):
                Logger.info("Values %s and %s were found in the response." %
                            (random_value1, current_time))
                break
            pass
        else:
            Logger.info("Values %s and %s were not found in the response." %
                        (random_value1, current_time))
            raise Fail("Values %s and %s were not found in the response." %
                       (random_value1, current_time))

        Logger.info("Ambari Metrics service check is finished.")
예제 #13
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """
    if configurations is None:
        return (RESULT_STATE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    # if not in HA mode, then SKIP
    if not NAMESERVICE_KEY in configurations:
        return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled'])

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # determine whether or not SSL is enabled
    is_ssl_enabled = False
    if DFS_POLICY_KEY in configurations:
        dfs_policy = configurations[DFS_POLICY_KEY]
        if dfs_policy == "HTTPS_ONLY":
            is_ssl_enabled = True

    name_service = configurations[NAMESERVICE_KEY]
    hdfs_site = configurations[HDFS_SITE_KEY]

    # look for dfs.ha.namenodes.foo
    nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
    if not nn_unique_ids_key in hdfs_site:
        return (RESULT_STATE_UNKNOWN, [
            'Unable to find unique namenode alias key {0}'.format(
                nn_unique_ids_key)
        ])

    namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
    jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"

    if is_ssl_enabled:
        namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
        jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=NameNodeStatus"

    active_namenodes = []
    standby_namenodes = []
    unknown_namenodes = []

    # now we have something like 'nn1,nn2,nn3,nn4'
    # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
    # ie dfs.namenode.http-address.hacluster.nn1
    nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
    for nn_unique_id in nn_unique_ids:
        key = namenode_http_fragment.format(name_service, nn_unique_id)

        if key in hdfs_site:
            # use str() to ensure that unicode strings do not have the u' in them
            value = str(hdfs_site[key])

            try:
                jmx_uri = jmx_uri_fragment.format(value)
                if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                    env = Environment.get_instance()
                    state_response, error_msg, time_millis = curl_krb_request(
                        env.tmp_dir, kerberos_keytab, kerberos_principal,
                        jmx_uri, "ha_nn_health", None, False,
                        "NameNode High Availability Health")
                    state_response_json = json.loads(state_response)
                    state = state_response_json["beans"][0]['State']
                else:
                    state = get_value_from_jmx(jmx_uri, 'State',
                                               connection_timeout)

                if state == HDFS_NN_STATE_ACTIVE:
                    active_namenodes.append(value)
                elif state == HDFS_NN_STATE_STANDBY:
                    standby_namenodes.append(value)
                else:
                    unknown_namenodes.append(value)
            except:
                unknown_namenodes.append(value)

    # now that the request is done, determine if this host is the host that
    # should report the status of the HA topology
    is_active_namenode = False
    for active_namenode in active_namenodes:
        if active_namenode.startswith(host_name):
            is_active_namenode = True

    # there's only one scenario here; there is exactly 1 active and 1 standby
    is_topology_healthy = len(active_namenodes) == 1 and len(
        standby_namenodes) == 1

    result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(
        str(active_namenodes), str(standby_namenodes), str(unknown_namenodes))

    # Healthy Topology:
    #   - Active NN reports the alert, standby does not
    #
    # Unhealthy Topology:
    #   - Report the alert if this is the first named host
    #   - Report the alert if not the first named host, but the other host
    #   could not report its status
    if is_topology_healthy:
        if is_active_namenode is True:
            return (RESULT_STATE_OK, [result_label])
        else:
            return (RESULT_STATE_SKIPPED,
                    ['Another host will report this alert'])
    else:
        # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode
        first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format(
            name_service, nn_unique_ids[0])

        first_listed_host = ''
        if first_listed_host_key in hdfs_site:
            first_listed_host = hdfs_site[first_listed_host_key]

        is_first_listed_host = False
        if first_listed_host.startswith(host_name):
            is_first_listed_host = True

        if is_first_listed_host:
            return (RESULT_STATE_CRITICAL, [result_label])
        else:
            # not the first listed host, but the first host might be in the unknown
            return (RESULT_STATE_SKIPPED,
                    ['Another host will report this alert'])
예제 #14
0
    def actionexecute(self, env):
        resolve_ambari_config()

        # Parse parameters from command json file.
        config = Script.get_config()

        host_name = socket.gethostname()
        version = default('/roleParams/version', None)

        # These 2 variables are optional
        service_package_folder = default(
            '/commandParams/service_package_folder', None)
        if service_package_folder is None:
            service_package_folder = default(
                '/serviceLevelParams/service_package_folder', None)
        hooks_folder = default('/commandParams/hooks_folder', None)

        tasks = json.loads(config['roleParams']['tasks'])
        if tasks:
            for t in tasks:
                task = ExecuteTask(t)
                Logger.info(str(task))

                # If a (script, function) exists, it overwrites the command.
                if task.script and task.function:
                    file_cache = FileCache(agent_config)

                    if service_package_folder and hooks_folder:
                        command_paths = {
                            "commandParams": {
                                "service_package_folder":
                                service_package_folder,
                            },
                            "clusterLevelParams": {
                                "hooks_folder": hooks_folder
                            },
                            "ambariLevelParams": {
                                "jdk_location":
                                default('/ambariLevelParams/jdk_location', "")
                            }
                        }

                        base_dir = file_cache.get_service_base_dir(
                            command_paths)
                    else:
                        base_dir = file_cache.get_custom_actions_base_dir({
                            "ambariLevelParams": {
                                "jdk_location":
                                default('/ambariLevelParams/jdk_location', "")
                            }
                        })

                    script_path = os.path.join(base_dir, task.script)
                    if not os.path.exists(script_path):
                        message = "Script %s does not exist" % str(script_path)
                        raise Fail(message)

                    # Notice that the script_path is now the fully qualified path, and the
                    # same command-#.json file is used.
                    # Also, the python wrapper is used, since it sets up the correct environment variables
                    command_params = [
                        "/usr/bin/ambari-python-wrap", script_path,
                        task.function, self.command_data_file, self.basedir,
                        self.stroutfile, self.logging_level,
                        Script.get_tmp_dir()
                    ]

                    task.command = "source /var/lib/ambari-agent/ambari-env.sh ; " + " ".join(
                        command_params)
                    # Replace redundant whitespace to make the unit tests easier to validate
                    task.command = re.sub("\s+", " ", task.command).strip()

                if task.command:
                    task.command = replace_variables(task.command, host_name,
                                                     version)
                    shell.checked_call(task.command,
                                       logoutput=True,
                                       quiet=True)
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    if NODEMANAGER_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY]

    if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY]

    if YARN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[YARN_HTTP_POLICY_KEY]

    if SMOKEUSER_KEY in configurations:
        smokeuser = configurations[SMOKEUSER_KEY]

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == 'HTTPS_ONLY':
        scheme = 'https'

        if https_uri is not None:
            uri = https_uri

    uri = str(host_name) + ":" + uri.split(":")[1]
    live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(
        scheme, uri)
    convert_to_json_failed = False
    response_code = None
    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()
            url_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir, kerberos_keytab, kerberos_principal,
                live_nodemanagers_qry, "nm_health_summary_alert", None, False,
                "NodeManager Health Summary", smokeuser)
            try:
                url_response_json = json.loads(url_response)
                live_nodemanagers = json.loads(
                    url_response_json["beans"][0]["LiveNodeManagers"])
            except ValueError, error:
                convert_to_json_failed = True
                if logger.isEnabledFor(logging.DEBUG):
                    logger.exception(
                        "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}"
                        .format("NodeManager Health Summary", str(error)))

            if convert_to_json_failed:
                response_code, error_msg, time_millis = curl_krb_request(
                    env.tmp_dir, kerberos_keytab, kerberos_principal,
                    live_nodemanagers_qry, "nm_health_summary_alert", None,
                    True, "NodeManager Health Summary", smokeuser)
        else:
예제 #16
0
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

  if configurations is None:
    return (('UNKNOWN', ['There were no configurations supplied to the script.']))
  
  uri = None
  scheme = 'http'  
  http_uri = None
  https_uri = None
  http_policy = 'HTTP_ONLY'
  checkpoint_tx = CHECKPOINT_TX_DEFAULT
  checkpoint_period = CHECKPOINT_PERIOD_DEFAULT
  
  if NN_HTTP_ADDRESS_KEY in configurations:
    http_uri = configurations[NN_HTTP_ADDRESS_KEY]

  if NN_HTTPS_ADDRESS_KEY in configurations:
    https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

  if NN_HTTP_POLICY_KEY in configurations:
    http_policy = configurations[NN_HTTP_POLICY_KEY]

  if NN_CHECKPOINT_TX_KEY in configurations:
    checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

  if NN_CHECKPOINT_PERIOD_KEY in configurations:
    checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]
    
  if SMOKEUSER_KEY in configurations:
    smokeuser = configurations[SMOKEUSER_KEY]

  executable_paths = None
  if EXECUTABLE_SEARCH_PATHS in configurations:
    executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

  security_enabled = False
  if SECURITY_ENABLED_KEY in configurations:
    security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

  kerberos_keytab = None
  if KERBEROS_KEYTAB in configurations:
    kerberos_keytab = configurations[KERBEROS_KEYTAB]

  kerberos_principal = None
  if KERBEROS_PRINCIPAL in configurations:
    kerberos_principal = configurations[KERBEROS_PRINCIPAL]
    kerberos_principal = kerberos_principal.replace('_HOST', host_name)

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  percent_warning = PERCENT_WARNING_DEFAULT
  if PERCENT_WARNING_KEY in parameters:
    percent_warning = float(parameters[PERCENT_WARNING_KEY])

  percent_critical = PERCENT_CRITICAL_DEFAULT
  if PERCENT_CRITICAL_KEY in parameters:
    percent_critical = float(parameters[PERCENT_CRITICAL_KEY])

  checkpoint_txn_multiplier_warning = CHECKPOINT_TX_MULTIPLIER_WARNING_DEFAULT
  if CHECKPOINT_TX_MULTIPLIER_WARNING_KEY in parameters:
    checkpoint_txn_multiplier_warning = float(parameters[CHECKPOINT_TX_MULTIPLIER_WARNING_KEY])

  checkpoint_txn_multiplier_critical = CHECKPOINT_TX_MULTIPLIER_CRITICAL_DEFAULT
  if CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY in parameters:
    checkpoint_txn_multiplier_critical = float(parameters[CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY])

  kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)

  # determine the right URI and whether to use SSL
  uri = http_uri
  if http_policy == 'HTTPS_ONLY':
    scheme = 'https'
    
    if https_uri is not None:
      uri = https_uri 
  
  current_time = int(round(time.time() * 1000))

  last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri)
  journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri)

  # start out assuming an OK status
  label = None
  result_code = "OK"

  try:
    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
      env = Environment.get_instance()

      # curl requires an integer timeout
      curl_connection_timeout = int(connection_timeout)

      last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab,
        kerberos_principal, last_checkpoint_time_qry,"checkpoint_time_alert", executable_paths, False,
        "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout,
        kinit_timer_ms = kinit_timer_ms)

      last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response)
      last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"])

      journal_transaction_info_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab,
        kerberos_principal, journal_transaction_info_qry,"checkpoint_time_alert", executable_paths,
        False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout,
        kinit_timer_ms = kinit_timer_ms)

      journal_transaction_info_response_json = json.loads(journal_transaction_info_response)
      journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"]
    else:
      last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,
      "LastCheckpointTime", connection_timeout))

      journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,
      "JournalTransactionInfo", connection_timeout)

    journal_transaction_info_dict = json.loads(journal_transaction_info)
  
    last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
    most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId'])
    transaction_difference = last_tx - most_recent_tx
    
    delta = (current_time - last_checkpoint_time)/1000

    label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference)

    is_checkpoint_txn_warning = transaction_difference > checkpoint_txn_multiplier_warning * int(checkpoint_tx)
    is_checkpoint_txn_critical = transaction_difference > checkpoint_txn_multiplier_critical * int(checkpoint_tx)

    # Either too many uncommitted transactions or missed check-pointing for
    # long time decided by the thresholds
    if is_checkpoint_txn_critical or (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)):
      logger.debug('Raising critical alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx))
      result_code = 'CRITICAL'
    elif is_checkpoint_txn_warning or (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)):
      logger.debug('Raising warning alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx))
      result_code = 'WARNING'

  except:
    label = traceback.format_exc()
    result_code = 'UNKNOWN'
        
  return ((result_code, [label]))
예제 #17
0
            if logger.isEnabledFor(logging.DEBUG):
                logger.debug("""
        AMS request parameters - {0}
        AMS response - {1}
        """.format(encoded_get_metrics_parameters, data))
            # explicitely close the connection as we've seen python hold onto these
            if conn is not None:
                try:
                    conn.close()
                except:
                    logger.debug(
                        "[Alert][{0}] Unable to close URL connection to {1}".
                        format(self.get_name(), url))
        json_is_valid = True
        try:
            data_json = json.loads(data)
        except Exception, exception:
            json_is_valid = False
            if logger.isEnabledFor(logging.DEBUG):
                logger.exception(
                    "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}"
                    .format(self.get_name(), str(exception)))

        metrics = []

        if json_is_valid:
            metric_dict = {}
            for metrics_data in data_json["metrics"]:
                metric_dict[
                    metrics_data["metricname"]] = metrics_data["metrics"]
예제 #18
0
    def actionexecute(self, env):
        num_errors = 0

        # Parse parameters
        config = Script.get_config()

        try:
            command_repository = CommandRepository(config['repositoryFile'])
        except KeyError:
            raise Fail(
                "The command repository indicated by 'repositoryFile' was not found"
            )

        # Handle a SIGTERM and SIGINT gracefully
        signal.signal(signal.SIGTERM, self.abort_handler)
        signal.signal(signal.SIGINT, self.abort_handler)

        self.repository_version = command_repository.version_string

        # Select dict that contains parameters
        try:
            package_list = json.loads(config['roleParams']['package_list'])
            stack_id = config['roleParams']['stack_id']
        except KeyError:
            pass

        self.stack_name = Script.get_stack_name()
        if self.stack_name is None:
            raise Fail("Cannot determine the stack name")

        self.stack_root_folder = Script.get_stack_root()
        if self.stack_root_folder is None:
            raise Fail("Cannot determine the stack's root directory")

        if self.repository_version is None:
            raise Fail("Cannot determine the repository version to install")

        self.repository_version = self.repository_version.strip()

        try:
            if not command_repository.items:
                Logger.warning(
                    "Repository list is empty. Ambari may not be managing the repositories for {0}."
                    .format(self.repository_version))
            else:
                Logger.info(
                    "Will install packages for repository version {0}".format(
                        self.repository_version))
                new_repo_files = Script.repository_util.create_repo_files()
                self.repo_files.update(new_repo_files)
        except Exception as err:
            Logger.logger.exception(
                "Cannot install repository files. Error: {0}".format(str(err)))
            num_errors += 1

        # Build structured output with initial values
        self.structured_output = {
            'package_installation_result': 'FAIL',
            'repository_version_id': command_repository.version_id
        }

        self.put_structured_out(self.structured_output)

        try:
            # check package manager non-completed transactions
            if self.repo_mgr.check_uncompleted_transactions():
                self.repo_mgr.print_uncompleted_transaction_hint()
                num_errors += 1
        except Exception as e:  # we need to ignore any exception
            Logger.warning(
                "Failed to check for uncompleted package manager transactions: "
                + str(e))

        if num_errors > 0:
            raise Fail("Failed to distribute repositories/install packages")

        # Initial list of versions, used to compute the new version installed
        self.old_versions = get_stack_versions(self.stack_root_folder)

        try:
            is_package_install_successful = False
            ret_code = self.install_packages(package_list)
            if ret_code == 0:
                self.structured_output[
                    'package_installation_result'] = 'SUCCESS'
                self.put_structured_out(self.structured_output)
                is_package_install_successful = True
            else:
                num_errors += 1
        except Exception as err:
            num_errors += 1
            Logger.logger.exception(
                "Could not install packages. Error: {0}".format(str(err)))

        # Provide correct exit code
        if num_errors > 0:
            raise Fail("Failed to distribute repositories/install packages")

        self._fix_default_links_for_current()
        # if installing a version of HDP that needs some symlink love, then create them
        if is_package_install_successful and 'actual_version' in self.structured_output:
            self._relink_configurations_with_conf_select(
                stack_id, self.structured_output['actual_version'])
예제 #19
0
          content = response.read()
      except Exception, exception:
        if logger.isEnabledFor(logging.DEBUG):
          logger.exception("[Alert][{0}] Unable to make a web request: {1}".format(self.get_name(), str(exception)))
      finally:
        # explicitely close the connection as we've seen python hold onto these
        if response is not None:
          try:
            response.close()
          except:
            logger.debug("[Alert][{0}] Unable to close JMX URL connection to {1}".format
              (self.get_name(), url))

      json_is_valid = True
      try:
        json_response = json.loads(content)
        json_data = json_response['beans'][0]
      except Exception, exception:
        json_is_valid = False
        if logger.isEnabledFor(logging.DEBUG):
          logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
                         format(self.get_name(), str(exception)))

      if json_is_valid:
        for attr in jmx_property_value:
          if attr not in json_data:
            beans = json_response['beans']
            for jmx_prop_list_item in beans:
              if "name" in jmx_prop_list_item and jmx_prop_list_item["name"] == jmx_property_key:
                if attr not in jmx_prop_list_item:
                  raise Exception("Unable to find {0} in JSON from {1} ".format(attr, url))
예제 #20
0
    def service_check(self, env):
        import params
        env.set_params(params)

        params.HdfsResource(
            format("/user/{smokeuser}"),
            type="directory",
            action="create_on_execute",
            owner=params.smokeuser,
            mode=params.smoke_hdfs_user_mode,
        )

        if params.stack_version_formatted_major and check_stack_feature(
                StackFeature.ROLLING_UPGRADE,
                params.stack_version_formatted_major):
            path_to_distributed_shell_jar = format(
                "{stack_root}/current/hadoop-yarn-client/hadoop-yarn-applications-distributedshell.jar"
            )
        else:
            path_to_distributed_shell_jar = "/usr/lib/hadoop-yarn/hadoop-yarn-applications-distributedshell*.jar"

        yarn_distrubuted_shell_check_params = [
            "yarn org.apache.hadoop.yarn.applications.distributedshell.Client",
            "-shell_command", "ls", "-num_containers", "{number_of_nm}",
            "-jar", "{path_to_distributed_shell_jar}", "-timeout", "300000",
            "--queue", "{service_check_queue_name}"
        ]
        yarn_distrubuted_shell_check_cmd = format(
            " ".join(yarn_distrubuted_shell_check_params))

        if params.security_enabled:
            kinit_cmd = format(
                "{kinit_path_local} -kt {smoke_user_keytab} {smokeuser_principal};"
            )
            smoke_cmd = format(
                "{kinit_cmd} {yarn_distrubuted_shell_check_cmd}")
        else:
            smoke_cmd = yarn_distrubuted_shell_check_cmd

        return_code, out = shell.checked_call(
            smoke_cmd,
            path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin',
            user=params.smokeuser,
        )

        m = re.search("appTrackingUrl=(.*),\s", out)
        app_url = m.group(1)

        splitted_app_url = str(app_url).split('/')

        for item in splitted_app_url:
            if "application" in item:
                application_name = item

        for rm_webapp_address in params.rm_webapp_addresses_list:
            info_app_url = params.scheme + "://" + rm_webapp_address + "/ws/v1/cluster/apps/" + application_name

            get_app_info_cmd = "curl --negotiate -u : -ksL --connect-timeout " + CURL_CONNECTION_TIMEOUT + " " + info_app_url

            return_code, stdout, _ = get_user_call_output(
                get_app_info_cmd,
                user=params.smokeuser,
                path='/usr/sbin:/sbin:/usr/local/bin:/bin:/usr/bin',
            )

            # Handle HDP<2.2.8.1 where RM doesn't do automatic redirection from standby to active
            if stdout.startswith(
                    "This is standby RM. Redirecting to the current active RM:"
            ):
                Logger.info(
                    format(
                        "Skipped checking of {rm_webapp_address} since returned '{stdout}'"
                    ))
                continue

            try:
                json_response = json.loads(stdout)
            except Exception as e:
                raise Fail(
                    format(
                        "Response from YARN API was not a valid JSON. Response: {stdout}"
                    ))

            if json_response is None or 'app' not in json_response or \
                    'state' not in json_response['app'] or 'finalStatus' not in json_response['app']:
                raise Fail("Application " + app_url + " returns invalid data.")

            if json_response['app']['state'] != "FINISHED" or json_response[
                    'app']['finalStatus'] != "SUCCEEDED":
                raise Fail(
                    "Application " + app_url +
                    " state/status is not valid. Should be FINISHED/SUCCEEDED."
                )
예제 #21
0
    )
#repo params
repo_info = config['hostLevelParams']['repo_info']
service_repo_info = default("/hostLevelParams/service_repo_info", None)

user_to_groups_dict = {}

#Append new user-group mapping to the dict
try:
    user_group_map = ast.literal_eval(config['hostLevelParams']['user_groups'])
    for key in user_group_map.iterkeys():
        user_to_groups_dict[key] = user_group_map[key]
except ValueError:
    print('User Group mapping (user_group) is missing in the hostLevelParams')

user_to_gid_dict = collections.defaultdict(lambda: user_group)

user_list = json.loads(config['hostLevelParams']['user_list'])
group_list = json.loads(config['hostLevelParams']['group_list'])
host_sys_prepped = default("/hostLevelParams/host_sys_prepped", False)

tez_am_view_acls = config['configurations']['tez-site']["tez.am.view-acls"]
override_uid = str(default("/configurations/cluster-env/override_uid",
                           "true")).lower()

# if NN HA on secure clutser, access Zookeper securely
if stack_supports_zk_security and dfs_ha_enabled and security_enabled:
    hadoop_zkfc_opts = format(
        "-Dzookeeper.sasl.client=true -Dzookeeper.sasl.client.username=zookeeper -Djava.security.auth.login.config={hadoop_conf_secure_dir}/hdfs_jaas.conf -Dzookeeper.sasl.clientconfig=Client"
    )
예제 #22
0
def sync_ldap(options):
  if not is_root():
    err = 'Ambari-server sync-ldap should be run with ' \
          'root-level privileges'
    raise FatalException(4, err)

  server_status, pid = is_server_runing()
  if not server_status:
    err = 'Ambari Server is not running.'
    raise FatalException(1, err)

  properties = get_ambari_properties()
  if properties == -1:
    raise FatalException(1, "Failed to read properties file.")

  ldap_configured = properties.get_property(IS_LDAP_CONFIGURED)
  if ldap_configured != 'true':
    err = "LDAP is not configured. Run 'ambari-server setup-ldap' first."
    raise FatalException(1, err)

  # set ldap sync options
  ldap_sync_options = LdapSyncOptions(options)

  if ldap_sync_options.no_ldap_sync_options_set():
    err = 'Must specify a sync option (all, existing, users or groups).  Please invoke ambari-server.py --help to print the options.'
    raise FatalException(1, err)

  admin_login = get_validated_string_input(prompt="Enter Ambari Admin login: "******"Enter Ambari Admin password: "******"Event":{"specs":[{"principal_type":"users","sync_type":"all"},{"principal_type":"groups","sync_type":"all"}]}}]
  elif ldap_sync_options.ldap_sync_existing:
    sys.stdout.write('Syncing existing.')
    bodies = [{"Event":{"specs":[{"principal_type":"users","sync_type":"existing"},{"principal_type":"groups","sync_type":"existing"}]}}]
  else:
    sys.stdout.write('Syncing specified users and groups.')
    bodies = [{"Event":{"specs":[]}}]
    body = bodies[0]
    events = body['Event']
    specs = events['specs']

    if ldap_sync_options.ldap_sync_users is not None:
      new_specs = [{"principal_type":"users","sync_type":"specific","names":""}]
      get_ldap_event_spec_names(ldap_sync_options.ldap_sync_users, specs, new_specs)
    if ldap_sync_options.ldap_sync_groups is not None:
      new_specs = [{"principal_type":"groups","sync_type":"specific","names":""}]
      get_ldap_event_spec_names(ldap_sync_options.ldap_sync_groups, specs, new_specs)

  if get_verbose():
    sys.stdout.write('\nCalling API ' + url + ' : ' + str(bodies) + '\n')

  request.add_data(json.dumps(bodies))
  request.get_method = lambda: 'POST'

  try:
    response = urllib2.urlopen(request)
  except Exception as e:
    err = 'Sync event creation failed. Error details: %s' % e
    raise FatalException(1, err)

  response_status_code = response.getcode()
  if response_status_code != 201:
    err = 'Error during syncing. Http status code - ' + str(response_status_code)
    raise FatalException(1, err)
  response_body = json.loads(response.read())

  url = response_body['resources'][0]['href']
  request = urllib2.Request(url)
  request.add_header('Authorization', 'Basic %s' % admin_auth)
  request.add_header('X-Requested-By', 'ambari')
  body = [{"LDAP":{"synced_groups":"*","synced_users":"*"}}]
  request.add_data(json.dumps(body))
  request.get_method = lambda: 'GET'
  request_in_progress = True

  while request_in_progress:
    sys.stdout.write('.')
    sys.stdout.flush()

    try:
      response = urllib2.urlopen(request)
    except Exception as e:
      request_in_progress = False
      err = 'Sync event check failed. Error details: %s' % e
      raise FatalException(1, err)

    response_status_code = response.getcode()
    if response_status_code != 200:
      err = 'Error during syncing. Http status code - ' + str(response_status_code)
      raise FatalException(1, err)
    response_body = json.loads(response.read())
    sync_info = response_body['Event']

    if sync_info['status'] == 'ERROR':
      raise FatalException(1, str(sync_info['status_detail']))
    elif sync_info['status'] == 'COMPLETE':
      print '\n\nCompleted LDAP Sync.'
      print 'Summary:'
      for principal_type, summary in sync_info['summary'].iteritems():
        print '  {0}:'.format(principal_type)
        for action, amount in summary.iteritems():
          print '    {0} = {1!s}'.format(action, amount)
      request_in_progress = False
    else:
      time.sleep(1)

  sys.stdout.write('\n')
  sys.stdout.flush()
예제 #23
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations (dictionary): a mapping of configuration key to value
  parameters (dictionary): a mapping of script parameter key to value
  host_name (string): the name of this host where the alert is running
  """

    if configurations is None:
        return (('UNKNOWN',
                 ['There were no configurations supplied to the script.']))

    uri = None
    scheme = 'http'
    http_uri = None
    https_uri = None
    http_policy = 'HTTP_ONLY'
    checkpoint_tx = CHECKPOINT_TX_DEFAULT
    checkpoint_period = CHECKPOINT_PERIOD_DEFAULT

    if NN_HTTP_ADDRESS_KEY in configurations:
        http_uri = configurations[NN_HTTP_ADDRESS_KEY]

    if NN_HTTPS_ADDRESS_KEY in configurations:
        https_uri = configurations[NN_HTTPS_ADDRESS_KEY]

    if NN_HTTP_POLICY_KEY in configurations:
        http_policy = configurations[NN_HTTP_POLICY_KEY]

    if NN_CHECKPOINT_TX_KEY in configurations:
        checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY]

    if NN_CHECKPOINT_PERIOD_KEY in configurations:
        checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY]

    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
        security_enabled = str(
            configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
        kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
        kerberos_principal = configurations[KERBEROS_PRINCIPAL]
        kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    percent_warning = PERCENT_WARNING_DEFAULT
    if PERCENT_WARNING_KEY in parameters:
        percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100

    percent_critical = PERCENT_CRITICAL_DEFAULT
    if PERCENT_CRITICAL_KEY in parameters:
        percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100

    # determine the right URI and whether to use SSL
    uri = http_uri
    if http_policy == 'HTTPS_ONLY':
        scheme = 'https'

        if https_uri is not None:
            uri = https_uri

    current_time = int(round(time.time() * 1000))

    last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(
        scheme, uri)
    journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(
        scheme, uri)

    # start out assuming an OK status
    label = None
    result_code = "OK"

    try:
        if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()
            last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir, kerberos_keytab, kerberos_principal,
                last_checkpoint_time_qry, "checkpoint_time_alert", None, False,
                "NameNode Last Checkpoint")
            last_checkpoint_time_response_json = json.loads(
                last_checkpoint_time_response)
            last_checkpoint_time = int(
                last_checkpoint_time_response_json["beans"][0]
                ["LastCheckpointTime"])

            journal_transaction_info_response, error_msg, time_millis = curl_krb_request(
                env.tmp_dir, kerberos_keytab, kerberos_principal,
                journal_transaction_info_qry, "checkpoint_time_alert", None,
                False, "NameNode Last Checkpoint")
            journal_transaction_info_response_json = json.loads(
                journal_transaction_info_response)
            journal_transaction_info = journal_transaction_info_response_json[
                "beans"][0]["JournalTransactionInfo"]
        else:
            last_checkpoint_time = int(
                get_value_from_jmx(last_checkpoint_time_qry,
                                   "LastCheckpointTime", connection_timeout))

            journal_transaction_info = get_value_from_jmx(
                journal_transaction_info_qry, "JournalTransactionInfo",
                connection_timeout)

        journal_transaction_info_dict = json.loads(journal_transaction_info)

        last_tx = int(
            journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
        most_recent_tx = int(
            journal_transaction_info_dict['MostRecentCheckpointTxId'])
        transaction_difference = last_tx - most_recent_tx

        delta = (current_time - last_checkpoint_time) / 1000

        label = LABEL.format(h=get_time(delta)['h'],
                             m=get_time(delta)['m'],
                             tx=transaction_difference)

        if (transaction_difference > int(checkpoint_tx)) and (
                float(delta) / int(checkpoint_period) * 100 >=
                int(percent_critical)):
            result_code = 'CRITICAL'
        elif (transaction_difference > int(checkpoint_tx)) and (
                float(delta) / int(checkpoint_period) * 100 >=
                int(percent_warning)):
            result_code = 'WARNING'

    except Exception, e:
        label = str(e)
        result_code = 'UNKNOWN'
예제 #24
0
def run_schema_upgrade(args):
    db_title = get_db_type(get_ambari_properties()).title
    confirm = get_YN_input(
        "Ambari Server configured for %s. Confirm "
        "you have made a backup of the Ambari Server database [y/n] (y)? " %
        db_title, True)

    if not confirm:
        print_error_msg("Database backup is not confirmed")
        return 1

    jdk_path = get_java_exe_path()
    if jdk_path is None:
        print_error_msg(
            "No JDK found, please run the \"setup\" "
            "command to install a JDK automatically or install any "
            "JDK manually to " + configDefaults.JDK_INSTALL_DIR)
        return 1

    ensure_jdbc_driver_is_installed(args, get_ambari_properties())

    print_info_msg('Upgrading database schema', True)

    serverClassPath = ServerClassPath(get_ambari_properties(), args)
    class_path = serverClassPath.get_full_ambari_classpath_escaped_for_shell(
        validate_classpath=True)

    set_debug_mode_from_options(args)
    debug_mode = get_debug_mode()
    debug_start = (debug_mode & 1) or SCHEMA_UPGRADE_DEBUG
    suspend_start = (debug_mode & 2) or SUSPEND_START_MODE
    suspend_mode = 'y' if suspend_start else 'n'
    command = SCHEMA_UPGRADE_HELPER_CMD_DEBUG.format(
        jdk_path, class_path,
        suspend_mode) if debug_start else SCHEMA_UPGRADE_HELPER_CMD.format(
            jdk_path, class_path)

    ambari_user = read_ambari_user()
    current_user = ensure_can_start_under_current_user(ambari_user)
    environ = generate_env(args, ambari_user, current_user)

    (retcode, stdout, stderr) = run_os_command(command, env=environ)
    upgrade_response = json.loads(stdout)

    check_gpl_license_approved(upgrade_response)

    print_info_msg(
        "Return code from schema upgrade command, retcode = {0}".format(
            str(retcode)), True)
    if stdout:
        print_info_msg("Console output from schema upgrade command:", True)
        print_info_msg(stdout, True)
        print
    if retcode > 0:
        print_error_msg(
            "Error executing schema upgrade, please check the server logs.")
        if stderr:
            print_error_msg("Error output from schema upgrade command:")
            print_error_msg(stderr)
            print
    else:
        print_info_msg('Schema upgrade completed', True)
    return retcode
예제 #25
0
    def _run_command(self,
                     target,
                     operation,
                     method='POST',
                     assertable_result=True,
                     file_to_put=None,
                     ignore_status_codes=[],
                     **kwargs):
        """
    assertable_result - some POST requests return '{"boolean":false}' or '{"boolean":true}'
    depending on if query was successful or not, we can assert this for them
    """
        target = HdfsResourceProvider.parse_path(target)
        if not target:
            raise Fail("Target cannot be empty")

        url = format("{address}/webhdfs/v1{target}?op={operation}",
                     address=self.address)
        request_args = kwargs

        if not self.security_enabled:
            request_args['user.name'] = self.run_user

        for k, v in request_args.iteritems():
            url = format("{url}&{k}={v}")

        cmd = ["curl", "-sS", "-L", "-w", "%{http_code}", "-X", method]

        # When operation is "OPEN" the target is actually the DFS file to download and the file_to_put is actually the target see _download_file
        if operation == "OPEN":
            cmd += ["-o", file_to_put]
        else:
            if file_to_put and not os.path.exists(file_to_put):
                raise Fail(format("File {file_to_put} is not found."))

            if file_to_put:
                cmd += [
                    "--data-binary", "@" + file_to_put, "-H",
                    "Content-Type: application/octet-stream"
                ]

        if self.security_enabled:
            cmd += ["--negotiate", "-u", ":"]
        if self.is_https_enabled:
            cmd += ["-k"]

        cmd.append(url)
        _, out, err = get_user_call_output(cmd,
                                           user=self.run_user,
                                           logoutput=self.logoutput,
                                           quiet=False)
        status_code = out[-3:]
        out = out[:-3]  # remove last line from output which is status code

        try:
            result_dict = json.loads(out)
        except ValueError:
            result_dict = out

        if status_code not in WebHDFSUtil.valid_status_codes + ignore_status_codes or assertable_result and result_dict and not result_dict[
                'boolean']:
            formatted_output = json.dumps(result_dict, indent=2) if isinstance(
                result_dict, dict) else result_dict
            formatted_output = err + "\n" + formatted_output
            err_msg = "Execution of '%s' returned status_code=%s. %s" % (
                shell.string_cmd_from_args_list(cmd), status_code,
                formatted_output)
            raise WebHDFSCallException(err_msg, result_dict)

        return result_dict
예제 #26
0
  def create_ambari_admin_user(self,ambari_admin_username, ambari_admin_password,usernamepassword):
    """
    :param ambari_admin_username: username of user to be created
    :param ambari_admin_username: user password of user to be created
    :return Returns response code for successful user creation else None
    """
    flag_ambari_admin_present = False
    match = re.match('[a-zA-Z0-9_\S]+$', ambari_admin_password)
    if match is None:
      raise Fail('Invalid password given for Ranger Admin user for Ambari')
    try:
      url =  self.urlUsers + '?name=' + str(ambari_admin_username)
      request = urllib2.Request(url)
      base64string = base64.encodestring(usernamepassword).replace('\n', '')
      request.add_header("Content-Type", "application/json")
      request.add_header("Accept", "application/json")
      request.add_header("Authorization", "Basic {0}".format(base64string))
      result = openurl(request, timeout=20)
      response_code = result.getcode()
      response = json.loads(result.read())
      if response_code == 200 and len(response['vXUsers']) >= 0:
        for vxuser in response['vXUsers']:
          if vxuser['name'] == ambari_admin_username:
            flag_ambari_admin_present = True
            break
          else:
            flag_ambari_admin_present = False

        if flag_ambari_admin_present:
          Logger.info(ambari_admin_username + ' user already exists.')
          return response_code
        else:
          Logger.info(ambari_admin_username + ' user is not present, creating user using given configurations')
          url = self.urlSecUsers
          admin_user = dict()
          admin_user['status'] = 1
          admin_user['userRoleList'] = ['ROLE_SYS_ADMIN']
          admin_user['name'] = ambari_admin_username
          admin_user['password'] = ambari_admin_password
          admin_user['description'] = ambari_admin_username
          admin_user['firstName'] = ambari_admin_username
          data = json.dumps(admin_user)
          base64string = base64.encodestring('{0}'.format(usernamepassword)).replace('\n', '')
          headers = {
            'Accept': 'application/json',
            "Content-Type": "application/json"
          }
          request = urllib2.Request(url, data, headers)
          request.add_header("Authorization", "Basic {0}".format(base64string))
          result = openurl(request, timeout=20)
          response_code = result.getcode()
          response = json.loads(json.JSONEncoder().encode(result.read()))
          if response_code == 200 and response is not None:
            Logger.info('Ambari admin user creation successful.')
            return response_code
          else:
            Logger.info('Ambari admin user creation failed.')
            return None
      else:
        return None
    except urllib2.URLError, e:
      if isinstance(e, urllib2.HTTPError):
        raise Fail("Error creating ambari admin user. Http status code - {0}. \n {1}".format(e.code, e.read()))
      else:
        raise Fail("Error creating ambari admin user. Reason - {0}.".format(e.reason))
def create_ams_dashboards():
  """
  Create dashboards in grafana from the json files
  """
  import params
  server = Server(protocol = params.ams_grafana_protocol.strip(),
                  host = params.ams_grafana_host.strip(),
                  port = params.ams_grafana_port,
                  user = params.ams_grafana_admin_user,
                  password = params.ams_grafana_admin_pwd)

  dashboard_files = params.get_grafana_dashboard_defs()
  version = params.get_ambari_version()
  Logger.info("Checking dashboards to update for Ambari version : %s" % version)
  # Friendly representation of dashboard
  Dashboard = namedtuple('Dashboard', ['uri', 'id', 'title', 'tags'])

  existing_dashboards = []
  response = perform_grafana_get_call(GRAFANA_SEARCH_BULTIN_DASHBOARDS, server)
  if response and response.status == 200:
    data = response.read()
    try:
      dashboards = json.loads(data)
    except:
      Logger.error("Unable to parse JSON response from grafana request: %s" %
                   GRAFANA_SEARCH_BULTIN_DASHBOARDS)
      Logger.info(data)
      return

    for dashboard in dashboards:
      if dashboard['title'] == 'HBase - Performance':
        perform_grafana_delete_call("/api/dashboards/" + dashboard['uri'], server)
      else:
        existing_dashboards.append(
            Dashboard(uri = dashboard['uri'], id = dashboard['id'],
                    title = dashboard['title'], tags = dashboard['tags'])
          )
    pass
  else:
    Logger.error("Failed to execute search query on Grafana dashboards. "
                 "query = %s\n statuscode = %s\n reason = %s\n data = %s\n" %
                 (GRAFANA_SEARCH_BULTIN_DASHBOARDS, response.status, response.reason, response.read()))
    return

  Logger.debug('Dashboard definitions found = %s' % str(dashboard_files))

  if dashboard_files:
    for dashboard_file in dashboard_files:
      try:
        with open(dashboard_file, 'r') as file:
          dashboard_def = json.load(file)
      except Exception, e:
        Logger.error('Unable to load dashboard json file %s' % dashboard_file)
        Logger.error(str(e))
        continue

      if dashboard_def:
        update_def = True
        # Make sure static json does not have id
        if "id" in dashboard_def:
          dashboard_def['id'] = None
        # Set correct tags
        if 'tags' in dashboard_def:
          dashboard_def['tags'].append('builtin')
          dashboard_def['tags'].append(version)
        else:
          dashboard_def['tags'] = [ 'builtin', version ]
        
        for dashboard in existing_dashboards:
          if dashboard.title == dashboard_def['title']:
            if version not in dashboard.tags:
              # Found existing dashboard with wrong version - update dashboard
              update_def = True
            else:
              update_def = False # Skip update
        pass

        if update_def:
          Logger.info("Updating dashboard definition for %s with tags: %s" %
                      (dashboard_def['title'], dashboard_def['tags']))

          # Discrepancy in grafana export vs import format
          dashboard_def_payload = { "dashboard" : dashboard_def, 'overwrite': True }
          paylaod = json.dumps(dashboard_def_payload).strip()

          (response, data) = perform_grafana_post_call(GRAFANA_DASHBOARDS_URL, paylaod, server)

          if response and response.status == 200:
            Logger.info("Dashboard created successfully.\n %s" % str(data))
          else:
            Logger.error("Failed creating dashboard: %s" % dashboard_def['title'])
          pass
        else:
          Logger.info('No update needed for dashboard = %s' % dashboard_def['title'])
      pass
    pass
예제 #28
0
def get_restricted_packages():
    """
  Gets the list of conf-select 'package' names that need to be invoked on the command.
  When the server passes down the list of packages to install, check the service names
  and use the information in stack_packages json to determine the list of packages that should
  be executed.  That is valid only for PATCH or MAINT upgrades.  STANDARD upgrades should be
  conf-select'ing everything it can find.
  """
    package_names = []

    # shortcut the common case if we are not patching
    cluster_version_summary = default(
        "/roleParameters/cluster_version_summary/services", None)

    if cluster_version_summary is None:
        Logger.info(
            "Cluster Summary is not available, there are no restrictions for conf-select"
        )
        return package_names

    service_names = []

    # pick out the services that are targeted
    for servicename, servicedetail in cluster_version_summary.iteritems():
        if servicedetail['upgrade']:
            service_names.append(servicename)

    if 0 == len(service_names):
        Logger.info(
            "No services found, there are no restrictions for conf-select")
        return package_names

    stack_name = default("/clusterLevelParams/stack_name", None)
    if stack_name is None:
        Logger.info(
            "The stack name is not present in the command. Restricted names skipped."
        )
        return package_names

    stack_packages_config = default(
        "/configurations/cluster-env/stack_packages", None)
    if stack_packages_config is None:
        Logger.info(
            "The stack packages are not defined on the command. Restricted names skipped."
        )
        return package_names

    data = json.loads(stack_packages_config)

    if stack_name not in data:
        Logger.info(
            "Cannot find conf-select packages for the {0} stack".format(
                stack_name))
        return package_names

    conf_select_key = "conf-select-patching"
    if conf_select_key not in data[stack_name]:
        Logger.info(
            "There are no conf-select-patching elements defined for this command for the {0} stack"
            .format(stack_name))
        return package_names

    service_dict = data[stack_name][conf_select_key]

    for servicename in service_names:
        if servicename in service_dict and 'packages' in service_dict[
                servicename]:
            package_names.extend(service_dict[servicename]['packages'])

    return package_names
예제 #29
0
#repo params
repo_info = config['hostLevelParams']['repoInfo']
service_repo_info = default("/hostLevelParams/service_repo_info", None)

user_to_groups_dict = {}

#Append new user-group mapping to the dict
try:
    user_group_map = ast.literal_eval(
        config['clusterLevelParams']['user_groups'])
    for key in user_group_map.iterkeys():
        user_to_groups_dict[key] = user_group_map[key]
except ValueError:
    print('User Group mapping (user_group) is missing in the hostLevelParams')

user_to_gid_dict = collections.defaultdict(lambda: user_group)

user_list = json.loads(config['clusterLevelParams']['user_list'])
group_list = json.loads(config['clusterLevelParams']['group_list'])
host_sys_prepped = default("/ambariLevelParams/host_sys_prepped", False)

tez_am_view_acls = config['configurations']['tez-site']["tez.am.view-acls"]
override_uid = str(default("/configurations/cluster-env/override_uid",
                           "true")).lower()

# if NN HA on secure clutser, access Zookeper securely
if stack_supports_zk_security and dfs_ha_enabled and security_enabled:
    hadoop_zkfc_opts = format(
        "-Dzookeeper.sasl.client=true -Dzookeeper.sasl.client.username=zookeeper -Djava.security.auth.login.config={hadoop_conf_secure_dir}/hdfs_jaas.conf -Dzookeeper.sasl.clientconfig=Client"
    )
예제 #30
0
def get_packages(scope, service_name = None, component_name = None):
  """
  Gets the packages which should be used with the stack's stack-select tool for the
  specified service/component. Not all services/components are used with the stack-select tools,
  so those will return no packages.

  :param scope: the scope of the command
  :param service_name:  the service name, such as ZOOKEEPER
  :param component_name: the component name, such as ZOOKEEPER_SERVER
  :return:  the packages to use with stack-select or None
  """
  from resource_management.libraries.functions.default import default

  if scope not in _PACKAGE_SCOPES:
    raise Fail("The specified scope of {0} is not valid".format(scope))

  config = Script.get_config()

  if service_name is None or component_name is None:
    if 'role' not in config or 'serviceName' not in config:
      raise Fail("Both the role and the service name must be included in the command in order to determine which packages to use with the stack-select tool")

    service_name = config['serviceName']
    component_name = config['role']


  stack_name = default("/clusterLevelParams/stack_name", None)
  if stack_name is None:
    raise Fail("The stack name is not present in the command. Packages for stack-select tool cannot be loaded.")

  stack_packages_config = default("/configurations/cluster-env/stack_packages", None)
  if stack_packages_config is None:
    raise Fail("The stack packages are not defined on the command. Unable to load packages for the stack-select tool")

  data = json.loads(stack_packages_config)

  if stack_name not in data:
    raise Fail(
      "Cannot find stack-select packages for the {0} stack".format(stack_name))

  stack_select_key = "stack-select"
  data = data[stack_name]
  if stack_select_key not in data:
    raise Fail(
      "There are no stack-select packages defined for this command for the {0} stack".format(stack_name))

  # this should now be the dictionary of role name to package name
  data = data[stack_select_key]
  service_name = service_name.upper()
  component_name = component_name.upper()

  if service_name not in data:
    Logger.info("Skipping stack-select on {0} because it does not exist in the stack-select package structure.".format(service_name))
    return None

  data = data[service_name]

  if component_name not in data:
    Logger.info("Skipping stack-select on {0} because it does not exist in the stack-select package structure.".format(component_name))
    return None

  # this one scope is not an array, so transform it into one for now so we can
  # use the same code below
  packages = data[component_name][scope]
  if scope == PACKAGE_SCOPE_STACK_SELECT:
    packages = [packages]

  # grab the package name from the JSON and validate it against the packages
  # that the stack-select tool supports - if it doesn't support it, then try to find the legacy
  # package name if it exists
  supported_packages = get_supported_packages()
  for index, package in enumerate(packages):
    if not is_package_supported(package, supported_packages=supported_packages):
      if _PACKAGE_SCOPE_LEGACY in data[component_name]:
        legacy_package = data[component_name][_PACKAGE_SCOPE_LEGACY]
        Logger.info(
          "The package {0} is not supported by this version of the stack-select tool, defaulting to the legacy package of {1}".format(package, legacy_package))

        # use the legacy package
        packages[index] = legacy_package
      else:
        raise Fail("The package {0} is not supported by this version of the stack-select tool.".format(package))

  # transform the array bcak to a single element
  if scope == PACKAGE_SCOPE_STACK_SELECT:
    packages = packages[0]

  return packages