def check_health(self):
     imp_map = {'OK': 'GREEN', 'WARNING': 'YELLOW', 'CRITICAL': 'RED'}
     other_map = {'OK': 'GREEN'}
     color_counter = collections.Counter()
     important_services = self.get_important_services()
     for alert in self.provider.get_alerts_data(self.service):
         alert_summary = alert.get('state', 'UNKNOWN')
         if self.service in important_services:
             target = imp_map.get(alert_summary, 'RED')
         else:
             target = other_map.get(alert_summary, 'YELLOW')
         color_counter[target] += 1
     if color_counter['RED'] > 0 and color_counter['YELLOW'] > 0:
         raise health_check_base.RedHealthError(
             _("Ambari Monitor has responded that cluster has "
               "%(red)d critical and %(yellow)d warning alert(s)")
             % {'red': color_counter['RED'],
                'yellow': color_counter['YELLOW']})
     elif color_counter['RED'] > 0:
         raise health_check_base.RedHealthError(
             _("Ambari Monitor has responded that cluster has "
               "%(red)d critical alert(s)")
             % {'red': color_counter['RED']})
     elif color_counter['YELLOW'] > 0:
         raise health_check_base.YellowHealthError(
             _("Ambari Monitor has responded that cluster "
               "has %d warning alert(s)")
             % color_counter['YELLOW'])
     return _("No alerts found")
def _check_storm(cluster):
    dr_count = utils.get_instances_count(cluster, common.DRPC_SERVER)
    ni_count = utils.get_instances_count(cluster, common.NIMBUS)
    su_count = utils.get_instances_count(cluster, common.STORM_UI_SERVER)
    sv_count = utils.get_instances_count(cluster, common.SUPERVISOR)
    if dr_count > 1:
        raise ex.InvalidComponentCountException(common.DRPC_SERVER,
                                                _("0 or 1"), dr_count)
    if ni_count > 1:
        raise ex.InvalidComponentCountException(common.NIMBUS,
                                                _("0 or 1"), ni_count)
    if su_count > 1:
        raise ex.InvalidComponentCountException(common.STORM_UI_SERVER,
                                                _("0 or 1"), su_count)
    if dr_count == 0 and ni_count == 1:
        raise ex.RequiredServiceMissingException(
            common.DRPC_SERVER, required_by=common.NIMBUS)
    if dr_count == 1 and ni_count == 0:
        raise ex.RequiredServiceMissingException(
            common.NIMBUS, required_by=common.DRPC_SERVER)
    if su_count == 1 and (dr_count == 0 or ni_count == 0):
        raise ex.RequiredServiceMissingException(
            common.NIMBUS, required_by=common.STORM_UI_SERVER)
    if dr_count == 1 and sv_count == 0:
        raise ex.RequiredServiceMissingException(
            common.SUPERVISOR, required_by=common.DRPC_SERVER)
    if sv_count > 0 and dr_count == 0:
        raise ex.RequiredServiceMissingException(
            common.DRPC_SERVER, required_by=common.SUPERVISOR)
 def wait_ambari_requests(self, requests, cluster_name):
     requests = set(requests)
     failed = []
     context.sleep(20)
     while len(requests) > 0:
         completed, not_completed = set(), set()
         for req_id in requests:
             request = self.get_request_info(cluster_name, req_id)
             status = request.get("request_status")
             if status == 'COMPLETED':
                 completed.add(req_id)
             elif status in ['IN_PROGRESS', 'PENDING']:
                 not_completed.add(req_id)
             else:
                 failed.append(request)
         if failed:
             msg = _("Some Ambari request(s) "
                     "not in COMPLETED state: %(description)s.")
             descrs = []
             for req in failed:
                 descr = _(
                     "request %(id)d: %(name)s - in status %(status)s")
                 descrs.append(
                     descr % {
                         'id': req.get("id"),
                         'name': req.get("request_context"),
                         'status': req.get("request_status")
                     })
             raise p_exc.HadoopProvisionError(msg % {'description': descrs})
         requests = not_completed
         context.sleep(5)
         LOG.debug("Waiting for %d ambari request(s) to be completed",
                   len(not_completed))
     LOG.debug("All ambari requests have been completed")
 def get_alerts_data(self, service=None):
     if self._data is not None:
         # return cached data
         return self._data.get(service, []) if service else self._data
     self._data = {}
     self._cluster_services = []
     try:
         ambari = plugin_utils.get_instance(
             self.cluster, p_common.AMBARI_SERVER)
         password = self.cluster.extra.get("ambari_password")
         with client.AmbariClient(ambari, password=password) as ambari:
             resp = ambari.get_alerts_data(self.cluster)
         for alert in resp:
             alert = alert.get('Alert', {})
             service = alert.get('service_name').lower()
             if service not in self._data:
                 self._data[service] = []
                 self._cluster_services.append(service)
             self._data[service].append(alert)
     except Exception as e:
         prefix = _("Can't get response from Ambari Monitor")
         msg = _("%(problem)s: %(description)s") % {
             'problem': prefix, 'description': six.text_type(e)}
         # don't put in exception to logs, it will be done by log.exception
         LOG.exception(prefix)
         self._exception_store = msg
Beispiel #5
0
def _get_ha_params():
    enable_namenode_ha = provisioning.Config(
        name=common.NAMENODE_HA,
        applicable_target="general",
        scope="cluster",
        config_type="bool",
        default_value=False,
        is_optional=True,
        description=_("Enable NameNode HA"),
        priority=1)

    enable_resourcemanager_ha = provisioning.Config(
        name=common.RESOURCEMANAGER_HA,
        applicable_target="general",
        scope="cluster",
        config_type="bool",
        default_value=False,
        is_optional=True,
        description=_("Enable ResourceManager HA"),
        priority=1)

    enable_regionserver_ha = provisioning.Config(
        name=common.HBASE_REGIONSERVER_HA,
        applicable_target="general",
        scope="cluster",
        config_type="bool",
        default_value=False,
        is_optional=True,
        description=_("Enable HBase RegionServer HA"),
        priority=1)

    return [
        enable_namenode_ha, enable_resourcemanager_ha, enable_regionserver_ha
    ]
def _check_jn_ha(cluster):
    jn_count = utils.get_instances_count(cluster, common.JOURNAL_NODE)
    if jn_count < 3:
        raise ex.InvalidComponentCountException(
            common.JOURNAL_NODE,
            _("3 or more. Odd number"),
            jn_count, _("At least 3 JournalNodes are required for HA"))
    if jn_count % 2 != 1:
        raise ex.InvalidComponentCountException(
            common.JOURNAL_NODE,
            _("Odd number"),
            jn_count, _("Odd number of JournalNodes are required for HA"))
def _check_zk_ha(cluster):
    zk_count = utils.get_instances_count(cluster, common.ZOOKEEPER_SERVER)
    if zk_count < 3:
        raise ex.InvalidComponentCountException(
            common.ZOOKEEPER_SERVER,
            _("3 or more. Odd number"),
            zk_count, _("At least 3 ZooKeepers are required for HA"))
    if zk_count % 2 != 1:
        raise ex.InvalidComponentCountException(
            common.ZOOKEEPER_SERVER,
            _("Odd number"),
            zk_count, _("Odd number of ZooKeepers are required for HA"))
def _check_hive(cluster):
    hs_count = utils.get_instances_count(cluster, common.HIVE_SERVER)
    hm_count = utils.get_instances_count(cluster, common.HIVE_METASTORE)
    if hs_count > 1:
        raise ex.InvalidComponentCountException(common.HIVE_SERVER,
                                                _("0 or 1"), hs_count)
    if hm_count > 1:
        raise ex.InvalidComponentCountException(common.HIVE_METASTORE,
                                                _("0 or 1"), hm_count)
    if hs_count == 0 and hm_count == 1:
        raise ex.RequiredServiceMissingException(
            common.HIVE_SERVER, required_by=common.HIVE_METASTORE)
    if hs_count == 1 and hm_count == 0:
        raise ex.RequiredServiceMissingException(
            common.HIVE_METASTORE, required_by=common.HIVE_SERVER)
def _check_ranger(cluster):
    ra_count = utils.get_instances_count(cluster, common.RANGER_ADMIN)
    ru_count = utils.get_instances_count(cluster, common.RANGER_USERSYNC)
    if ra_count > 1:
        raise ex.InvalidComponentCountException(common.RANGER_ADMIN,
                                                _("0 or 1"), ra_count)
    if ru_count > 1:
        raise ex.InvalidComponentCountException(common.RANGER_USERSYNC,
                                                _("0 or 1"), ru_count)
    if ra_count == 1 and ru_count == 0:
        raise ex.RequiredServiceMissingException(
            common.RANGER_USERSYNC, required_by=common.RANGER_ADMIN)
    if ra_count == 0 and ru_count == 1:
        raise ex.RequiredServiceMissingException(
            common.RANGER_ADMIN, required_by=common.RANGER_USERSYNC)
def _check_yarn(cluster):
    rm_count = utils.get_instances_count(cluster, common.RESOURCEMANAGER)
    nm_count = utils.get_instances_count(cluster, common.NODEMANAGER)
    hs_count = utils.get_instances_count(cluster, common.HISTORYSERVER)
    at_count = utils.get_instances_count(cluster, common.APP_TIMELINE_SERVER)

    if cluster.cluster_configs.get("general", {}).get(
            common.RESOURCEMANAGER_HA):
        _check_zk_ha(cluster)

        if rm_count != 2:
            raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 2,
                                                    rm_count)
    else:
        if rm_count != 1:
            raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 1,
                                                    rm_count)

    if hs_count != 1:
        raise ex.InvalidComponentCountException(common.HISTORYSERVER, 1,
                                                hs_count)
    if at_count != 1:
        raise ex.InvalidComponentCountException(common.APP_TIMELINE_SERVER, 1,
                                                at_count)
    if nm_count == 0:
        raise ex.InvalidComponentCountException(common.NODEMANAGER,
                                                _("1 or more"), nm_count)
def _check_ambari(cluster):
    am_count = utils.get_instances_count(cluster, common.AMBARI_SERVER)
    zk_count = utils.get_instances_count(cluster, common.ZOOKEEPER_SERVER)
    if am_count != 1:
        raise ex.InvalidComponentCountException(common.AMBARI_SERVER, 1,
                                                am_count)
    if zk_count == 0:
        raise ex.InvalidComponentCountException(common.ZOOKEEPER_SERVER,
                                                _("1 or more"), zk_count)
def _check_hbase(cluster):
    hm_count = utils.get_instances_count(cluster, common.HBASE_MASTER)
    hr_count = utils.get_instances_count(cluster, common.HBASE_REGIONSERVER)
    if hm_count > 1:
        raise ex.InvalidComponentCountException(common.HBASE_MASTER,
                                                _("0 or 1"), hm_count)
    if hm_count == 1 and hr_count == 0:
        raise ex.RequiredServiceMissingException(
            common.HBASE_REGIONSERVER, required_by=common.HBASE_MASTER)
    if hr_count > 0 and hm_count == 0:
        raise ex.RequiredServiceMissingException(
            common.HBASE_MASTER, required_by=common.HBASE_REGIONSERVER)
Beispiel #13
0
    def validate_job_execution(self, cluster, job, data):
        if not self.edp_supported(cluster.hadoop_version):
            raise pex.PluginInvalidDataException(
                _('Ambari plugin of {base} or higher required to run {type} '
                  'jobs').format(base=EDPSparkEngine.edp_base_version,
                                 type=job.type))

        spark_nodes_count = plugin_utils.get_instances_count(
            cluster, p_common.SPARK_JOBHISTORYSERVER)
        if spark_nodes_count != 1:
            raise pex.InvalidComponentCountException(
                p_common.SPARK_JOBHISTORYSERVER, '1', spark_nodes_count)

        super(EDPSparkEngine, self).validate_job_execution(cluster, job, data)
 def wait_ambari_request(self, request_id, cluster_name):
     context.sleep(20)
     while True:
         status = self.check_request_status(cluster_name, request_id)
         LOG.debug(
             "Task %(context)s in %(status)s state. "
             "Completed %(percent).1f%%", {
                 'context': status["request_context"],
                 'status': status["request_status"],
                 'percent': status["progress_percent"]
             })
         if status["request_status"] == "COMPLETED":
             return
         if status["request_status"] in ["IN_PROGRESS", "PENDING"]:
             context.sleep(5)
         else:
             raise p_exc.HadoopProvisionError(
                 _("Ambari request in %s state") % status["request_status"])
def _check_hdfs(cluster):
    nn_count = utils.get_instances_count(cluster, common.NAMENODE)
    dn_count = utils.get_instances_count(cluster, common.DATANODE)
    snn_count = utils.get_instances_count(cluster, common.SECONDARY_NAMENODE)

    if cluster.cluster_configs.get("general", {}).get(common.NAMENODE_HA):
        _check_zk_ha(cluster)
        _check_jn_ha(cluster)

        if nn_count != 2:
            raise ex.InvalidComponentCountException(common.NAMENODE, 2,
                                                    nn_count)
    else:
        if nn_count != 1:
            raise ex.InvalidComponentCountException(common.NAMENODE, 1,
                                                    nn_count)

        if snn_count != 1:
            raise ex.InvalidComponentCountException(common.SECONDARY_NAMENODE,
                                                    1, snn_count)

    if dn_count == 0:
        raise ex.InvalidComponentCountException(
            common.DATANODE, _("1 or more"), dn_count)
def _check_oozie(cluster):
    count = utils.get_instances_count(cluster, common.OOZIE_SERVER)
    if count > 1:
        raise ex.InvalidComponentCountException(common.OOZIE_SERVER,
                                                _("0 or 1"), count)
def _check_spark(cluster):
    count = utils.get_instances_count(cluster, common.SPARK_JOBHISTORYSERVER)
    if count > 1:
        raise ex.InvalidComponentCountException(common.SPARK_JOBHISTORYSERVER,
                                                _("0 or 1"), count)
 def is_ambari_active(self):
     if self._exception_store:
         raise health_check_base.RedHealthError(self._exception_store)
     return _("Ambari Monitor is healthy")
Beispiel #19
0
 def get_description(self):
     return _("The Ambari Sahara plugin provides the ability to launch "
              "clusters with Hortonworks Data Platform (HDP) on OpenStack "
              "using Apache Ambari")