def check_health(self): imp_map = {'OK': 'GREEN', 'WARNING': 'YELLOW', 'CRITICAL': 'RED'} other_map = {'OK': 'GREEN'} color_counter = collections.Counter() important_services = self.get_important_services() for alert in self.provider.get_alerts_data(self.service): alert_summary = alert.get('state', 'UNKNOWN') if self.service in important_services: target = imp_map.get(alert_summary, 'RED') else: target = other_map.get(alert_summary, 'YELLOW') color_counter[target] += 1 if color_counter['RED'] > 0 and color_counter['YELLOW'] > 0: raise health_check_base.RedHealthError( _("Ambari Monitor has responded that cluster has " "%(red)d critical and %(yellow)d warning alert(s)") % {'red': color_counter['RED'], 'yellow': color_counter['YELLOW']}) elif color_counter['RED'] > 0: raise health_check_base.RedHealthError( _("Ambari Monitor has responded that cluster has " "%(red)d critical alert(s)") % {'red': color_counter['RED']}) elif color_counter['YELLOW'] > 0: raise health_check_base.YellowHealthError( _("Ambari Monitor has responded that cluster " "has %d warning alert(s)") % color_counter['YELLOW']) return _("No alerts found")
def _check_storm(cluster): dr_count = utils.get_instances_count(cluster, common.DRPC_SERVER) ni_count = utils.get_instances_count(cluster, common.NIMBUS) su_count = utils.get_instances_count(cluster, common.STORM_UI_SERVER) sv_count = utils.get_instances_count(cluster, common.SUPERVISOR) if dr_count > 1: raise ex.InvalidComponentCountException(common.DRPC_SERVER, _("0 or 1"), dr_count) if ni_count > 1: raise ex.InvalidComponentCountException(common.NIMBUS, _("0 or 1"), ni_count) if su_count > 1: raise ex.InvalidComponentCountException(common.STORM_UI_SERVER, _("0 or 1"), su_count) if dr_count == 0 and ni_count == 1: raise ex.RequiredServiceMissingException( common.DRPC_SERVER, required_by=common.NIMBUS) if dr_count == 1 and ni_count == 0: raise ex.RequiredServiceMissingException( common.NIMBUS, required_by=common.DRPC_SERVER) if su_count == 1 and (dr_count == 0 or ni_count == 0): raise ex.RequiredServiceMissingException( common.NIMBUS, required_by=common.STORM_UI_SERVER) if dr_count == 1 and sv_count == 0: raise ex.RequiredServiceMissingException( common.SUPERVISOR, required_by=common.DRPC_SERVER) if sv_count > 0 and dr_count == 0: raise ex.RequiredServiceMissingException( common.DRPC_SERVER, required_by=common.SUPERVISOR)
def wait_ambari_requests(self, requests, cluster_name): requests = set(requests) failed = [] context.sleep(20) while len(requests) > 0: completed, not_completed = set(), set() for req_id in requests: request = self.get_request_info(cluster_name, req_id) status = request.get("request_status") if status == 'COMPLETED': completed.add(req_id) elif status in ['IN_PROGRESS', 'PENDING']: not_completed.add(req_id) else: failed.append(request) if failed: msg = _("Some Ambari request(s) " "not in COMPLETED state: %(description)s.") descrs = [] for req in failed: descr = _( "request %(id)d: %(name)s - in status %(status)s") descrs.append( descr % { 'id': req.get("id"), 'name': req.get("request_context"), 'status': req.get("request_status") }) raise p_exc.HadoopProvisionError(msg % {'description': descrs}) requests = not_completed context.sleep(5) LOG.debug("Waiting for %d ambari request(s) to be completed", len(not_completed)) LOG.debug("All ambari requests have been completed")
def get_alerts_data(self, service=None): if self._data is not None: # return cached data return self._data.get(service, []) if service else self._data self._data = {} self._cluster_services = [] try: ambari = plugin_utils.get_instance( self.cluster, p_common.AMBARI_SERVER) password = self.cluster.extra.get("ambari_password") with client.AmbariClient(ambari, password=password) as ambari: resp = ambari.get_alerts_data(self.cluster) for alert in resp: alert = alert.get('Alert', {}) service = alert.get('service_name').lower() if service not in self._data: self._data[service] = [] self._cluster_services.append(service) self._data[service].append(alert) except Exception as e: prefix = _("Can't get response from Ambari Monitor") msg = _("%(problem)s: %(description)s") % { 'problem': prefix, 'description': six.text_type(e)} # don't put in exception to logs, it will be done by log.exception LOG.exception(prefix) self._exception_store = msg
def _get_ha_params(): enable_namenode_ha = provisioning.Config( name=common.NAMENODE_HA, applicable_target="general", scope="cluster", config_type="bool", default_value=False, is_optional=True, description=_("Enable NameNode HA"), priority=1) enable_resourcemanager_ha = provisioning.Config( name=common.RESOURCEMANAGER_HA, applicable_target="general", scope="cluster", config_type="bool", default_value=False, is_optional=True, description=_("Enable ResourceManager HA"), priority=1) enable_regionserver_ha = provisioning.Config( name=common.HBASE_REGIONSERVER_HA, applicable_target="general", scope="cluster", config_type="bool", default_value=False, is_optional=True, description=_("Enable HBase RegionServer HA"), priority=1) return [ enable_namenode_ha, enable_resourcemanager_ha, enable_regionserver_ha ]
def _check_jn_ha(cluster): jn_count = utils.get_instances_count(cluster, common.JOURNAL_NODE) if jn_count < 3: raise ex.InvalidComponentCountException( common.JOURNAL_NODE, _("3 or more. Odd number"), jn_count, _("At least 3 JournalNodes are required for HA")) if jn_count % 2 != 1: raise ex.InvalidComponentCountException( common.JOURNAL_NODE, _("Odd number"), jn_count, _("Odd number of JournalNodes are required for HA"))
def _check_zk_ha(cluster): zk_count = utils.get_instances_count(cluster, common.ZOOKEEPER_SERVER) if zk_count < 3: raise ex.InvalidComponentCountException( common.ZOOKEEPER_SERVER, _("3 or more. Odd number"), zk_count, _("At least 3 ZooKeepers are required for HA")) if zk_count % 2 != 1: raise ex.InvalidComponentCountException( common.ZOOKEEPER_SERVER, _("Odd number"), zk_count, _("Odd number of ZooKeepers are required for HA"))
def _check_hive(cluster): hs_count = utils.get_instances_count(cluster, common.HIVE_SERVER) hm_count = utils.get_instances_count(cluster, common.HIVE_METASTORE) if hs_count > 1: raise ex.InvalidComponentCountException(common.HIVE_SERVER, _("0 or 1"), hs_count) if hm_count > 1: raise ex.InvalidComponentCountException(common.HIVE_METASTORE, _("0 or 1"), hm_count) if hs_count == 0 and hm_count == 1: raise ex.RequiredServiceMissingException( common.HIVE_SERVER, required_by=common.HIVE_METASTORE) if hs_count == 1 and hm_count == 0: raise ex.RequiredServiceMissingException( common.HIVE_METASTORE, required_by=common.HIVE_SERVER)
def _check_ranger(cluster): ra_count = utils.get_instances_count(cluster, common.RANGER_ADMIN) ru_count = utils.get_instances_count(cluster, common.RANGER_USERSYNC) if ra_count > 1: raise ex.InvalidComponentCountException(common.RANGER_ADMIN, _("0 or 1"), ra_count) if ru_count > 1: raise ex.InvalidComponentCountException(common.RANGER_USERSYNC, _("0 or 1"), ru_count) if ra_count == 1 and ru_count == 0: raise ex.RequiredServiceMissingException( common.RANGER_USERSYNC, required_by=common.RANGER_ADMIN) if ra_count == 0 and ru_count == 1: raise ex.RequiredServiceMissingException( common.RANGER_ADMIN, required_by=common.RANGER_USERSYNC)
def _check_yarn(cluster): rm_count = utils.get_instances_count(cluster, common.RESOURCEMANAGER) nm_count = utils.get_instances_count(cluster, common.NODEMANAGER) hs_count = utils.get_instances_count(cluster, common.HISTORYSERVER) at_count = utils.get_instances_count(cluster, common.APP_TIMELINE_SERVER) if cluster.cluster_configs.get("general", {}).get( common.RESOURCEMANAGER_HA): _check_zk_ha(cluster) if rm_count != 2: raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 2, rm_count) else: if rm_count != 1: raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 1, rm_count) if hs_count != 1: raise ex.InvalidComponentCountException(common.HISTORYSERVER, 1, hs_count) if at_count != 1: raise ex.InvalidComponentCountException(common.APP_TIMELINE_SERVER, 1, at_count) if nm_count == 0: raise ex.InvalidComponentCountException(common.NODEMANAGER, _("1 or more"), nm_count)
def _check_ambari(cluster): am_count = utils.get_instances_count(cluster, common.AMBARI_SERVER) zk_count = utils.get_instances_count(cluster, common.ZOOKEEPER_SERVER) if am_count != 1: raise ex.InvalidComponentCountException(common.AMBARI_SERVER, 1, am_count) if zk_count == 0: raise ex.InvalidComponentCountException(common.ZOOKEEPER_SERVER, _("1 or more"), zk_count)
def _check_hbase(cluster): hm_count = utils.get_instances_count(cluster, common.HBASE_MASTER) hr_count = utils.get_instances_count(cluster, common.HBASE_REGIONSERVER) if hm_count > 1: raise ex.InvalidComponentCountException(common.HBASE_MASTER, _("0 or 1"), hm_count) if hm_count == 1 and hr_count == 0: raise ex.RequiredServiceMissingException( common.HBASE_REGIONSERVER, required_by=common.HBASE_MASTER) if hr_count > 0 and hm_count == 0: raise ex.RequiredServiceMissingException( common.HBASE_MASTER, required_by=common.HBASE_REGIONSERVER)
def validate_job_execution(self, cluster, job, data): if not self.edp_supported(cluster.hadoop_version): raise pex.PluginInvalidDataException( _('Ambari plugin of {base} or higher required to run {type} ' 'jobs').format(base=EDPSparkEngine.edp_base_version, type=job.type)) spark_nodes_count = plugin_utils.get_instances_count( cluster, p_common.SPARK_JOBHISTORYSERVER) if spark_nodes_count != 1: raise pex.InvalidComponentCountException( p_common.SPARK_JOBHISTORYSERVER, '1', spark_nodes_count) super(EDPSparkEngine, self).validate_job_execution(cluster, job, data)
def wait_ambari_request(self, request_id, cluster_name): context.sleep(20) while True: status = self.check_request_status(cluster_name, request_id) LOG.debug( "Task %(context)s in %(status)s state. " "Completed %(percent).1f%%", { 'context': status["request_context"], 'status': status["request_status"], 'percent': status["progress_percent"] }) if status["request_status"] == "COMPLETED": return if status["request_status"] in ["IN_PROGRESS", "PENDING"]: context.sleep(5) else: raise p_exc.HadoopProvisionError( _("Ambari request in %s state") % status["request_status"])
def _check_hdfs(cluster): nn_count = utils.get_instances_count(cluster, common.NAMENODE) dn_count = utils.get_instances_count(cluster, common.DATANODE) snn_count = utils.get_instances_count(cluster, common.SECONDARY_NAMENODE) if cluster.cluster_configs.get("general", {}).get(common.NAMENODE_HA): _check_zk_ha(cluster) _check_jn_ha(cluster) if nn_count != 2: raise ex.InvalidComponentCountException(common.NAMENODE, 2, nn_count) else: if nn_count != 1: raise ex.InvalidComponentCountException(common.NAMENODE, 1, nn_count) if snn_count != 1: raise ex.InvalidComponentCountException(common.SECONDARY_NAMENODE, 1, snn_count) if dn_count == 0: raise ex.InvalidComponentCountException( common.DATANODE, _("1 or more"), dn_count)
def _check_oozie(cluster): count = utils.get_instances_count(cluster, common.OOZIE_SERVER) if count > 1: raise ex.InvalidComponentCountException(common.OOZIE_SERVER, _("0 or 1"), count)
def _check_spark(cluster): count = utils.get_instances_count(cluster, common.SPARK_JOBHISTORYSERVER) if count > 1: raise ex.InvalidComponentCountException(common.SPARK_JOBHISTORYSERVER, _("0 or 1"), count)
def is_ambari_active(self): if self._exception_store: raise health_check_base.RedHealthError(self._exception_store) return _("Ambari Monitor is healthy")
def get_description(self): return _("The Ambari Sahara plugin provides the ability to launch " "clusters with Hortonworks Data Platform (HDP) on OpenStack " "using Apache Ambari")