def get_alerts_data(self, service=None): if self._data is not None: # return cached data return self._data.get(service, []) if service else self._data self._data = {} self._cluster_services = [] try: ambari = plugin_utils.get_instance(self.cluster, p_common.AMBARI_SERVER) password = self.cluster.extra.get("ambari_password") with client.AmbariClient(ambari, password=password) as ambari: resp = ambari.get_alerts_data(self.cluster) for alert in resp: alert = alert.get('Alert', {}) service = alert.get('service_name').lower() if service not in self._data: self._data[service] = [] self._cluster_services.append(service) self._data[service].append(alert) except Exception as e: prefix = _("Can't get response from Ambari Monitor") msg = _("%(problem)s: %(description)s") % { 'problem': prefix, 'description': six.text_type(e) } # don't put in exception to logs, it will be done by log.exception LOG.exception(prefix) self._exception_store = msg
def check_health(self): imp_map = {'OK': 'GREEN', 'WARNING': 'YELLOW', 'CRITICAL': 'RED'} other_map = {'OK': 'GREEN'} color_counter = collections.Counter() important_services = self.get_important_services() for alert in self.provider.get_alerts_data(self.service): alert_summary = alert.get('state', 'UNKNOWN') if self.service in important_services: target = imp_map.get(alert_summary, 'RED') else: target = other_map.get(alert_summary, 'YELLOW') color_counter[target] += 1 if color_counter['RED'] > 0 and color_counter['YELLOW'] > 0: raise health_check_base.RedHealthError( _("Ambari Monitor has responded that cluster has " "%(red)d critical and %(yellow)d warning alert(s)") % { 'red': color_counter['RED'], 'yellow': color_counter['YELLOW'] }) elif color_counter['RED'] > 0: raise health_check_base.RedHealthError( _("Ambari Monitor has responded that cluster has " "%(red)d critical alert(s)") % {'red': color_counter['RED']}) elif color_counter['YELLOW'] > 0: raise health_check_base.YellowHealthError( _("Ambari Monitor has responded that cluster " "has %d warning alert(s)") % color_counter['YELLOW']) return _("No alerts found")
def _check_storm(cluster): dr_count = utils.get_instances_count(cluster, common.DRPC_SERVER) ni_count = utils.get_instances_count(cluster, common.NIMBUS) su_count = utils.get_instances_count(cluster, common.STORM_UI_SERVER) sv_count = utils.get_instances_count(cluster, common.SUPERVISOR) if dr_count > 1: raise ex.InvalidComponentCountException(common.DRPC_SERVER, _("0 or 1"), dr_count) if ni_count > 1: raise ex.InvalidComponentCountException(common.NIMBUS, _("0 or 1"), ni_count) if su_count > 1: raise ex.InvalidComponentCountException(common.STORM_UI_SERVER, _("0 or 1"), su_count) if dr_count == 0 and ni_count == 1: raise ex.RequiredServiceMissingException(common.DRPC_SERVER, required_by=common.NIMBUS) if dr_count == 1 and ni_count == 0: raise ex.RequiredServiceMissingException( common.NIMBUS, required_by=common.DRPC_SERVER) if su_count == 1 and (dr_count == 0 or ni_count == 0): raise ex.RequiredServiceMissingException( common.NIMBUS, required_by=common.STORM_UI_SERVER) if dr_count == 1 and sv_count == 0: raise ex.RequiredServiceMissingException( common.SUPERVISOR, required_by=common.DRPC_SERVER) if sv_count > 0 and dr_count == 0: raise ex.RequiredServiceMissingException(common.DRPC_SERVER, required_by=common.SUPERVISOR)
def check_health(self): important_services = self.provider.get_important_services() observed_data = self.provider.get_health_status(self.service) imp_map = {'BAD': 'red', 'CONCERNING': 'yellow', 'GOOD': 'green'} summary = observed_data['summary'] checks = observed_data.get('checks', []) failed_checks = [] for check in checks: if check['summary'] != 'GOOD': failed_checks.append('%(name)s - %(summary)s state' % { 'name': check['name'], 'summary': check['summary'] }) additional_info = None if failed_checks: additional_info = _("The following checks did not pass: %s" ) % ",".join(failed_checks) if self.service in important_services: overall = imp_map.get(summary, 'red') else: overall = 'green' if summary != 'GOOD': overall = 'yellow' msg = _("Cloudera Manager has responded that service is in " "the %s state") % summary if additional_info: msg = _("%(problem)s. %(description)s") % { 'problem': msg, 'description': additional_info } if overall == 'red': raise health_check_base.RedHealthError(msg) elif overall == 'yellow': raise health_check_base.YellowHealthError(msg) return msg
def _create_config_obj(self, item, target='general', scope='cluster', high_priority=False): def _prepare_value(value): if isinstance(value, str): return value.strip().lower() return value conf_name = _prepare_value(item.get('name', None)) conf_value = _prepare_value(item.get('value', None)) if not conf_name: raise ex.HadoopProvisionError(_("Config missing 'name'")) if conf_value is None: raise ex.PluginInvalidDataException( _("Config '%s' missing 'value'") % conf_name) if high_priority or item.get('priority', 2) == 1: priority = 1 else: priority = 2 return p.Config( name=conf_name, applicable_target=target, scope=scope, config_type=item.get('config_type', "string"), config_values=item.get('config_values', None), default_value=conf_value, is_optional=item.get('is_optional', True), description=item.get('description', None), priority=priority)
def _hue_validation(cls, cluster): hue_count = cls.get_inst_count(cluster, 'HUE_SERVER') if hue_count > 1: raise ex.InvalidComponentCountException('HUE_SERVER', _('0 or 1'), hue_count) shs_count = cls.get_inst_count(cluster, 'SPARK_YARN_HISTORY_SERVER') hms_count = cls.get_inst_count(cluster, 'HIVE_METASTORE') oo_count = cls.get_inst_count(cluster, 'OOZIE_SERVER') rm_count = cls.get_inst_count(cluster, 'YARN_RESOURCEMANAGER') if shs_count > 1: raise ex.InvalidComponentCountException( 'SPARK_YARN_HISTORY_SERVER', _('0 or 1'), shs_count) if shs_count and not rm_count: raise ex.RequiredServiceMissingException( 'YARN_RESOURCEMANAGER', required_by='SPARK_YARN_HISTORY_SERVER') if oo_count < 1 and hue_count: raise ex.RequiredServiceMissingException('OOZIE_SERVER', required_by='HUE_SERVER') if hms_count < 1 and hue_count: raise ex.RequiredServiceMissingException('HIVE_METASTORE', required_by='HUE_SERVER')
def _await_cldb(self, cluster_context, instances=None, timeout=600): instances = instances or cluster_context.get_instances() cldb_node = cluster_context.get_instance(mfs.CLDB) start_time = timeutils.utcnow() retry_count = 0 with cldb_node.remote() as r: LOG.debug("Waiting {count} seconds for CLDB initialization".format( count=timeout)) while timeutils.delta_seconds(start_time, timeutils.utcnow()) < timeout: ec, out = r.execute_command(NODE_LIST_CMD, raise_when_error=False) resp = json.loads(out) status = resp['status'] if str(status).lower() == 'ok': ips = [n['ip'] for n in resp['data']] retry_count += 1 for i in instances: if (i.internal_ip not in ips and (retry_count > DEFAULT_RETRY_COUNT)): msg = _("Node failed to connect to CLDB: %s" ) % i.internal_ip raise ex.HadoopProvisionError(msg) break else: context.sleep(DELAY) else: raise ex.HadoopProvisionError(_("CLDB failed to start"))
def _hdfs_ha_validation(cls, cluster): jn_count = cls.get_inst_count(cluster, 'HDFS_JOURNALNODE') zk_count = cls.get_inst_count(cluster, 'ZOOKEEPER_SERVER') require_anti_affinity = cls.PU.c_helper.get_required_anti_affinity( cluster) if jn_count > 0: if jn_count < 3: raise ex.InvalidComponentCountException( 'HDFS_JOURNALNODE', _('not less than 3'), jn_count) if not jn_count % 2: raise ex.InvalidComponentCountException( 'HDFS_JOURNALNODE', _('be odd'), jn_count) if zk_count < 1: raise ex.RequiredServiceMissingException('ZOOKEEPER', required_by='HDFS HA') if require_anti_affinity: if 'HDFS_SECONDARYNAMENODE' not in \ cls._get_anti_affinity(cluster): raise ex.NameNodeHAConfigurationError( _('HDFS_SECONDARYNAMENODE should be enabled ' 'in anti_affinity.')) if 'HDFS_NAMENODE' not in cls._get_anti_affinity(cluster): raise ex.NameNodeHAConfigurationError( _('HDFS_NAMENODE should be enabled in anti_affinity.'))
def _impala_validation(cls, cluster): ics_count = cls.get_inst_count(cluster, 'IMPALA_CATALOGSERVER') iss_count = cls.get_inst_count(cluster, 'IMPALA_STATESTORE') id_count = cls.get_inst_count(cluster, 'IMPALAD') dn_count = cls.get_inst_count(cluster, 'HDFS_DATANODE') hms_count = cls.get_inst_count(cluster, 'HIVE_METASTORE') if ics_count > 1: raise ex.InvalidComponentCountException('IMPALA_CATALOGSERVER', _('0 or 1'), ics_count) if iss_count > 1: raise ex.InvalidComponentCountException('IMPALA_STATESTORE', _('0 or 1'), iss_count) if ics_count == 1: datanode_ng = u.get_node_groups(cluster, "HDFS_DATANODE") impalad_ng = u.get_node_groups(cluster, "IMPALAD") datanodes = set(ng.id for ng in datanode_ng) impalads = set(ng.id for ng in impalad_ng) if datanodes != impalads: raise ex.InvalidClusterTopology( _("IMPALAD must be installed on every HDFS_DATANODE")) if iss_count != 1: raise ex.RequiredServiceMissingException('IMPALA_STATESTORE', required_by='IMPALA') if id_count < 1: raise ex.RequiredServiceMissingException('IMPALAD', required_by='IMPALA') if dn_count < 1: raise ex.RequiredServiceMissingException('HDFS_DATANODE', required_by='IMPALA') if hms_count < 1: raise ex.RequiredServiceMissingException('HIVE_METASTORE', required_by='IMPALA')
def _yarn_ha_validation(cls, cluster): rm_count = cls.get_inst_count(cluster, 'YARN_RESOURCEMANAGER') zk_count = cls.get_inst_count(cluster, 'ZOOKEEPER_SERVER') stdb_rm_count = cls.get_inst_count(cluster, 'YARN_STANDBYRM') require_anti_affinity = cls.PU.c_helper.get_required_anti_affinity( cluster) if stdb_rm_count > 1: raise ex.InvalidComponentCountException('YARN_STANDBYRM', _('0 or 1'), stdb_rm_count) if stdb_rm_count > 0: if rm_count < 1: raise ex.RequiredServiceMissingException( 'YARN_RESOURCEMANAGER', required_by='RM HA') if zk_count < 1: raise ex.RequiredServiceMissingException('ZOOKEEPER', required_by='RM HA') if require_anti_affinity: if 'YARN_RESOURCEMANAGER' not in \ cls._get_anti_affinity(cluster): raise ex.ResourceManagerHAConfigurationError( _('YARN_RESOURCEMANAGER should be enabled in ' 'anti_affinity.')) if 'YARN_STANDBYRM' not in cls._get_anti_affinity(cluster): raise ex.ResourceManagerHAConfigurationError( _('YARN_STANDBYRM should be' ' enabled in anti_affinity.'))
def _get_ha_params(): enable_namenode_ha = provisioning.Config( name=common.NAMENODE_HA, applicable_target="general", scope="cluster", config_type="bool", default_value=False, is_optional=True, description=_("Enable NameNode HA"), priority=1) enable_resourcemanager_ha = provisioning.Config( name=common.RESOURCEMANAGER_HA, applicable_target="general", scope="cluster", config_type="bool", default_value=False, is_optional=True, description=_("Enable ResourceManager HA"), priority=1) enable_regionserver_ha = provisioning.Config( name=common.HBASE_REGIONSERVER_HA, applicable_target="general", scope="cluster", config_type="bool", default_value=False, is_optional=True, description=_("Enable HBase RegionServer HA"), priority=1) return [ enable_namenode_ha, enable_resourcemanager_ha, enable_regionserver_ha ]
def _validate_existing_ng_scaling(self, cluster, existing): scalable_processes = self._get_scalable_processes() dn_to_delete = 0 for ng in cluster.node_groups: if ng.id in existing: if ng.count > existing[ng.id] and ("datanode" in ng.node_processes): dn_to_delete += ng.count - existing[ng.id] if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Spark plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) dn_amount = len(utils.get_instances(cluster, "datanode")) rep_factor = utils.get_config_value_or_default('HDFS', "dfs.replication", cluster) if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor: raise ex.ClusterCannotBeScaled( cluster.name, _("Spark plugin cannot shrink cluster because " "there would be not enough nodes for HDFS " "replicas (replication factor is %s)") % rep_factor)
def wait_ambari_requests(self, requests, cluster_name): requests = set(requests) failed = [] context.sleep(20) while len(requests) > 0: completed, not_completed = set(), set() for req_id in requests: request = self.get_request_info(cluster_name, req_id) status = request.get("request_status") if status == 'COMPLETED': completed.add(req_id) elif status in ['IN_PROGRESS', 'PENDING']: not_completed.add(req_id) else: failed.append(request) if failed: msg = _("Some Ambari request(s) " "not in COMPLETED state: %(description)s.") descrs = [] for req in failed: descr = _( "request %(id)d: %(name)s - in status %(status)s") descrs.append(descr % {'id': req.get("id"), 'name': req.get("request_context"), 'status': req.get("request_status")}) raise p_exc.HadoopProvisionError(msg % {'description': descrs}) requests = not_completed context.sleep(5) LOG.debug("Waiting for %d ambari request(s) to be completed", len(not_completed)) LOG.debug("All ambari requests have been completed")
def get_service(self, node_process): ui_name = self.get_service_name_by_node_process(node_process) if ui_name is None: raise e.PluginInvalidDataException( _('Service not found in services list')) version = self.get_chosen_service_version(ui_name) service = self._find_service_instance(ui_name, version) if service is None: raise e.PluginInvalidDataException(_('Can not map service')) return service
def _check_jn_ha(cluster): jn_count = utils.get_instances_count(cluster, common.JOURNAL_NODE) if jn_count < 3: raise ex.InvalidComponentCountException( common.JOURNAL_NODE, _("3 or more. Odd number"), jn_count, _("At least 3 JournalNodes are required for HA")) if jn_count % 2 != 1: raise ex.InvalidComponentCountException( common.JOURNAL_NODE, _("Odd number"), jn_count, _("Odd number of JournalNodes are required for HA"))
def _check_zk_ha(cluster): zk_count = utils.get_instances_count(cluster, common.ZOOKEEPER_SERVER) if zk_count < 3: raise ex.InvalidComponentCountException( common.ZOOKEEPER_SERVER, _("3 or more. Odd number"), zk_count, _("At least 3 ZooKeepers are required for HA")) if zk_count % 2 != 1: raise ex.InvalidComponentCountException( common.ZOOKEEPER_SERVER, _("Odd number"), zk_count, _("Odd number of ZooKeepers are required for HA"))
def stop(self, cluster_context, instances=None): instances = instances or cluster_context.get_instances() zookeepers = cluster_context.filter_instances(instances, mng.ZOOKEEPER) utils.add_provisioning_step(cluster_context.cluster.id, _("Stop ZooKeepers nodes"), len(zookeepers)) self._stop_zk_nodes(zookeepers) utils.add_provisioning_step(cluster_context.cluster.id, _("Stop Warden nodes"), len(instances)) self._stop_warden_on_nodes(instances)
def _basic_validation(cls, cluster): mng_count = cls.get_inst_count(cluster, 'CLOUDERA_MANAGER') if mng_count != 1: raise ex.InvalidComponentCountException('CLOUDERA_MANAGER', 1, mng_count) nn_count = cls.get_inst_count(cluster, 'HDFS_NAMENODE') if nn_count != 1: raise ex.InvalidComponentCountException('HDFS_NAMENODE', 1, nn_count) snn_count = cls.get_inst_count(cluster, 'HDFS_SECONDARYNAMENODE') if snn_count != 1: raise ex.InvalidComponentCountException('HDFS_SECONDARYNAMENODE', 1, snn_count) dn_count = cls.get_inst_count(cluster, 'HDFS_DATANODE') replicas = cls.PU.get_config_value('HDFS', 'dfs_replication', cluster) if dn_count < replicas: raise ex.InvalidComponentCountException( 'HDFS_DATANODE', replicas, dn_count, _('Number of datanodes must be not' ' less than dfs_replication.')) du_reserved = cls.PU.get_config_value('DATANODE', 'dfs_datanode_du_reserved', cluster) du_reserved = du_reserved / 1073741824. for node_group in cluster.node_groups: volume_size = node_group.volumes_size if volume_size and volume_size < du_reserved: raise ex.InvalidVolumeSizeException(volume_size, du_reserved) rm_count = cls.get_inst_count(cluster, 'YARN_RESOURCEMANAGER') if rm_count > 1: raise ex.InvalidComponentCountException('YARN_RESOURCEMANAGER', _('0 or 1'), rm_count) hs_count = cls.get_inst_count(cluster, 'YARN_JOBHISTORY') if hs_count > 1: raise ex.InvalidComponentCountException('YARN_JOBHISTORY', _('0 or 1'), hs_count) if rm_count > 0 and hs_count < 1: raise ex.RequiredServiceMissingException( 'YARN_JOBHISTORY', required_by='YARN_RESOURCEMANAGER') nm_count = cls.get_inst_count(cluster, 'YARN_NODEMANAGER') if rm_count == 0: if nm_count > 0: raise ex.RequiredServiceMissingException( 'YARN_RESOURCEMANAGER', required_by='YARN_NODEMANAGER')
class NodeRequiredServiceMissingException(e.RequiredServiceMissingException): MISSING_MSG = _('Node "%(ng_name)s" is missing component %(component)s') REQUIRED_MSG = _('%(message)s, required by %(required_by)s') def __init__(self, service_name, ng_name, required_by=None): super(NodeRequiredServiceMissingException, self).__init__(service_name, required_by) args = {'ng_name': ng_name, 'component': service_name} self.message = (NodeRequiredServiceMissingException.MISSING_MSG % args) if required_by: args = {'message': self.message, 'required_by': required_by} self.message = (NodeRequiredServiceMissingException.REQUIRED_MSG % args)
def get_service_by_role(self, role, cluster=None, instance=None): if cluster: cm_cluster = self.get_cloudera_cluster(cluster) elif instance: cm_cluster = self.get_cloudera_cluster(instance.cluster) else: raise ValueError(_("'cluster' or 'instance' argument missed")) if role in ['NAMENODE', 'DATANODE', 'SECONDARYNAMENODE', 'HDFS_GATEWAY']: return cm_cluster.get_service(self.HDFS_SERVICE_NAME) elif role in ['RESOURCEMANAGER', 'NODEMANAGER', 'JOBHISTORY', 'YARN_GATEWAY']: return cm_cluster.get_service(self.YARN_SERVICE_NAME) elif role in ['OOZIE_SERVER']: return cm_cluster.get_service(self.OOZIE_SERVICE_NAME) elif role in ['HIVESERVER2', 'HIVEMETASTORE', 'WEBHCAT']: return cm_cluster.get_service(self.HIVE_SERVICE_NAME) elif role in ['HUE_SERVER']: return cm_cluster.get_service(self.HUE_SERVICE_NAME) elif role in ['SPARK_YARN_HISTORY_SERVER']: return cm_cluster.get_service(self.SPARK_SERVICE_NAME) elif role in ['SERVER']: return cm_cluster.get_service(self.ZOOKEEPER_SERVICE_NAME) elif role in ['MASTER', 'REGIONSERVER']: return cm_cluster.get_service(self.HBASE_SERVICE_NAME) elif role in ['AGENT']: return cm_cluster.get_service(self.FLUME_SERVICE_NAME) elif role in ['SENTRY_SERVER']: return cm_cluster.get_service(self.SENTRY_SERVICE_NAME) elif role in ['SQOOP_SERVER']: return cm_cluster.get_service(self.SQOOP_SERVICE_NAME) elif role in ['SOLR_SERVER']: return cm_cluster.get_service(self.SOLR_SERVICE_NAME) elif role in ['HBASE_INDEXER']: return cm_cluster.get_service(self.KS_INDEXER_SERVICE_NAME) elif role in ['CATALOGSERVER', 'STATESTORE', 'IMPALAD', 'LLAMA']: return cm_cluster.get_service(self.IMPALA_SERVICE_NAME) elif role in ['KMS']: return cm_cluster.get_service(self.KMS_SERVICE_NAME) elif role in ['JOURNALNODE']: return cm_cluster.get_service(self.HDFS_SERVICE_NAME) elif role in ['YARN_STANDBYRM']: return cm_cluster.get_service(self.YARN_SERVICE_NAME) elif role in ['KAFKA_BROKER']: return cm_cluster.get_service(self.KAFKA_SERVICE_NAME) else: raise ValueError( _("Process %(process)s is not supported by CDH plugin") % {'process': role})
def _check_hive(cluster): hs_count = utils.get_instances_count(cluster, common.HIVE_SERVER) hm_count = utils.get_instances_count(cluster, common.HIVE_METASTORE) if hs_count > 1: raise ex.InvalidComponentCountException(common.HIVE_SERVER, _("0 or 1"), hs_count) if hm_count > 1: raise ex.InvalidComponentCountException(common.HIVE_METASTORE, _("0 or 1"), hm_count) if hs_count == 0 and hm_count == 1: raise ex.RequiredServiceMissingException( common.HIVE_SERVER, required_by=common.HIVE_METASTORE) if hs_count == 1 and hm_count == 0: raise ex.RequiredServiceMissingException( common.HIVE_METASTORE, required_by=common.HIVE_SERVER)
def _check_ranger(cluster): ra_count = utils.get_instances_count(cluster, common.RANGER_ADMIN) ru_count = utils.get_instances_count(cluster, common.RANGER_USERSYNC) if ra_count > 1: raise ex.InvalidComponentCountException(common.RANGER_ADMIN, _("0 or 1"), ra_count) if ru_count > 1: raise ex.InvalidComponentCountException(common.RANGER_USERSYNC, _("0 or 1"), ru_count) if ra_count == 1 and ru_count == 0: raise ex.RequiredServiceMissingException( common.RANGER_USERSYNC, required_by=common.RANGER_ADMIN) if ra_count == 0 and ru_count == 1: raise ex.RequiredServiceMissingException( common.RANGER_ADMIN, required_by=common.RANGER_USERSYNC)
def validate_additional_ng_scaling(cluster, additional): rm = vu.get_resourcemanager(cluster) scalable_processes = _get_scalable_processes() for ng_id in additional: ng = u.get_by_id(cluster.node_groups, ng_id) if not set(ng.node_processes).issubset(scalable_processes): msg = _("Vanilla plugin cannot scale nodegroup with processes: %s") raise ex.NodeGroupCannotBeScaled(ng.name, msg % ' '.join(ng.node_processes)) if not rm and 'nodemanager' in ng.node_processes: msg = _("Vanilla plugin cannot scale node group with processes " "which have no master-processes run in cluster") raise ex.NodeGroupCannotBeScaled(ng.name, msg)
def _check_decommission(cluster, instances, check_func, option): utils.plugin_option_poll(cluster, is_decommissioned, option, _("Wait for decommissioning"), 5, { 'cluster': cluster, 'check_func': check_func, 'instances': instances })
def validate_job_execution(self, cluster, job, data): if not self.edp_supported(cluster.hadoop_version): raise ex.InvalidDataException( _('Storm {base} required to run {type} jobs').format( base=EdpPyleusEngine.edp_base_version, type=job.type)) super(EdpPyleusEngine, self).validate_job_execution(cluster, job, data)
def invoke(self, method, relpath=None, params=None, data=None, headers=None): """Invoke an API method :return: Raw body or JSON dictionary (if response content type is JSON). """ path = self._join_uri(relpath) resp = self._client.execute(method, path, params=params, data=data, headers=headers) try: body = resp.read() except Exception as ex: raise ex.CMApiException( _("Command %(method)s %(path)s failed: %(msg)s") % {'method': method, 'path': path, 'msg': six.text_type(ex)}) LOG.debug("{method} got response: {body}".format(method=method, body=body[:32])) # Is the response application/json? if (len(body) != 0 and resp.info().getmaintype() == "application" and resp.info().getsubtype() == "json"): try: json_dict = json.loads(body) return json_dict except Exception: LOG.error('JSON decode error: {body}'.format(body=body)) raise else: return body
def get(self, relpath=None, params=None): """Invoke the GET method on a resource :param relpath: Optional. A relative path to this resource's path. :param params: Key-value data. :return: A dictionary of the JSON result. """ for retry in six.moves.xrange(self.retries + 1): if retry: context.sleep(self.retry_sleep) try: return self.invoke("GET", relpath, params) except (socket.error, urllib.error.URLError) as e: if "timed out" in six.text_type(e).lower(): if retry < self.retries: LOG.warning("Timeout issuing GET request for " "{path}. Will retry".format( path=self._join_uri(relpath))) else: LOG.warning("Timeout issuing GET request for " "{path}. No retries left".format( path=self._join_uri(relpath))) else: raise else: raise ex.CMApiException(_("Get retry max time reached."))
def validate_additional_ng_scaling(cls, cluster, additional): rm = cls.PU.get_resourcemanager(cluster) scalable_processes = cls._get_scalable_processes() for ng_id in additional: ng = u.get_by_id(cluster.node_groups, ng_id) if not set(ng.node_processes).issubset(scalable_processes): msg = _("CDH plugin cannot scale nodegroup with processes: " "%(processes)s") raise ex.NodeGroupCannotBeScaled( ng.name, msg % {'processes': ' '.join(ng.node_processes)}) if not rm and 'YARN_NODEMANAGER' in ng.node_processes: msg = _("CDH plugin cannot scale node group with processes " "which have no master-processes run in cluster") raise ex.NodeGroupCannotBeScaled(ng.name, msg)
def _check_yarn(cluster): rm_count = utils.get_instances_count(cluster, common.RESOURCEMANAGER) nm_count = utils.get_instances_count(cluster, common.NODEMANAGER) hs_count = utils.get_instances_count(cluster, common.HISTORYSERVER) at_count = utils.get_instances_count(cluster, common.APP_TIMELINE_SERVER) if cluster.cluster_configs.get("general", {}).get(common.RESOURCEMANAGER_HA): _check_zk_ha(cluster) if rm_count != 2: raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 2, rm_count) else: if rm_count != 1: raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 1, rm_count) if hs_count != 1: raise ex.InvalidComponentCountException(common.HISTORYSERVER, 1, hs_count) if at_count != 1: raise ex.InvalidComponentCountException(common.APP_TIMELINE_SERVER, 1, at_count) if nm_count == 0: raise ex.InvalidComponentCountException(common.NODEMANAGER, _("1 or more"), nm_count)
def check_health(self): instances = self.cluster_context.get_instances( node_process=management.ZOOKEEPER) active_count = 0 for instance in instances: if self._is_zookeeper_running(instance): active_count += 1 if active_count == 0: raise health_check_base.RedHealthError(_( "ZooKeeper is not in running state")) if active_count < len(instances): raise health_check_base.YellowHealthError(_( "Some ZooKeeper processes are not in running state")) return _("ZooKeeper is in running state")