def validate_additional_ng_scaling(cluster, additional): rm = vu.get_resourcemanager(cluster) scalable_processes = _get_scalable_processes() for ng_id in additional: ng = u.get_by_id(cluster.node_groups, ng_id) if not set(ng.node_processes).issubset(scalable_processes): msg = _("Vanilla plugin cannot scale nodegroup with processes: %s") raise ex.NodeGroupCannotBeScaled(ng.name, msg % ' '.join(ng.node_processes)) if not rm and 'nodemanager' in ng.node_processes: msg = _("Vanilla plugin cannot scale node group with processes " "which have no master-processes run in cluster") raise ex.NodeGroupCannotBeScaled(ng.name, msg)
def _check_decommission(cluster, instances, check_func, option): utils.plugin_option_poll(cluster, is_decommissioned, option, _("Wait for decommissioning"), 5, { 'cluster': cluster, 'check_func': check_func, 'instances': instances })
def validate_job_execution(self, cluster, job, data): if (not self.edp_supported(cluster.hadoop_version) or not v_utils.get_spark_history_server(cluster)): raise ex.PluginInvalidDataException( _('Spark {base} or higher required to run {type} jobs').format( base=EdpSparkEngine.edp_base_version, type=job.type)) super(EdpSparkEngine, self).validate_job_execution(cluster, job, data)
def configure_instances(pctx, instances): if len(instances) == 0: return utils.add_provisioning_step(instances[0].cluster_id, _("Configure instances"), len(instances)) for instance in instances: with context.set_current_instance_id(instance.instance_id): _configure_instance(pctx, instance)
def await_datanodes(cluster): datanodes_count = len(vu.get_datanodes(cluster)) if datanodes_count < 1: return l_message = _("Waiting on %s datanodes to start up") % datanodes_count with vu.get_namenode(cluster).remote() as r: utils.plugin_option_poll( cluster, _check_datanodes_count, config_helper.DATANODES_STARTUP_TIMEOUT, l_message, 1, { 'remote': r, 'count': datanodes_count})
def validate_existing_ng_scaling(pctx, cluster, existing): scalable_processes = _get_scalable_processes() dn_to_delete = 0 for ng in cluster.node_groups: if ng.id in existing: if ng.count > existing[ng.id] and "datanode" in ng.node_processes: dn_to_delete += ng.count - existing[ng.id] if not set(ng.node_processes).issubset(scalable_processes): msg = _("Vanilla plugin cannot scale nodegroup " "with processes: %s") raise ex.NodeGroupCannotBeScaled( ng.name, msg % ' '.join(ng.node_processes)) dn_amount = len(vu.get_datanodes(cluster)) rep_factor = cu.get_config_value(pctx, 'HDFS', 'dfs.replication', cluster) if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor: msg = _("Vanilla plugin cannot shrink cluster because it would be " "not enough nodes for replicas (replication factor is %s)") raise ex.ClusterCannotBeScaled(cluster.name, msg % rep_factor)
def test_check_decommission(self, plugin_option_poll): check_func = mock.Mock() option = mock.Mock() is_dec = scaling.is_decommissioned mess = _("Wait for decommissioning") sample_dict = { 'cluster': self.cluster, 'check_func': check_func, 'instances': self.instances } scaling._check_decommission(self.cluster, self.instances, check_func, option) plugin_option_poll.assert_called_once_with(self.cluster, is_dec, option, mess, 5, sample_dict)
def get_config_value(pctx, service, name, cluster=None): if cluster: for ng in cluster.node_groups: cl_param = ng.configuration().get(service, {}).get(name) if cl_param is not None: return cl_param for c in pctx['all_confs']: if c.applicable_target == service and c.name == name: return c.default_value raise ex.PluginNotFoundException({ "name": name, "service": service }, _("Unable to get parameter '%(name)s' from service %(service)s"))
def validate_zookeeper_node_count(zk_ng, existing, additional): zk_amount = 0 for ng in zk_ng: if ng.id in existing: zk_amount += existing[ng.id] else: zk_amount += ng.count for ng_id in additional: ng = u.get_by_id(zk_ng, ng_id) if "zookeeper" in ng.node_processes: zk_amount += ng.count if (zk_amount % 2) != 1: msg = _("Vanilla plugin cannot scale cluster because it must keep" " zookeeper service in odd.") raise ex.ClusterCannotBeScaled(zk_ng[0].cluster.name, msg)
def test_await_datanodes(self, plugin_option_poll, add_provisioning_step, check_cluster_exists, get_datanodes, get_namenode): cluster = mock.Mock() get_datanodes.return_value = ['node1'] r = mock.Mock() remote = mock.Mock(return_value=r) remote.__enter__ = remote remote.__exit__ = mock.Mock() namenode = mock.Mock() namenode.remote.return_value = remote get_namenode.return_value = namenode mess = _('Waiting on 1 datanodes to start up') test_data = {'remote': r, 'count': 1} timeout = config_helper.DATANODES_STARTUP_TIMEOUT rs.await_datanodes(cluster) get_datanodes.assert_called_once_with(cluster) get_namenode.assert_called_once_with(cluster) plugin_option_poll.assert_called_once_with(cluster, rs._check_datanodes_count, timeout, mess, 1, test_data)
config.configure_topology_data(pctx, cluster) run.start_dn_nm_processes(instances) swift_helper.install_ssl_certs(instances) config.configure_zookeeper(cluster) run.refresh_zk_servers(cluster) def _get_instances_with_service(instances, service): return [ instance for instance in instances if service in instance.node_group.node_processes ] @utils.event_wrapper(True, step=_("Update include files"), param=('cluster', 0)) def _update_include_files(cluster, dec_instances=None): dec_instances = dec_instances or [] dec_instances_ids = [instance.id for instance in dec_instances] instances = utils.get_instances(cluster) inst_filter = lambda inst: inst.id not in dec_instances_ids datanodes = filter(inst_filter, vu.get_datanodes(cluster)) nodemanagers = filter(inst_filter, vu.get_nodemanagers(cluster)) dn_hosts = utils.generate_fqdn_host_names(datanodes) nm_hosts = utils.generate_fqdn_host_names(nodemanagers) for instance in instances: with instance.remote() as r:
def format_namenode(instance): instance.remote().execute_command( 'sudo su - -c "hdfs namenode -format" hadoop') @utils.event_wrapper(True, step=utils.start_process_event_message("Oozie"), param=('cluster', 0)) def refresh_hadoop_nodes(cluster): nn = vu.get_namenode(cluster) nn.remote().execute_command( 'sudo su - -c "hdfs dfsadmin -refreshNodes" hadoop') @utils.event_wrapper(True, step=_("Refresh %s nodes") % "YARN", param=('cluster', 0)) def refresh_yarn_nodes(cluster): rm = vu.get_resourcemanager(cluster) rm.remote().execute_command( 'sudo su - -c "yarn rmadmin -refreshNodes" hadoop') def _oozie_share_lib(remote): LOG.debug("Sharing Oozie libs") # remote.execute_command('sudo su - -c "/opt/oozie/bin/oozie-setup.sh ' # 'sharelib create -fs hdfs://%s:8020" hadoop' # % nn_hostname) # TODO(alazarev) return 'oozie-setup.sh sharelib create' back # when #1262023 is resolved
'/hdfs/datanode') dirs['hadoop_log_dir'] = _make_hadoop_paths(storage_paths, '/hadoop/logs')[0] dirs['hadoop_secure_dn_log_dir'] = _make_hadoop_paths( storage_paths, '/hadoop/logs/secure')[0] dirs['yarn_log_dir'] = _make_hadoop_paths(storage_paths, '/yarn/logs')[0] return dirs def _make_hadoop_paths(paths, hadoop_dir): return [path + hadoop_dir for path in paths] @utils.event_wrapper(True, step=_("Configure topology data"), param=('cluster', 1)) def configure_topology_data(pctx, cluster): if config_helper.is_data_locality_enabled(pctx, cluster): LOG.warning("Node group awareness is not implemented in YARN yet " "so enable_hypervisor_awareness set to False explicitly") tpl_map = th.generate_topology_map(cluster, is_node_awareness=False) topology_data = "\n".join([k + " " + v for k, v in tpl_map.items()]) + "\n" for ng in cluster.node_groups: for i in ng.instances: i.remote().write_file_to(HADOOP_CONF_DIR + "/topology.data", topology_data, run_as_root=True)
def format_namenode(instance): instance.remote().execute_command( 'sudo su - -c "hdfs namenode -format" hadoop') @utils.event_wrapper( True, step=utils.start_process_event_message("Oozie"), param=('cluster', 0)) def refresh_hadoop_nodes(cluster): nn = vu.get_namenode(cluster) nn.remote().execute_command( 'sudo su - -c "hdfs dfsadmin -refreshNodes" hadoop') @utils.event_wrapper( True, step=_("Refresh %s nodes") % "YARN", param=('cluster', 0)) def refresh_yarn_nodes(cluster): rm = vu.get_resourcemanager(cluster) rm.remote().execute_command( 'sudo su - -c "yarn rmadmin -refreshNodes" hadoop') def _oozie_share_lib(remote): LOG.debug("Sharing Oozie libs") # remote.execute_command('sudo su - -c "/opt/oozie/bin/oozie-setup.sh ' # 'sharelib create -fs hdfs://%s:8020" hadoop' # % nn_hostname) # TODO(alazarev) return 'oozie-setup.sh sharelib create' back # when #1262023 is resolved
def validate_cluster_creating(pctx, cluster): nn_count = _get_inst_count(cluster, 'namenode') if nn_count != 1: raise ex.InvalidComponentCountException('namenode', 1, nn_count) snn_count = _get_inst_count(cluster, 'secondarynamenode') if snn_count > 1: raise ex.InvalidComponentCountException('secondarynamenode', _('0 or 1'), snn_count) rm_count = _get_inst_count(cluster, 'resourcemanager') if rm_count > 1: raise ex.InvalidComponentCountException('resourcemanager', _('0 or 1'), rm_count) hs_count = _get_inst_count(cluster, 'historyserver') if hs_count > 1: raise ex.InvalidComponentCountException('historyserver', _('0 or 1'), hs_count) nm_count = _get_inst_count(cluster, 'nodemanager') if rm_count == 0: if nm_count > 0: raise ex.RequiredServiceMissingException('resourcemanager', required_by='nodemanager') oo_count = _get_inst_count(cluster, 'oozie') dn_count = _get_inst_count(cluster, 'datanode') if oo_count > 1: raise ex.InvalidComponentCountException('oozie', _('0 or 1'), oo_count) if oo_count == 1: if dn_count < 1: raise ex.RequiredServiceMissingException('datanode', required_by='oozie') if nm_count < 1: raise ex.RequiredServiceMissingException('nodemanager', required_by='oozie') if hs_count != 1: raise ex.RequiredServiceMissingException('historyserver', required_by='oozie') spark_hist_count = _get_inst_count(cluster, 'spark history server') if spark_hist_count > 1: raise ex.InvalidComponentCountException('spark history server', _('0 or 1'), spark_hist_count) rep_factor = cu.get_config_value(pctx, 'HDFS', 'dfs.replication', cluster) if dn_count < rep_factor: raise ex.InvalidComponentCountException( 'datanode', rep_factor, dn_count, _('Number of datanodes must be ' 'not less than ' 'dfs.replication.')) hive_count = _get_inst_count(cluster, 'hiveserver') if hive_count > 1: raise ex.InvalidComponentCountException('hive', _('0 or 1'), hive_count) zk_count = _get_inst_count(cluster, 'zookeeper') if zk_count > 0 and (zk_count % 2) != 1: raise ex.InvalidComponentCountException( 'zookeeper', _('odd'), zk_count, _('Number of zookeeper nodes' 'should be in odd.'))
def get_description(self): return _('The Apache Vanilla plugin provides the ability to launch ' 'upstream Vanilla Apache Hadoop cluster without any ' 'management consoles. It can also deploy the Oozie ' 'component.')