def start_secondarynamenodes(self, cluster): snns = vu.get_secondarynamenodes(cluster) if len(snns) == 0: return cpo.add_provisioning_step(cluster.id, utils.start_process_event_message("SecondaryNameNodes"), len(snns)) for snn in snns: self._start_secondarynamenode(snn)
def start_zk_server(instances): utils.add_provisioning_step(instances[0].cluster_id, utils.start_process_event_message("ZooKeeper"), len(instances)) with context.PluginsThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): tg.spawn('ZK-start-processes-%s' % instance.instance_name, _start_zk_processes, instance, 'start')
def _start_slave_processes(self, sl_instances): if len(sl_instances) == 0: return cpo.add_provisioning_step( sl_instances[0].cluster_id, utils.start_process_event_message("Slave"), len(sl_instances) ) with context.ThreadGroup() as tg: for i in sl_instances: tg.spawn("storm-start-sl-%s" % i.instance_name, self._start_slaves, i)
def start_secondarynamenodes(self, cluster): snns = vu.get_secondarynamenodes(cluster) if len(snns) == 0: return cpo.add_provisioning_step( snns[0].cluster_id, utils.start_process_event_message("SecondaryNameNodes"), len(snns)) for snn in vu.get_secondarynamenodes(cluster): self._start_secondarynamenode(snn)
def _start_slave_processes(self, sl_instances): if len(sl_instances) == 0: return utils.add_provisioning_step(sl_instances[0].cluster_id, utils.start_process_event_message("Slave"), len(sl_instances)) with context.PluginsThreadGroup() as tg: for i in sl_instances: tg.spawn('storm-start-sl-%s' % i.instance_name, self._start_slaves, i)
def _start_datanode_processes(self, dn_instances): if len(dn_instances) == 0: return cpo.add_provisioning_step( dn_instances[0].cluster_id, utils.start_process_event_message("DataNodes"), len(dn_instances) ) with context.ThreadGroup() as tg: for i in dn_instances: tg.spawn("spark-start-dn-%s" % i.instance_name, self._start_datanode, i)
def _start_zookeeper_processes(self, zk_instances): if len(zk_instances) == 0: return cpo.add_provisioning_step( zk_instances[0].cluster_id, utils.start_process_event_message("Zookeeper"), len(zk_instances) ) with context.ThreadGroup() as tg: for i in zk_instances: tg.spawn("storm-start-zk-%s" % i.instance_name, self._start_zookeeper, i)
def _start_datanode_processes(self, dn_instances): if len(dn_instances) == 0: return cpo.add_provisioning_step( dn_instances[0].cluster_id, utils.start_process_event_message("DataNodes"), len(dn_instances)) with context.ThreadGroup() as tg: for i in dn_instances: tg.spawn('spark-start-dn-%s' % i.instance_name, self._start_datanode, i)
def _start_zookeeper_processes(self, zk_instances): if len(zk_instances) == 0: return utils.add_provisioning_step( zk_instances[0].cluster_id, utils.start_process_event_message("Zookeeper"), len(zk_instances)) with context.PluginsThreadGroup() as tg: for i in zk_instances: tg.spawn('storm-start-zk-%s' % i.instance_name, self._start_zookeeper, i)
def refresh_zk_servers(cluster, to_delete_instances=None): instances = vu.get_zk_servers(cluster) if to_delete_instances: for instance in to_delete_instances: if instance in instances: instances.remove(instance) utils.add_provisioning_step(cluster.id, utils.start_process_event_message("ZooKeeper"), len(instances)) with context.PluginsThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): tg.spawn('ZK-restart-processes-%s' % instance.instance_name, _start_zk_processes, instance, 'restart')
def test_start_dn_nm_processes(self, instances_with_services, add_provisioning_step, set_current_instance_id, _start_processes): ins = mock.Mock() ins.cluster_id = '111' ins.instance_id = '123' ins.instance_name = 'ins_1' instances = [ins] instances_with_services.return_value = instances mess = pu.start_process_event_message('DataNodes, NodeManagers') ins.node_group.node_processes = ['datanode', 'test'] rs.start_dn_nm_processes(instances) instances_with_services.assert_called_once_with( instances, ['datanode', 'nodemanager']) add_provisioning_step.assert_called_once_with('111', mess, 1) set_current_instance_id.assert_called_once_with('123') _start_processes.assert_called_once_with(ins, ['datanode'])
def _start_tt_dn_processes(self, instances): tt_dn_names = ["datanode", "tasktracker"] instances = utils.instances_with_services(instances, tt_dn_names) if not instances: return cpo.add_provisioning_step( instances[0].cluster_id, utils.start_process_event_message("DataNodes, TaskTrackers"), len(instances) ) with context.ThreadGroup() as tg: for i in instances: processes = set(i.node_group.node_processes) tt_dn_procs = processes.intersection(tt_dn_names) tg.spawn("vanilla-start-tt-dn-%s" % i.instance_name, self._start_tt_dn, i, list(tt_dn_procs))
def start_dn_nm_processes(instances): filternames = ['datanode', 'nodemanager'] instances = pu.instances_with_services(instances, filternames) if len(instances) == 0: return cpo.add_provisioning_step( instances[0].cluster_id, pu.start_process_event_message("DataNodes, NodeManagers"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: processes = set(instance.node_group.node_processes) processes = processes.intersection(filternames) tg.spawn('vanilla-start-processes-%s' % instance.instance_name, _start_processes, instance, list(processes))
def start_dn_nm_processes(instances): filternames = ['datanode', 'nodemanager'] instances = pu.instances_with_services(instances, filternames) if len(instances) == 0: return cpo.add_provisioning_step( instances[0].cluster_id, pu.start_process_event_message("DataNodes, NodeManagers"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): processes = set(instance.node_group.node_processes) processes = processes.intersection(filternames) tg.spawn('vanilla-start-processes-%s' % instance.instance_name, _start_processes, instance, list(processes))
def _start_tt_dn_processes(self, instances): tt_dn_names = ["datanode", "tasktracker"] instances = utils.instances_with_services(instances, tt_dn_names) if not instances: return cpo.add_provisioning_step( instances[0].cluster_id, utils.start_process_event_message("DataNodes, TaskTrackers"), len(instances)) with context.ThreadGroup() as tg: for i in instances: processes = set(i.node_group.node_processes) tt_dn_procs = processes.intersection(tt_dn_names) tg.spawn('vanilla-start-tt-dn-%s' % i.instance_name, self._start_tt_dn, i, list(tt_dn_procs))
class StormProvider(p.ProvisioningPluginBase): def __init__(self): self.processes = { "Zookeeper": ["zookeeper"], "Storm": ["nimbus", "supervisor"] } def get_title(self): return "Apache Storm" def get_description(self): return (_("This plugin provides an ability to launch Storm " "cluster without any management consoles.")) def get_labels(self): default = {'enabled': {'status': True}, 'stable': {'status': True}} deprecated = { 'enabled': { 'status': True }, 'deprecated': { 'status': True } } result = {'plugin_labels': copy.deepcopy(default)} result['version_labels'] = { '1.2': copy.deepcopy(default), '1.1.0': copy.deepcopy(default), '1.0.1': copy.deepcopy(deprecated), } return result def get_versions(self): return ['1.0.1', '1.1.0', '1.2'] def get_configs(self, storm_version): return c_helper.get_plugin_configs() def get_node_processes(self, storm_version): return self.processes def validate(self, cluster): # validate Storm Master Node and Storm Slaves sm_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "nimbus")]) if sm_count < 1: raise ex.RequiredServiceMissingException("Storm nimbus") if sm_count >= 2: raise ex.InvalidComponentCountException("Storm nimbus", "1", sm_count) sl_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "supervisor")]) if sl_count < 1: raise ex.InvalidComponentCountException("Storm supervisor", _("1 or more"), sl_count) def update_infra(self, cluster): pass def configure_cluster(self, cluster): self._setup_instances(cluster) def start_cluster(self, cluster): sm_instance = utils.get_instance(cluster, "nimbus") sl_instances = utils.get_instances(cluster, "supervisor") zk_instances = utils.get_instances(cluster, "zookeeper") # start zookeeper processes self._start_zookeeper_processes(zk_instances) # start storm master if sm_instance: self._start_storm_master(sm_instance) # start storm slaves self._start_slave_processes(sl_instances) LOG.info("Cluster {cluster} has been started successfully".format( cluster=cluster.name)) self._set_cluster_info(cluster) def get_edp_engine(self, cluster, job_type): if job_type in edp_engine.EdpStormEngine.get_supported_job_types(): return edp_engine.EdpStormEngine(cluster) if job_type in edp_engine.EdpPyleusEngine.get_supported_job_types(): return edp_engine.EdpPyleusEngine(cluster) return None def get_edp_job_types(self, versions=None): res = {} for vers in self.get_versions(): if not versions or vers in versions: storm_engine = edp_engine.EdpStormEngine pyleus_engine = edp_engine.EdpPyleusEngine res[vers] = (storm_engine.get_supported_job_types() + pyleus_engine.get_supported_job_types()) return res def get_edp_config_hints(self, job_type, version): if edp_engine.EdpStormEngine.edp_supported(version): return edp_engine.EdpStormEngine.get_possible_job_config(job_type) if edp_engine.EdpPyleusEngine.edp_supported(version): return edp_engine.EdpPyleusEngine.get_possible_job_config(job_type) return {} def get_open_ports(self, node_group): ports_map = {'nimbus': [8080]} ports = [] for process in node_group.node_processes: if process in ports_map: ports.extend(ports_map[process]) return ports def _extract_configs_to_extra(self, cluster): st_master = utils.get_instance(cluster, "nimbus") zk_servers = utils.get_instances(cluster, "zookeeper") extra = dict() config_instances = '' if st_master is not None: if zk_servers is not None: zknames = [] for zk in zk_servers: zknames.append(zk.hostname()) config_instances = c_helper.generate_storm_config( st_master.hostname(), zknames, cluster.hadoop_version) config = self._convert_dict_to_yaml(config_instances) supervisor_conf = c_helper.generate_slave_supervisor_conf() nimbus_ui_conf = c_helper.generate_master_supervisor_conf() zk_conf = c_helper.generate_zookeeper_conf() pyleus_conf = c_helper.generate_pyleus_config() for ng in cluster.node_groups: extra[ng.id] = { 'st_instances': config, 'slave_sv_conf': supervisor_conf, 'master_sv_conf': nimbus_ui_conf, 'zk_conf': zk_conf, 'pyleus_conf': pyleus_conf } return extra @utils.event_wrapper(True, step=utils.start_process_event_message("StormMaster")) def _start_storm_master(self, sm_instance): with utils.get_remote(sm_instance) as r: run.start_storm_nimbus_and_ui(r) LOG.info("Storm master at {host} has been started".format( host=sm_instance.hostname())) def _start_slave_processes(self, sl_instances): if len(sl_instances) == 0: return utils.add_provisioning_step(sl_instances[0].cluster_id, utils.start_process_event_message("Slave"), len(sl_instances)) with context.PluginsThreadGroup() as tg: for i in sl_instances: tg.spawn('storm-start-sl-%s' % i.instance_name, self._start_slaves, i) @utils.event_wrapper(True) def _start_slaves(self, instance): with instance.remote() as r: run.start_storm_supervisor(r) def _start_zookeeper_processes(self, zk_instances): if len(zk_instances) == 0: return utils.add_provisioning_step( zk_instances[0].cluster_id, utils.start_process_event_message("Zookeeper"), len(zk_instances)) with context.PluginsThreadGroup() as tg: for i in zk_instances: tg.spawn('storm-start-zk-%s' % i.instance_name, self._start_zookeeper, i) @utils.event_wrapper(True) def _start_zookeeper(self, instance): with instance.remote() as r: run.start_zookeeper(r) def _setup_instances(self, cluster, instances=None): extra = self._extract_configs_to_extra(cluster) if instances is None: instances = utils.get_instances(cluster) self._push_configs_to_nodes(cluster, extra, instances) def _push_configs_to_nodes(self, cluster, extra, new_instances): all_instances = utils.get_instances(cluster) utils.add_provisioning_step(cluster.id, _("Push configs to nodes"), len(all_instances)) with context.PluginsThreadGroup() as tg: for instance in all_instances: if instance in new_instances: tg.spawn('storm-configure-%s' % instance.instance_name, self._push_configs_to_new_node, cluster, extra, instance) else: tg.spawn('storm-reconfigure-%s' % instance.instance_name, self._push_configs_to_existing_node, cluster, extra, instance) def _convert_dict_to_yaml(self, dict_to_convert): new_dict = dict_to_convert.copy() for key in dict_to_convert: if isinstance(dict_to_convert[key], six.string_types): new_dict[key] = "\"" + dict_to_convert[key] + "\"" stream = yaml.dump(new_dict, default_flow_style=False) stream = stream.replace("\'", "") return stream @utils.event_wrapper(True) def _push_configs_to_new_node(self, cluster, extra, instance): ng_extra = extra[instance.node_group.id] files_supervisor = { '/etc/supervisor/supervisord.conf': ng_extra['slave_sv_conf'] } files_storm = { '/usr/local/storm/conf/storm.yaml': ng_extra['st_instances'] } files_zk = { '/opt/zookeeper/zookeeper/conf/zoo.cfg': ng_extra['zk_conf'] } files_supervisor_master = { '/etc/supervisor/supervisord.conf': ng_extra['master_sv_conf'] } file_pyleus_conf = { '/home/ubuntu/.pyleus.conf': ng_extra['pyleus_conf'] } with utils.get_remote(instance) as r: node_processes = instance.node_group.node_processes r.write_files_to(files_storm, run_as_root=True) if 'zookeeper' in node_processes: self._push_zk_configs(r, files_zk) if 'nimbus' in node_processes: self._push_supervisor_configs(r, files_supervisor_master) self._push_supervisor_configs(r, file_pyleus_conf) if 'supervisor' in node_processes: self._push_supervisor_configs(r, files_supervisor) @utils.event_wrapper(True) def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_storm_update = ('nimbus' in node_processes or 'supervisor' in node_processes) need_zookeeper_update = 'zookeeper' in node_processes ng_extra = extra[instance.node_group.id] r = utils.get_remote(instance) if need_storm_update: storm_path = '/usr/local/storm/conf/storm.yaml' files_storm = {storm_path: ng_extra['st_instances']} r.write_files_to(files_storm) if need_zookeeper_update: zk_path = '/opt/zookeeper/zookeeper/conf/zoo.cfg' files_zookeeper = {zk_path: ng_extra['zk_conf']} self._push_zk_configs(r, files_zookeeper) def _set_cluster_info(self, cluster): st_master = utils.get_instance(cluster, "nimbus") info = {} if st_master: port = "8080" info['Strom'] = { 'Web UI': 'http://%s:%s' % (st_master.get_ip_or_dns_name(), port) } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info}) def _push_zk_configs(self, r, files): r.write_files_to(files, run_as_root=True) def _push_supervisor_configs(self, r, files): r.append_to_files(files, run_as_root=True) # Scaling def _get_running_topologies_names(self, cluster): master = utils.get_instance(cluster, "nimbus") cmd = ("%(storm)s -c nimbus.host=%(host)s " "list | grep ACTIVE | awk '{print $1}'") % ( { "storm": "/usr/local/storm/bin/storm", "host": master.hostname() }) with utils.get_remote(master) as r: ret, stdout = r.execute_command(cmd) names = stdout.split('\n') topology_names = names[0:len(names) - 1] return topology_names @utils.event_wrapper(True, step=_("Rebalance Topology"), param=('cluster', 1)) def rebalance_topology(self, cluster): topology_names = self._get_running_topologies_names(cluster) master = utils.get_instance(cluster, "nimbus") for topology_name in topology_names: cmd = ('%(rebalance)s -c nimbus.host=%(host)s %(topology_name)s' ) % ({ "rebalance": "/usr/local/storm/bin/storm rebalance", "host": master.hostname(), "topology_name": topology_name }) with utils.get_remote(master) as r: ret, stdout = r.execute_command(cmd) def validate_scaling(self, cluster, existing, additional): self._validate_existing_ng_scaling(cluster, existing) self._validate_additional_ng_scaling(cluster, additional) def scale_cluster(self, cluster, instances): self._setup_instances(cluster, instances) # start storm slaves self._start_slave_processes(instances) self.rebalance_topology(cluster) LOG.info("Storm scaling has been started.") def _get_scalable_processes(self): return ["supervisor"] def _validate_additional_ng_scaling(self, cluster, additional): scalable_processes = self._get_scalable_processes() for ng_id in additional: ng = utils.get_by_id(cluster.node_groups, ng_id) if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Storm plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) def _validate_existing_ng_scaling(self, cluster, existing): scalable_processes = self._get_scalable_processes() for ng in cluster.node_groups: if ng.id in existing: if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Storm plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes))
class VersionHandler(avm.AbstractVersionHandler): def get_plugin_configs(self): return c_helper.get_plugin_configs() def get_node_processes(self): return { "HDFS": ["namenode", "datanode", "secondarynamenode"], "MapReduce": ["tasktracker", "jobtracker"], "JobFlow": ["oozie"], "Hive": ["hiveserver"] } def validate(self, cluster): nn_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "namenode")]) if nn_count != 1: raise ex.InvalidComponentCountException("namenode", 1, nn_count) snn_count = sum([ ng.count for ng in utils.get_node_groups(cluster, 'secondarynamenode') ]) if snn_count > 1: raise ex.InvalidComponentCountException('secondarynamenode', _('0 or 1'), snn_count) jt_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "jobtracker")]) if jt_count > 1: raise ex.InvalidComponentCountException("jobtracker", _('0 or 1'), jt_count) oozie_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "oozie")]) if oozie_count > 1: raise ex.InvalidComponentCountException("oozie", _('0 or 1'), oozie_count) hive_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "hiveserver")]) if jt_count == 0: tt_count = sum([ ng.count for ng in utils.get_node_groups(cluster, "tasktracker") ]) if tt_count > 0: raise ex.RequiredServiceMissingException( "jobtracker", required_by="tasktracker") if oozie_count > 0: raise ex.RequiredServiceMissingException("jobtracker", required_by="oozie") if hive_count > 0: raise ex.RequiredServiceMissingException("jobtracker", required_by="hive") if hive_count > 1: raise ex.InvalidComponentCountException("hive", _('0 or 1'), hive_count) def configure_cluster(self, cluster): instances = utils.get_instances(cluster) self._setup_instances(cluster, instances) def start_namenode(self, cluster): nn = vu.get_namenode(cluster) self._start_namenode(nn) @cpo.event_wrapper(True, step=utils.start_process_event_message("NameNode")) def _start_namenode(self, nn_instance): with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") def start_secondarynamenodes(self, cluster): snns = vu.get_secondarynamenodes(cluster) if len(snns) == 0: return cpo.add_provisioning_step( cluster.id, utils.start_process_event_message("SecondaryNameNodes"), len(snns)) for snn in snns: self._start_secondarynamenode(snn) @cpo.event_wrapper(True) def _start_secondarynamenode(self, snn): run.start_processes(remote.get_remote(snn), "secondarynamenode") def start_jobtracker(self, cluster): jt = vu.get_jobtracker(cluster) if jt: self._start_jobtracker(jt) @cpo.event_wrapper(True, step=utils.start_process_event_message("JobTracker")) def _start_jobtracker(self, jt_instance): run.start_processes(remote.get_remote(jt_instance), "jobtracker") def start_oozie(self, cluster): oozie = vu.get_oozie(cluster) if oozie: self._start_oozie(cluster, oozie) @cpo.event_wrapper(True, step=utils.start_process_event_message("Oozie")) def _start_oozie(self, cluster, oozie): nn_instance = vu.get_namenode(cluster) with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info( _LI("Oozie service at {host} has been started").format( host=nn_instance.hostname())) def start_hiveserver(self, cluster): hs = vu.get_hiveserver(cluster) if hs: self._start_hiveserver(cluster, hs) @cpo.event_wrapper(True, step=utils.start_process_event_message("HiveServer")) def _start_hiveserver(self, cluster, hive_server): oozie = vu.get_oozie(cluster) with remote.get_remote(hive_server) as r: run.hive_create_warehouse_dir(r) run.hive_copy_shared_conf(r, edp.get_hive_shared_conf_path('hadoop')) if c_helper.is_mysql_enable(cluster): if not oozie or hive_server.hostname() != oozie.hostname(): run.mysql_start(r, hive_server) run.hive_create_db(r, cluster.extra['hive_mysql_passwd']) run.hive_metastore_start(r) LOG.info( _LI("Hive Metastore server at {host} has been " "started").format(host=hive_server.hostname())) def start_cluster(self, cluster): self.start_namenode(cluster) self.start_secondarynamenodes(cluster) self.start_jobtracker(cluster) self._start_tt_dn_processes(utils.get_instances(cluster)) self._await_datanodes(cluster) LOG.info( _LI("Hadoop services in cluster {cluster} have been started"). format(cluster=cluster.name)) self.start_oozie(cluster) self.start_hiveserver(cluster) LOG.info( _LI('Cluster {cluster} has been started successfully').format( cluster=cluster.name)) self._set_cluster_info(cluster) @cpo.event_wrapper(True, step=_("Await %s start up") % "DataNodes", param=('cluster', 1)) def _await_datanodes(self, cluster): datanodes_count = len(vu.get_datanodes(cluster)) if datanodes_count < 1: return l_message = _("Waiting on %s datanodes to start up") % datanodes_count LOG.info(l_message) with remote.get_remote(vu.get_namenode(cluster)) as r: poll_utils.plugin_option_poll(cluster, run.check_datanodes_count, c_helper.DATANODES_STARTUP_TIMEOUT, l_message, 1, { 'remote': r, 'count': datanodes_count }) def _generate_hive_mysql_password(self, cluster): extra = cluster.extra.to_dict() if cluster.extra else {} password = extra.get('hive_mysql_passwd') if not password: password = six.text_type(uuid.uuid4()) extra['hive_mysql_passwd'] = password conductor.cluster_update(context.ctx(), cluster, {'extra': extra}) return password def _extract_configs_to_extra(self, cluster): oozie = vu.get_oozie(cluster) hive = vu.get_hiveserver(cluster) extra = dict() if hive: extra['hive_mysql_passwd'] = self._generate_hive_mysql_password( cluster) for ng in cluster.node_groups: extra[ng.id] = { 'xml': c_helper.generate_xml_configs( cluster, ng, extra['hive_mysql_passwd'] if hive else None), 'setup_script': c_helper.generate_setup_script( ng.storage_paths(), c_helper.extract_environment_confs(ng.configuration()), append_oozie=(oozie and oozie.node_group.id == ng.id)) } if c_helper.is_data_locality_enabled(cluster): topology_data = th.generate_topology_map( cluster, CONF.enable_hypervisor_awareness) extra['topology_data'] = "\n".join( [k + " " + v for k, v in topology_data.items()]) + "\n" return extra def decommission_nodes(self, cluster, instances): tts = vu.get_tasktrackers(cluster) dns = vu.get_datanodes(cluster) decommission_dns = False decommission_tts = False for i in instances: if 'datanode' in i.node_group.node_processes: dns.remove(i) decommission_dns = True if 'tasktracker' in i.node_group.node_processes: tts.remove(i) decommission_tts = True nn = vu.get_namenode(cluster) jt = vu.get_jobtracker(cluster) if decommission_tts: sc.decommission_tt(jt, instances, tts) if decommission_dns: sc.decommission_dn(nn, instances, dns) def validate_scaling(self, cluster, existing, additional): self._validate_existing_ng_scaling(cluster, existing) self._validate_additional_ng_scaling(cluster, additional) def scale_cluster(self, cluster, instances): self._setup_instances(cluster, instances) run.refresh_nodes(remote.get_remote(vu.get_namenode(cluster)), "dfsadmin") jt = vu.get_jobtracker(cluster) if jt: run.refresh_nodes(remote.get_remote(jt), "mradmin") self._start_tt_dn_processes(instances) def _start_tt_dn_processes(self, instances): tt_dn_names = ["datanode", "tasktracker"] instances = utils.instances_with_services(instances, tt_dn_names) if not instances: return cpo.add_provisioning_step( instances[0].cluster_id, utils.start_process_event_message("DataNodes, TaskTrackers"), len(instances)) with context.ThreadGroup() as tg: for i in instances: processes = set(i.node_group.node_processes) tt_dn_procs = processes.intersection(tt_dn_names) tg.spawn('vanilla-start-tt-dn-%s' % i.instance_name, self._start_tt_dn, i, list(tt_dn_procs)) @cpo.event_wrapper(True) def _start_tt_dn(self, instance, tt_dn_procs): with instance.remote() as r: run.start_processes(r, *tt_dn_procs) @cpo.event_wrapper(True, step=_("Setup instances and push configs"), param=('cluster', 1)) def _setup_instances(self, cluster, instances): if (CONF.use_identity_api_v3 and CONF.use_domain_for_proxy_users and vu.get_hiveserver(cluster) and c_helper.is_swift_enable(cluster)): cluster = proxy.create_proxy_user_for_cluster(cluster) instances = utils.get_instances(cluster) extra = self._extract_configs_to_extra(cluster) cluster = conductor.cluster_get(context.ctx(), cluster) self._push_configs_to_nodes(cluster, extra, instances) def _push_configs_to_nodes(self, cluster, extra, new_instances): all_instances = utils.get_instances(cluster) new_ids = set([instance.id for instance in new_instances]) with context.ThreadGroup() as tg: for instance in all_instances: if instance.id in new_ids: tg.spawn('vanilla-configure-%s' % instance.instance_name, self._push_configs_to_new_node, cluster, extra, instance) else: tg.spawn('vanilla-reconfigure-%s' % instance.instance_name, self._push_configs_to_existing_node, cluster, extra, instance) def _push_configs_to_new_node(self, cluster, extra, instance): ng_extra = extra[instance.node_group.id] private_key, public_key = c_helper.get_hadoop_ssh_keys(cluster) files = { '/etc/hadoop/core-site.xml': ng_extra['xml']['core-site'], '/etc/hadoop/mapred-site.xml': ng_extra['xml']['mapred-site'], '/etc/hadoop/hdfs-site.xml': ng_extra['xml']['hdfs-site'], '/tmp/sahara-hadoop-init.sh': ng_extra['setup_script'], 'id_rsa': private_key, 'authorized_keys': public_key } key_cmd = ('sudo mkdir -p /home/hadoop/.ssh/ && ' 'sudo mv id_rsa authorized_keys /home/hadoop/.ssh && ' 'sudo chown -R hadoop:hadoop /home/hadoop/.ssh && ' 'sudo chmod 600 /home/hadoop/.ssh/{id_rsa,authorized_keys}') with remote.get_remote(instance) as r: # TODO(aignatov): sudo chown is wrong solution. But it works. r.execute_command('sudo chown -R $USER:$USER /etc/hadoop') r.execute_command('sudo chown -R $USER:$USER /opt/oozie/conf') r.write_files_to(files) r.execute_command('sudo chmod 0500 /tmp/sahara-hadoop-init.sh') r.execute_command('sudo /tmp/sahara-hadoop-init.sh ' '>> /tmp/sahara-hadoop-init.log 2>&1') r.execute_command(key_cmd) if c_helper.is_data_locality_enabled(cluster): r.write_file_to( '/etc/hadoop/topology.sh', f.get_file_text( 'plugins/vanilla/v1_2_1/resources/topology.sh')) r.execute_command('sudo chmod +x /etc/hadoop/topology.sh') self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance) def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_update = (c_helper.is_data_locality_enabled(cluster) or 'namenode' in node_processes or 'jobtracker' in node_processes or 'oozie' in node_processes or 'hiveserver' in node_processes) if not need_update: return with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance) def _write_topology_data(self, r, cluster, extra): if c_helper.is_data_locality_enabled(cluster): topology_data = extra['topology_data'] r.write_file_to('/etc/hadoop/topology.data', topology_data) def _push_master_configs(self, r, cluster, extra, instance): ng_extra = extra[instance.node_group.id] node_processes = instance.node_group.node_processes if 'namenode' in node_processes: self._push_namenode_configs(cluster, r) if 'jobtracker' in node_processes: self._push_jobtracker_configs(cluster, r) if 'oozie' in node_processes: self._push_oozie_configs(ng_extra, r) if 'hiveserver' in node_processes: self._push_hive_configs(ng_extra, r) def _push_namenode_configs(self, cluster, r): r.write_file_to( '/etc/hadoop/dn.incl', utils.generate_fqdn_host_names(vu.get_datanodes(cluster))) def _push_jobtracker_configs(self, cluster, r): r.write_file_to( '/etc/hadoop/tt.incl', utils.generate_fqdn_host_names(vu.get_tasktrackers(cluster))) def _push_oozie_configs(self, ng_extra, r): r.write_file_to('/opt/oozie/conf/oozie-site.xml', ng_extra['xml']['oozie-site']) def _push_hive_configs(self, ng_extra, r): files = {'/opt/hive/conf/hive-site.xml': ng_extra['xml']['hive-site']} r.write_files_to(files) def _set_cluster_info(self, cluster): nn = vu.get_namenode(cluster) jt = vu.get_jobtracker(cluster) oozie = vu.get_oozie(cluster) info = {} if jt: ui_port = c_helper.get_port_from_config( 'MapReduce', 'mapred.job.tracker.http.address', cluster) jt_port = c_helper.get_port_from_config('MapReduce', 'mapred.job.tracker', cluster) info['MapReduce'] = { 'Web UI': 'http://%s:%s' % (jt.management_ip, ui_port), 'JobTracker': '%s:%s' % (jt.hostname(), jt_port) } if nn: ui_port = c_helper.get_port_from_config('HDFS', 'dfs.http.address', cluster) nn_port = c_helper.get_port_from_config('HDFS', 'fs.default.name', cluster) info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.management_ip, ui_port), 'NameNode': 'hdfs://%s:%s' % (nn.hostname(), nn_port) } if oozie: # TODO(yrunts) change from hardcode value info['JobFlow'] = { 'Oozie': 'http://%s:11000' % oozie.management_ip } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info}) def _get_scalable_processes(self): return ["datanode", "tasktracker"] def _validate_additional_ng_scaling(self, cluster, additional): jt = vu.get_jobtracker(cluster) scalable_processes = self._get_scalable_processes() for ng_id in additional: ng = g.get_by_id(cluster.node_groups, ng_id) if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Vanilla plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) if not jt and 'tasktracker' in ng.node_processes: raise ex.NodeGroupCannotBeScaled( ng.name, _("Vanilla plugin cannot scale node group with " "processes which have no master-processes run " "in cluster")) def _validate_existing_ng_scaling(self, cluster, existing): scalable_processes = self._get_scalable_processes() dn_to_delete = 0 for ng in cluster.node_groups: if ng.id in existing: if (ng.count > existing[ng.id] and "datanode" in ng.node_processes): dn_to_delete += ng.count - existing[ng.id] if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Vanilla plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) dn_amount = len(vu.get_datanodes(cluster)) rep_factor = c_helper.get_config_value('HDFS', 'dfs.replication', cluster) if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor: raise ex.ClusterCannotBeScaled( cluster.name, _("Vanilla plugin cannot shrink cluster because " "it would be not enough nodes for replicas " "(replication factor is %s)") % rep_factor) def get_edp_engine(self, cluster, job_type): if job_type in edp_engine.EdpOozieEngine.get_supported_job_types(): return edp_engine.EdpOozieEngine(cluster) return None def get_edp_job_types(self): return edp_engine.EdpOozieEngine.get_supported_job_types() def get_edp_config_hints(self, job_type): return edp_engine.EdpOozieEngine.get_possible_job_config(job_type) def get_open_ports(self, node_group): cluster = node_group.cluster ports = [] if "namenode" in node_group.node_processes: ports.append( c_helper.get_port_from_config('HDFS', 'dfs.http.address', cluster)) ports.append(8020) if "datanode" in node_group.node_processes: ports.append( c_helper.get_port_from_config('HDFS', 'dfs.datanode.http.address', cluster)) ports.append( c_helper.get_port_from_config('HDFS', 'dfs.datanode.address', cluster)) ports.append( c_helper.get_port_from_config('HDFS', 'dfs.datanode.ipc.address', cluster)) if "jobtracker" in node_group.node_processes: ports.append( c_helper.get_port_from_config( 'MapReduce', 'mapred.job.tracker.http.address', cluster)) ports.append(8021) if "tasktracker" in node_group.node_processes: ports.append( c_helper.get_port_from_config( 'MapReduce', 'mapred.task.tracker.http.address', cluster)) if "secondarynamenode" in node_group.node_processes: ports.append( c_helper.get_port_from_config('HDFS', 'dfs.secondary.http.address', cluster)) if "oozie" in node_group.node_processes: ports.append(11000) if "hive" in node_group.node_processes: ports.append(9999) ports.append(10000) return ports def on_terminate_cluster(self, cluster): proxy.delete_proxy_user_for_cluster(cluster)
def _step_description(x): return {'step': gu.start_process_event_message(x), 'param': ('cluster', 0)}
if 'nodemanager' in processes: r.execute_command( 'sudo su - -c "yarn-daemon.sh start nodemanager" hadoop') def start_hadoop_process(instance, process): instance.remote().execute_command( 'sudo su - -c "hadoop-daemon.sh start %s" hadoop' % process) def start_yarn_process(instance, process): instance.remote().execute_command( 'sudo su - -c "yarn-daemon.sh start %s" hadoop' % process) @cpo.event_wrapper(True, step=pu.start_process_event_message("HistoryServer")) def start_historyserver(instance): instance.remote().execute_command( 'sudo su - -c "mr-jobhistory-daemon.sh start historyserver" hadoop') @cpo.event_wrapper(True, step=pu.start_process_event_message("Oozie")) def start_oozie_process(pctx, instance): with context.set_current_instance_id(instance.instance_id): with instance.remote() as r: if c_helper.is_mysql_enabled(pctx, instance.cluster): _start_mysql(r) LOG.debug("Creating Oozie DB Schema") sql_script = files.get_file_text( 'plugins/vanilla/hadoop2/resources/create_oozie_db.sql')
class StormProvider(p.ProvisioningPluginBase): def __init__(self): self.processes = { "Zookeeper": ["zookeeper"], "Storm": ["nimbus", "supervisor"] } def get_title(self): return "Apache Storm" def get_description(self): return (_("This plugin provides an ability to launch Storm " "cluster without any management consoles.")) def get_versions(self): return ['0.9.2'] def get_configs(self, storm_version): return c_helper.get_plugin_configs() def get_node_processes(self, storm_version): return self.processes def validate(self, cluster): # validate Storm Master Node and Storm Slaves sm_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "nimbus")]) if sm_count != 1: raise ex.RequiredServiceMissingException("Storm nimbus") sl_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "supervisor")]) if sl_count < 1: raise ex.InvalidComponentCountException("Storm supervisor", _("1 or more"), sl_count) def update_infra(self, cluster): pass def configure_cluster(self, cluster): self._setup_instances(cluster) def start_cluster(self, cluster): sm_instance = utils.get_instance(cluster, "nimbus") sl_instances = utils.get_instances(cluster, "supervisor") zk_instance = utils.get_instances(cluster, "zookeeper") if zk_instance: self._start_zookeeper_processes(zk_instance) # start storm master if sm_instance: self._start_storm_master(sm_instance) # start storm slaves self._start_slave_processes(sl_instances) LOG.info( _LI('Cluster {cluster} has been started successfully').format( cluster=cluster.name)) self._set_cluster_info(cluster) def get_edp_engine(self, cluster, job_type): if job_type in edp_engine.EdpEngine.get_supported_job_types(): return edp_engine.EdpEngine(cluster) return None def get_edp_job_types(self, versions=[]): res = {} for vers in self.get_versions(): if not versions or vers in versions: if edp_engine.EdpEngine.edp_supported(vers): res[vers] = edp_engine.EdpEngine.get_supported_job_types() return res def get_edp_config_hints(self, job_type, version): if edp_engine.EdpEngine.edp_supported(version): return edp_engine.EdpEngine.get_possible_job_config(job_type) return {} def _extract_configs_to_extra(self, cluster): st_master = utils.get_instance(cluster, "nimbus") zk_servers = utils.get_instances(cluster, "zookeeper") extra = dict() config_instances = '' if st_master is not None: if zk_servers is not None: zknames = [] for zk in zk_servers: zknames.append(zk.hostname()) config_instances = c_helper.generate_storm_config( st_master.hostname(), zknames) config = self._convert_dict_to_yaml(config_instances) supervisor_conf = c_helper.generate_slave_supervisor_conf() nimbus_ui_conf = c_helper.generate_master_supervisor_conf() zk_conf = c_helper.generate_zookeeper_conf() for ng in cluster.node_groups: extra[ng.id] = { 'st_instances': config, 'slave_sv_conf': supervisor_conf, 'master_sv_conf': nimbus_ui_conf, 'zk_conf': zk_conf } return extra @cpo.event_wrapper(True, step=utils.start_process_event_message("StormMaster")) def _start_storm_master(self, sm_instance): with remote.get_remote(sm_instance) as r: run.start_storm_nimbus_and_ui(r) LOG.info( _LI("Storm master at {host} has been started").format( host=sm_instance.hostname())) def _start_slave_processes(self, sl_instances): if len(sl_instances) == 0: return cpo.add_provisioning_step(sl_instances[0].cluster_id, utils.start_process_event_message("Slave"), len(sl_instances)) with context.ThreadGroup() as tg: for i in sl_instances: tg.spawn('storm-start-sl-%s' % i.instance_name, self._start_slaves, i) @cpo.event_wrapper(True) def _start_slaves(self, instance): with instance.remote() as r: run.start_storm_supervisor(r) def _start_zookeeper_processes(self, zk_instances): if len(zk_instances) == 0: return cpo.add_provisioning_step( zk_instances[0].cluster_id, utils.start_process_event_message("Zookeeper"), len(zk_instances)) with context.ThreadGroup() as tg: for i in zk_instances: tg.spawn('storm-start-zk-%s' % i.instance_name, self._start_zookeeper, i) @cpo.event_wrapper(True) def _start_zookeeper(self, instance): with instance.remote() as r: run.start_zookeeper(r) def _setup_instances(self, cluster, instances=None): extra = self._extract_configs_to_extra(cluster) if instances is None: instances = utils.get_instances(cluster) self._push_configs_to_nodes(cluster, extra, instances) def _push_configs_to_nodes(self, cluster, extra, new_instances): all_instances = utils.get_instances(cluster) cpo.add_provisioning_step(cluster.id, _("Push configs to nodes"), len(all_instances)) with context.ThreadGroup() as tg: for instance in all_instances: if instance in new_instances: tg.spawn('storm-configure-%s' % instance.instance_name, self._push_configs_to_new_node, cluster, extra, instance) else: tg.spawn('storm-reconfigure-%s' % instance.instance_name, self._push_configs_to_existing_node, cluster, extra, instance) def _convert_dict_to_yaml(self, dict_to_convert): new_dict = dict_to_convert.copy() for key in dict_to_convert: if isinstance(dict_to_convert[key], six.string_types): new_dict[key] = "\"" + dict_to_convert[key] + "\"" stream = yaml.dump(new_dict, default_flow_style=False) stream = stream.replace("\'", "") return stream @cpo.event_wrapper(True) def _push_configs_to_new_node(self, cluster, extra, instance): ng_extra = extra[instance.node_group.id] files_supervisor = { '/etc/supervisor/supervisord.conf': ng_extra['slave_sv_conf'] } files_storm = { '/usr/local/storm/conf/storm.yaml': ng_extra['st_instances'] } files_zk = { '/opt/zookeeper/zookeeper/conf/zoo.cfg': ng_extra['zk_conf'] } files_supervisor_master = { '/etc/supervisor/supervisord.conf': ng_extra['master_sv_conf'] } with remote.get_remote(instance) as r: node_processes = instance.node_group.node_processes r.write_files_to(files_storm, run_as_root=True) if 'zookeeper' in node_processes: self._push_zk_configs(r, files_zk) if 'nimbus' in node_processes: self._push_supervisor_configs(r, files_supervisor_master) if 'supervisor' in node_processes: self._push_supervisor_configs(r, files_supervisor) @cpo.event_wrapper(True) def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_storm_update = ('nimbus' in node_processes or 'supervisor' in node_processes) need_zookeeper_update = 'zookeeper' in node_processes ng_extra = extra[instance.node_group.id] r = remote.get_remote(instance) if need_storm_update: storm_path = '/usr/local/storm/conf/storm.yaml' files_storm = {storm_path: ng_extra['st_instances']} r.write_files_to(files_storm) if need_zookeeper_update: zk_path = '/opt/zookeeper/zookeeper-3.4.6/conf/zoo.cfg' files_zookeeper = {zk_path: ng_extra['zk_conf']} self._push_zk_configs(r, files_zookeeper) def _set_cluster_info(self, cluster): st_master = utils.get_instance(cluster, "nimbus") info = {} if st_master: port = "8080" info['Strom'] = { 'Web UI': 'http://%s:%s' % (st_master.management_ip, port) } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info}) def _push_zk_configs(self, r, files): r.write_files_to(files, run_as_root=True) def _push_supervisor_configs(self, r, files): r.append_to_files(files, run_as_root=True)
def _step_description(x): return { 'step': gu.start_process_event_message(x), 'param': ('cluster', 0) }
r.execute_command( 'sudo su - -c "yarn-daemon.sh start nodemanager" hadoop') def start_hadoop_process(instance, process): instance.remote().execute_command( 'sudo su - -c "hadoop-daemon.sh start %s" hadoop' % process) def start_yarn_process(instance, process): instance.remote().execute_command( 'sudo su - -c "yarn-daemon.sh start %s" hadoop' % process) @utils.event_wrapper(True, step=utils.start_process_event_message("HistoryServer")) def start_historyserver(instance): instance.remote().execute_command( 'sudo su - -c "mr-jobhistory-daemon.sh start historyserver" hadoop') @utils.event_wrapper(True, step=utils.start_process_event_message("Oozie")) def start_oozie_process(pctx, instance): with context.set_current_instance_id(instance.instance_id): with instance.remote() as r: if config_helper.is_mysql_enabled(pctx, instance.cluster): _start_mysql(r) LOG.debug("Creating Oozie DB Schema") sql_script = utils.get_file_text( 'plugins/vanilla/hadoop2/resources/create_oozie_db.sql', 'sahara_plugin_vanilla')
# limitations under the License. from sahara.plugins import utils from sahara.plugins.sandbox.hadoop2 import run_scripts as run from sahara.plugins.sandbox import utils as vu from sahara.utils import cluster_progress_ops as cpo from sahara.utils import files def start_namenode(cluster, backup=None): nn = vu.get_namenode(cluster) _start_namenode(nn, backup) @cpo.event_wrapper( True, step=utils.start_process_event_message('NameNode')) def _start_namenode(nn, backup=None): if backup is None: run.format_namenode(nn) run.start_hadoop_process(nn, 'namenode') def start_secondarynamenode(cluster): snn = vu.get_secondarynamenode(cluster) if snn: _start_secondarynamenode(snn) @cpo.event_wrapper( True, step=utils.start_process_event_message("SecondaryNameNodes")) def _start_secondarynamenode(snn):
# See the License for the specific language governing permissions and # limitations under the License. from sahara.plugins import utils from sahara.plugins.vanilla.hadoop2 import run_scripts as run from sahara.plugins.vanilla import utils as vu from sahara.utils import cluster_progress_ops as cpo def start_namenode(cluster): nn = vu.get_namenode(cluster) _start_namenode(nn) @cpo.event_wrapper( True, step=utils.start_process_event_message('NameNode')) def _start_namenode(nn): run.format_namenode(nn) run.start_hadoop_process(nn, 'namenode') def start_secondarynamenode(cluster): snn = vu.get_secondarynamenode(cluster) if snn: _start_secondarynamenode(snn) @cpo.event_wrapper( True, step=utils.start_process_event_message("SecondaryNameNodes")) def _start_secondarynamenode(snn): run.start_hadoop_process(snn, 'secondarynamenode')
class VersionHandler(avm.AbstractVersionHandler): def __init__(self): self.pctx = { 'env_confs': c_helper.get_env_configs(), 'all_confs': c_helper.get_plugin_configs() } def get_plugin_configs(self): return self.pctx['all_confs'] def get_node_processes(self): return { "Hadoop": [], "MapReduce": ["historyserver"], "HDFS": ["namenode", "datanode", "secondarynamenode"], "YARN": ["resourcemanager", "nodemanager"], "JobFlow": ["oozie"], "Hive": ["hiveserver"] } def validate(self, cluster): vl.validate_cluster_creating(self.pctx, cluster) def update_infra(self, cluster): pass def configure_cluster(self, cluster): c.configure_cluster(self.pctx, cluster) def start_namenode(self, cluster): nn = vu.get_namenode(cluster) self._start_namenode(nn) @cpo.event_wrapper(True, step=utils.start_process_event_message('NameNode')) def _start_namenode(self, nn): run.format_namenode(nn) run.start_hadoop_process(nn, 'namenode') def start_secondarynamenodes(self, cluster): snns = vu.get_secondarynamenodes(cluster) if len(snns) == 0: return cpo.add_provisioning_step( snns[0].cluster_id, utils.start_process_event_message("SecondaryNameNodes"), len(snns)) for snn in vu.get_secondarynamenodes(cluster): self._start_secondarynamenode(snn) @cpo.event_wrapper(True) def _start_secondarynamenode(self, snn): run.start_hadoop_process(snn, 'secondarynamenode') def start_resourcemanager(self, cluster): rm = vu.get_resourcemanager(cluster) if rm: self._start_resourcemanager(rm) @cpo.event_wrapper( True, step=utils.start_process_event_message('ResourceManager')) def _start_resourcemanager(self, snn): run.start_yarn_process(snn, 'resourcemanager') def start_historyserver(self, cluster): hs = vu.get_historyserver(cluster) if hs: run.start_historyserver(hs) def start_oozie(self, cluster): oo = vu.get_oozie(cluster) if oo: run.start_oozie_process(self.pctx, oo) def start_hiveserver(self, cluster): hiveserver = vu.get_hiveserver(cluster) if hiveserver: run.start_hiveserver_process(self.pctx, hiveserver) def start_cluster(self, cluster): self.start_namenode(cluster) self.start_secondarynamenodes(cluster) self.start_resourcemanager(cluster) run.start_dn_nm_processes(utils.get_instances(cluster)) run.await_datanodes(cluster) self.start_historyserver(cluster) self.start_oozie(cluster) self.start_hiveserver(cluster) self._set_cluster_info(cluster) def decommission_nodes(self, cluster, instances): sc.decommission_nodes(self.pctx, cluster, instances) def validate_scaling(self, cluster, existing, additional): vl.validate_additional_ng_scaling(cluster, additional) vl.validate_existing_ng_scaling(self.pctx, cluster, existing) def scale_cluster(self, cluster, instances): sc.scale_cluster(self.pctx, cluster, instances) def _set_cluster_info(self, cluster): nn = vu.get_namenode(cluster) rm = vu.get_resourcemanager(cluster) hs = vu.get_historyserver(cluster) oo = vu.get_oozie(cluster) info = {} if rm: info['YARN'] = { 'Web UI': 'http://%s:%s' % (rm.management_ip, '8088'), 'ResourceManager': 'http://%s:%s' % (rm.management_ip, '8032') } if nn: info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.management_ip, '50070'), 'NameNode': 'hdfs://%s:%s' % (nn.hostname(), '9000') } if oo: info['JobFlow'] = { 'Oozie': 'http://%s:%s' % (oo.management_ip, '11000') } if hs: info['MapReduce JobHistory Server'] = { 'Web UI': 'http://%s:%s' % (hs.management_ip, '19888') } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info}) def get_edp_engine(self, cluster, job_type): if job_type in edp_engine.EdpOozieEngine.get_supported_job_types(): return edp_engine.EdpOozieEngine(cluster) return None def get_edp_job_types(self): return edp_engine.EdpOozieEngine.get_supported_job_types() def get_edp_config_hints(self, job_type): return edp_engine.EdpOozieEngine.get_possible_job_config(job_type) def get_open_ports(self, node_group): return c.get_open_ports(node_group)
class SparkProvider(p.ProvisioningPluginBase): def __init__(self): self.processes = { "HDFS": ["namenode", "datanode"], "Spark": ["master", "slave"] } def get_title(self): return "Apache Spark" def get_description(self): return _("This plugin provides an ability to launch Spark on Hadoop " "CDH cluster without any management consoles.") def get_labels(self): default = {'enabled': {'status': True}, 'stable': {'status': True}} result = {'plugin_labels': copy.deepcopy(default)} result['version_labels'] = { version: copy.deepcopy(default) for version in self.get_versions() } return result def get_versions(self): return ['1.6.0', '1.3.1'] def get_configs(self, hadoop_version): return c_helper.get_plugin_configs() def get_node_processes(self, hadoop_version): return self.processes def validate(self, cluster): nn_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "namenode")]) if nn_count != 1: raise ex.InvalidComponentCountException("namenode", 1, nn_count) dn_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "datanode")]) if dn_count < 1: raise ex.InvalidComponentCountException("datanode", _("1 or more"), nn_count) rep_factor = utils.get_config_value_or_default('HDFS', "dfs.replication", cluster) if dn_count < rep_factor: raise ex.InvalidComponentCountException( 'datanode', _('%s or more') % rep_factor, dn_count, _('Number of %(dn)s instances should not be less ' 'than %(replication)s') % { 'dn': 'datanode', 'replication': 'dfs.replication' }) # validate Spark Master Node and Spark Slaves sm_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "master")]) if sm_count != 1: raise ex.RequiredServiceMissingException("Spark master") sl_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "slave")]) if sl_count < 1: raise ex.InvalidComponentCountException("Spark slave", _("1 or more"), sl_count) def update_infra(self, cluster): pass def configure_cluster(self, cluster): self._setup_instances(cluster) @cpo.event_wrapper(True, step=utils.start_process_event_message("NameNode")) def _start_namenode(self, nn_instance): with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") def start_spark(self, cluster): sm_instance = utils.get_instance(cluster, "master") if sm_instance: self._start_spark(cluster, sm_instance) @cpo.event_wrapper( True, step=utils.start_process_event_message("SparkMasterNode")) def _start_spark(self, cluster, sm_instance): with remote.get_remote(sm_instance) as r: run.start_spark_master(r, self._spark_home(cluster)) LOG.info(_LI("Spark service has been started")) def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node self._start_namenode(nn_instance) # start the data nodes self._start_datanode_processes(dn_instances) run.await_datanodes(cluster) LOG.info(_LI("Hadoop services have been started")) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes self.start_spark(cluster) swift_helper.install_ssl_certs(utils.get_instances(cluster)) LOG.info(_LI('Cluster has been started successfully')) self._set_cluster_info(cluster) def _spark_home(self, cluster): return utils.get_config_value_or_default("Spark", "Spark home", cluster) def _extract_configs_to_extra(self, cluster): sp_master = utils.get_instance(cluster, "master") sp_slaves = utils.get_instances(cluster, "slave") extra = dict() config_master = config_slaves = '' if sp_master is not None: config_master = c_helper.generate_spark_env_configs(cluster) if sp_slaves is not None: slavenames = [] for slave in sp_slaves: slavenames.append(slave.hostname()) config_slaves = c_helper.generate_spark_slaves_configs(slavenames) else: config_slaves = "\n" # Any node that might be used to run spark-submit will need # these libs for swift integration config_defaults = c_helper.generate_spark_executor_classpath(cluster) extra['job_cleanup'] = c_helper.generate_job_cleanup_config(cluster) extra['sp_master'] = config_master extra['sp_slaves'] = config_slaves extra['sp_defaults'] = config_defaults if c_helper.is_data_locality_enabled(cluster): topology_data = th.generate_topology_map( cluster, CONF.enable_hypervisor_awareness) extra['topology_data'] = "\n".join( [k + " " + v for k, v in topology_data.items()]) + "\n" return extra def _add_instance_ng_related_to_extra(self, cluster, instance, extra): extra = extra.copy() ng = instance.node_group nn = utils.get_instance(cluster, "namenode") extra['xml'] = c_helper.generate_xml_configs(ng.configuration(), instance.storage_paths(), nn.hostname(), None) extra['setup_script'] = c_helper.generate_hadoop_setup_script( instance.storage_paths(), c_helper.extract_hadoop_environment_confs(ng.configuration())) return extra def _start_datanode_processes(self, dn_instances): if len(dn_instances) == 0: return cpo.add_provisioning_step( dn_instances[0].cluster_id, utils.start_process_event_message("DataNodes"), len(dn_instances)) with context.ThreadGroup() as tg: for i in dn_instances: tg.spawn('spark-start-dn-%s' % i.instance_name, self._start_datanode, i) @cpo.event_wrapper(mark_successful_on_exit=True) def _start_datanode(self, instance): with instance.remote() as r: run.start_processes(r, "datanode") def _setup_instances(self, cluster, instances=None): extra = self._extract_configs_to_extra(cluster) if instances is None: instances = utils.get_instances(cluster) self._push_configs_to_nodes(cluster, extra, instances) def _push_configs_to_nodes(self, cluster, extra, new_instances): all_instances = utils.get_instances(cluster) cpo.add_provisioning_step(cluster.id, _("Push configs to nodes"), len(all_instances)) with context.ThreadGroup() as tg: for instance in all_instances: extra = self._add_instance_ng_related_to_extra( cluster, instance, extra) if instance in new_instances: tg.spawn('spark-configure-%s' % instance.instance_name, self._push_configs_to_new_node, cluster, extra, instance) else: tg.spawn('spark-reconfigure-%s' % instance.instance_name, self._push_configs_to_existing_node, cluster, extra, instance) @cpo.event_wrapper(mark_successful_on_exit=True) def _push_configs_to_new_node(self, cluster, extra, instance): files_hadoop = { os.path.join(c_helper.HADOOP_CONF_DIR, "core-site.xml"): extra['xml']['core-site'], os.path.join(c_helper.HADOOP_CONF_DIR, "hdfs-site.xml"): extra['xml']['hdfs-site'], } sp_home = self._spark_home(cluster) files_spark = { os.path.join(sp_home, 'conf/spark-env.sh'): extra['sp_master'], os.path.join(sp_home, 'conf/slaves'): extra['sp_slaves'], os.path.join(sp_home, 'conf/spark-defaults.conf'): extra['sp_defaults'] } files_init = { '/tmp/sahara-hadoop-init.sh': extra['setup_script'], 'id_rsa': cluster.management_private_key, 'authorized_keys': cluster.management_public_key } # pietro: This is required because the (secret) key is not stored in # .ssh which hinders password-less ssh required by spark scripts key_cmd = ('sudo cp $HOME/id_rsa $HOME/.ssh/; ' 'sudo chown $USER $HOME/.ssh/id_rsa; ' 'sudo chmod 600 $HOME/.ssh/id_rsa') storage_paths = instance.storage_paths() dn_path = ' '.join(c_helper.make_hadoop_path(storage_paths, '/dfs/dn')) nn_path = ' '.join(c_helper.make_hadoop_path(storage_paths, '/dfs/nn')) hdfs_dir_cmd = ('sudo mkdir -p %(nn_path)s %(dn_path)s &&' 'sudo chown -R hdfs:hadoop %(nn_path)s %(dn_path)s &&' 'sudo chmod 755 %(nn_path)s %(dn_path)s' % { "nn_path": nn_path, "dn_path": dn_path }) with remote.get_remote(instance) as r: r.execute_command('sudo chown -R $USER:$USER /etc/hadoop') r.execute_command('sudo chown -R $USER:$USER %s' % sp_home) r.write_files_to(files_hadoop) r.write_files_to(files_spark) r.write_files_to(files_init) r.execute_command('sudo chmod 0500 /tmp/sahara-hadoop-init.sh') r.execute_command('sudo /tmp/sahara-hadoop-init.sh ' '>> /tmp/sahara-hadoop-init.log 2>&1') r.execute_command(hdfs_dir_cmd) r.execute_command(key_cmd) if c_helper.is_data_locality_enabled(cluster): r.write_file_to( '/etc/hadoop/topology.sh', f.get_file_text('plugins/spark/resources/topology.sh')) r.execute_command('sudo chmod +x /etc/hadoop/topology.sh') self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance) self._push_cleanup_job(r, cluster, extra, instance) @cpo.event_wrapper(mark_successful_on_exit=True) def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_update_hadoop = (c_helper.is_data_locality_enabled(cluster) or 'namenode' in node_processes) need_update_spark = ('master' in node_processes or 'slave' in node_processes) if need_update_spark: sp_home = self._spark_home(cluster) files = { os.path.join(sp_home, 'conf/spark-env.sh'): extra['sp_master'], os.path.join(sp_home, 'conf/slaves'): extra['sp_slaves'], os.path.join(sp_home, 'conf/spark-defaults.conf'): extra['sp_defaults'] } r = remote.get_remote(instance) r.write_files_to(files) self._push_cleanup_job(r, cluster, extra, instance) if need_update_hadoop: with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance) def _write_topology_data(self, r, cluster, extra): if c_helper.is_data_locality_enabled(cluster): topology_data = extra['topology_data'] r.write_file_to('/etc/hadoop/topology.data', topology_data) def _push_master_configs(self, r, cluster, extra, instance): node_processes = instance.node_group.node_processes if 'namenode' in node_processes: self._push_namenode_configs(cluster, r) def _push_cleanup_job(self, r, cluster, extra, instance): node_processes = instance.node_group.node_processes if 'master' in node_processes: if extra['job_cleanup']['valid']: r.write_file_to('/etc/hadoop/tmp-cleanup.sh', extra['job_cleanup']['script']) r.execute_command("chmod 755 /etc/hadoop/tmp-cleanup.sh") cmd = 'sudo sh -c \'echo "%s" > /etc/cron.d/spark-cleanup\'' r.execute_command(cmd % extra['job_cleanup']['cron']) else: r.execute_command("sudo rm -f /etc/hadoop/tmp-cleanup.sh") r.execute_command("sudo rm -f /etc/crond.d/spark-cleanup") def _push_namenode_configs(self, cluster, r): r.write_file_to( '/etc/hadoop/dn.incl', utils.generate_fqdn_host_names( utils.get_instances(cluster, "datanode"))) r.write_file_to('/etc/hadoop/dn.excl', '') def _set_cluster_info(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") info = {} if nn: address = utils.get_config_value_or_default( 'HDFS', 'dfs.http.address', cluster) port = address[address.rfind(':') + 1:] info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.get_ip_or_dns_name(), port) } info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname() if sp_master: port = utils.get_config_value_or_default('Spark', 'Master webui port', cluster) if port is not None: info['Spark'] = { 'Web UI': 'http://%s:%s' % (sp_master.get_ip_or_dns_name(), port) } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info}) # Scaling def validate_scaling(self, cluster, existing, additional): self._validate_existing_ng_scaling(cluster, existing) self._validate_additional_ng_scaling(cluster, additional) def decommission_nodes(self, cluster, instances): sls = utils.get_instances(cluster, "slave") dns = utils.get_instances(cluster, "datanode") decommission_dns = False decommission_sls = False for i in instances: if 'datanode' in i.node_group.node_processes: dns.remove(i) decommission_dns = True if 'slave' in i.node_group.node_processes: sls.remove(i) decommission_sls = True nn = utils.get_instance(cluster, "namenode") spark_master = utils.get_instance(cluster, "master") if decommission_sls: sc.decommission_sl(spark_master, instances, sls) if decommission_dns: sc.decommission_dn(nn, instances, dns) def scale_cluster(self, cluster, instances): master = utils.get_instance(cluster, "master") r_master = remote.get_remote(master) run.stop_spark(r_master, self._spark_home(cluster)) self._setup_instances(cluster, instances) nn = utils.get_instance(cluster, "namenode") run.refresh_nodes(remote.get_remote(nn), "dfsadmin") dn_instances = [ instance for instance in instances if 'datanode' in instance.node_group.node_processes ] self._start_datanode_processes(dn_instances) swift_helper.install_ssl_certs(instances) run.start_spark_master(r_master, self._spark_home(cluster)) LOG.info(_LI("Spark master service has been restarted")) def _get_scalable_processes(self): return ["datanode", "slave"] def _validate_additional_ng_scaling(self, cluster, additional): scalable_processes = self._get_scalable_processes() for ng_id in additional: ng = ug.get_by_id(cluster.node_groups, ng_id) if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Spark plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) def _validate_existing_ng_scaling(self, cluster, existing): scalable_processes = self._get_scalable_processes() dn_to_delete = 0 for ng in cluster.node_groups: if ng.id in existing: if ng.count > existing[ng.id] and ("datanode" in ng.node_processes): dn_to_delete += ng.count - existing[ng.id] if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Spark plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) dn_amount = len(utils.get_instances(cluster, "datanode")) rep_factor = utils.get_config_value_or_default('HDFS', "dfs.replication", cluster) if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor: raise ex.ClusterCannotBeScaled( cluster.name, _("Spark plugin cannot shrink cluster because " "there would be not enough nodes for HDFS " "replicas (replication factor is %s)") % rep_factor) def get_edp_engine(self, cluster, job_type): if edp_engine.EdpEngine.job_type_supported(job_type): return edp_engine.EdpEngine(cluster) if shell_engine.ShellEngine.job_type_supported(job_type): return shell_engine.ShellEngine(cluster) return None def get_edp_job_types(self, versions=None): res = {} for vers in self.get_versions(): if not versions or vers in versions: res[vers] = shell_engine.ShellEngine.get_supported_job_types() if edp_engine.EdpEngine.edp_supported(vers): res[vers].extend( edp_engine.EdpEngine.get_supported_job_types()) return res def get_edp_config_hints(self, job_type, version): if (edp_engine.EdpEngine.edp_supported(version) and edp_engine.EdpEngine.job_type_supported(job_type)): return edp_engine.EdpEngine.get_possible_job_config(job_type) if shell_engine.ShellEngine.job_type_supported(job_type): return shell_engine.ShellEngine.get_possible_job_config(job_type) return {} def get_open_ports(self, node_group): cluster = node_group.cluster ports_map = { 'namenode': [8020, 50070, 50470], 'datanode': [50010, 1004, 50075, 1006, 50020], 'master': [ int( utils.get_config_value_or_default("Spark", "Master port", cluster)), int( utils.get_config_value_or_default("Spark", "Master webui port", cluster)), ], 'slave': [ int( utils.get_config_value_or_default("Spark", "Worker webui port", cluster)) ] } ports = [] for process in node_group.node_processes: if process in ports_map: ports.extend(ports_map[process]) return ports def recommend_configs(self, cluster, scaling=False): want_to_configure = { 'cluster_configs': { 'dfs.replication': ('HDFS', 'dfs.replication') } } provider = ru.HadoopAutoConfigsProvider( want_to_configure, self.get_configs(cluster.hadoop_version), cluster, scaling) provider.apply_recommended_configs()