def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_update_hadoop = (c_helper.is_data_locality_enabled(cluster) or 'namenode' in node_processes) need_update_spark = ('master' in node_processes or 'slave' in node_processes) if need_update_spark: ng_extra = extra[instance.node_group.id] sp_home = self._spark_home(cluster) files = { os.path.join(sp_home, 'conf/spark-env.sh'): ng_extra['sp_master'], os.path.join(sp_home, 'conf/slaves'): ng_extra['sp_slaves'], os.path.join( sp_home, 'conf/spark-defaults.conf'): ng_extra['sp_defaults'] } r = remote.get_remote(instance) r.write_files_to(files) self._push_cleanup_job(r, cluster, extra, instance) if need_update_hadoop: with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance)
def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_update_hadoop = (c_helper.is_data_locality_enabled(cluster) or 'namenode' in node_processes) need_update_spark = ('master' in node_processes or 'slave' in node_processes) if need_update_spark: ng_extra = extra[instance.node_group.id] sp_home = self._spark_home(cluster) files = { os.path.join(sp_home, 'conf/spark-env.sh'): ng_extra['sp_master'], os.path.join(sp_home, 'conf/slaves'): ng_extra['sp_slaves'], os.path.join(sp_home, 'conf/spark-defaults.conf'): ng_extra['sp_defaults'] } r = remote.get_remote(instance) r.write_files_to(files) self._push_cleanup_job(r, cluster, extra, instance) if need_update_hadoop: with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance)
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") sm_instance = utils.get_instance(cluster, "master") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") # start the data nodes self._start_slave_datanode_processes(dn_instances) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes if sm_instance: with remote.get_remote(sm_instance) as r: run.start_spark_master(r, self._spark_home(cluster)) LOG.info(_LI("Spark service at '%s' has been started"), sm_instance.hostname()) LOG.info(_LI('Cluster %s has been started successfully'), cluster.name) self._set_cluster_info(cluster)
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") sm_instance = utils.get_instance(cluster, "master") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") # start the data nodes self._start_slave_datanode_processes(dn_instances) LOG.info("Hadoop services in cluster %s have been started" % cluster.name) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command(("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/")) # start spark nodes if sm_instance: with remote.get_remote(sm_instance) as r: run.start_spark_master(r, self._spark_home(cluster)) LOG.info("Spark service at '%s' has been started", sm_instance.hostname()) LOG.info('Cluster %s has been started successfully' % cluster.name) self._set_cluster_info(cluster)
def scale_cluster(self, cluster, instances): self._setup_instances(cluster, instances) run.refresh_nodes(remote.get_remote(vu.get_namenode(cluster)), "dfsadmin") jt = vu.get_jobtracker(cluster) if jt: run.refresh_nodes(remote.get_remote(jt), "mradmin") self._start_tt_dn_processes(instances)
def decommission_tt(jt, inst_to_be_deleted, survived_inst): with remote.get_remote(jt) as r: r.write_file_to('/etc/hadoop/tt.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(jt), "mradmin") context.sleep(3) r.write_files_to({'/etc/hadoop/tt.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/tt.excl': "", })
def decommission_tt(jt, inst_to_be_deleted, survived_inst): with remote.get_remote(jt) as r: r.write_file_to('/etc/hadoop/tt.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(jt), "mradmin") context.sleep(3) r.write_files_to({ '/etc/hadoop/tt.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/tt.excl': "", })
def scale_cluster(self, cluster, instances): master = utils.get_instance(cluster, "master") r_master = remote.get_remote(master) run.stop_spark(r_master, self._spark_home(cluster)) self._setup_instances(cluster, instances) nn = utils.get_instance(cluster, "namenode") run.refresh_nodes(remote.get_remote(nn), "dfsadmin") self._start_slave_datanode_processes(instances) run.start_spark_master(r_master, self._spark_home(cluster)) LOG.info(_LI("Spark master service at '%s' has been restarted"), master.hostname())
def scale_cluster(self, cluster, instances): master = utils.get_instance(cluster, "master") r_master = remote.get_remote(master) run.stop_spark(r_master, self._spark_home(cluster)) self._setup_instances(cluster, instances) nn = utils.get_instance(cluster, "namenode") run.refresh_nodes(remote.get_remote(nn), "dfsadmin") dn_instances = [instance for instance in instances if "datanode" in instance.node_group.node_processes] self._start_datanode_processes(dn_instances) run.start_spark_master(r_master, self._spark_home(cluster)) LOG.info(_LI("Spark master service has been restarted"))
def start_cluster(self, cluster): nn_instance = vu.get_namenode(cluster) with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") for snn in vu.get_secondarynamenodes(cluster): run.start_processes(remote.get_remote(snn), "secondarynamenode") jt_instance = vu.get_jobtracker(cluster) if jt_instance: run.start_processes(remote.get_remote(jt_instance), "jobtracker") self._start_tt_dn_processes(utils.get_instances(cluster)) self._await_datanodes(cluster) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) oozie = vu.get_oozie(cluster) if oozie: with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info(_LI("Oozie service at '%s' has been started"), nn_instance.hostname()) hive_server = vu.get_hiveserver(cluster) if hive_server: with remote.get_remote(hive_server) as r: run.hive_create_warehouse_dir(r) run.hive_copy_shared_conf( r, edp.get_hive_shared_conf_path('hadoop')) if c_helper.is_mysql_enable(cluster): if not oozie or hive_server.hostname() != oozie.hostname(): run.mysql_start(r, hive_server) run.hive_create_db(r, cluster.extra['hive_mysql_passwd']) run.hive_metastore_start(r) LOG.info( _LI("Hive Metastore server at %s has been " "started"), hive_server.hostname()) LOG.info(_LI('Cluster %s has been started successfully'), cluster.name) self._set_cluster_info(cluster)
def start_cluster(self, cluster): nn_instance = vu.get_namenode(cluster) with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") for snn in vu.get_secondarynamenodes(cluster): run.start_processes(remote.get_remote(snn), "secondarynamenode") jt_instance = vu.get_jobtracker(cluster) if jt_instance: run.start_processes(remote.get_remote(jt_instance), "jobtracker") self._start_tt_dn_processes(utils.get_instances(cluster)) self._await_datanodes(cluster) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) oozie = vu.get_oozie(cluster) if oozie: with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info(_LI("Oozie service at '%s' has been started"), nn_instance.hostname()) hive_server = vu.get_hiveserver(cluster) if hive_server: with remote.get_remote(hive_server) as r: run.hive_create_warehouse_dir(r) run.hive_copy_shared_conf( r, edp.get_hive_shared_conf_path('hadoop')) if c_helper.is_mysql_enable(cluster): if not oozie or hive_server.hostname() != oozie.hostname(): run.mysql_start(r, hive_server) run.hive_create_db(r) run.hive_metastore_start(r) LOG.info(_LI("Hive Metastore server at %s has been " "started"), hive_server.hostname()) LOG.info(_LI('Cluster %s has been started successfully'), cluster.name) self._set_cluster_info(cluster)
def _get_job_status_from_remote(self, job_execution, retries=3): topology_name, inst_id = self._get_instance_if_running( job_execution) if topology_name is None or inst_id is None: return edp.JOB_STATUSES_TERMINATED topology_name = self._get_topology_name(job_execution) master = plugin_utils.get_instance(self.cluster, "nimbus") cmd = ( "%(storm)s -c nimbus.host=%(host)s " "list | grep %(topology_name)s | awk '{print $2}'") % ( { "storm": "/usr/local/storm/bin/storm", "host": master.hostname(), "topology_name": topology_name }) for i in range(retries): with remote.get_remote(master) as r: ret, stdout = r.execute_command("%s " % (cmd)) # If the status is ACTIVE is there, it's still running if stdout.strip() == "ACTIVE": return {"status": edp.JOB_STATUS_RUNNING} else: if i == retries - 1: return {"status": edp.JOB_STATUS_KILLED} context.sleep(10)
def _execute_remote_job(self, master, wf_dir, cmd): with remote.get_remote(master) as r: ret, stdout = r.execute_command( "cd %s; %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) return ret, stdout
def _upload_wrapper_xml(self, where, job_dir, job_configs): xml_name = 'spark.xml' proxy_configs = job_configs.get('proxy_configs') configs = {} cfgs = job_configs.get('configs', {}) if proxy_configs: configs[sw.HADOOP_SWIFT_USERNAME] = proxy_configs.get( 'proxy_username') configs[sw.HADOOP_SWIFT_PASSWORD] = key_manager.get_secret( proxy_configs.get('proxy_password')) configs[sw.HADOOP_SWIFT_TRUST_ID] = proxy_configs.get( 'proxy_trust_id') configs[sw.HADOOP_SWIFT_DOMAIN_NAME] = CONF.proxy_user_domain_name else: targets = [sw.HADOOP_SWIFT_USERNAME] configs = {k: cfgs[k] for k in targets if k in cfgs} if sw.HADOOP_SWIFT_PASSWORD in cfgs: configs[sw.HADOOP_SWIFT_PASSWORD] = ( key_manager.get_secret(cfgs[sw.HADOOP_SWIFT_PASSWORD]) ) for s3_cfg_key in s3_common.S3_DS_CONFIGS: if s3_cfg_key in cfgs: if s3_cfg_key == s3_common.S3_SECRET_KEY_CONFIG: configs[s3_cfg_key] = ( key_manager.get_secret(cfgs[s3_cfg_key]) ) else: configs[s3_cfg_key] = cfgs[s3_cfg_key] content = xmlutils.create_hadoop_xml(configs) with remote.get_remote(where) as r: dst = os.path.join(job_dir, xml_name) r.write_file_to(dst, content) return xml_name
def _upload_job_files_to_hdfs(self, where, job_dir, job, configs, proxy_configs=None): mains = list(job.mains) if job.mains else [] libs = list(job.libs) if job.libs else [] builtin_libs = edp.get_builtin_binaries(job, configs) uploaded_paths = [] hdfs_user = self.get_hdfs_user() job_dir_suffix = 'lib' if job.type != edp.JOB_TYPE_SHELL else '' lib_dir = os.path.join(job_dir, job_dir_suffix) with remote.get_remote(where) as r: job_binaries = mains + libs self._prepare_job_binaries(job_binaries, r) # upload mains uploaded_paths.extend(self._upload_job_binaries(r, mains, proxy_configs, hdfs_user, job_dir)) # upload libs if len(libs) and job_dir_suffix: # HDFS 2.2.0 fails to put file if the lib dir does not exist self.create_hdfs_dir(r, lib_dir) uploaded_paths.extend(self._upload_job_binaries(r, libs, proxy_configs, hdfs_user, lib_dir)) # upload buitin_libs for lib in builtin_libs: h.put_file_to_hdfs(r, lib['raw'], lib['name'], lib_dir, hdfs_user) uploaded_paths.append(lib_dir + lib['name']) return uploaded_paths
def _get_job_status_from_remote(self, job_execution, retries=3): topology_name, inst_id = self._get_instance_if_running(job_execution) if topology_name is None or inst_id is None: return edp.JOB_STATUSES_TERMINATED topology_name = self._get_topology_name(job_execution) master = plugin_utils.get_instance(self.cluster, "nimbus") cmd = ("%(storm)s -c nimbus.host=%(host)s " "list | grep %(topology_name)s | awk '{print $2}'") % ( { "storm": "/usr/local/storm/bin/storm", "host": master.hostname(), "topology_name": topology_name }) for i in range(retries): with remote.get_remote(master) as r: ret, stdout = r.execute_command("%s " % (cmd)) # If the status is ACTIVE is there, it's still running if stdout.strip() == "ACTIVE": return {"status": edp.JOB_STATUS_RUNNING} else: if i == retries - 1: return {"status": edp.JOB_STATUS_KILLED} context.sleep(10)
def _exec_cmd_on_remote_instance(self, master, cmd): if master is not None: with remote.get_remote(master) as r: ret, stdout = r.execute_command("%s > /dev/null 2>&1 & echo $!" % cmd) return ret, stdout
def _push_configs_to_new_node(self, cluster, extra, instance): ng_extra = extra[instance.node_group.id] files_supervisor = { '/etc/supervisor/supervisord.conf': ng_extra['slave_sv_conf'] } files_storm = { '/usr/local/storm/conf/storm.yaml': ng_extra['st_instances'] } files_zk = { '/opt/zookeeper/zookeeper/conf/zoo.cfg': ng_extra['zk_conf'] } files_supervisor_master = { '/etc/supervisor/supervisord.conf': ng_extra['master_sv_conf'] } with remote.get_remote(instance) as r: node_processes = instance.node_group.node_processes r.write_files_to(files_storm, run_as_root=True) if 'zookeeper' in node_processes: self._push_zk_configs(r, files_zk) if 'nimbus' in node_processes: self._push_supervisor_configs(r, files_supervisor_master) if 'supervisor' in node_processes: self._push_supervisor_configs(r, files_supervisor)
def upload_job_files(where, job_dir, job, libs_subdir=True, proxy_configs=None): mains = job.mains or [] libs = job.libs or [] uploaded_paths = [] def upload(r, dir, job_file): dst = os.path.join(dir, job_file.name) raw_data = dispatch.get_raw_binary(job_file, proxy_configs) r.write_file_to(dst, raw_data) uploaded_paths.append(dst) with remote.get_remote(where) as r: libs_dir = job_dir if libs_subdir and libs: libs_dir = os.path.join(libs_dir, "libs") r.execute_command("mkdir -p %s" % libs_dir) for job_file in mains: upload(r, job_dir, job_file) for job_file in libs: upload(r, libs_dir, job_file) return uploaded_paths
def scale_cluster(self, cluster, instances): master = utils.get_instance(cluster, "master") r_master = remote.get_remote(master) run.stop_spark(r_master, self._spark_home(cluster)) self._setup_instances(cluster, instances) nn = utils.get_instance(cluster, "namenode") run.refresh_nodes(remote.get_remote(nn), "dfsadmin") dn_instances = [instance for instance in instances if 'datanode' in instance.node_group.node_processes] self._start_datanode_processes(dn_instances) swift_helper.install_ssl_certs(instances) run.start_spark_master(r_master, self._spark_home(cluster)) LOG.info("Spark master service has been restarted")
def _upload_job_files(self, where, job_dir, job, job_configs): def upload(r, dir, job_file, proxy_configs): dst = os.path.join(dir, job_file.name) raw_data = dispatch.get_raw_binary(job_file, proxy_configs) r.write_file_to(dst, raw_data) return dst def upload_builtin(r, dir, builtin): dst = os.path.join(dir, builtin['name']) r.write_file_to(dst, builtin['raw']) return dst builtin_libs = [] if edp.is_adapt_spark_for_swift_enabled(job_configs.get('configs', {})): path = 'service/edp/resources/edp-spark-wrapper.jar' name = 'builtin-%s.jar' % six.text_type(uuid.uuid4()) builtin_libs = [{'raw': files.get_file_text(path), 'name': name}] uploaded_paths = [] builtin_paths = [] with remote.get_remote(where) as r: mains = list(job.mains) if job.mains else [] libs = list(job.libs) if job.libs else [] for job_file in mains + libs: uploaded_paths.append( upload(r, job_dir, job_file, job_configs.get('proxy_configs'))) for builtin in builtin_libs: builtin_paths.append(upload_builtin(r, job_dir, builtin)) return uploaded_paths, builtin_paths
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node self._start_namenode(nn_instance) # start the data nodes self._start_datanode_processes(dn_instances) LOG.info( _LI("Hadoop services in cluster {cluster} have been started"). format(cluster=cluster.name)) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes self.start_spark(cluster) LOG.info( _LI('Cluster {cluster} has been started successfully').format( cluster=cluster.name)) self._set_cluster_info(cluster)
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") dn_instances = utils.get_instances(cluster, "datanode") zep_instance = utils.get_instance(cluster, "zeppelin") # Start the name node self._start_namenode(nn_instance) # start the data nodes self._start_datanode_processes(dn_instances) LOG.info(_LI("Hadoop services have been started")) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes self.start_spark(cluster) # start zeppelin, if necessary if zep_instance: self._start_zeppelin(zep_instance) LOG.info(_LI('Cluster has been started successfully')) self._set_cluster_info(cluster)
def _upload_job_files_to_hdfs(self, where, job_dir, job, configs, proxy_configs=None): mains = job.mains or [] libs = job.libs or [] builtin_libs = edp.get_builtin_binaries(job, configs) uploaded_paths = [] hdfs_user = self.get_hdfs_user() job_dir_suffix = "lib" if job.type != edp.JOB_TYPE_SHELL else "" lib_dir = os.path.join(job_dir, job_dir_suffix) with remote.get_remote(where) as r: for main in mains: raw_data = dispatch.get_raw_binary(main, proxy_configs) h.put_file_to_hdfs(r, raw_data, main.name, job_dir, hdfs_user) uploaded_paths.append(job_dir + "/" + main.name) if len(libs) and job_dir_suffix: # HDFS 2.2.0 fails to put file if the lib dir does not exist self.create_hdfs_dir(r, lib_dir) for lib in libs: raw_data = dispatch.get_raw_binary(lib, proxy_configs) h.put_file_to_hdfs(r, raw_data, lib.name, lib_dir, hdfs_user) uploaded_paths.append(lib_dir + "/" + lib.name) for lib in builtin_libs: h.put_file_to_hdfs(r, lib["raw"], lib["name"], lib_dir, hdfs_user) uploaded_paths.append(lib_dir + "/" + lib["name"]) return uploaded_paths
def _upload_job_files_to_hdfs(self, where, job_dir, job, configs, proxy_configs=None): mains = job.mains or [] libs = job.libs or [] builtin_libs = edp.get_builtin_binaries(job, configs) uploaded_paths = [] hdfs_user = self.get_hdfs_user() lib_dir = job_dir + '/lib' with remote.get_remote(where) as r: for main in mains: raw_data = dispatch.get_raw_binary(main, proxy_configs) h.put_file_to_hdfs(r, raw_data, main.name, job_dir, hdfs_user) uploaded_paths.append(job_dir + '/' + main.name) if len(libs) > 0: # HDFS 2.2.0 fails to put file if the lib dir does not exist self.create_hdfs_dir(r, lib_dir) for lib in libs: raw_data = dispatch.get_raw_binary(lib, proxy_configs) h.put_file_to_hdfs(r, raw_data, lib.name, lib_dir, hdfs_user) uploaded_paths.append(lib_dir + '/' + lib.name) for lib in builtin_libs: h.put_file_to_hdfs(r, lib['raw'], lib['name'], lib_dir, hdfs_user) uploaded_paths.append(lib_dir + '/' + lib['name']) return uploaded_paths
def _push_configs_to_new_node(self, cluster, extra, instance): ng_extra = extra[instance.node_group.id] files_hadoop = { '/etc/hadoop/conf/core-site.xml': ng_extra['xml']['core-site'], '/etc/hadoop/conf/hdfs-site.xml': ng_extra['xml']['hdfs-site'], } sp_home = self._spark_home(cluster) files_spark = { os.path.join(sp_home, 'conf/spark-env.sh'): ng_extra['sp_master'], os.path.join(sp_home, 'conf/slaves'): ng_extra['sp_slaves'] } files_init = { '/tmp/sahara-hadoop-init.sh': ng_extra['setup_script'], 'id_rsa': cluster.management_private_key, 'authorized_keys': cluster.management_public_key } # pietro: This is required because the (secret) key is not stored in # .ssh which hinders password-less ssh required by spark scripts key_cmd = ('sudo cp $HOME/id_rsa $HOME/.ssh/; ' 'sudo chown $USER $HOME/.ssh/id_rsa; ' 'sudo chmod 600 $HOME/.ssh/id_rsa') storage_paths = instance.node_group.storage_paths() dn_path = ' '.join(c_helper.make_hadoop_path(storage_paths, '/dfs/dn')) nn_path = ' '.join(c_helper.make_hadoop_path(storage_paths, '/dfs/nn')) hdfs_dir_cmd = ('sudo mkdir -p %(nn_path)s %(dn_path)s &&' 'sudo chown -R hdfs:hadoop %(nn_path)s %(dn_path)s &&' 'sudo chmod 755 %(nn_path)s %(dn_path)s' % { "nn_path": nn_path, "dn_path": dn_path }) with remote.get_remote(instance) as r: r.execute_command('sudo chown -R $USER:$USER /etc/hadoop') r.execute_command('sudo chown -R $USER:$USER %s' % sp_home) r.write_files_to(files_hadoop) r.write_files_to(files_spark) r.write_files_to(files_init) r.execute_command('sudo chmod 0500 /tmp/sahara-hadoop-init.sh') r.execute_command('sudo /tmp/sahara-hadoop-init.sh ' '>> /tmp/sahara-hadoop-init.log 2>&1') r.execute_command(hdfs_dir_cmd) r.execute_command(key_cmd) if c_helper.is_data_locality_enabled(cluster): r.write_file_to( '/etc/hadoop/topology.sh', f.get_file_text('plugins/spark/resources/topology.sh')) r.execute_command('sudo chmod +x /etc/hadoop/topology.sh') self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance) self._push_cleanup_job(r, cluster, extra, instance)
def create_workflow_dir(where, job, hdfs_user): constructed_dir = '/user/%s/' % hdfs_user constructed_dir = _add_postfix(constructed_dir) constructed_dir += '%s/%s' % (job.name, six.text_type(uuid.uuid4())) with remote.get_remote(where) as r: h.create_dir(r, constructed_dir, hdfs_user) return constructed_dir
def _create_hdfs_workflow_dir(self, where, job): constructed_dir = '/user/%s/' % self.get_hdfs_user() constructed_dir = self._add_postfix(constructed_dir) constructed_dir += '%s/%s' % (job.name, uuidutils.generate_uuid()) with remote.get_remote(where) as r: self.create_hdfs_dir(r, constructed_dir) return constructed_dir
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) poll_utils.plugin_option_poll( nn.cluster, _is_decommissioned, c_helper.DECOMMISSIONING_TIMEOUT, _("Decommission %s") % "DataNodes", 3, { 'r': r, 'inst_to_be_deleted': inst_to_be_deleted}) r.write_files_to({ '/etc/hadoop/dn.incl': utils. generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': ""})
def _create_hdfs_workflow_dir(self, where, job): constructed_dir = '/user/%s/' % self.get_hdfs_user() constructed_dir = self._add_postfix(constructed_dir) constructed_dir += '%s/%s' % (job.name, six.text_type(uuid.uuid4())) with remote.get_remote(where) as r: self.create_hdfs_dir(r, constructed_dir) return constructed_dir
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) poll_utils.plugin_option_poll( nn.cluster, is_decommissioned, config_helper.DECOMMISSIONING_TIMEOUT, _("Decommission %s") % "DataNodes", 3, {'r': r, 'inst_to_be_deleted': inst_to_be_deleted}) r.write_files_to({'/etc/hadoop/dn.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': ""})
def _push_configs_to_new_node(self, cluster, extra, instance): ng_extra = extra[instance.node_group.id] files_hadoop = { "/etc/hadoop/conf/core-site.xml": ng_extra["xml"]["core-site"], "/etc/hadoop/conf/hdfs-site.xml": ng_extra["xml"]["hdfs-site"], } sp_home = self._spark_home(cluster) files_spark = { os.path.join(sp_home, "conf/spark-env.sh"): ng_extra["sp_master"], os.path.join(sp_home, "conf/slaves"): ng_extra["sp_slaves"], } files_init = { "/tmp/sahara-hadoop-init.sh": ng_extra["setup_script"], "id_rsa": cluster.management_private_key, "authorized_keys": cluster.management_public_key, } # pietro: This is required because the (secret) key is not stored in # .ssh which hinders password-less ssh required by spark scripts key_cmd = ( "sudo cp $HOME/id_rsa $HOME/.ssh/; " "sudo chown $USER $HOME/.ssh/id_rsa; " "sudo chmod 600 $HOME/.ssh/id_rsa" ) for ng in cluster.node_groups: dn_path = c_helper.extract_hadoop_path(ng.storage_paths(), "/dfs/dn") nn_path = c_helper.extract_hadoop_path(ng.storage_paths(), "/dfs/nn") hdfs_dir_cmd = ("sudo mkdir -p %s %s;" "sudo chown -R hdfs:hadoop %s %s;" "sudo chmod 755 %s %s;") % ( nn_path, dn_path, nn_path, dn_path, nn_path, dn_path, ) with remote.get_remote(instance) as r: r.execute_command("sudo chown -R $USER:$USER /etc/hadoop") r.execute_command("sudo chown -R $USER:$USER %s" % sp_home) r.write_files_to(files_hadoop) r.write_files_to(files_spark) r.write_files_to(files_init) r.execute_command("sudo chmod 0500 /tmp/sahara-hadoop-init.sh") r.execute_command("sudo /tmp/sahara-hadoop-init.sh " ">> /tmp/sahara-hadoop-init.log 2>&1") r.execute_command(hdfs_dir_cmd) r.execute_command(key_cmd) if c_helper.is_data_locality_enabled(cluster): r.write_file_to("/etc/hadoop/topology.sh", f.get_file_text("plugins/spark/resources/topology.sh")) r.execute_command("sudo chmod +x /etc/hadoop/topology.sh") self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance)
def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_update_hadoop = c_helper.is_data_locality_enabled(cluster) or "namenode" in node_processes need_update_spark = "master" in node_processes or "slave" in node_processes if need_update_spark: ng_extra = extra[instance.node_group.id] sp_home = self._spark_home(cluster) files = { os.path.join(sp_home, "conf/spark-env.sh"): ng_extra["sp_master"], os.path.join(sp_home, "conf/slaves"): ng_extra["sp_slaves"], } r = remote.get_remote(instance) r.write_files_to(files) if need_update_hadoop: with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance)
def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_update_hadoop = (c_helper.is_data_locality_enabled(cluster) or 'namenode' in node_processes) need_update_spark = ('master' in node_processes or 'slave' in node_processes) if need_update_spark: ng_extra = extra[instance.node_group.id] files = { '/opt/spark/conf/spark-env.sh': ng_extra['sp_master'], '/opt/spark/conf/slaves': ng_extra['sp_slaves'], } r = remote.get_remote(instance) r.write_files_to(files) if need_update_hadoop: with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance)
def _start_oozie(self, cluster, oozie): nn_instance = vu.get_namenode(cluster) with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info(_LI("Oozie service at {host} has been started").format(host=nn_instance.hostname()))
def create_workflow_dir(where, path, job, use_uuid=None): if use_uuid is None: use_uuid = six.text_type(uuid.uuid4()) constructed_dir = _append_slash_if_needed(path) constructed_dir += '%s/%s' % (job.name, use_uuid) with remote.get_remote(where) as r: ret, stdout = r.execute_command("mkdir -p %s" % constructed_dir) return constructed_dir
def cancel_job(self, job_execution): pid, instance = self._get_instance_if_running(job_execution) if instance is not None: with remote.get_remote(instance) as r: ret, stdout = r.execute_command("kill -SIGINT %s" % pid, raise_when_error=False) if ret == 0: # We had some effect, check the status return self._get_job_status_from_remote(r, pid, job_execution)
def scale_cluster(self, cluster, instances): master = utils.get_instance(cluster, "master") r_master = remote.get_remote(master) run.stop_spark(r_master, self._spark_home(cluster)) self._setup_instances(cluster, instances) nn = utils.get_instance(cluster, "namenode") run.refresh_nodes(remote.get_remote(nn), "dfsadmin") dn_instances = [ instance for instance in instances if 'datanode' in instance.node_group.node_processes ] self._start_datanode_processes(dn_instances) run.start_spark_master(r_master, self._spark_home(cluster)) LOG.info( _LI("Spark master service at {host} has been restarted").format( host=master.hostname()))
def _start_oozie(self, cluster, oozie): nn_instance = vu.get_namenode(cluster) with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info(_LI("Oozie service at '%s' has been started"), nn_instance.hostname())
def _start_oozie(self, cluster, oozie): nn_instance = vu.get_namenode(cluster) with remote.get_remote(oozie) as r: with context.set_current_instance_id(oozie.instance_id): if c_helper.is_mysql_enable(cluster): run.mysql_start(r) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info(_LI("Oozie service has been started"))
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) timeout = config_helper.get_decommissioning_timeout( nn.node_group.cluster) s_time = timeutils.utcnow() all_found = False while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: cmd = r.execute_command( "sudo su -c 'hadoop dfsadmin -report' hadoop") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({ '/etc/hadoop/dn.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) if not all_found: ex.DecommissionError( _("Cannot finish decommission of cluster %(cluster)s in " "%(seconds)d seconds") % { "cluster": nn.node_group.cluster, "seconds": timeout })
def _get_running_topologies_names(self, cluster): master = utils.get_instance(cluster, "nimbus") cmd = ("%(storm)s -c nimbus.host=%(host)s " "list | grep ACTIVE | awk '{print $1}'") % ( {"storm": "/usr/local/storm/bin/storm", "host": master.hostname()} ) with remote.get_remote(master) as r: ret, stdout = r.execute_command(cmd) names = stdout.split("\n") topology_names = names[0 : len(names) - 1] return topology_names
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names( inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) timeout = c_helper.get_decommissioning_timeout( nn.node_group.cluster) s_time = timeutils.utcnow() all_found = False while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: cmd = r.execute_command( "sudo -u hdfs hadoop dfsadmin -report") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({'/etc/hadoop/dn.incl': utils. generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) if not all_found: ex.DecommissionError( _("Cannot finish decommission of cluster %(cluster)s in " "%(seconds)d seconds") % {"cluster": nn.node_group.cluster, "seconds": timeout})
def _await_datanodes(self, cluster): datanodes_count = len(vu.get_datanodes(cluster)) if datanodes_count < 1: return l_message = _("Waiting on %s datanodes to start up") % datanodes_count LOG.info(l_message) with remote.get_remote(vu.get_namenode(cluster)) as r: poll_utils.plugin_option_poll( cluster, run.check_datanodes_count, c_helper.DATANODES_STARTUP_TIMEOUT, l_message, 1, { 'remote': r, 'count': datanodes_count})
def _start_hiveserver(self, cluster, hive_server): oozie = vu.get_oozie(cluster) with remote.get_remote(hive_server) as r: run.hive_create_warehouse_dir(r) run.hive_copy_shared_conf(r, edp.get_hive_shared_conf_path("hadoop")) if c_helper.is_mysql_enable(cluster): if not oozie or hive_server.hostname() != oozie.hostname(): run.mysql_start(r, hive_server) run.hive_create_db(r, cluster.extra["hive_mysql_passwd"]) run.hive_metastore_start(r) LOG.info(_LI("Hive Metastore server at {host} has been " "started").format(host=hive_server.hostname()))
def create_workflow_dir(where, path, job, use_uuid=None, chmod=""): if use_uuid is None: use_uuid = uuidutils.generate_uuid() constructed_dir = _append_slash_if_needed(path) constructed_dir += '%s/%s' % (job.name, use_uuid) with remote.get_remote(where) as r: if chmod: r.execute_command("mkdir -p -m %s %s" % (chmod, constructed_dir)) else: r.execute_command("mkdir -p %s" % constructed_dir) return constructed_dir
def _push_configs_to_existing_node(self, cluster, extra, instance): node_processes = instance.node_group.node_processes need_update = (c_helper.is_data_locality_enabled(cluster) or 'namenode' in node_processes or 'jobtracker' in node_processes or 'oozie' in node_processes or 'hiveserver' in node_processes) if not need_update: return with remote.get_remote(instance) as r: self._write_topology_data(r, cluster, extra) self._push_master_configs(r, cluster, extra, instance)