def get_open_ports(self, node_group): cluster = node_group.cluster ports_map = { 'namenode': [8020, 50070, 50470], 'datanode': [50010, 1004, 50075, 1006, 50020], 'master': [ int(c_helper.get_config_value("Spark", "Master port", cluster)), int( c_helper.get_config_value("Spark", "Master webui port", cluster)), ], 'slave': [ int( c_helper.get_config_value("Spark", "Worker webui port", cluster)) ] } ports = [] for process in node_group.node_processes: if process in ports_map: ports.extend(ports_map[process]) return ports
def __init__(self, cluster): super(EdpEngine, self).__init__(cluster) self.master = plugin_utils.get_instance(cluster, "master") self.plugin_params["spark-user"] = "" self.plugin_params["spark-submit"] = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") self.plugin_params["deploy-mode"] = "client" port = c_helper.get_config_value("Spark", "Master port", cluster) self.plugin_params["master"] = ('spark://%(host)s:%(port)s')
def get_driver_classpath(self): cp = c_helper.get_config_value("Spark", "Executor extra classpath", self.cluster) if cp: cp = " --driver-class-path " + cp return cp
def _validate_existing_ng_scaling(self, cluster, existing): scalable_processes = self._get_scalable_processes() dn_to_delete = 0 for ng in cluster.node_groups: if ng.id in existing: if ng.count > existing[ng.id] and ("datanode" in ng.node_processes): dn_to_delete += ng.count - existing[ng.id] if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Spark plugin cannot scale nodegroup" " with processes: %s") % " ".join(ng.node_processes), ) dn_amount = len(utils.get_instances(cluster, "datanode")) rep_factor = c_helper.get_config_value("HDFS", "dfs.replication", cluster) if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor: raise ex.ClusterCannotBeScaled( cluster.name, _( "Spark plugin cannot shrink cluster because " "there would be not enough nodes for HDFS " "replicas (replication factor is %s)" ) % rep_factor, )
def validate(self, cluster): nn_count = sum([ng.count for ng in utils.get_node_groups(cluster, "namenode")]) if nn_count != 1: raise ex.InvalidComponentCountException("namenode", 1, nn_count) dn_count = sum([ng.count for ng in utils.get_node_groups(cluster, "datanode")]) if dn_count < 1: raise ex.InvalidComponentCountException("datanode", _("1 or more"), nn_count) rep_factor = c_helper.get_config_value('HDFS', "dfs.replication", cluster) if dn_count < rep_factor: raise ex.InvalidComponentCountException( 'datanode', _('%s or more') % rep_factor, dn_count, _('Number of %(dn)s instances should not be less ' 'than %(replication)s') % {'dn': 'datanode', 'replication': 'dfs.replication'}) # validate Spark Master Node and Spark Slaves sm_count = sum([ng.count for ng in utils.get_node_groups(cluster, "master")]) if sm_count != 1: raise ex.RequiredServiceMissingException("Spark master") sl_count = sum([ng.count for ng in utils.get_node_groups(cluster, "slave")]) if sl_count < 1: raise ex.InvalidComponentCountException("Spark slave", _("1 or more"), sl_count)
def _set_cluster_info(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") info = {} if nn: address = c_helper.get_config_value("HDFS", "dfs.http.address", cluster) port = address[address.rfind(":") + 1 :] info["HDFS"] = {"Web UI": "http://%s:%s" % (nn.management_ip, port)} info["HDFS"]["NameNode"] = "hdfs://%s:8020" % nn.hostname() if sp_master: port = c_helper.get_config_value("Spark", "Master webui port", cluster) if port is not None: info["Spark"] = {"Web UI": "http://%s:%s" % (sp_master.management_ip, port)} ctx = context.ctx() conductor.cluster_update(ctx, cluster, {"info": info})
def _set_cluster_info(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") info = {} if nn: address = c_helper.get_config_value('HDFS', 'dfs.http.address', cluster) port = address[address.rfind(':') + 1:] info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.management_ip, port) } info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname() if sp_master: port = c_helper.get_config_value('Spark', 'Master webui port', cluster) if port is not None: info['Spark'] = { 'Web UI': 'http://%s:%s' % (sp_master.management_ip, port) } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info})
def get_open_ports(self, node_group): cluster = node_group.cluster ports_map = { 'namenode': [8020, 50070, 50470], 'datanode': [50010, 1004, 50075, 1006, 50020], 'master': [ int(c_helper.get_config_value("Spark", "Master port", cluster)), int(c_helper.get_config_value("Spark", "Master webui port", cluster)), ], 'slave': [ int(c_helper.get_config_value("Spark", "Worker webui port", cluster)) ] } ports = [] for process in node_group.node_processes: if process in ports_map: ports.extend(ports_map[process]) return ports
def _set_cluster_info(self, cluster): nn = utils.get_instance(cluster, "namenode") sp_master = utils.get_instance(cluster, "master") info = {} if nn: address = c_helper.get_config_value( 'HDFS', 'dfs.http.address', cluster) port = address[address.rfind(':') + 1:] info['HDFS'] = { 'Web UI': 'http://%s:%s' % (nn.management_ip, port) } info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname() if sp_master: port = c_helper.get_config_value( 'Spark', 'Master webui port', cluster) if port is not None: info['Spark'] = { 'Web UI': 'http://%s:%s' % (sp_master.management_ip, port) } ctx = context.ctx() conductor.cluster_update(ctx, cluster, {'info': info})
def validate(self, cluster): nn_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "namenode")]) if nn_count != 1: raise ex.InvalidComponentCountException("namenode", 1, nn_count) dn_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "datanode")]) if dn_count < 1: raise ex.InvalidComponentCountException("datanode", _("1 or more"), nn_count) rep_factor = c_helper.get_config_value('HDFS', "dfs.replication", cluster) if dn_count < rep_factor: raise ex.InvalidComponentCountException( 'datanode', _('%s or more') % rep_factor, dn_count, _('Number of %(dn)s instances should not be less ' 'than %(replication)s') % { 'dn': 'datanode', 'replication': 'dfs.replication' }) # validate Spark Master Node and Spark Slaves sm_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "master")]) if sm_count != 1: raise ex.RequiredServiceMissingException("Spark master") sl_count = sum( [ng.count for ng in utils.get_node_groups(cluster, "slave")]) if sl_count < 1: raise ex.InvalidComponentCountException("Spark slave", _("1 or more"), sl_count)
def decommission_sl(master, inst_to_be_deleted, survived_inst): if survived_inst is not None: slavenames = [] for slave in survived_inst: slavenames.append(slave.hostname()) slaves_content = c_helper.generate_spark_slaves_configs(slavenames) else: slaves_content = "\n" cluster = master.cluster sp_home = c_helper.get_config_value("Spark", "Spark home", cluster) r_master = remote.get_remote(master) run.stop_spark(r_master, sp_home) # write new slave file to master files = {os.path.join(sp_home, 'conf/slaves'): slaves_content} r_master.write_files_to(files) # write new slaves file to each survived slave as well for i in survived_inst: with remote.get_remote(i) as r: r.write_files_to(files) run.start_spark_master(r_master, sp_home)
def _validate_existing_ng_scaling(self, cluster, existing): scalable_processes = self._get_scalable_processes() dn_to_delete = 0 for ng in cluster.node_groups: if ng.id in existing: if ng.count > existing[ng.id] and ("datanode" in ng.node_processes): dn_to_delete += ng.count - existing[ng.id] if not set(ng.node_processes).issubset(scalable_processes): raise ex.NodeGroupCannotBeScaled( ng.name, _("Spark plugin cannot scale nodegroup" " with processes: %s") % ' '.join(ng.node_processes)) dn_amount = len(utils.get_instances(cluster, "datanode")) rep_factor = c_helper.get_config_value('HDFS', "dfs.replication", cluster) if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor: raise ex.ClusterCannotBeScaled( cluster.name, _("Spark plugin cannot shrink cluster because " "there would be not enough nodes for HDFS " "replicas (replication factor is %s)") % rep_factor)
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) # We'll always run the driver program on the master master = plugin_utils.get_instance(self.cluster, "master") # TODO(tmckay): wf_dir should probably be configurable. # The only requirement is that the dir is writable by the image user wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job, job_execution.id) paths = job_utils.upload_job_files(master, wf_dir, job, libs_subdir=False) # We can shorten the paths in this case since we'll run out of wf_dir paths = [os.path.basename(p) for p in paths] # TODO(tmckay): for now, paths[0] is always assumed to be the app # jar and we generate paths in order (mains, then libs). # When we have a Spark job type, we can require a "main" and set # the app jar explicitly to be "main" app_jar = paths.pop(0) # The rest of the paths will be passed with --jars additional_jars = ",".join(paths) if additional_jars: additional_jars = "--jars " + additional_jars # Launch the spark job using spark-submit and deploy_mode = client host = master.hostname() port = c_helper.get_config_value("Spark", "Master port", self.cluster) spark_submit = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") job_class = job_execution.job_configs.configs["edp.java.main_class"] # TODO(tmckay): we need to clean up wf_dirs on long running clusters # TODO(tmckay): probably allow for general options to spark-submit args = " ".join(job_execution.job_configs.get('args', [])) # The redirects of stdout and stderr will preserve output in the wf_dir cmd = "%s %s --class %s %s --master spark://%s:%s %s" % ( spark_submit, app_jar, job_class, additional_jars, host, port, args) # If an exception is raised here, the job_manager will mark # the job failed and log the exception with remote.get_remote(master) as r: # Upload the command launch script launch = os.path.join(wf_dir, "launch_command") r.write_file_to(launch, self._job_script()) r.execute_command("chmod +x %s" % launch) ret, stdout = r.execute_command( "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) if ret == 0: # Success, we'll add the wf_dir in job_execution.extra and store # pid@instance_id as the job id # We know the job is running so return "RUNNING" return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {'spark-path': wf_dir}) # Hmm, no execption but something failed. # Since we're using backgrounding with redirect, this is unlikely. raise e.EDPError("Spark job execution failed. Exit status = %s, " "stdout = %s" % (ret, stdout))
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) proxy_configs = job_execution.job_configs.get('proxy_configs') # We'll always run the driver program on the master master = plugin_utils.get_instance(self.cluster, "master") # TODO(tmckay): wf_dir should probably be configurable. # The only requirement is that the dir is writable by the image user wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job, job_execution.id) paths = job_utils.upload_job_files(master, wf_dir, job, libs_subdir=False, proxy_configs=proxy_configs) # We can shorten the paths in this case since we'll run out of wf_dir paths = [os.path.basename(p) for p in paths] # TODO(tmckay): for now, paths[0] is always assumed to be the app # jar and we generate paths in order (mains, then libs). # When we have a Spark job type, we can require a "main" and set # the app jar explicitly to be "main" app_jar = paths.pop(0) # The rest of the paths will be passed with --jars additional_jars = ",".join(paths) if additional_jars: additional_jars = "--jars " + additional_jars # Launch the spark job using spark-submit and deploy_mode = client host = master.hostname() port = c_helper.get_config_value("Spark", "Master port", self.cluster) spark_submit = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") job_class = job_execution.job_configs.configs["edp.java.main_class"] # TODO(tmckay): we need to clean up wf_dirs on long running clusters # TODO(tmckay): probably allow for general options to spark-submit args = " ".join(job_execution.job_configs.get('args', [])) # The redirects of stdout and stderr will preserve output in the wf_dir cmd = "%s %s --class %s %s --master spark://%s:%s %s" % ( spark_submit, app_jar, job_class, additional_jars, host, port, args) # If an exception is raised here, the job_manager will mark # the job failed and log the exception with remote.get_remote(master) as r: # Upload the command launch script launch = os.path.join(wf_dir, "launch_command") r.write_file_to(launch, self._job_script()) r.execute_command("chmod +x %s" % launch) ret, stdout = r.execute_command( "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) if ret == 0: # Success, we'll add the wf_dir in job_execution.extra and store # pid@instance_id as the job id # We know the job is running so return "RUNNING" return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, { 'spark-path': wf_dir }) # Hmm, no execption but something failed. # Since we're using backgrounding with redirect, this is unlikely. raise e.EDPError( _("Spark job execution failed. Exit status = " "%(status)s, stdout = %(stdout)s") % { 'status': ret, 'stdout': stdout })
def _spark_home(self, cluster): return c_helper.get_config_value("Spark", "Spark home", cluster)
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) additional_sources, updated_job_configs = ( job_utils.resolve_data_source_references( job_execution.job_configs)) # We'll always run the driver program on the master master = plugin_utils.get_instance(self.cluster, "master") # TODO(tmckay): wf_dir should probably be configurable. # The only requirement is that the dir is writable by the image user wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job, job_execution.id, "700") paths, builtin_paths = self._upload_job_files(master, wf_dir, job, updated_job_configs) # We can shorten the paths in this case since we'll run out of wf_dir paths = [os.path.basename(p) for p in paths] builtin_paths = [os.path.basename(p) for p in builtin_paths] # TODO(tmckay): for now, paths[0] is always assumed to be the app # jar and we generate paths in order (mains, then libs). # When we have a Spark job type, we can require a "main" and set # the app jar explicitly to be "main" app_jar = paths.pop(0) job_class = updated_job_configs["configs"]["edp.java.main_class"] # If we uploaded builtins then we are using a wrapper jar. It will # be the first one on the builtin list and the original app_jar needs # to be added to the 'additional' jars if builtin_paths: wrapper_jar = builtin_paths.pop(0) wrapper_class = 'org.openstack.sahara.edp.SparkWrapper' wrapper_xml = self._upload_wrapper_xml(master, wf_dir, updated_job_configs) wrapper_args = "%s %s" % (wrapper_xml, job_class) additional_jars = ",".join([app_jar] + paths + builtin_paths) else: wrapper_jar = wrapper_class = wrapper_args = "" additional_jars = ",".join(paths) # All additional jars are passed with the --jars option if additional_jars: additional_jars = " --jars " + additional_jars # Launch the spark job using spark-submit and deploy_mode = client host = master.hostname() port = c_helper.get_config_value("Spark", "Master port", self.cluster) spark_submit = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") # TODO(tmckay): we need to clean up wf_dirs on long running clusters # TODO(tmckay): probably allow for general options to spark-submit args = updated_job_configs.get('args', []) args = " ".join([su.inject_swift_url_suffix(arg) for arg in args]) if args: args = " " + args if wrapper_jar and wrapper_class: # Substrings which may be empty have spaces # embedded if they are non-empty cmd = ('%(spark_submit)s%(driver_cp)s' ' --class %(wrapper_class)s%(addnl_jars)s' ' --master spark://%(host)s:%(port)s' ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % ( { "spark_submit": spark_submit, "driver_cp": self.get_driver_classpath(), "wrapper_class": wrapper_class, "addnl_jars": additional_jars, "host": host, "port": port, "wrapper_jar": wrapper_jar, "wrapper_args": wrapper_args, "args": args }) else: cmd = ('%(spark_submit)s --class %(job_class)s%(addnl_jars)s' ' --master spark://%(host)s:%(port)s %(app_jar)s%(args)s' ) % ({ "spark_submit": spark_submit, "job_class": job_class, "addnl_jars": additional_jars, "host": host, "port": port, "app_jar": app_jar, "args": args }) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) # If an exception is raised here, the job_manager will mark # the job failed and log the exception # The redirects of stdout and stderr will preserve output in the wf_dir with remote.get_remote(master) as r: # Upload the command launch script launch = os.path.join(wf_dir, "launch_command") r.write_file_to(launch, self._job_script()) r.execute_command("chmod +x %s" % launch) ret, stdout = r.execute_command( "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) if ret == 0: # Success, we'll add the wf_dir in job_execution.extra and store # pid@instance_id as the job id # We know the job is running so return "RUNNING" return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, { 'spark-path': wf_dir }) # Hmm, no execption but something failed. # Since we're using backgrounding with redirect, this is unlikely. raise e.EDPError( _("Spark job execution failed. Exit status = " "%(status)s, stdout = %(stdout)s") % { 'status': ret, 'stdout': stdout })
def run_job(self, job_execution): ctx = context.ctx() job = conductor.job_get(ctx, job_execution.job_id) additional_sources, updated_job_configs = ( job_utils.resolve_data_source_references(job_execution.job_configs) ) for data_source in additional_sources: if data_source and data_source.type == 'hdfs': h.configure_cluster_for_hdfs(self.cluster, data_source) break # We'll always run the driver program on the master master = plugin_utils.get_instance(self.cluster, "master") # TODO(tmckay): wf_dir should probably be configurable. # The only requirement is that the dir is writable by the image user wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job, job_execution.id, "700") paths, builtin_paths = self._upload_job_files( master, wf_dir, job, updated_job_configs) # We can shorten the paths in this case since we'll run out of wf_dir paths = [os.path.basename(p) for p in paths] builtin_paths = [os.path.basename(p) for p in builtin_paths] # TODO(tmckay): for now, paths[0] is always assumed to be the app # jar and we generate paths in order (mains, then libs). # When we have a Spark job type, we can require a "main" and set # the app jar explicitly to be "main" app_jar = paths.pop(0) job_class = updated_job_configs["configs"]["edp.java.main_class"] # If we uploaded builtins then we are using a wrapper jar. It will # be the first one on the builtin list and the original app_jar needs # to be added to the 'additional' jars if builtin_paths: wrapper_jar = builtin_paths.pop(0) wrapper_class = 'org.openstack.sahara.edp.SparkWrapper' wrapper_xml = self._upload_wrapper_xml(master, wf_dir, updated_job_configs) wrapper_args = "%s %s" % (wrapper_xml, job_class) additional_jars = ",".join([app_jar] + paths + builtin_paths) else: wrapper_jar = wrapper_class = wrapper_args = "" additional_jars = ",".join(paths) # All additional jars are passed with the --jars option if additional_jars: additional_jars = " --jars " + additional_jars # Launch the spark job using spark-submit and deploy_mode = client host = master.hostname() port = c_helper.get_config_value("Spark", "Master port", self.cluster) spark_submit = os.path.join( c_helper.get_config_value("Spark", "Spark home", self.cluster), "bin/spark-submit") # TODO(tmckay): we need to clean up wf_dirs on long running clusters # TODO(tmckay): probably allow for general options to spark-submit args = updated_job_configs.get('args', []) args = " ".join([su.inject_swift_url_suffix(arg) for arg in args]) if args: args = " " + args if wrapper_jar and wrapper_class: # Substrings which may be empty have spaces # embedded if they are non-empty cmd = ( '%(spark_submit)s%(driver_cp)s' ' --class %(wrapper_class)s%(addnl_jars)s' ' --master spark://%(host)s:%(port)s' ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % ( { "spark_submit": spark_submit, "driver_cp": self.get_driver_classpath(), "wrapper_class": wrapper_class, "addnl_jars": additional_jars, "host": host, "port": port, "wrapper_jar": wrapper_jar, "wrapper_args": wrapper_args, "args": args }) else: cmd = ( '%(spark_submit)s --class %(job_class)s%(addnl_jars)s' ' --master spark://%(host)s:%(port)s %(app_jar)s%(args)s') % ( { "spark_submit": spark_submit, "job_class": job_class, "addnl_jars": additional_jars, "host": host, "port": port, "app_jar": app_jar, "args": args }) job_execution = conductor.job_execution_get(ctx, job_execution.id) if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED: return (None, edp.JOB_STATUS_KILLED, None) # If an exception is raised here, the job_manager will mark # the job failed and log the exception # The redirects of stdout and stderr will preserve output in the wf_dir with remote.get_remote(master) as r: # Upload the command launch script launch = os.path.join(wf_dir, "launch_command") r.write_file_to(launch, self._job_script()) r.execute_command("chmod +x %s" % launch) ret, stdout = r.execute_command( "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" % (wf_dir, cmd)) if ret == 0: # Success, we'll add the wf_dir in job_execution.extra and store # pid@instance_id as the job id # We know the job is running so return "RUNNING" return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {'spark-path': wf_dir}) # Hmm, no execption but something failed. # Since we're using backgrounding with redirect, this is unlikely. raise e.EDPError(_("Spark job execution failed. Exit status = " "%(status)s, stdout = %(stdout)s") % {'status': ret, 'stdout': stdout})