Example #1
0
    def get_open_ports(self, node_group):
        cluster = node_group.cluster
        ports_map = {
            'namenode': [8020, 50070, 50470],
            'datanode': [50010, 1004, 50075, 1006, 50020],
            'master': [
                int(c_helper.get_config_value("Spark", "Master port",
                                              cluster)),
                int(
                    c_helper.get_config_value("Spark", "Master webui port",
                                              cluster)),
            ],
            'slave': [
                int(
                    c_helper.get_config_value("Spark", "Worker webui port",
                                              cluster))
            ]
        }

        ports = []
        for process in node_group.node_processes:
            if process in ports_map:
                ports.extend(ports_map[process])

        return ports
Example #2
0
 def __init__(self, cluster):
     super(EdpEngine, self).__init__(cluster)
     self.master = plugin_utils.get_instance(cluster, "master")
     self.plugin_params["spark-user"] = ""
     self.plugin_params["spark-submit"] = os.path.join(
         c_helper.get_config_value("Spark",
                                   "Spark home",
                                   self.cluster),
         "bin/spark-submit")
     self.plugin_params["deploy-mode"] = "client"
     port = c_helper.get_config_value("Spark", "Master port", cluster)
     self.plugin_params["master"] = ('spark://%(host)s:%(port)s')
Example #3
0
 def get_driver_classpath(self):
     cp = c_helper.get_config_value("Spark",
                                    "Executor extra classpath",
                                    self.cluster)
     if cp:
         cp = " --driver-class-path " + cp
     return cp
Example #4
0
    def _validate_existing_ng_scaling(self, cluster, existing):
        scalable_processes = self._get_scalable_processes()
        dn_to_delete = 0
        for ng in cluster.node_groups:
            if ng.id in existing:
                if ng.count > existing[ng.id] and ("datanode" in ng.node_processes):
                    dn_to_delete += ng.count - existing[ng.id]
                if not set(ng.node_processes).issubset(scalable_processes):
                    raise ex.NodeGroupCannotBeScaled(
                        ng.name,
                        _("Spark plugin cannot scale nodegroup" " with processes: %s") % " ".join(ng.node_processes),
                    )

        dn_amount = len(utils.get_instances(cluster, "datanode"))
        rep_factor = c_helper.get_config_value("HDFS", "dfs.replication", cluster)

        if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor:
            raise ex.ClusterCannotBeScaled(
                cluster.name,
                _(
                    "Spark plugin cannot shrink cluster because "
                    "there would be not enough nodes for HDFS "
                    "replicas (replication factor is %s)"
                )
                % rep_factor,
            )
Example #5
0
    def validate(self, cluster):
        nn_count = sum([ng.count for ng
                        in utils.get_node_groups(cluster, "namenode")])
        if nn_count != 1:
            raise ex.InvalidComponentCountException("namenode", 1, nn_count)

        dn_count = sum([ng.count for ng
                        in utils.get_node_groups(cluster, "datanode")])
        if dn_count < 1:
            raise ex.InvalidComponentCountException("datanode", _("1 or more"),
                                                    nn_count)

        rep_factor = c_helper.get_config_value('HDFS', "dfs.replication",
                                               cluster)
        if dn_count < rep_factor:
            raise ex.InvalidComponentCountException(
                'datanode', _('%s or more') % rep_factor, dn_count,
                _('Number of %(dn)s instances should not be less '
                  'than %(replication)s')
                % {'dn': 'datanode', 'replication': 'dfs.replication'})

        # validate Spark Master Node and Spark Slaves
        sm_count = sum([ng.count for ng
                        in utils.get_node_groups(cluster, "master")])

        if sm_count != 1:
            raise ex.RequiredServiceMissingException("Spark master")

        sl_count = sum([ng.count for ng
                        in utils.get_node_groups(cluster, "slave")])

        if sl_count < 1:
            raise ex.InvalidComponentCountException("Spark slave",
                                                    _("1 or more"),
                                                    sl_count)
Example #6
0
    def _set_cluster_info(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        info = {}

        if nn:
            address = c_helper.get_config_value("HDFS", "dfs.http.address", cluster)
            port = address[address.rfind(":") + 1 :]
            info["HDFS"] = {"Web UI": "http://%s:%s" % (nn.management_ip, port)}
            info["HDFS"]["NameNode"] = "hdfs://%s:8020" % nn.hostname()

        if sp_master:
            port = c_helper.get_config_value("Spark", "Master webui port", cluster)
            if port is not None:
                info["Spark"] = {"Web UI": "http://%s:%s" % (sp_master.management_ip, port)}
        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {"info": info})
Example #7
0
    def _set_cluster_info(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        info = {}

        if nn:
            address = c_helper.get_config_value('HDFS', 'dfs.http.address',
                                                cluster)
            port = address[address.rfind(':') + 1:]
            info['HDFS'] = {
                'Web UI': 'http://%s:%s' % (nn.management_ip, port)
            }
            info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname()

        if sp_master:
            port = c_helper.get_config_value('Spark', 'Master webui port',
                                             cluster)
            if port is not None:
                info['Spark'] = {
                    'Web UI': 'http://%s:%s' % (sp_master.management_ip, port)
                }
        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {'info': info})
Example #8
0
    def get_open_ports(self, node_group):
        cluster = node_group.cluster
        ports_map = {
            'namenode': [8020, 50070, 50470],
            'datanode': [50010, 1004, 50075, 1006, 50020],
            'master': [
                int(c_helper.get_config_value("Spark", "Master port",
                                              cluster)),
                int(c_helper.get_config_value("Spark", "Master webui port",
                                              cluster)),
            ],
            'slave': [
                int(c_helper.get_config_value("Spark", "Worker webui port",
                                              cluster))
            ]
        }

        ports = []
        for process in node_group.node_processes:
            if process in ports_map:
                ports.extend(ports_map[process])

        return ports
Example #9
0
    def _set_cluster_info(self, cluster):
        nn = utils.get_instance(cluster, "namenode")
        sp_master = utils.get_instance(cluster, "master")
        info = {}

        if nn:
            address = c_helper.get_config_value(
                'HDFS', 'dfs.http.address', cluster)
            port = address[address.rfind(':') + 1:]
            info['HDFS'] = {
                'Web UI': 'http://%s:%s' % (nn.management_ip, port)
            }
            info['HDFS']['NameNode'] = 'hdfs://%s:8020' % nn.hostname()

        if sp_master:
            port = c_helper.get_config_value(
                'Spark', 'Master webui port', cluster)
            if port is not None:
                info['Spark'] = {
                    'Web UI': 'http://%s:%s' % (sp_master.management_ip, port)
                }
        ctx = context.ctx()
        conductor.cluster_update(ctx, cluster, {'info': info})
Example #10
0
    def validate(self, cluster):
        nn_count = sum(
            [ng.count for ng in utils.get_node_groups(cluster, "namenode")])
        if nn_count != 1:
            raise ex.InvalidComponentCountException("namenode", 1, nn_count)

        dn_count = sum(
            [ng.count for ng in utils.get_node_groups(cluster, "datanode")])
        if dn_count < 1:
            raise ex.InvalidComponentCountException("datanode", _("1 or more"),
                                                    nn_count)

        rep_factor = c_helper.get_config_value('HDFS', "dfs.replication",
                                               cluster)
        if dn_count < rep_factor:
            raise ex.InvalidComponentCountException(
                'datanode',
                _('%s or more') % rep_factor, dn_count,
                _('Number of %(dn)s instances should not be less '
                  'than %(replication)s') % {
                      'dn': 'datanode',
                      'replication': 'dfs.replication'
                  })

        # validate Spark Master Node and Spark Slaves
        sm_count = sum(
            [ng.count for ng in utils.get_node_groups(cluster, "master")])

        if sm_count != 1:
            raise ex.RequiredServiceMissingException("Spark master")

        sl_count = sum(
            [ng.count for ng in utils.get_node_groups(cluster, "slave")])

        if sl_count < 1:
            raise ex.InvalidComponentCountException("Spark slave",
                                                    _("1 or more"), sl_count)
Example #11
0
def decommission_sl(master, inst_to_be_deleted, survived_inst):
    if survived_inst is not None:
        slavenames = []
        for slave in survived_inst:
            slavenames.append(slave.hostname())
        slaves_content = c_helper.generate_spark_slaves_configs(slavenames)
    else:
        slaves_content = "\n"

    cluster = master.cluster
    sp_home = c_helper.get_config_value("Spark", "Spark home", cluster)
    r_master = remote.get_remote(master)
    run.stop_spark(r_master, sp_home)

    # write new slave file to master
    files = {os.path.join(sp_home, 'conf/slaves'): slaves_content}
    r_master.write_files_to(files)

    # write new slaves file to each survived slave as well
    for i in survived_inst:
        with remote.get_remote(i) as r:
            r.write_files_to(files)

    run.start_spark_master(r_master, sp_home)
Example #12
0
    def _validate_existing_ng_scaling(self, cluster, existing):
        scalable_processes = self._get_scalable_processes()
        dn_to_delete = 0
        for ng in cluster.node_groups:
            if ng.id in existing:
                if ng.count > existing[ng.id] and ("datanode"
                                                   in ng.node_processes):
                    dn_to_delete += ng.count - existing[ng.id]
                if not set(ng.node_processes).issubset(scalable_processes):
                    raise ex.NodeGroupCannotBeScaled(
                        ng.name,
                        _("Spark plugin cannot scale nodegroup"
                          " with processes: %s") % ' '.join(ng.node_processes))

        dn_amount = len(utils.get_instances(cluster, "datanode"))
        rep_factor = c_helper.get_config_value('HDFS', "dfs.replication",
                                               cluster)

        if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor:
            raise ex.ClusterCannotBeScaled(
                cluster.name,
                _("Spark plugin cannot shrink cluster because "
                  "there would be not enough nodes for HDFS "
                  "replicas (replication factor is %s)") % rep_factor)
Example #13
0
 def get_driver_classpath(self):
     cp = c_helper.get_config_value("Spark", "Executor extra classpath",
                                    self.cluster)
     if cp:
         cp = " --driver-class-path " + cp
     return cp
Example #14
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id)
        paths = job_utils.upload_job_files(master, wf_dir, job,
                                           libs_subdir=False)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)

        # The rest of the paths will be passed with --jars
        additional_jars = ",".join(paths)
        if additional_jars:
            additional_jars = "--jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark",
                                      "Spark home",
                                      self.cluster),
            "bin/spark-submit")

        job_class = job_execution.job_configs.configs["edp.java.main_class"]

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = " ".join(job_execution.job_configs.get('args', []))

        # The redirects of stdout and stderr will preserve output in the wf_dir
        cmd = "%s %s --class %s %s --master spark://%s:%s %s" % (
            spark_submit,
            app_jar,
            job_class,
            additional_jars,
            host,
            port,
            args)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError("Spark job execution failed. Exit status = %s, "
                         "stdout = %s" % (ret, stdout))
Example #15
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        proxy_configs = job_execution.job_configs.get('proxy_configs')

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id)
        paths = job_utils.upload_job_files(master,
                                           wf_dir,
                                           job,
                                           libs_subdir=False,
                                           proxy_configs=proxy_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)

        # The rest of the paths will be passed with --jars
        additional_jars = ",".join(paths)
        if additional_jars:
            additional_jars = "--jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark", "Spark home", self.cluster),
            "bin/spark-submit")

        job_class = job_execution.job_configs.configs["edp.java.main_class"]

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = " ".join(job_execution.job_configs.get('args', []))

        # The redirects of stdout and stderr will preserve output in the wf_dir
        cmd = "%s %s --class %s %s --master spark://%s:%s %s" % (
            spark_submit, app_jar, job_class, additional_jars, host, port,
            args)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {
                'spark-path': wf_dir
            })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Example #16
0
 def _spark_home(self, cluster):
     return c_helper.get_config_value("Spark", "Spark home", cluster)
Example #17
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs))

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id, "700")

        paths, builtin_paths = self._upload_job_files(master, wf_dir, job,
                                                      updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)
        job_class = updated_job_configs["configs"]["edp.java.main_class"]

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            wrapper_jar = builtin_paths.pop(0)
            wrapper_class = 'org.openstack.sahara.edp.SparkWrapper'
            wrapper_xml = self._upload_wrapper_xml(master, wf_dir,
                                                   updated_job_configs)
            wrapper_args = "%s %s" % (wrapper_xml, job_class)

            additional_jars = ",".join([app_jar] + paths + builtin_paths)

        else:
            wrapper_jar = wrapper_class = wrapper_args = ""
            additional_jars = ",".join(paths)

        # All additional jars are passed with the --jars option
        if additional_jars:
            additional_jars = " --jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark", "Spark home", self.cluster),
            "bin/spark-submit")

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = updated_job_configs.get('args', [])
        args = " ".join([su.inject_swift_url_suffix(arg) for arg in args])
        if args:
            args = " " + args

        if wrapper_jar and wrapper_class:
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = ('%(spark_submit)s%(driver_cp)s'
                   ' --class %(wrapper_class)s%(addnl_jars)s'
                   ' --master spark://%(host)s:%(port)s'
                   ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % (
                       {
                           "spark_submit": spark_submit,
                           "driver_cp": self.get_driver_classpath(),
                           "wrapper_class": wrapper_class,
                           "addnl_jars": additional_jars,
                           "host": host,
                           "port": port,
                           "wrapper_jar": wrapper_jar,
                           "wrapper_args": wrapper_args,
                           "args": args
                       })
        else:
            cmd = ('%(spark_submit)s --class %(job_class)s%(addnl_jars)s'
                   ' --master spark://%(host)s:%(port)s %(app_jar)s%(args)s'
                   ) % ({
                       "spark_submit": spark_submit,
                       "job_class": job_class,
                       "addnl_jars": additional_jars,
                       "host": host,
                       "port": port,
                       "app_jar": app_jar,
                       "args": args
                   })

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {
                'spark-path': wf_dir
            })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Example #18
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs)
        )

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id, "700")

        paths, builtin_paths = self._upload_job_files(
            master, wf_dir, job, updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)
        job_class = updated_job_configs["configs"]["edp.java.main_class"]

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            wrapper_jar = builtin_paths.pop(0)
            wrapper_class = 'org.openstack.sahara.edp.SparkWrapper'
            wrapper_xml = self._upload_wrapper_xml(master,
                                                   wf_dir,
                                                   updated_job_configs)
            wrapper_args = "%s %s" % (wrapper_xml, job_class)

            additional_jars = ",".join([app_jar] + paths + builtin_paths)

        else:
            wrapper_jar = wrapper_class = wrapper_args = ""
            additional_jars = ",".join(paths)

        # All additional jars are passed with the --jars option
        if additional_jars:
            additional_jars = " --jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark",
                                      "Spark home",
                                      self.cluster),
            "bin/spark-submit")

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = updated_job_configs.get('args', [])
        args = " ".join([su.inject_swift_url_suffix(arg) for arg in args])
        if args:
            args = " " + args

        if wrapper_jar and wrapper_class:
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = (
                '%(spark_submit)s%(driver_cp)s'
                ' --class %(wrapper_class)s%(addnl_jars)s'
                ' --master spark://%(host)s:%(port)s'
                ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % (
                {
                    "spark_submit": spark_submit,
                    "driver_cp": self.get_driver_classpath(),
                    "wrapper_class": wrapper_class,
                    "addnl_jars": additional_jars,
                    "host": host,
                    "port": port,
                    "wrapper_jar": wrapper_jar,
                    "wrapper_args": wrapper_args,
                    "args": args
                })
        else:
            cmd = (
                '%(spark_submit)s --class %(job_class)s%(addnl_jars)s'
                ' --master spark://%(host)s:%(port)s %(app_jar)s%(args)s') % (
                {
                    "spark_submit": spark_submit,
                    "job_class": job_class,
                    "addnl_jars": additional_jars,
                    "host": host,
                    "port": port,
                    "app_jar": app_jar,
                    "args": args
                })

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(_("Spark job execution failed. Exit status = "
                           "%(status)s, stdout = %(stdout)s") %
                         {'status': ret, 'stdout': stdout})
Example #19
0
 def _spark_home(self, cluster):
     return c_helper.get_config_value("Spark", "Spark home", cluster)