Example #1
0
 def test_inject_swift_url_suffix(self):
     self.assertEqual("swift://ex.sahara/o",
                      su.inject_swift_url_suffix("swift://ex/o"))
     self.assertEqual("swift://ex.sahara/o",
                      su.inject_swift_url_suffix("swift://ex.sahara/o"))
     self.assertEqual("hdfs://my/path",
                      su.inject_swift_url_suffix("hdfs://my/path"))
     self.assertEqual(12345, su.inject_swift_url_suffix(12345))
     self.assertEqual(['test'], su.inject_swift_url_suffix(['test']))
 def test_inject_swift_url_suffix(self):
     self.assertEqual(su.inject_swift_url_suffix("swift://ex/o"),
                      "swift://ex.sahara/o")
     self.assertEqual(su.inject_swift_url_suffix("swift://ex.sahara/o"),
                      "swift://ex.sahara/o")
     self.assertEqual(su.inject_swift_url_suffix("hdfs://my/path"),
                      "hdfs://my/path")
     self.assertEqual(su.inject_swift_url_suffix(12345), 12345)
     self.assertEqual(su.inject_swift_url_suffix(['test']), ['test'])
Example #3
0
 def test_inject_swift_url_suffix(self):
     self.assertEqual("swift://ex.sahara/o",
                      su.inject_swift_url_suffix("swift://ex/o"))
     self.assertEqual("swift://ex.sahara/o",
                      su.inject_swift_url_suffix("swift://ex.sahara/o"))
     self.assertEqual("hdfs://my/path",
                      su.inject_swift_url_suffix("hdfs://my/path"))
     self.assertEqual(12345, su.inject_swift_url_suffix(12345))
     self.assertEqual(['test'], su.inject_swift_url_suffix(['test']))
Example #4
0
    def update_job_dict(self, job_dict, exec_dict):
        pruned_exec_dict, edp_configs = self._prune_edp_configs(exec_dict)
        self._update_dict(job_dict, pruned_exec_dict)

        # Add the separated "edp." configs to the job_dict
        job_dict['edp_configs'] = edp_configs

        # Args are listed, not named. Simply replace them.
        job_dict['args'] = pruned_exec_dict.get('args', [])

        # Find all swift:// paths in args, configs, and params and
        # add the .sahara suffix to the container if it is not there
        # already
        job_dict['args'] = [
            # TODO(tmckay) args for Pig can actually be -param name=value
            # and value could conceivably contain swift paths
            su.inject_swift_url_suffix(arg) for arg in job_dict['args']]

        for k, v in six.iteritems(job_dict.get('configs', {})):
            job_dict['configs'][k] = su.inject_swift_url_suffix(v)

        for k, v in six.iteritems(job_dict.get('params', {})):
            job_dict['params'][k] = su.inject_swift_url_suffix(v)
Example #5
0
    def update_job_dict(self, job_dict, exec_dict):
        pruned_exec_dict, edp_configs = self._prune_edp_configs(exec_dict)
        self._update_dict(job_dict, pruned_exec_dict)

        # Add the separated "edp." configs to the job_dict
        job_dict['edp_configs'] = edp_configs

        # Args are listed, not named. Simply replace them.
        job_dict['args'] = pruned_exec_dict.get('args', [])

        # Find all swift:// paths in args, configs, and params and
        # add the .sahara suffix to the container if it is not there
        # already
        job_dict['args'] = [
            # TODO(tmckay) args for Pig can actually be -param name=value
            # and value could conceivably contain swift paths
            su.inject_swift_url_suffix(arg) for arg in job_dict['args']]

        for k, v in six.iteritems(job_dict.get('configs', {})):
            job_dict['configs'][k] = su.inject_swift_url_suffix(v)

        for k, v in six.iteritems(job_dict.get('params', {})):
            job_dict['params'][k] = su.inject_swift_url_suffix(v)
Example #6
0
    def _build_command(self, wf_dir, paths, builtin_paths,
                       updated_job_configs):

        indep_params = {}

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        indep_params["app_jar"] = paths.pop(0)
        indep_params["job_class"] = (
            updated_job_configs["configs"]["edp.java.main_class"])
        if self.plugin_params.get('drivers-to-jars', None):
            paths.extend(self.plugin_params['drivers-to-jars'])
        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            indep_params["wrapper_jar"] = builtin_paths.pop(0)
            indep_params["wrapper_class"] = (
                'org.openstack.sahara.edp.SparkWrapper')
            wrapper_xml = self._upload_wrapper_xml(self.master,
                                                   wf_dir,
                                                   updated_job_configs)
            indep_params["wrapper_args"] = "%s %s" % (
                wrapper_xml, indep_params["job_class"])

            indep_params["addnl_files"] = wrapper_xml

            indep_params["addnl_jars"] = ",".join(
                [indep_params["wrapper_jar"]] + paths + builtin_paths)

        else:
            indep_params["addnl_jars"] = ",".join(paths)

        # All additional jars are passed with the --jars option
        if indep_params["addnl_jars"]:
            indep_params["addnl_jars"] = (
                " --jars " + indep_params["addnl_jars"])

        # Launch the spark job using spark-submit and deploy_mode = client
        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        indep_params["args"] = updated_job_configs.get('args', [])
        indep_params["args"] = " ".join([su.inject_swift_url_suffix(arg)
                                         for arg in indep_params["args"]])
        if indep_params.get("args"):
            indep_params["args"] = (" " + indep_params["args"])

        mutual_dict = self.plugin_params.copy()
        mutual_dict.update(indep_params)

        # Handle driver classpath. Because of the way the hadoop
        # configuration is handled in the wrapper class, using
        # wrapper_xml, the working directory must be on the classpath
        self._check_driver_class_path(updated_job_configs, mutual_dict, wf_dir)

        if mutual_dict.get("wrapper_jar"):
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = (
                '%(spark-user)s%(spark-submit)s%(driver-class-path)s'
                ' --files %(addnl_files)s'
                ' --class %(wrapper_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(app_jar)s %(wrapper_args)s%(args)s') % dict(
                mutual_dict)
        else:
            cmd = (
                '%(spark-user)s%(spark-submit)s%(driver-class-path)s'
                ' --class %(job_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(app_jar)s%(args)s') % dict(
                mutual_dict)

        return cmd
Example #7
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs))

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster,
                                           spark.SPARK_MASTER.ui_name)

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id, "700")
        paths, builtin_paths = self._upload_job_files(master, wf_dir, job,
                                                      updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)
        job_class = updated_job_configs["configs"]["edp.java.main_class"]

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            wrapper_jar = builtin_paths.pop(0)
            wrapper_class = 'org.openstack.sahara.edp.SparkWrapper'
            wrapper_xml = self._upload_wrapper_xml(master, wf_dir,
                                                   updated_job_configs)
            wrapper_args = "%s %s" % (wrapper_xml, job_class)
            additional_jars = ",".join([app_jar] + paths + builtin_paths)
        else:
            wrapper_jar = wrapper_class = wrapper_args = ""
            additional_jars = ",".join(paths)

        # All additional jars are passed with the --jars option
        if additional_jars:
            additional_jars = " --jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        cluster_context = self._get_cluster_context(self.cluster)
        spark_home_dir = spark.Spark().home_dir(cluster_context)

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = updated_job_configs.get('args', [])
        args = " ".join([su.inject_swift_url_suffix(arg) for arg in args])

        submit_args = {
            "spark_submit": "%s/bin/spark-submit" % spark_home_dir,
            "addnl_jars": additional_jars,
            "master_url": spark.SPARK_MASTER.submit_url(cluster_context),
            "args": args
        }
        if wrapper_jar and wrapper_class:
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            submit_args.update({
                "driver_cp": self.get_driver_classpath(),
                "wrapper_class": wrapper_class,
                "wrapper_jar": wrapper_jar,
                "wrapper_args": wrapper_args,
            })
            submit_cmd = ('%(spark_submit)s%(driver_cp)s'
                          ' --class %(wrapper_class)s%(addnl_jars)s'
                          ' --master %(master_url)s'
                          ' %(wrapper_jar)s %(wrapper_args)s %(args)s')
        else:
            submit_args.update({
                "job_class": job_class,
                "app_jar": app_jar,
            })
            submit_cmd = ('%(spark_submit)s --class %(job_class)s'
                          '%(addnl_jars)s --master %(master_url)s'
                          ' %(app_jar)s %(args)s')
        submit_cmd = g._run_as('mapr', submit_cmd % submit_args)

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with master.remote() as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s && ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, submit_cmd),
                raise_when_error=False)

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id, edp.JOB_STATUS_RUNNING, {
                'spark-path': wf_dir
            })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Example #8
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)
        indep_params = {}
        data_source_urls = {}
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(
                job_execution.job_configs, job_execution.id, data_source_urls)
        )

        job_execution = conductor.job_execution_update(
            ctx, job_execution, {"data_source_urls": data_source_urls})

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # It is needed in case we are working with Spark plugin
        self.plugin_params['master'] = (
            self.plugin_params['master'] % {'host': self.master.hostname()})

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(self.master, '/tmp/spark-edp',
                                               job, job_execution.id, "700")
        paths, builtin_paths = self._upload_job_files(
            self.master, wf_dir, job, updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        indep_params["app_jar"] = paths.pop(0)
        indep_params["job_class"] = (
            updated_job_configs["configs"]["edp.java.main_class"])

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            indep_params["wrapper_jar"] = builtin_paths.pop(0)
            indep_params["wrapper_class"] = (
                'org.openstack.sahara.edp.SparkWrapper')
            wrapper_xml = self._upload_wrapper_xml(self.master,
                                                   wf_dir,
                                                   updated_job_configs)
            indep_params["wrapper_args"] = "%s %s" % (
                wrapper_xml, indep_params["job_class"])

            indep_params["addnl_jars"] = ",".join(
                [indep_params["app_jar"]] + paths + builtin_paths)

        else:
            indep_params["addnl_jars"] = ",".join(paths)

        # All additional jars are passed with the --jars option
        if indep_params["addnl_jars"]:
            indep_params["addnl_jars"] = (
                " --jars " + indep_params["addnl_jars"])

        # Launch the spark job using spark-submit and deploy_mode = client
        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        indep_params["args"] = updated_job_configs.get('args', [])
        indep_params["args"] = " ".join([su.inject_swift_url_suffix(arg)
                                         for arg in indep_params["args"]])
        if indep_params.get("args"):
            indep_params["args"] = (" " + indep_params["args"])

        mutual_dict = self.plugin_params.copy()
        mutual_dict.update(indep_params)
        if mutual_dict.get("wrapper_jar"):
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = (
                '%(spark-user)s%(spark-submit)s%(driver-class-path)s'
                ' --class %(wrapper_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % dict(
                mutual_dict)
        else:
            cmd = (
                '%(spark-user)s%(spark-submit)s'
                ' --class %(job_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(app_jar)s%(args)s') % dict(
                mutual_dict)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(self.master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod u+rwx,g+rx,o+rx %s" % wf_dir)
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + self.master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(_("Spark job execution failed. Exit status = "
                           "%(status)s, stdout = %(stdout)s") %
                         {'status': ret, 'stdout': stdout})
Example #9
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)

        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs)
        )

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # We'll always run the driver program on the master
        master = plugin_utils.get_instance(self.cluster, "master")

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(master, '/tmp/spark-edp', job,
                                               job_execution.id, "700")

        paths, builtin_paths = self._upload_job_files(
            master, wf_dir, job, updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        app_jar = paths.pop(0)
        job_class = updated_job_configs["configs"]["edp.java.main_class"]

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            wrapper_jar = builtin_paths.pop(0)
            wrapper_class = 'org.openstack.sahara.edp.SparkWrapper'
            wrapper_xml = self._upload_wrapper_xml(master,
                                                   wf_dir,
                                                   updated_job_configs)
            wrapper_args = "%s %s" % (wrapper_xml, job_class)

            additional_jars = ",".join([app_jar] + paths + builtin_paths)

        else:
            wrapper_jar = wrapper_class = wrapper_args = ""
            additional_jars = ",".join(paths)

        # All additional jars are passed with the --jars option
        if additional_jars:
            additional_jars = " --jars " + additional_jars

        # Launch the spark job using spark-submit and deploy_mode = client
        host = master.hostname()
        port = c_helper.get_config_value("Spark", "Master port", self.cluster)
        spark_submit = os.path.join(
            c_helper.get_config_value("Spark",
                                      "Spark home",
                                      self.cluster),
            "bin/spark-submit")

        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        args = updated_job_configs.get('args', [])
        args = " ".join([su.inject_swift_url_suffix(arg) for arg in args])
        if args:
            args = " " + args

        if wrapper_jar and wrapper_class:
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = (
                '%(spark_submit)s%(driver_cp)s'
                ' --class %(wrapper_class)s%(addnl_jars)s'
                ' --master spark://%(host)s:%(port)s'
                ' %(wrapper_jar)s %(wrapper_args)s%(args)s') % (
                {
                    "spark_submit": spark_submit,
                    "driver_cp": self.get_driver_classpath(),
                    "wrapper_class": wrapper_class,
                    "addnl_jars": additional_jars,
                    "host": host,
                    "port": port,
                    "wrapper_jar": wrapper_jar,
                    "wrapper_args": wrapper_args,
                    "args": args
                })
        else:
            cmd = (
                '%(spark_submit)s --class %(job_class)s%(addnl_jars)s'
                ' --master spark://%(host)s:%(port)s %(app_jar)s%(args)s') % (
                {
                    "spark_submit": spark_submit,
                    "job_class": job_class,
                    "addnl_jars": additional_jars,
                    "host": host,
                    "port": port,
                    "app_jar": app_jar,
                    "args": args
                })

        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!"
                % (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + master.id,
                    edp.JOB_STATUS_RUNNING,
                    {'spark-path': wf_dir})

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(_("Spark job execution failed. Exit status = "
                           "%(status)s, stdout = %(stdout)s") %
                         {'status': ret, 'stdout': stdout})
Example #10
0
    def run_job(self, job_execution):
        ctx = context.ctx()
        job = conductor.job_get(ctx, job_execution.job_id)
        indep_params = {}
        data_source_urls = {}
        additional_sources, updated_job_configs = (
            job_utils.resolve_data_source_references(job_execution.job_configs,
                                                     job_execution.id,
                                                     data_source_urls))

        job_execution = conductor.job_execution_update(
            ctx, job_execution, {"data_source_urls": data_source_urls})

        for data_source in additional_sources:
            if data_source and data_source.type == 'hdfs':
                h.configure_cluster_for_hdfs(self.cluster, data_source)
                break

        # It is needed in case we are working with Spark plugin
        self.plugin_params['master'] = (self.plugin_params['master'] % {
            'host': self.master.hostname()
        })

        # TODO(tmckay): wf_dir should probably be configurable.
        # The only requirement is that the dir is writable by the image user
        wf_dir = job_utils.create_workflow_dir(self.master, '/tmp/spark-edp',
                                               job, job_execution.id, "700")
        paths, builtin_paths = self._upload_job_files(self.master, wf_dir, job,
                                                      updated_job_configs)

        # We can shorten the paths in this case since we'll run out of wf_dir
        paths = [os.path.basename(p) for p in paths]
        builtin_paths = [os.path.basename(p) for p in builtin_paths]

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        indep_params["app_jar"] = paths.pop(0)
        indep_params["job_class"] = (
            updated_job_configs["configs"]["edp.java.main_class"])

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            indep_params["wrapper_jar"] = builtin_paths.pop(0)
            indep_params["wrapper_class"] = (
                'org.openstack.sahara.edp.SparkWrapper')
            wrapper_xml = self._upload_wrapper_xml(self.master, wf_dir,
                                                   updated_job_configs)
            indep_params["wrapper_args"] = "%s %s" % (
                wrapper_xml, indep_params["job_class"])

            indep_params["addnl_files"] = wrapper_xml

            indep_params["addnl_jars"] = ",".join(
                [indep_params["wrapper_jar"]] + paths + builtin_paths)

        else:
            indep_params["addnl_jars"] = ",".join(paths)

        # All additional jars are passed with the --jars option
        if indep_params["addnl_jars"]:
            indep_params["addnl_jars"] = (" --jars " +
                                          indep_params["addnl_jars"])

        # Launch the spark job using spark-submit and deploy_mode = client
        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        indep_params["args"] = updated_job_configs.get('args', [])
        indep_params["args"] = " ".join(
            [su.inject_swift_url_suffix(arg) for arg in indep_params["args"]])
        if indep_params.get("args"):
            indep_params["args"] = (" " + indep_params["args"])

        mutual_dict = self.plugin_params.copy()
        mutual_dict.update(indep_params)
        if mutual_dict.get("wrapper_jar"):
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = ('%(spark-user)s%(spark-submit)s%(driver-class-path)s'
                   ' --files %(addnl_files)s'
                   ' --class %(wrapper_class)s%(addnl_jars)s'
                   ' --master %(master)s'
                   ' --deploy-mode %(deploy-mode)s'
                   ' %(app_jar)s %(wrapper_args)s%(args)s') % dict(mutual_dict)
        else:
            cmd = ('%(spark-user)s%(spark-submit)s'
                   ' --class %(job_class)s%(addnl_jars)s'
                   ' --master %(master)s'
                   ' --deploy-mode %(deploy-mode)s'
                   ' %(app_jar)s%(args)s') % dict(mutual_dict)
        job_execution = conductor.job_execution_get(ctx, job_execution.id)
        if job_execution.info['status'] == edp.JOB_STATUS_TOBEKILLED:
            return (None, edp.JOB_STATUS_KILLED, None)

        # If an exception is raised here, the job_manager will mark
        # the job failed and log the exception
        # The redirects of stdout and stderr will preserve output in the wf_dir
        with remote.get_remote(self.master) as r:
            # Upload the command launch script
            launch = os.path.join(wf_dir, "launch_command")
            r.write_file_to(launch, self._job_script())
            r.execute_command("chmod u+rwx,g+rx,o+rx %s" % wf_dir)
            r.execute_command("chmod +x %s" % launch)
            ret, stdout = r.execute_command(
                "cd %s; ./launch_command %s > /dev/null 2>&1 & echo $!" %
                (wf_dir, cmd))

        if ret == 0:
            # Success, we'll add the wf_dir in job_execution.extra and store
            # pid@instance_id as the job id
            # We know the job is running so return "RUNNING"
            return (stdout.strip() + "@" + self.master.id,
                    edp.JOB_STATUS_RUNNING, {
                        'spark-path': wf_dir
                    })

        # Hmm, no execption but something failed.
        # Since we're using backgrounding with redirect, this is unlikely.
        raise e.EDPError(
            _("Spark job execution failed. Exit status = "
              "%(status)s, stdout = %(stdout)s") % {
                  'status': ret,
                  'stdout': stdout
              })
Example #11
0
    def _build_command(self, wf_dir, paths, builtin_paths,
                       updated_job_configs):
        indep_params = {}

        # TODO(tmckay): for now, paths[0] is always assumed to be the app
        # jar and we generate paths in order (mains, then libs).
        # When we have a Spark job type, we can require a "main" and set
        # the app jar explicitly to be "main"
        indep_params["app_jar"] = paths.pop(0)
        indep_params["job_class"] = (
            updated_job_configs["configs"]["edp.java.main_class"])

        # If we uploaded builtins then we are using a wrapper jar. It will
        # be the first one on the builtin list and the original app_jar needs
        # to be added to the  'additional' jars
        if builtin_paths:
            indep_params["wrapper_jar"] = builtin_paths.pop(0)
            indep_params["wrapper_class"] = (
                'org.openstack.sahara.edp.SparkWrapper')
            wrapper_xml = self._upload_wrapper_xml(self.master,
                                                   wf_dir,
                                                   updated_job_configs)
            indep_params["wrapper_args"] = "%s %s" % (
                wrapper_xml, indep_params["job_class"])

            indep_params["addnl_files"] = wrapper_xml

            indep_params["addnl_jars"] = ",".join(
                [indep_params["wrapper_jar"]] + paths + builtin_paths)

        else:
            indep_params["addnl_jars"] = ",".join(paths)

        # All additional jars are passed with the --jars option
        if indep_params["addnl_jars"]:
            indep_params["addnl_jars"] = (
                " --jars " + indep_params["addnl_jars"])

        # Launch the spark job using spark-submit and deploy_mode = client
        # TODO(tmckay): we need to clean up wf_dirs on long running clusters
        # TODO(tmckay): probably allow for general options to spark-submit
        indep_params["args"] = updated_job_configs.get('args', [])
        indep_params["args"] = " ".join([su.inject_swift_url_suffix(arg)
                                         for arg in indep_params["args"]])
        if indep_params.get("args"):
            indep_params["args"] = (" " + indep_params["args"])

        mutual_dict = self.plugin_params.copy()
        mutual_dict.update(indep_params)

        # Handle driver classpath. Because of the way the hadoop
        # configuration is handled in the wrapper class, using
        # wrapper_xml, the working directory must be on the classpath
        self._check_driver_class_path(mutual_dict)

        if mutual_dict.get("wrapper_jar"):
            # Substrings which may be empty have spaces
            # embedded if they are non-empty
            cmd = (
                '%(spark-user)s%(spark-submit)s%(driver-class-path)s'
                ' --files %(addnl_files)s'
                ' --class %(wrapper_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(app_jar)s %(wrapper_args)s%(args)s') % dict(
                mutual_dict)
        else:
            cmd = (
                '%(spark-user)s%(spark-submit)s'
                ' --class %(job_class)s%(addnl_jars)s'
                ' --master %(master)s'
                ' --deploy-mode %(deploy-mode)s'
                ' %(app_jar)s%(args)s') % dict(
                mutual_dict)

        return cmd