Ejemplo n.º 1
0
    def watch_jobs(self, job_db, app=None):
        """Use SSH connection to slurm submitnode to monitor jobs.

        :param job_db: Dictionary which contains all running jobs.
        """
        slurm_connection = SSHClient(
            hostname=SlurmJobManagerCERN.SLURM_HEADNODE_HOSTNAME,
            port=SlurmJobManagerCERN.SLURM_HEADNODE_PORT,
        )
        statuses_to_skip = ['succeeded', 'failed']
        while True:
            logging.debug('Starting a new stream request to watch Jobs')
            try:
                slurm_jobs = {}
                for id, job_dict in job_db.items():
                    if (not job_db[id]['deleted']
                            and job_db[id]['compute_backend'] == 'slurmcern'
                            and not job_db[id]['status'] in statuses_to_skip):
                        slurm_jobs[job_dict['backend_job_id']] = id
                if not slurm_jobs.keys():
                    logging.error('No slurm jobs')
                    continue
                slurm_query_cmd = self.format_slurm_job_query(
                    slurm_jobs.keys())
                stdout = slurm_connection.exec_command(slurm_query_cmd)
                for item in stdout.rstrip().split('\n'):
                    slurm_job_status = item.split('|')[0]
                    slurm_job_id = item.split('|')[1]
                    job_id = slurm_jobs[slurm_job_id]
                    if slurm_job_status in slurmJobStatus['succeeded']:
                        SlurmJobManagerCERN.get_outputs()
                        job_db[job_id]['status'] = 'succeeded'
                        job_db[job_id]['deleted'] = True
                        job_db[job_id]['log'] = \
                            SlurmJobManagerCERN.get_logs(
                                backend_job_id=job_dict['backend_job_id'],
                                workspace=job_db[
                                    job_id]['obj'].workflow_workspace)
                        store_logs(logs=job_db[job_id]['log'], job_id=job_id)
                    if slurm_job_status in slurmJobStatus['failed']:
                        SlurmJobManagerCERN.get_outputs()
                        job_db[job_id]['status'] = 'failed'
                        job_db[job_id]['deleted'] = True
                        job_db[job_id]['log'] = \
                            SlurmJobManagerCERN.get_logs(
                                backend_job_id=job_dict['backend_job_id'],
                                workspace=job_db[
                                    job_id]['obj'].workflow_workspace)
                        store_logs(logs=job_db[job_id]['log'], job_id=job_id)
            except Exception as e:
                logging.error("Unexpected error: {}".format(e), exc_info=True)
                time.sleep(120)
Ejemplo n.º 2
0
    def watch_jobs(self, job_db, app=None):
        """Use SSH connection to slurm submitnode to monitor jobs.

        :param job_db: Dictionary which contains all running jobs.
        """
        slurm_connection = SSHClient(
            hostname=self.job_manager_cls.SLURM_HEADNODE_HOSTNAME,
            port=self.job_manager_cls.SLURM_HEADNODE_PORT,
        )
        statuses_to_skip = ["succeeded", "failed"]
        while True:
            logging.debug("Starting a new stream request to watch Jobs")
            try:
                slurm_jobs = {}
                for id, job_dict in job_db.items():
                    if (not job_db[id]["deleted"]
                            and job_db[id]["compute_backend"] == "slurmcern"
                            and not job_db[id]["status"] in statuses_to_skip):
                        slurm_jobs[job_dict["backend_job_id"]] = id
                if not slurm_jobs.keys():
                    logging.error("No slurm jobs")
                    continue
                slurm_query_cmd = self.format_slurm_job_query(
                    slurm_jobs.keys())
                stdout = slurm_connection.exec_command(slurm_query_cmd)
                for item in stdout.rstrip().split("\n"):
                    slurm_job_status = item.split("|")[0]
                    slurm_job_id = item.split("|")[1]
                    job_id = slurm_jobs[slurm_job_id]
                    if slurm_job_status in slurmJobStatus["succeeded"]:
                        self.job_manager_cls.get_outputs()
                        job_db[job_id]["status"] = "succeeded"
                        job_db[job_id]["deleted"] = True
                        job_db[job_id]["log"] = self.job_manager_cls.get_logs(
                            backend_job_id=job_dict["backend_job_id"],
                            workspace=job_db[job_id]["obj"].workflow_workspace,
                        )
                        store_logs(logs=job_db[job_id]["log"], job_id=job_id)
                    if slurm_job_status in slurmJobStatus["failed"]:
                        self.job_manager_cls.get_outputs()
                        job_db[job_id]["status"] = "failed"
                        job_db[job_id]["deleted"] = True
                        job_db[job_id]["log"] = self.job_manager_cls.get_logs(
                            backend_job_id=job_dict["backend_job_id"],
                            workspace=job_db[job_id]["obj"].workflow_workspace,
                        )
                        store_logs(logs=job_db[job_id]["log"], job_id=job_id)
            except Exception as e:
                logging.error("Unexpected error: {}".format(e), exc_info=True)
                time.sleep(120)
Ejemplo n.º 3
0
 def execute(self):
     """Execute / submit a job with Slurm."""
     self.cmd = self._encode_cmd(" ".join(self.cmd))
     initialize_krb5_token(workflow_uuid=self.workflow_uuid)
     self.slurm_connection = SSHClient(
         hostname=SlurmJobManagerCERN.SLURM_HEADNODE_HOSTNAME,
         port=SlurmJobManagerCERN.SLURM_HEADNODE_PORT,
     )
     self._transfer_inputs()
     self._dump_job_file()
     self._dump_job_submission_file()
     stdout = self.slurm_connection.exec_command(
         "cd {} && sbatch --parsable {}".format(
             SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH,
             self.job_description_file))
     backend_job_id = stdout.rstrip()
     return backend_job_id
 def get_outputs():
     """Transfer job outputs to REANA."""
     os.chdir(SlurmJobManagerCERN.REANA_WORKSPACE_PATH)
     slurm_connection = SSHClient()
     sftp = slurm_connection.ssh_client.open_sftp()
     SlurmJobManagerCERN._download_dir(
         sftp, SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH,
         SlurmJobManagerCERN.REANA_WORKSPACE_PATH)
     sftp.close()
Ejemplo n.º 5
0
class SlurmJobManagerCERN(JobManager):
    """Slurm job management."""

    SLURM_WORKSAPCE_PATH = ""
    """Absolute path inside slurm head node used for submission."""
    REANA_WORKSPACE_PATH = ""
    """Absolute REANA workspace path."""
    SLURM_HEADNODE_HOSTNAME = os.getenv("SLURM_HOSTNAME", "hpc-batch.cern.ch")
    """Hostname of SLURM head-node used for job management via SSH."""
    SLURM_HEADNODE_PORT = os.getenv("SLURM_CLUSTER_PORT", "22")
    """Port of SLURM head node."""
    SLURM_PARTITION = os.getenv("SLURM_PARTITION", "batch-short")
    """Default slurm partition."""
    SLURM_HOME_PATH = os.getenv("SLURM_HOME_PATH", "")
    """Default SLURM home path."""
    def __init__(
        self,
        docker_img=None,
        cmd=None,
        prettified_cmd=None,
        env_vars=None,
        workflow_uuid=None,
        workflow_workspace=None,
        cvmfs_mounts="false",
        shared_file_system=False,
        job_name=None,
        kerberos=False,
        kubernetes_uid=None,
        unpacked_img=False,
    ):
        """Instanciate Slurm job manager.

        :param docker_img: Docker image.
        :type docker_img: str
        :param cmd: Command to execute.
        :type cmd: list
        :param prettified_cmd: pretified version of command to execute.
        :type prettified_cmd: str
        :param env_vars: Environment variables.
        :type env_vars: dict
        :param workflow_id: Unique workflow id.
        :type workflow_id: str
        :param workflow_workspace: Workflow workspace path.
        :type workflow_workspace: str
        :param cvmfs_mounts: list of CVMFS mounts as a string.
        :type cvmfs_mounts: str
        :param shared_file_system: if shared file system is available.
        :type shared_file_system: bool
        :param job_name: Name of the job
        :type job_name: str
        """
        super(SlurmJobManagerCERN, self).__init__(
            docker_img=docker_img,
            cmd=cmd,
            prettified_cmd=prettified_cmd,
            env_vars=env_vars,
            workflow_uuid=workflow_uuid,
            job_name=job_name,
        )
        self.compute_backend = "Slurm"
        self.workflow_workspace = workflow_workspace
        self.cvmfs_mounts = cvmfs_mounts
        self.shared_file_system = shared_file_system
        self.job_file = "job.sh"
        self.job_description_file = "job_description.sh"

    def _transfer_inputs(self):
        """Transfer inputs to SLURM submit node."""
        stdout = self.slurm_connection.exec_command("pwd")
        self.slurm_home_path = SlurmJobManagerCERN.SLURM_HOME_PATH or stdout.rstrip(
        )
        SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH = os.path.join(
            self.slurm_home_path, self.workflow_workspace[1:])
        SlurmJobManagerCERN.REANA_WORKSPACE_PATH = self.workflow_workspace
        self.slurm_connection.exec_command("mkdir -p {}".format(
            SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH))
        sftp = self.slurm_connection.ssh_client.open_sftp()
        os.chdir(self.workflow_workspace)
        for dirpath, dirnames, filenames in os.walk(self.workflow_workspace):
            try:
                sftp.mkdir(os.path.join(self.slurm_home_path, dirpath[1:]))
            except Exception:
                pass
            for file in filenames:
                sftp.put(
                    os.path.join(dirpath, file),
                    os.path.join(self.slurm_home_path, dirpath[1:], file),
                )

    @JobManager.execution_hook
    def execute(self):
        """Execute / submit a job with Slurm."""
        self.cmd = self._encode_cmd(" ".join(self.cmd))
        initialize_krb5_token(workflow_uuid=self.workflow_uuid)
        self.slurm_connection = SSHClient(
            hostname=SlurmJobManagerCERN.SLURM_HEADNODE_HOSTNAME,
            port=SlurmJobManagerCERN.SLURM_HEADNODE_PORT,
        )
        self._transfer_inputs()
        self._dump_job_file()
        self._dump_job_submission_file()
        stdout = self.slurm_connection.exec_command(
            "cd {} && sbatch --parsable {}".format(
                SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH,
                self.job_description_file))
        backend_job_id = stdout.rstrip()
        return backend_job_id

    def _dump_job_submission_file(self):
        """Dump job submission file to the Slurm submit node."""
        job_template = ("#!/bin/bash \n"
                        "#SBATCH --job-name={job_name} \n"
                        "#SBATCH --output=reana_job.%j.out \n"
                        "#SBATCH --error=reana_job.%j.err \n"
                        "#SBATCH --partition {partition} \n"
                        "#SBATCH --time 60 \n"
                        "srun {command}").format(
                            partition=SlurmJobManagerCERN.SLURM_PARTITION,
                            job_name=self.job_name,
                            command=self._wrap_singularity_cmd(),
                        )
        self.slurm_connection.exec_command(
            'cd {} && job="{}" && echo "$job"> {}'.format(
                SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH,
                job_template,
                self.job_description_file,
            ))

    def _dump_job_file(self):
        """Dump job file."""
        job_template = "#!/bin/bash \n{}".format(self.cmd)
        self.slurm_connection.exec_command(
            'cd {} && job="{}" && echo "$job" > {} && chmod +x {}'.format(
                SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH,
                job_template,
                self.job_file,
                self.job_file,
            ))

    def _encode_cmd(self, cmd):
        """Encode base64 cmd."""
        encoded_cmd = base64.b64encode(cmd.encode("utf-8")).decode("utf-8")
        return "echo {}|base64 -d|bash".format(encoded_cmd)

    def _wrap_singularity_cmd(self):
        """Wrap command in singulrity."""
        base_singularity_cmd = (
            "singularity exec -B {SLURM_WORKSAPCE}:{REANA_WORKSPACE}"
            " docker://{DOCKER_IMG} {CMD}".format(
                SLURM_WORKSAPCE=SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH,
                REANA_WORKSPACE=SlurmJobManagerCERN.REANA_WORKSPACE_PATH,
                DOCKER_IMG=self.docker_img,
                CMD="./" + self.job_file,
            ))
        return base_singularity_cmd

    def get_outputs():
        """Transfer job outputs to REANA."""
        os.chdir(SlurmJobManagerCERN.REANA_WORKSPACE_PATH)
        slurm_connection = SSHClient()
        sftp = slurm_connection.ssh_client.open_sftp()
        SlurmJobManagerCERN._download_dir(
            sftp,
            SlurmJobManagerCERN.SLURM_WORKSAPCE_PATH,
            SlurmJobManagerCERN.REANA_WORKSPACE_PATH,
        )
        sftp.close()

    def _download_dir(sftp, remote_dir, local_dir):
        """Download remote directory content."""
        os.path.exists(local_dir) or os.makedirs(local_dir)
        dir_items = sftp.listdir_attr(remote_dir)
        for item in dir_items:
            remote_path = os.path.join(remote_dir, item.filename)
            local_path = os.path.join(local_dir, item.filename)
            if S_ISDIR(item.st_mode):
                SlurmJobManagerCERN._download_dir(sftp, remote_path,
                                                  local_path)
            else:
                sftp.get(remote_path, local_path)

    def get_logs(backend_job_id, workspace):
        """Return job logs if log files are present."""
        stderr_file = os.path.join(workspace,
                                   "reana_job." + str(backend_job_id) + ".err")
        stdout_file = os.path.join(workspace,
                                   "reana_job." + str(backend_job_id) + ".out")
        log_files = [stderr_file, stdout_file]
        job_log = ""
        try:
            for file in log_files:
                with open(file, "r") as log_file:
                    job_log += log_file.read()
            return job_log
        except Exception as e:
            msg = "Job logs of {} were not found. {}".format(backend_job_id, e)
            logging.error(msg, exc_info=True)
            return msg