def stop_job(self,
                 ssh_client,
                 name,
                 job_options,
                 is_singularity,
                 logger,
                 workdir=None):
        """
        Stops a job from the HPC

        @type ssh_client: SshClient
        @param ssh_client: ssh client connected to an HPC login node
        @type name: string
        @param name: name of the job
        @type job_settings: dictionary
        @param job_settings: dictionary with the job options
        @type is_singularity: bool
        @param is_singularity: True if the job is in a container
        @rtype string
        @return Slurm's job name stopped. None if an error arise.
        """
        if not SshClient.check_ssh_client(ssh_client, logger):
            return False

        call = self._build_job_cancellation_call(name, job_options, logger)
        if call is None:
            return False

        return ssh_client.execute_shell_command(call, workdir=workdir)
    def clean_job_aux_files(self,
                            ssh_client,
                            name,
                            job_options,
                            is_singularity,
                            logger,
                            workdir=None):
        """
        Cleans no more needed job files in the HPC

        @type ssh_client: SshClient
        @param ssh_client: ssh client connected to an HPC login node
        @type name: string
        @param name: name of the job
        @type job_settings: dictionary
        @param job_settings: dictionary with the job options
        @type is_singularity: bool
        @param is_singularity: True if the job is in a container
        @rtype string
        @return Slurm's job name stopped. None if an error arise.
        """
        if not SshClient.check_ssh_client(ssh_client, logger):
            return False

        if is_singularity:
            return ssh_client.execute_shell_command("rm " + name + ".script",
                                                    workdir=workdir)
        return True
Esempio n. 3
0
    def get_states(self, workdir, credentials, job_names, logger):
        # TODO set start time of consulting
        # (sacct only check current day)
        call = "cat msomonitor.data"

        client = SshClient(credentials)

        output, exit_code = client.execute_shell_command(call,
                                                         workdir=workdir,
                                                         wait_result=True)

        client.close_connection()

        states = {}
        if exit_code == 0:
            states = self._parse_states(output, logger)

        return states
Esempio n. 4
0
def configure_execution(
        config,
        credentials,
        base_dir,
        workdir_prefix,
        simulate,
        **kwargs):  # pylint: disable=W0613
    """ Creates the working directory for the execution """
    ctx.logger.info('Connecting to workload manager..')
    if not simulate:
        wm_type = config['workload_manager']
        ctx.logger.info(' - manager: {wm_type}'.format(wm_type=wm_type))

        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError(
                "Workload Manager '" +
                wm_type +
                "' not supported.")

        if 'credentials' in ctx.instance.runtime_properties:
            credentials = ctx.instance.runtime_properties['credentials']
        try:
            client = SshClient(credentials)
        except Exception as exp:
            raise NonRecoverableError(
                "Failed trying to connect to workload manager: " + str(exp))

        # TODO: use command according to wm
        _, exit_code = client.execute_shell_command(
            'uname',
            wait_result=True)

        if exit_code is not 0:
            client.close_connection()
            raise NonRecoverableError(
                "Failed executing on the workload manager: exit code " +
                str(exit_code))

        ctx.instance.runtime_properties['login'] = exit_code is 0

        prefix = workdir_prefix
        if workdir_prefix is "":
            prefix = ctx.blueprint.id

        workdir = wm.create_new_workdir(client, base_dir, prefix, ctx.logger)
        client.close_connection()
        if workdir is None:
            raise NonRecoverableError(
                "failed to create the working directory, base dir: " +
                base_dir)
        ctx.instance.runtime_properties['workdir'] = workdir
        ctx.logger.info('..workload manager ready to be used on ' + workdir)
    else:
        ctx.logger.info(' - [simulation]..')
        ctx.instance.runtime_properties['login'] = True
        ctx.instance.runtime_properties['workdir'] = "simulation"
        ctx.logger.warning('Workload manager connection simulated')
Esempio n. 5
0
    def get_states(self, workdir, credentials, job_names, logger):
        # TODO set start time of consulting
        # (sacct only check current day)
        call = "sacct -n -o JobName,State -X -P --name=" + ','.join(job_names)

        client = SshClient(credentials)

        output, exit_code = client.execute_shell_command(call,
                                                         workdir=workdir,
                                                         wait_result=True)

        client.close_connection()

        states = {}
        if exit_code == 0:
            states = self._parse_states(output, logger)
        else:
            logger.warning("Failed to get states")

        return states
Esempio n. 6
0
def cleanup_execution(
        config,
        credentials,
        skip,
        simulate,
        **kwargs):  # pylint: disable=W0613
    """ Cleans execution working directory """
    if skip:
        return

    ctx.logger.info('Cleaning up...')
    if not simulate:
        workdir = ctx.instance.runtime_properties['workdir']
        wm_type = config['workload_manager']
        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError(
                "Workload Manager '" +
                wm_type +
                "' not supported.")

        if 'credentials' in ctx.instance.runtime_properties:
            credentials = ctx.instance.runtime_properties['credentials']
        client = SshClient(credentials)
        client.execute_shell_command(
            'rm -r ' + workdir,
            wait_result=True)
        client.close_connection()
        ctx.logger.info('..all clean.')
    else:
        ctx.logger.warning('clean up simulated.')
Esempio n. 7
0
    def _get_states_detailed(workdir, credentials, job_names, logger):
        """
        Get job states by job names

        This function uses `qstat` command to query Torque.
        Please don't launch this call very friquently. Polling it
        frequently, especially across all users on the cluster,
        will slow down response times and may bring
        scheduling to a crawl.

        It allows to a precise mapping of Torque states to
        Slurm states by taking into account `exit_code`.
        Unlike `get_states_tabular` it parses output on host
        and uses several SSH commands.
        """
        # identify job ids
        call = "echo {} | xargs -n 1 qselect -N".format(
            shlex_quote(' '.join(map(shlex_quote, job_names))))

        logger.info('TORQUE.PY::_GET_STATES_DETAILED() L232')
        logger.info('call = ' + str(call))

        client = SshClient(credentials)

        output, exit_code = client.execute_shell_command(call,
                                                         workdir=workdir,
                                                         wait_result=True)
        job_ids = Torque._parse_qselect(output)
        if not job_ids:
            return {}

        logger.info('job_ids = ' + str(job_ids))

        # get detailed information about jobs
        call = "qstat -f {}".format(' '.join(map(str, job_ids)))

        logger.info('call = ' + str(call))

        output, exit_code = client.execute_shell_command(call,
                                                         workdir=workdir,
                                                         wait_result=True)
        client.close_connection()
        try:
            job_states = Torque._parse_qstat_detailed(output)
            logger.info('job_states = ' + str(job_states))
        except SyntaxError as e:
            logger.warning(
                "cannot parse state response for job ids=[{}]".format(','.join(
                    map(str, job_ids))))
            logger.warning(
                "{err}\n`qstat -f` output to parse:\n\\[\n{text}\n\\]".format(
                    err=str(e), text=output))
            # TODO: think whether error ignoring is better
            #       for the correct lifecycle
            raise e

        return job_states
Esempio n. 8
0
def send_job(job_options, **kwargs):  # pylint: disable=W0613
    """ Sends a job to the workload manager """
    ctx.logger.info('TASKS.PY::SEND_JOB L455')
    simulate = ctx.instance.runtime_properties['simulate']

    name = kwargs['name']
    is_singularity = 'hpc.nodes.SingularityJob' in ctx.node.\
        type_hierarchy

    if not simulate:
        workdir = ctx.instance.runtime_properties['workdir']
        wm_type = ctx.instance.runtime_properties['workload_manager']

        if wm_type != 'K8S':
            client = SshClient(ctx.instance.runtime_properties['credentials'])
        else:
            client = None

        wm = WorkloadManager.factory(wm_type)
        if not wm:
            if wm_type != 'K8S':
                client.close_connection()
            raise NonRecoverableError(
                "Workload Manager '" +
                wm_type +
                "' not supported.")
        context_vars = {
            'CFY_EXECUTION_ID': ctx.execution_id,
            'CFY_JOB_NAME': name
        }
        is_submitted = wm.submit_job(client,
                                     name,
                                     job_options,
                                     is_singularity,
                                     ctx.logger,
                                     workdir=workdir,
                                     context=context_vars)

	if wm_type != 'K8S':
	    client.close_connection()
    else:
        ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')
        is_submitted = True

    if is_submitted:
        ctx.logger.info('Job ' + name + ' (' + ctx.instance.id + ') sent.')
    else:
        ctx.logger.error(
            'Job ' + name + ' (' + ctx.instance.id + ') not sent.')
        raise NonRecoverableError(
            'Job ' + name + ' (' + ctx.instance.id + ') not sent.')

    ctx.instance.runtime_properties['job_name'] = name

    ctx.logger.info('TASKS.PY::SEND_JOB *** END *** L517')
Esempio n. 9
0
def cleanup_job(job_options, skip, **kwargs):  # pylint: disable=W0613
    ctx.logger.info('TASKS.PY::PRECONFIGURE_WM L503')
    """Clean the aux files of the job"""
    if skip:
        return

    try:
        simulate = ctx.instance.runtime_properties['simulate']
    except KeyError:
        # The job wasn't configured properly, so no cleanup needed
        ctx.logger.warning('Job was not cleaned up as it was not configured.')

    try:
        name = kwargs['name']
        if not simulate:
            is_singularity = 'hpc.nodes.SingularityJob' in ctx.node.\
                type_hierarchy
            workdir = ctx.instance.runtime_properties['workdir']
            wm_type = ctx.instance.runtime_properties['workload_manager']

            client = SshClient(ctx.instance.runtime_properties['credentials'])

            wm = WorkloadManager.factory(wm_type)
            if not wm:
                client.close_connection()
                raise NonRecoverableError(
                    "Workload Manager '" +
                    wm_type +
                    "' not supported.")
            is_clean = wm.clean_job_aux_files(client,
                                              name,
                                              job_options,
                                              is_singularity,
                                              ctx.logger,
                                              workdir=workdir)

            client.close_connection()
        else:
            ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')
            is_clean = True

        if is_clean:
            ctx.logger.info(
                'Job ' + name + ' (' + ctx.instance.id + ') cleaned.')
        else:
            ctx.logger.error('Job ' + name + ' (' + ctx.instance.id +
                             ') not cleaned.')
    except Exception as exp:
        print(traceback.format_exc())
        ctx.logger.error(
            'Something happend when trying to clean up: ' + exp.message)

    ctx.logger.info('TASKS.PY::PRECONFIGURE_WM L573')
Esempio n. 10
0
def stop_job(job_options, **kwargs):  # pylint: disable=W0613
    ctx.logger.info('TASKS.PY::STOP_JOB L557')
    """ Stops a job in the workload manager """
    try:
        simulate = ctx.instance.runtime_properties['simulate']
    except KeyError:
        # The job wasn't configured properly, no need to be stopped
        ctx.logger.warning('Job was not stopped as it was not configured.')

    try:
        name = kwargs['name']
        is_singularity = 'hpc.nodes.SingularityJob' in ctx.node.\
            type_hierarchy

        if not simulate:
            workdir = ctx.instance.runtime_properties['workdir']
            wm_type = ctx.instance.runtime_properties['workload_manager']
            client = SshClient(ctx.instance.runtime_properties['credentials'])

            wm = WorkloadManager.factory(wm_type)
            if not wm:
                client.close_connection()
                raise NonRecoverableError(
                    "Workload Manager '" +
                    wm_type +
                    "' not supported.")
            is_stopped = wm.stop_job(client,
                                     name,
                                     job_options,
                                     is_singularity,
                                     ctx.logger,
                                     workdir=workdir)

            client.close_connection()
        else:
            ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')
            is_stopped = True

        if is_stopped:
            ctx.logger.info(
                'Job ' + name + ' (' + ctx.instance.id + ') stopped.')
        else:
            ctx.logger.error('Job ' + name + ' (' + ctx.instance.id +
                             ') not stopped.')
            raise NonRecoverableError('Job ' + name + ' (' + ctx.instance.id +
                                      ') not stopped.')
    except Exception as exp:
        print(traceback.format_exc())
        ctx.logger.error(
            'Something happend when trying to stop: ' + exp.message)

    ctx.logger.info('TASKS.PY::STOP_JOB L628')
Esempio n. 11
0
def deploy_job(script,
               inputs,
               credentials,
               wm_type,
               workdir,
               name,
               logger,
               skip_cleanup):  # pylint: disable=W0613
    """ Exec a deployment job script that receives SSH credentials as input """

    wm = WorkloadManager.factory(wm_type)
    if not wm:
        raise NonRecoverableError(
            "Workload Manager '" +
            wm_type +
            "' not supported.")

    # Execute the script and manage the output
    success = False
    client = SshClient(credentials)
    if wm._create_shell_script(client,
                               name,
                               ctx.get_resource(script),
                               logger,
                               workdir=workdir):
        call = "./" + name
        for dinput in inputs:
            str_input = str(dinput)
            if ('\n' in str_input or ' ' in str_input) and str_input[0] != '"':
                call += ' "' + str_input + '"'
            else:
                call += ' ' + str_input
        _, exit_code = client.execute_shell_command(
            call,
            workdir=workdir,
            wait_result=True)
        if exit_code is not 0:
            logger.warning(
                "failed to deploy job: call '" + call + "', exit code " +
                str(exit_code))
        else:
            success = True

        if not skip_cleanup:
            if not client.execute_shell_command(
                    "rm " + name,
                    workdir=workdir):
                logger.warning("failed removing bootstrap script")

    client.close_connection()

    return success
    def publish(self, ssh_client, logger, workdir=None):
        """
        Publish the local file in the external repository

        @type ssh_client: SshClient
        @param ssh_client: ssh client connected to an HPC login node
        @rtype string
        @return False if something went wrong
        """
        if not SshClient.check_ssh_client(ssh_client, logger):
            return False

        call = self._build_publish_call(logger)
        if call is None:
            return False

        return ssh_client.execute_shell_command(call,
                                                workdir=workdir,
                                                wait_result=False)
Esempio n. 13
0
def publish(publish_list, **kwargs):
    ctx.logger.info('TASKS.PY::PRECONFIGURE_WM L610')
    """ Publish the job outputs """
    try:
        simulate = ctx.instance.runtime_properties['simulate']
    except KeyError as exp:
        # The job wasn't configured properly, no need to publish
        ctx.logger.warning(
            'Job outputs where not published as' +
            ' the job was not configured properly.')
        return

    try:
        name = kwargs['name']
        published = True
        if not simulate:
            workdir = ctx.instance.runtime_properties['workdir']
            client = SshClient(ctx.instance.runtime_properties['credentials'])

            for publish_item in publish_list:
                if not published:
                    break
                exrep = ExternalRepository.factory(publish_item)
                if not exrep:
                    client.close_connection()
                    raise NonRecoverableError(
                        "External repository '" +
                        publish_item['dataset']['type'] +
                        "' not supported.")
                published = exrep.publish(client, ctx.logger, workdir)

            client.close_connection()
        else:
            ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')

        if published:
            ctx.logger.info(
                'Job ' + name + ' (' + ctx.instance.id + ') published.')
        else:
            ctx.logger.error('Job ' + name + ' (' + ctx.instance.id +
                             ') not published.')
            raise NonRecoverableError('Job ' + name + ' (' + ctx.instance.id +
                                      ') not published.')
    except Exception as exp:
        print(traceback.format_exc())
        ctx.logger.error(
            'Cannot publish: ' + exp.message)

    ctx.logger.info('TASKS.PY::PRECONFIGURE_WM L680')
Esempio n. 14
0
def cleanup_execution(
        config,
        credentials,
        skip,
        simulate,
        **kwargs):  # pylint: disable=W0613
    """ Cleans execution working directory """
    ctx.logger.info('TASKS.PY::CLEANUP_EXECUTION L148')
    if skip:
        ctx.logger.info('TASKS.PY::CLEANUP_EXECUTION *** SKIP *** L154')
        return

    ctx.logger.info('Cleaning up...')
    if not simulate:
        workdir = ctx.instance.runtime_properties['workdir']
        wm_type = config['workload_manager']
        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError(
                "Workload Manager '" +
                wm_type +
                "' not supported.")

        if 'credentials' in ctx.instance.runtime_properties:
            credentials = ctx.instance.runtime_properties['credentials']

        if wm_type != 'K8S':
            client = SshClient(credentials)
            client.execute_shell_command(
                'rm -r ' + workdir,
                wait_result=True)
            client.close_connection()
        else:
            try:
                os.rmdir(workdir)
            except Exception as exp:
                print(traceback.format_exc())
                ctx.logger.error(
                    'Something happend when trying to clean up: ' + 
                    exp.message)

        ctx.logger.info('..all clean.')
    else:
        ctx.logger.warning('clean up simulated.')

    ctx.logger.info('TASKS.PY::CLEANUP_EXECUTION *** END *** L179')
    def submit_job(self,
                   ssh_client,
                   name,
                   job_settings,
                   is_singularity,
                   logger,
                   workdir=None,
                   context=None):
        """
        Sends a job to the HPC

        @type ssh_client: SshClient
        @param ssh_client: ssh client connected to an HPC login node
        @type name: string
        @param name: name of the job
        @type job_settings: dictionary
        @param job_settings: dictionary with the job options
        @type is_singularity: bool
        @param is_singularity: True if the job is in a container
        @rtype string
        @param logger: Logger object to print log messages
        @rtype logger
        @param workdir: Path of the working directory of the job
        @rtype string
        @param context: Dictionary containing context env vars
        @rtype dictionary of strings
        @return Slurm's job name sent. None if an error arise.
        """
        if ssh_client:
            if not SshClient.check_ssh_client(ssh_client, logger):
                return False

            if is_singularity:
                # generate script content for singularity
                script_content = self._build_container_script(
                    name, job_settings, logger)
                if script_content is None:
                    return False

                if not self._create_shell_script(ssh_client,
                                                 name + ".script",
                                                 script_content,
                                                 logger,
                                                 workdir=workdir):
                    return False

                # @TODO: use more general type names (e.g., BATCH/INLINE, etc)
                settings = {"type": "SBATCH", "command": name + ".script"}

                if 'scale' in job_settings:
                    settings['scale'] = job_settings['scale']
                    if 'scale_max_in_parallel' in job_settings:
                        settings['scale_max_in_parallel'] = \
                            job_settings['scale_max_in_parallel']
            else:
                settings = job_settings
        else:  # K8S
            settings = job_settings

        # build the call to submit the job
        response = self._build_job_submission_call(name, settings, logger)

        if 'error' in response:
            logger.error("Couldn't build the call to send the job: " +
                         response['error'])
            return False

        # prepare the scale env variables
        if 'scale_env_mapping_call' in response:
            scale_env_mapping_call = response['scale_env_mapping_call']
            output, exit_code = ssh_client.execute_shell_command(
                scale_env_mapping_call, workdir=workdir, wait_result=True)
            if exit_code is not 0:
                logger.error("Scale env vars mapping '" +
                             scale_env_mapping_call + "' failed with code " +
                             str(exit_code) + ":\n" + output)
                return False

        # submit the job
        call = response['call']

        if ssh_client:
            output, exit_code = ssh_client.execute_shell_command(
                call, env=context, workdir=workdir, wait_result=True)
        else:  # K8S
            os.chdir(workdir)
            output = ''
            exit_code = subprocess.call(call, shell=True)
            #output = check_output(['chmod', '+x', 'touch.script'])

        if exit_code is not 0:
            logger.error("Job submission '" + call + "' exited with code " +
                         str(exit_code) + ":\n" + output)
            return False
        return True