Ejemplo n.º 1
0
def get_states(monitor_jobs, logger):
    """ Retrieves the status of every job asking to the monitors"""
    states = {}

    for host, settings in monitor_jobs.iteritems():
        if settings['type'] == "PROMETHEUS":  # external
            partial_states = _get_prometheus(host, settings['config'],
                                             settings['names'])
        else:  # internal
            wm = WorkloadManager.factory(settings['type'])
            if wm:
                credentials = settings['config']
                client = SshClient(credentials['host'],
                                   credentials['user'],
                                   credentials['password'],
                                   use_login_shell=credentials['login_shell'])
                partial_states = wm.get_states(client, settings['names'],
                                               logger)
                client.close_connection()
            else:
                partial_states = _no_states(host, settings['type'],
                                            settings['names'], logger)
        states.update(partial_states)

    return states
Ejemplo n.º 2
0
def cleanup_hpc(config, skip, simulate, **kwargs):  # pylint: disable=W0613
    """ Tries to connect to a login node """
    if skip:
        return

    ctx.logger.info('Cleaning up...')
    if not simulate:
        workdir = ctx.instance.runtime_properties['workdir']
        wm_type = config['workload_manager']
        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError("Workload Manager '" + wm_type +
                                      "' not supported.")
        credentials = config['credentials']
        client = SshClient(credentials['host'],
                           credentials['user'],
                           credentials['password'],
                           use_login_shell=credentials['login_shell'])
        _, exit_code = wm._execute_shell_command(client,
                                                 'rm -r ' + workdir,
                                                 wait_result=True)
        client.close_connection()
        ctx.logger.info('..all clean.')
    else:
        ctx.logger.warning('HPC clean up simulated.')
Ejemplo n.º 3
0
def prepare_hpc(config, base_dir, workdir_prefix, simulate, **kwargs):  # pylint: disable=W0613
    """ Tries to connect to a login node """
    ctx.logger.info('Connecting to login node:')
    if not simulate:
        wm_type = config['workload_manager']
        ctx.logger.info(' - manager: {wm_type}'.format(wm_type=wm_type))

        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError("Workload Manager '" + wm_type +
                                      "' not supported.")

        credentials = config['credentials']
        ctx.logger.info(
            ' - remote host: {user}@{host}, {login_sh} remote login..'.format(
                login_sh='with' if credentials['login_shell'] else 'w/o',
                user=credentials['user'],
                host=credentials['host']))
        client = SshClient(credentials['host'],
                           credentials['user'],
                           credentials['password'],
                           use_login_shell=credentials['login_shell'])
        _, exit_code = wm._execute_shell_command(client,
                                                 'uname',
                                                 wait_result=True)

        if exit_code is not 0:
            raise NonRecoverableError("failed to connect to HPC: exit code " +
                                      str(exit_code))

        ctx.instance.runtime_properties['login'] = exit_code is 0

        prefix = workdir_prefix
        if workdir_prefix is "":
            prefix = ctx.blueprint.id

        workdir = wm.create_new_workdir(client, base_dir, prefix)
        client.close_connection()
        if workdir is None:
            raise NonRecoverableError(
                "failed to create the working directory, base dir: " +
                base_dir)
        ctx.instance.runtime_properties['workdir'] = workdir
        ctx.logger.info('..HPC ready on ' + workdir)
    else:
        ctx.logger.info(' - [simulation]..')
        ctx.instance.runtime_properties['login'] = True
        ctx.instance.runtime_properties['workdir'] = "simulation"
        ctx.logger.warning('HPC login connection simulated')
Ejemplo n.º 4
0
def stop_job(job_options, **kwargs):  # pylint: disable=W0613
    """ Stops a job in the HPC """
    simulate = ctx.instance.runtime_properties['simulate']

    credentials = ctx.instance.runtime_properties['credentials']
    name = kwargs['name']
    is_singularity = 'hpc.nodes.singularity_job' in ctx.node.\
        type_hierarchy

    if not simulate:
        workdir = ctx.instance.runtime_properties['workdir']
        wm_type = ctx.instance.runtime_properties['workload_manager']
        client = SshClient(credentials['host'],
                           credentials['user'],
                           credentials['password'],
                           use_login_shell=credentials['login_shell'])

        # TODO(emepetres): manage errors
        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError("Workload Manager '" + wm_type +
                                      "' not supported.")
        is_stopped = wm.stop_job(client,
                                 name,
                                 job_options,
                                 is_singularity,
                                 ctx.logger,
                                 workdir=workdir)

        client.close_connection()
    else:
        ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')
        is_stopped = True

    if is_stopped:
        ctx.logger.info('Job ' + name + ' (' + ctx.instance.id + ') stopped.')
    else:
        ctx.logger.error('Job ' + name + ' (' + ctx.instance.id +
                         ') not stopped.')
        raise NonRecoverableError('Job ' + name + ' (' + ctx.instance.id +
                                  ') not stopped.')
Ejemplo n.º 5
0
def deploy_job(script, inputs, credentials, wm_type, workdir, name, logger,
               skip_cleanup):  # pylint: disable=W0613
    """ Exec a eployment job script that receives SSH credentials as input """

    wm = WorkloadManager.factory(wm_type)
    if not wm:
        raise NonRecoverableError("Workload Manager '" + wm_type +
                                  "' not supported.")

    # Execute the script and manage the output
    client = SshClient(credentials['host'],
                       credentials['user'],
                       credentials['password'],
                       use_login_shell=credentials['login_shell'])
    if wm._create_shell_script(client,
                               name,
                               ctx.get_resource(script),
                               logger,
                               workdir=workdir):
        call = "./" + name
        for dinput in inputs:
            call += ' ' + dinput
        _, exit_code = wm._execute_shell_command(client,
                                                 call,
                                                 workdir=workdir,
                                                 wait_result=True)
        if exit_code is not 0:
            logger.warning("failed to deploy job: call '" + call +
                           "', exit code " + str(exit_code))

        if not skip_cleanup:
            if not wm._execute_shell_command(
                    client, "rm " + name, workdir=workdir):
                logger.warning("failed removing bootstrap script")

    client.close_connection()

    return exit_code is 0