Ejemplo n.º 1
0
def ssh():
    ssh_config = load_config("credentials.json")["ssh"]
    client = SshClient(hostname=ssh_config["hostname"],
                       username=ssh_config["username"],
                       password=ssh_config["password"],
                       port=ssh_config["port"])
    yield client
    client.destroy()
    return client
Ejemplo n.º 2
0
def stop_job(job_options, **kwargs):  # pylint: disable=W0613
    """ Stops a job in the HPC """
    try:
        simulate = ctx.instance.runtime_properties['simulate']

        credentials = ctx.instance.runtime_properties['credentials']
        name = kwargs['name']
        is_singularity = 'hpc.nodes.singularity_job' in ctx.node.\
            type_hierarchy

        if not simulate:
            workdir = ctx.instance.runtime_properties['workdir']
            wm_type = ctx.instance.runtime_properties['workload_manager']
            client = SshClient(credentials['host'], credentials['user'],
                               credentials['password'])

            # TODO(emepetres): manage errors
            wm = WorkloadManager.factory(wm_type)
            if not wm:
                raise NonRecoverableError("Workload Manager '" + wm_type +
                                          "' not supported.")
            is_stopped = wm.stop_job(client,
                                     name,
                                     job_options,
                                     is_singularity,
                                     ctx.logger,
                                     workdir=workdir)

            client.close_connection()
        else:
            ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')
            is_stopped = True

        if is_stopped:
            ctx.logger.info('Job ' + name + ' (' + ctx.instance.id +
                            ') stopped.')
        else:
            ctx.logger.error('Job ' + name + ' (' + ctx.instance.id +
                             ') not stopped.')
            raise NonRecoverableError('Job ' + name + ' (' + ctx.instance.id +
                                      ') not stopped.')
    except KeyError:
        # The job wasn't configured properly, no need to be stopped
        ctx.logger.warning('Job was not stopped as it was not configured.')
Ejemplo n.º 3
0
def cleanup_job(job_options, skip, **kwargs):  # pylint: disable=W0613
    """Clean the aux files of the job in the HPC"""
    if skip:
        return

    try:
        simulate = ctx.instance.runtime_properties['simulate']
        name = kwargs['name']
        if not simulate:
            is_singularity = 'hpc.nodes.singularity_job' in ctx.node.\
                type_hierarchy
            credentials = ctx.instance.runtime_properties['credentials']
            workdir = ctx.instance.runtime_properties['workdir']
            wm_type = ctx.instance.runtime_properties['workload_manager']

            client = SshClient(credentials['host'], credentials['user'],
                               credentials['password'])

            # TODO(emepetres): manage errors
            wm = WorkloadManager.factory(wm_type)
            if not wm:
                raise NonRecoverableError("Workload Manager '" + wm_type +
                                          "' not supported.")
            is_clean = wm.clean_job_aux_files(client,
                                              name,
                                              job_options,
                                              is_singularity,
                                              ctx.logger,
                                              workdir=workdir)

            client.close_connection()
        else:
            ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')
            is_clean = True

        if is_clean:
            ctx.logger.info('Job ' + name + ' (' + ctx.instance.id +
                            ') cleaned.')
        else:
            ctx.logger.error('Job ' + name + ' (' + ctx.instance.id +
                             ') not cleaned.')
    except KeyError:
        # The job wasn't configured properly, so no cleanup needed
        ctx.logger.warning('Job was not cleaned up as it was not configured.')
Ejemplo n.º 4
0
def send_job(job_options, **kwargs):  # pylint: disable=W0613
    """ Sends a job to the HPC """
    simulate = ctx.instance.runtime_properties['simulate']

    credentials = ctx.instance.runtime_properties['credentials']
    name = kwargs['name']
    is_singularity = 'hpc.nodes.singularity_job' in ctx.node.\
        type_hierarchy

    if not simulate:
        workdir = ctx.instance.runtime_properties['workdir']
        wm_type = ctx.instance.runtime_properties['workload_manager']
        client = SshClient(credentials['host'], credentials['user'],
                           credentials['password'])

        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError("Workload Manager '" + wm_type +
                                      "' not supported.")
        is_submitted = wm.submit_job(client,
                                     name,
                                     job_options,
                                     is_singularity,
                                     ctx.logger,
                                     workdir=workdir)
        client.close_connection()
    else:
        ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated')
        is_submitted = True

    if is_submitted:
        ctx.logger.info('Job ' + name + ' (' + ctx.instance.id + ') sent.')
    else:
        ctx.logger.error('Job ' + name + ' (' + ctx.instance.id +
                         ') not sent.')
        raise NonRecoverableError('Job ' + name + ' (' + ctx.instance.id +
                                  ') not sent.')

    ctx.instance.runtime_properties['job_name'] = name
Ejemplo n.º 5
0
def prepare_hpc(config, base_dir, workdir_prefix, simulate, **kwargs):  # pylint: disable=W0613
    """ Tries to connect to a login node """
    ctx.logger.info('Connecting to login node..')
    if not simulate:
        wm_type = config['workload_manager']
        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError("Workload Manager '" + wm_type +
                                      "' not supported.")
        credentials = config['credentials']
        client = SshClient(credentials['host'], credentials['user'],
                           credentials['password'])
        _, exit_code = wm._execute_shell_command(client,
                                                 'uname',
                                                 wait_result=True)

        if exit_code is not 0:
            raise NonRecoverableError("failed to connect to HPC: exit code " +
                                      str(exit_code))

        ctx.instance.runtime_properties['login'] = exit_code is 0

        prefix = workdir_prefix
        if workdir_prefix is "":
            prefix = ctx.blueprint.id

        workdir = wm.create_new_workdir(client, base_dir, prefix)
        client.close_connection()
        if workdir is None:
            raise NonRecoverableError(
                "failed to create the working directory, base dir: " +
                base_dir)
        ctx.instance.runtime_properties['workdir'] = workdir
        ctx.logger.info('..HPC ready on ' + workdir)
    else:
        ctx.instance.runtime_properties['login'] = True
        ctx.instance.runtime_properties['workdir'] = "simulation"
        ctx.logger.warning('HPC login connection simulated')
Ejemplo n.º 6
0
def cleanup_hpc(config, skip, simulate, **kwargs):  # pylint: disable=W0613
    """ Tries to connect to a login node """
    if skip:
        return

    ctx.logger.info('Cleaning up...')
    if not simulate:
        workdir = ctx.instance.runtime_properties['workdir']
        wm_type = config['workload_manager']
        wm = WorkloadManager.factory(wm_type)
        if not wm:
            raise NonRecoverableError("Workload Manager '" + wm_type +
                                      "' not supported.")
        credentials = config['credentials']
        client = SshClient(credentials['host'], credentials['user'],
                           credentials['password'])
        _, exit_code = wm._execute_shell_command(client,
                                                 'rm -r ' + workdir,
                                                 wait_result=True)
        client.close_connection()
        ctx.logger.info('..all clean.')
    else:
        ctx.logger.warning('HPC clean up simulated.')
Ejemplo n.º 7
0
def get_states(monitor_jobs, logger):
    """ Retrieves the status of every job asking to the monitors"""
    states = {}

    for host, settings in monitor_jobs.iteritems():
        if settings['type'] == "PROMETHEUS":  # external
            partial_states = _get_prometheus(host, settings['config'],
                                             settings['names'])
        else:  # internal
            wm = WorkloadManager.factory(settings['type'])
            if wm:
                credentials = settings['config']
                client = SshClient(credentials['host'], credentials['user'],
                                   credentials['password'])
                partial_states = wm.get_states(client, settings['names'],
                                               logger)
                client.close_connection()
            else:
                partial_states = _no_states(host, settings['type'],
                                            settings['names'], logger)
        states.update(partial_states)

    return states
Ejemplo n.º 8
0
def deploy_job(script, inputs, credentials, wm_type, workdir, name, logger,
               skip_cleanup):  # pylint: disable=W0613
    """ Exec a eployment job script that receives SSH credentials as input """

    wm = WorkloadManager.factory(wm_type)
    if not wm:
        raise NonRecoverableError("Workload Manager '" + wm_type +
                                  "' not supported.")

    # Execute the script and manage the output
    client = SshClient(credentials['host'], credentials['user'],
                       credentials['password'])
    if wm._create_shell_script(client,
                               name,
                               ctx.get_resource(script),
                               logger,
                               workdir=workdir):
        call = "./" + name
        for dinput in inputs:
            call += ' ' + dinput
        _, exit_code = wm._execute_shell_command(client,
                                                 call,
                                                 workdir=workdir,
                                                 wait_result=True)
        if exit_code is not 0:
            logger.warning("failed to deploy job: call '" + call +
                           "', exit code " + str(exit_code))

        if not skip_cleanup:
            if not wm._execute_shell_command(
                    client, "rm " + name, workdir=workdir):
                logger.warning("failed removing bootstrap script")

    client.close_connection()

    return exit_code is 0