Ejemplo n.º 1
0
def _testscheduler(job):
    """Find out what scheduler is on the system."""
    schedulerqueries = getattr(schedulers, "QUERY")

    LOG.info("No environment for this host '%s' is specified - attempting to "
             "determine it!", job["resource"])

    # Go through the schedulers we are supporting.
    for param in schedulerqueries:

        try:

            shellwrappers.sendtossh(job, schedulerqueries[param])

            job["scheduler"] = param

            LOG.info("The environment on this host is '%s'", param)
            break

        except exceptions.SSHError:

            LOG.debug("Environment is not '%s'", param)

    if job["scheduler"] is "":

        raise exceptions.SchedulercheckError("Could not find the job "
                                             "scheduling system.")
Ejemplo n.º 2
0
def test_sendtossh_retries(mock_sendtoshell, mock_time):
    """
    This test will check that if an error code of 255 is raised, that the
    retries happen and that eventually upon failure that the SSHError
    exception is raised.
    """

    job = {
        "port": "22",
        "user": "******",
        "host": "massive-machine",
        "env-fix": "false"
    }

    args = ["ls"]

    # Set the return values of sendtoshell.
    mock_sendtoshell.return_value = "Output message", "Error message", 255

    # Set the timout for retries to 0 seconds to speed up test.
    mock_time.return_value = None

    with pytest.raises(exceptions.SSHError):

        sendtossh(job, args)

    assert mock_sendtoshell.call_count == 3, "This method should retry 3 times"
Ejemplo n.º 3
0
def stage_upstream(jobs):
    """Transfer files for all jobs, to a remote HPC machine.

    A method for staging files for each job to the target HPC host. The
    underlying utility behind this transfer is rsync, thus it is possible
    to supply rsync file masks to blacklist unwanted large files. By default
    rsync is configured to transfer blockwise and only transfer the
    newest/changed blocks, this saves a lot of time during persistant staging.

    Required arguments are:

    jobs (dictionary) - The Longbow jobs data structure, see configuration.py
                        for more information about the format of this
                        structure.

    """
    LOG.info("Staging files for job/s.")

    for item in [a for a in jobs if "lbowconf" not in a]:

        job = jobs[item]
        destdir = job["destdir"]

        LOG.info("Transfering files for job '%s' to host '%s'", item,
                 job["resource"])

        try:

            shellwrappers.sendtossh(job, ["mkdir -p " + destdir + "\n"])

            LOG.info("Creation of directory '%s' - successful.", destdir)

        except exceptions.SSHError:

            LOG.error(
                "Creation of directory '%s' - failed. Make sure that you "
                "have write permissions at the top level of the path given.",
                destdir)

            raise

        # Transfer files upstream.
        try:

            shellwrappers.upload(job)

        except exceptions.RsyncError:

            raise exceptions.StagingError(
                "Could not stage '{0}' upstream, make sure that you have "
                "supplied the correct remote working directory and that you "
                "have chosen a path that you can write to.".format(
                    job["localworkdir"]))

    LOG.info("Staging files upstream - complete.")
Ejemplo n.º 4
0
def status(job):
    """Query a job status."""
    # Initialise variables.
    states = {"h": "Held", "qw": "Queued", "r": "Running"}

    jobstate = ""

    shellout = shellwrappers.sendtossh(job, ["qstat -u " + job["user"]])

    # PBS will return a table, so split lines into a list.
    stdout = shellout[0].split("\n")

    # Look up the job state and convert it to Longbow terminology.
    # Now match the jobid against the list of jobs, extract the line and
    # split it into a list
    for line in stdout:

        line = line.split()

        if len(line) > 0 and job["jobid"] in line[0]:

            jobstate = states[line[4]]
            break

    if jobstate == "":

        jobstate = "Finished"

    return jobstate
Ejemplo n.º 5
0
def delete(job):
    """Delete a job."""
    # Initialise variables.
    jobid = job["jobid"]

    try:

        if int(job["replicates"]) > 1:

            shellout = shellwrappers.sendtossh(job, ["qdel " + jobid + "[]"])

        else:

            shellout = shellwrappers.sendtossh(job, ["qdel " + jobid])

    except exceptions.SSHError:

        raise exceptions.JobdeleteError("Unable to delete job.")

    return shellout[0]
Ejemplo n.º 6
0
def _testhandler(job):
    """Find out what job handler is on the system."""
    # Initialise variables.
    handlers = {
        "aprun": ["which aprun"],
        "mpirun": ["which mpirun"]
    }

    LOG.info("No queue handler was specified for host '%s' - attempting to "
             "find it", job["resource"])

    modules = []

    # Go through the handlers and find out which is there. Load modules first
    # as this is necessary for some remote resources
    for module in job["modules"].split(","):

        module = module.replace(" ", "")
        modules.extend(["module load " + module + "\n"])

    for param in handlers:

        try:

            cmd = modules[:]
            cmd.extend(handlers[param])
            shellwrappers.sendtossh(job, cmd)

            job["handler"] = param

            LOG.info("The batch queue handler is '%s'", param)
            break

        except exceptions.SSHError:

            LOG.debug("The batch queue handler is not '%s'", param)

    if job["handler"] is "":

        raise exceptions.HandlercheckError("Could not find the batch queue "
                                           "handler.")
Ejemplo n.º 7
0
def test_sendtossh_errorcode(mock_sendtoshell):
    """
    This test will check that if the error code is not 0 or 255 that the
    SSHError exception is raised.
    """

    job = {
        "port": "22",
        "user": "******",
        "host": "massive-machine",
        "env-fix": "false"
    }

    args = ["ls"]

    # Set the return values of sendtoshell.
    mock_sendtoshell.return_value = "Output message", "Error message", 1

    with pytest.raises(exceptions.SSHError):

        sendtossh(job, args)
Ejemplo n.º 8
0
def test_sendtossh_envfix(mock_sendtoshell):
    """
    Testing that the environment fix is switched on.
    """

    job = {
        "port": "22",
        "user": "******",
        "host": "massive-machine",
        "env-fix": "true"
    }

    # Set the return values of sendtoshell.
    mock_sendtoshell.return_value = "Output message", "Error message", 0

    sendtossh(job, ["ls"])

    callargs = mock_sendtoshell.call_args[0][0]
    testargs = ("ssh -p 22 juan_trique-ponee@massive-machine source "
                "/etc/profile; ls")

    assert " ".join(callargs) == testargs
Ejemplo n.º 9
0
def test_sendtossh_formattest(mock_sendtoshell):
    """
    Testing the format of the rsync call sent to the shell. This test
    will check that calls without masks get formed correctly.
    """

    job = {
        "port": "22",
        "user": "******",
        "host": "massive-machine",
        "env-fix": "false"
    }

    # Set the return values of sendtoshell.
    mock_sendtoshell.return_value = "Output message", "Error message", 0

    sendtossh(job, ["ls"])

    callargs = mock_sendtoshell.call_args[0][0]
    testargs = "ssh -p 22 juan_trique-ponee@massive-machine ls"

    assert " ".join(callargs) == testargs
Ejemplo n.º 10
0
def delete(job):
    """Delete a job."""
    # Initialise variables.
    jobid = job["jobid"]

    try:

        shellout = shellwrappers.sendtossh(job, ["bkill " + jobid])

    except exceptions.SSHError:

        raise exceptions.JobdeleteError("Unable to delete job.")

    return shellout[0]
Ejemplo n.º 11
0
def status(job):
    """Query a job status."""
    # Initialise variables.
    states = {
        "CA": "Cancelled",
        "CD": "Completed",
        "CF": "Configuring",
        "CG": "Completing",
        "F": "Failed",
        "NF": "Node Failure",
        "PD": "Pending",
        "PR": "Preempted",
        "R": "Running",
        "S": "Suspended",
        "TO": "Timed out"
    }

    jobstate = ""

    shellout = shellwrappers.sendtossh(job, ["squeue -u " + job["user"]])

    # PBS will return a table, so split lines into a list.
    stdout = shellout[0].split("\n")

    # Look up the job state and convert it to Longbow terminology.
    # Now match the jobid against the list of jobs, extract the line and
    # split it into a list
    for line in stdout:

        line = line.split()

        if len(line) > 0 and job["jobid"] in line[0]:

            jobstate = states[line[4]]
            break

    if jobstate == "":

        jobstate = "Finished"

    return jobstate
Ejemplo n.º 12
0
def status(job):
    """Query a job status."""
    # Initialise variables.
    states = {
        "DONE": "Job Exited Properly",
        "EXIT": "Job Exited in Error",
        "PEND": "Queued",
        "PSUSP": "Suspended",
        "RUN": "Running",
        "SSUSP": "Suspended",
        "UNKWN": "Unknown Status",
        "USUSP": "Suspended",
        "WAIT": "Waiting for Start Time",
        "ZOMBI": "Zombie Job"
    }

    jobstate = ""

    shellout = shellwrappers.sendtossh(job, ["bjobs -u " + job["user"]])

    # PBS will return a table, so split lines into a list.
    stdout = shellout[0].split("\n")

    # Look up the job state and convert it to Longbow terminology.
    # Now match the jobid against the list of jobs, extract the line and
    # split it into a list
    for line in stdout:

        line = line.split()

        if len(line) > 0 and job["jobid"] in line[0]:

            jobstate = states[line[2]]
            break

    if jobstate == "":

        jobstate = "Finished"

    return jobstate
Ejemplo n.º 13
0
def status(job):
    """Query a job status."""
    # Initialise variables.
    states = {
        "B": "Subjob(s) Running",
        "E": "Exiting",
        "H": "Held",
        "M": "Job Moved to Server",
        "Q": "Queued",
        "R": "Running",
        "S": "Suspended",
        "T": "Job Moved to New Location",
        "U": "Cycle-Harvesting Job is Suspended Due to Keyboard Activity",
        "W": "Waiting for Start Time",
        "X": "Subjob Completed Execution/Has Been Deleted"
    }

    jobstate = ""

    shellout = shellwrappers.sendtossh(job, ["qstat -u " + job["user"]])

    # PBS will return a table, so split lines into a list.
    stdout = shellout[0].split("\n")

    for line in stdout:

        line = line.split()

        if len(line) > 0 and job["jobid"] in line[0]:

            jobstate = states[line[9]]
            break

    if jobstate == "":

        jobstate = "Finished"

    return jobstate
Ejemplo n.º 14
0
def test_sendtossh_returncheck(mock_sendtoshell):
    """
    This test will check that the sendtossh method will exit and return the
    raw return values from sendtoshell.
    """

    job = {
        "port": "22",
        "user": "******",
        "host": "massive-machine",
        "env-fix": "false"
    }

    args = ["ls"]

    # Set the return values of sendtoshell.
    mock_sendtoshell.return_value = "Output message", "Error message", 0

    output = sendtossh(job, args)

    assert output[0] == "Output message", "method is not returning stdout"
    assert output[1] == "Error message", "method is not returning stderr"
    assert output[2] == 0, "method is not returning the error code"
Ejemplo n.º 15
0
def submit(job):
    """Submit a job."""
    # cd into the working directory and submit the job.
    cmd = ["cd " + job["destdir"] + "\n", "bsub < " + job["subfile"]]

    # Process the submit
    try:

        shellout = shellwrappers.sendtossh(job, cmd)

    except exceptions.SSHError as inst:

        if "limit" in inst.stderr:

            raise exceptions.QueuemaxError

        else:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. The following output "
                "came back from the SSH call:\nstdout: {0}\nstderr {1}".format(
                    inst.stdout, inst.stderr))

    try:

        # Do the regex in Longbow rather than in the subprocess.
        jobid = re.search(r'\d+', shellout[0]).group()

    except AttributeError:

        raise exceptions.JobsubmitError(
            "Could not detect the job id during submission, this means that "
            "either the submission failed in an unexpected way, or that "
            "Longbow could not understand the returned information.")

    # Put jobid into the job dictionary.
    job["jobid"] = jobid
Ejemplo n.º 16
0
def submit(job):
    """Submit a job."""
    # Change into the working directory and submit the job.
    cmd = ["cd " + job["destdir"] + "\n", "qsub " + job["subfile"]]

    try:

        shellout = shellwrappers.sendtossh(job, cmd)

    except exceptions.SSHError as inst:

        if "would exceed" in inst.stderr and "per-user limit" in inst.stderr:

            raise exceptions.QueuemaxError

        elif "set_booleans" in inst.stderr:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. The likely cause is "
                "your particular PBS install is not receiving the "
                "information/options/parameters it "
                "requires "
                "e.g. '#PBS -l mem=20gb'. Check the PBS documentation and edit"
                " the configuration files to provide the necessary information"
                "e.g. 'memory = 20' in the job configuration file")

        elif "Job rejected by all possible destinations" in inst.stderr:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. This may be because "
                "you need to provide PBS with your account code and the "
                "account flag your PBS install expects (Longbow defaults to "
                "A). Check the PBS documentation and edit the configuration "
                "files to provide the necessary information e.g. "
                "'accountflag = P' and 'account = ABCD-01234-EFG'")

        elif "Job must specify budget (-A option)" in inst.stderr:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. This may be because "
                "you provided PBS with an account flag other than 'A' which "
                "your PBS install expects")

        elif "Job exceeds queue and/or server resource limits" in inst.stderr:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. PBS has reported "
                "that 'Job exceeds queue and/or server resource limits'. "
                "This may be because you set a walltime or some other "
                "quantity that exceeds the maximum allowed on your system.")

        elif "budget" in inst.stderr:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. This may be that you "
                "have entered an incorrect account code.")

        elif "illegal -N value" in inst.stderr:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. This is due to the job "
                "name being too long, consult your system administrators/"
                "documentation to query this policy (try < 15 chars).")

        else:

            raise exceptions.JobsubmitError(
                "Something went wrong when submitting. The following output "
                "came back from the SSH call:\nstdout: {0}\nstderr {1}".format(
                    inst.stdout, inst.stderr))

    try:

        # Do the regex in Longbow rather than in the subprocess.
        jobid = re.search(r'\d+', shellout[0]).group()

    except AttributeError:

        raise exceptions.JobsubmitError(
            "Could not detect the job id during submission, this means that "
            "either the submission failed in an unexpected way, or that "
            "Longbow could not understand the returned information.")

    # Put jobid into the job dictionary.
    job["jobid"] = jobid
Ejemplo n.º 17
0
def checkapp(jobs):
    """Test that executables and their modules are launchable.

    This method will make an attempt to check that the application executable
    required to run a job or many jobs is present on the specified host. This
    method is capable of using the module system using some pre-configured
    either using user specified modules supplied in configuration files, or by
    using internal defaults. Users of codes that we are not supporting out of
    the box, will either have to specify the modules explicitly within
    configuration files.

    Required arguments are:

    jobs (dictionary) - The Longbow jobs data structure, see configuration.py
                        for more information about the format of this
                        structure.

    """
    checked = {}

    LOG.info("Testing the executables defined for each job.")

    for job in [a for a in jobs if "lbowconf" not in a]:

        # If we haven't checked this resource then it is likely not in the dict
        if jobs[job]["resource"] not in checked:

            checked[jobs[job]["resource"]] = []

        # Now check if we have tested this exec already.
        if jobs[job]["executable"] not in checked[jobs[job]["resource"]]:

            # If not then add it to the list now.
            checked[jobs[job]["resource"]].extend([jobs[job]["executable"]])

            LOG.info("Checking executable '%s' on '%s'",
                     jobs[job]["executable"], jobs[job]["resource"])

            cmd = []

            if jobs[job]["modules"] is "":

                LOG.debug("Checking without modules.")

            else:
                LOG.debug("Checking with modules.")

                for module in jobs[job]["modules"].split(","):

                    module = module.replace(" ", "")
                    cmd.extend(["module load " + module + "\n"])

            cmd.extend(["which " + jobs[job]["executable"]])

            try:

                shellwrappers.sendtossh(jobs[job], cmd)
                LOG.info("Executable check - passed.")

            except exceptions.SSHError:

                raise exceptions.ExecutableError("Executable check - failed.")