Esempio n. 1
0
def get_slurm_queues():
    '''
    Get a list of all available queues to submit a job to.

    **Returns**

        avail_queues: *list, str*
            A list of available queues by name.
    '''
    sinfo_path = which("sinfo")
    assert sinfo_path is not None,\
        "Error - Unable to find sinfo in PATH."

    p = subprocess.Popen([sinfo_path],
                         shell=False,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    all_queues = p.stdout.read().decode("utf-8").strip()
    if all_queues == '':
        close_pipes(p)
        return []
    all_queues = all_queues.split("\n")[1:]
    all_queues = [q.split()[0] for q in all_queues if q.split()[1] == 'up']
    all_queues = list(set(all_queues))
    close_pipes(p)
    return [q if "*" not in q else q.replace("*", "") for q in all_queues]
Esempio n. 2
0
def get_nbs_queues():
    '''
    Get a list of all available queues to submit a job to.

    **Returns**

        avail_queues: *list, str*
            A list of available queues by name.
    '''
    qlist_path = which("qlist")
    p = subprocess.Popen([qlist_path],
                         shell=False,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    all_queues = p.stdout.read().decode("utf-8").strip().split('\n')[:-1]
    all_queues = [a.split() for a in all_queues]
    all_queues = [a[0] for a in all_queues if len(a) > 1]
    close_pipes(p)
    return [
        a.lower() for a in all_queues
        if a.lower() not in ["queue", "name", ""]
    ]
Esempio n. 3
0
def get_ovito_obj(version="2.9.0"):
    '''
    This function returns the ovito object.  Note, currently the code below
    only works on version 2.9.0.
    '''
    ovito_path = which("ovitos")

    # Determine version
    ovito_pipe = subprocess.Popen([ovito_path, "-v"],
                                  shell=False,
                                  stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
    stdout = str(ovito_pipe.stdout.read().decode("utf-8").strip())

    assert "Ovito" in stdout,\
        "Error - Unable to access Ovito.  Please ensure it is in your PATH \
environment variable!"
    assert version in stdout,\
        "Error - Incorrect Ovito version!  It should be %s, but is %s."\
        % (version, stdout.strip().split()[-1])

    close_pipes(ovito_pipe)

    return ovito_path
Esempio n. 4
0
def submit_job(name, job_to_submit, **kwargs):
    '''
    Code to submit a simulation to the specified queue and queueing system.

    **Parameters**

        name: *str*
            Name of the job to be submitted to the queue.
        job_to_submit: *str*
            String holding code you wish to submit.
        queue: *str, optional*
            What queue to run the simulation on (queueing system dependent).
        walltime: *str, optional*
            How long to post the job on the queue for in d-h:m:s where d are
            days, h are hours, m are minutes, and s are seconds.  Default is
            for 30 minutes (00:30:00).
        cpus_per_task: *int, optional*
            How many processors to run the simulation on.  Note, the actual
            number of cores mpirun will use is nprocs * ntasks.
        ntasks: *int, optional*
            How many processors to run the simulation on.  Note, the actual
            number of cores mpirun will use is nprocs * ntasks.
        nodes: *int, optional*
            How many nodes to run the simulation on.
        sub_flag: *str, optional*
            Additional strings/flags/arguments to add at the end when we
            submit a job using sbatch.  That is: sbatch demo.slurm sub_flag.
        unique_name: *bool, optional*
            Whether to force the requirement of a unique name or not.  NOTE! If
            you submit simulations from the same folder, ensure that this is
            True lest you have a redundancy problem! To overcome said issue,
            you can set redundancy to True as well (but only if the simulation
            is truly redundant).
        outfile_name: *str, optional*
            Whether to give a unique output file name, or one based on the sim
            name.procs
        allocation: *str, optional*
            The SLURM allocation to submit the job to.
        jobarray: *str, optional*
            If specified, instead of indicating a range for job arrays, we
            will use these specific values.  For example,
            jobarray=1,2,4,5 would submit jobs, but skip the 3rd
            index by name.
        gpu: *int, optional*
            How many GPUs to use, if submitting to a GPU node.
        redundancy: *bool, optional*
            With redundancy on, if the job is submitted and unique_name is on,
            then if another job of the same name is running, a pointer to that
            job will instead be returned.

    **Returns**

        job_obj: :class:`squid.jobs.slurm.Job`
            A Job object.
    '''
    # Store the defaults
    params = {
        "queue": "shared",
        "ntasks": 1,
        "cpus_per_task": 1,
        "nodes": 1,
        "walltime": "00:30:00",
        "sub_flag": "",
        "unique_name": True,
        "redundancy": False,
        "allocation": None,
        "gpu": None,
        "jobarray": None,
        "sandbox": None,
        "outfile_name": None,
        "email": None,
    }
    AVAIL_GPU_PARTS = ["unlimited", "gpuk80", "gpup100", "debugger"]

    ## We used to check to see if any pointless values were being passed, but
    ## we found it was easier to simply just ignore those and not crash.

    ## Ensure we are passing only the above
    #for key, value in kwargs.items():
    #    assert key in params,\
    #        "Error - Unknown variable (%s) passed to slurm.submit_job." % key
    params.update(kwargs)

    # Ensure variables of correct types
    param_types = {
        "queue": lambda s: str(s).strip(),
        "ntasks": int,
        "cpus_per_task": int,
        "nodes": int,
        "unique_name": bool,
        "redundancy": bool,
        "walltime": lambda s: str(s).strip(),
        "sub_flag": lambda s: str(s).strip()
    }
    for k, f in param_types.items():
        params[k] = f(params[k])

    # Ensure default values make sense
    # Check Queue
    slurm_queues = get_slurm_queues()
    assert params["queue"] in slurm_queues,\
        "Error - Invalid queue (%s) requested.  Options: %s"\
        % (params["queue"], ", ".join(slurm_queues))
    # Check ntasks and nodes
    if params["cpus_per_task"] * params["ntasks"] > 24 * params["nodes"]:
        print("Warning - You requested %d tasks and %d cpus_per_task.  This \
equates to %d nodes on marcc; however, you only requested %d nodes." %
              (params["cpus_per_task"], params["ntasks"],
               (params["cpus_per_task"] * params["ntasks"] - 1) // 24 + 1,
               params["nodes"]))
        print("\tWill adjust nodes accordingly...")
        params["nodes"] =\
            (params["cpus_per_task"] * params["ntasks"] - 1) // 24 + 1

    # We need to remove gpu nodes from available nodes on SLURM/MARCC
    gpu_flag_slurm = "#SBATCH --exclude=gpu004,gpu005"
    # However, if we want to submit to GPU, handle accordingly
    if params["gpu"] is not None:
        msg = "Error - queue (%s) not available with gpus.  Choose one: %s"\
              % (params["queue"], ', '.join(AVAIL_GPU_PARTS))
        assert params["queue"] in AVAIL_GPU_PARTS, msg

        gpu_flag_slurm = "#SBATCH --gres=gpu:%d" % params["gpu"]
        # On MARCC we need gpu tasks, and 6 cores per task
        params["ntasks"] = params["gpu"]
        params["cpus_per_task"] = 6

    if params["allocation"] is None:
        allocation = ""
    else:
        allocation = "#SBATCH --account=" + params["allocation"]

    # Generate your script
    jobarray_outfile_append = ""
    job_array_script = ""
    if params["jobarray"] is not None:
        if isinstance(params["jobarray"], str):
            job_array_script = "#SBATCH --array=%s"\
                % simplify_numerical_array(params["jobarray"])
        else:
            job_array_script = "#SBATCH --array=%d-%d"\
                % tuple(params["jobarray"])
        jobarray_outfile_append = ".a%a"
    if params["outfile_name"] is None:
        params["outfile_name"] = name + jobarray_outfile_append + ".o%j"
    generic_script = '''#!/bin/sh
#SBATCH --job-name="''' + name + '''"
#SBATCH --output="''' + params["outfile_name"] + '''"
#SBATCH --nodes=''' + str(params["nodes"]) + '''
#SBATCH --ntasks=''' + str(params["ntasks"]) + ('''
#SBATCH --cpus-per-task=''' + str(
        params["cpus_per_task"]) if params["cpus_per_task"] > 1 else "") + '''
#SBATCH --partition=''' + params["queue"] + '''
#SBATCH --time=''' + params["walltime"] + '''
''' + allocation + '''
''' + gpu_flag_slurm + '''
''' + job_array_script + '''
'''
    # Take care of sandboxing if needed
    if params["sandbox"] is not None:
        raise Exception("Sandbox not implemented in slurm.")

    # Add in your script now
    generic_script = generic_script +\
        "\ndate\n" +\
        job_to_submit +\
        "\ndate\n\n"

    f = open(name + '.slurm', 'w')
    f.write(generic_script)
    f.close()

    # Get a list of all jobs
    all_jobs = get_job("RUNNING", detail=0) + get_job("PENDING", detail=0)
    job_exists = name in all_jobs
    if params["redundancy"] and job_exists:
        try:
            job_info = get_job(name, detail=1)[0]
        except IndexError:
            # Job finished in process of submitting and redundancy call
            job_to_return = Job(name)
            # Attach the redundancy flag
            job_to_return.redundancy = True
            return job_to_return
        job_to_return = Job(job_info[0], job_id=job_info[-1])
        job_to_return.redundancy = True
        return job_to_return
    elif params["unique_name"] and job_exists:
        raise Exception("Job with name %s already exists in the queue!" % name)

    # Submit job
    cmd = 'sbatch %s.slurm %s' % (name, params["sub_flag"])
    job_pipe = subprocess.Popen(cmd.split(),
                                shell=False,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)

    # Get the error message
    job_err = job_pipe.stderr.read().decode("utf-8")

    # If we figure out redundancy, add it here
    # CODE FORE REDUNDANCY IN SLURM
    job_id = job_pipe.stdout.read().decode("utf-8")

    if "Submitted batch job" not in job_id:
        print("\nFailed to submit the job!")
        print("--------------- JOB OUTPUT ---------------")
        print(job_id)
        print("---------------- JOB ERROR ---------------")
        print(job_err)
        print("---------------------------------")
        sys.stdout.flush()
        raise Exception()

    job_id = job_id.split()[-1].strip()

    close_pipes(job_pipe)
    return Job(name, job_id=job_id)
Esempio n. 5
0
def get_orca_obj(parallel=True):
    '''
    This function will find the orca executable and the corresponding openmpi
    executable.  It will handle errors accordingly.

    **Parameters**

        parallel: *bool, optional*
            Whether we guarantee the relevant parallel openmpi is setup (True)
            or not (False).

    **Returns**

        orca_path: *str*
            The path to the orca executable.
    '''
    # This is to ensure we read in ORCA correctly
    orca_string_id = "An Ab Initio, DFT and Semiempirical electronic structure"
    # This is to find the version
    version_string_id = "Program Version"

    orca_path = which("orca")
    # Determine orca version
    orca_pipe = subprocess.Popen(
        [orca_path, "FAKE_FILE"], shell=False,
        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout = str(orca_pipe.stdout.read().decode("utf-8").strip())
    # stderr = str(p.stderr.read().decode("utf-8").strip())

    assert orca_string_id in stdout,\
        "Error - Unable to access Orca.  Please ensure it is in your PATH \
environment variable!"
    assert version_string_id in stdout,\
        "Error - Unable to assess Orca version!"

    orca_version = stdout.split(version_string_id)[1].strip().split()[0]

    # If running in parallel, ensure we have the correct version of openmpi
    ompi_pipe = None
    if parallel:
        ompi_version_should_be = {
            "4.1.2": "3.1",
            "4.2.0": "3.1"
        }
        assert orca_version in ompi_version_should_be,\
            "Error - Please contact squid dev. We do not have stored the \
required openmpi version for Orca %s" % orca_version

        # Find openmpi
        ompi_path = which("mpiexec")
        ompi_pipe = subprocess.Popen(
            [ompi_path, "--V"], shell=False,
            stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout = str(ompi_pipe.stdout.read().decode("utf-8").strip())
        # stderr = str(p.stderr.read().decode("utf-8").strip())

        # Simple check for openmpi
        assert "open" in stdout.lower(),\
            "Error - Unable to access openmpi.  Please ensure it is in your \
PATH environment variable!"

        ompi_version = stdout.strip().split("\n")[0].split()[-1]
        ompi_version_held = ompi_version_should_be[orca_version]

        assert ompi_version.startswith(ompi_version_held),\
            "Error - Incorrect openmpi version for the loaded orca version. \
Should be openmpi %s (found %s) for orca %s."\
        % (ompi_version_held, ompi_version, orca_version)

    close_pipes(orca_pipe)
    close_pipes(ompi_pipe)

    return orca_path
Esempio n. 6
0
def get_job(s_flag, detail=0):
    '''
    Get a list of all jobs currently on your queue.  From this, only return
    the values that have s_flag in them.  The *detail* variable can be used
    to specify how much information you want returned.

    **Parameters**

        s_flag: *str*
            A string to parse out job information with.
        detail: *int, optional*
            The amount of information you want returned.

    **Returns**

        all_jobs: *list*
            Depending on *detail*, you get the following:

                - *details* =0: *list, str*
                    List of all jobs on the queue.

                - *details* =1: *list, tuple, str*
                    List of all jobs on the queue as:
                        (job name, time run, job status)

                - *details* =2: *list, tuple, str*
                    List of all jobs on the queue as:
                        (job name,
                         time run,
                         job status,
                         queue,
                         number of processors)
    '''
    detail = int(detail)

    main_detail = detail
    if detail <= 0:
        detail = 1

    # Get input from jlist as a string
    jlist_path = which("jlist")
    qlist_path = which("qlist")
    jshow_path = which("jshow")

    assert jlist_path is not None,\
        "Error - Cannot find jlist."
    assert qlist_path is not None,\
        "Error - Cannot find qlist."
    assert jshow_path is not None,\
        "Error - Cannot find jshow."

    p = subprocess.Popen([jlist_path],
                         shell=False,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    output = p.stdout.read().decode("utf-8")

    # Get data from string
    pattern = getpass.getuser() +\
        '''[\s]+([\S]+)[\s]+([\S]+)[\s]+([\S]+)'''
    info = re.findall(pattern, output)

    # Get a list of names
    names = []
    for a in info:
        names.append(a[0])

    if len(names) > 0:
        out_ids = output.split("\n")
        out_ids = [
            x.split()[0] for x in out_ids
            if len(x.split()) > 0 and is_numeric(x.split()[0])
        ]
        info = [tuple(list(i) + [j]) for i, j in zip(info, out_ids)]

    # If user wants more information
    all_jobs = None
    if detail == 3:
        close_pipes(p)
        all_jobs = [i[-1] for i in info]
    elif detail == 2:
        for i, a in enumerate(info):
            p = subprocess.Popen([jshow_path, a[0]], stdout=subprocess.PIPE)
            s = p.stdout.read().decode("utf-8")
            serv = s[s.find('Queue name:'):].split()[2].strip()
            threads = 1
            if "Slot Reservations" in s:
                threads = s[s.find('Slot Reservations'):].split()[4]
                threads = threads.strip()
            info[i] = info[i] + (
                serv,
                threads,
            )
        close_pipes(p)
        all_jobs = info

    if all_jobs is None:
        # Return appropriate information
        close_pipes(p)
        if detail == 1:
            all_jobs = info
        else:
            all_jobs = names

    job_indices = [i for i, j in enumerate(all_jobs) if s_flag in " ".join(j)]
    chosen_jobs = [all_jobs[i] for i in job_indices]
    if main_detail == 0:
        return [j[0] for j in chosen_jobs]
    else:
        return chosen_jobs
Esempio n. 7
0
def submit_job(name, job_to_submit, **kwargs):
    '''
    Code to submit a simulation to the specified queue and queueing system.

    **Parameters**

        name: *str*
            Name of the job to be submitted to the queue.
        job_to_submit: *str*
            String holding code you wish to submit.
        queue: *str, optional*
            What queue to run the simulation on (queueing system dependent).
        walltime: *str, optional*
            How long to post the job on the queue for in d-h:m:s where d are
            days, h are hours, m are minutes, and s are seconds.  Default is
            for 30 minutes (00:30:00).
        nprocs: *int, optional*
            How many processors to run the simulation on.  Note, the actual
            number of cores mpirun will use is procs * ntasks.
        sub_flag: *str, optional*
            Additional strings/flags/arguments to add at the end when we
            submit a job using jsub.  That is: jsub demo.nbs sub_flag.
        unique_name: *bool, optional*
            Whether to force the requirement of a unique name or not.  NOTE! If
            you submit simulations from the same folder, ensure that this is
            True lest you have a redundancy problem! To overcome said issue,
            you can set redundancy to True as well (but only if the simulation
            is truly redundant).
        outfile_name: *str, optional*
            Whether to give a unique output file name, or one based on the sim
            name.procs
        xhosts: *str* or *list, str, optional*
            Which cpu to submit the job to.
        email: *str, optional*
            An email address for sending job information to.
        priority: *int, optional*
            What priority to give the submitted job.
        sandbox: *bool, optional*
            Whether to sandbox the job or not.
        redundancy: *bool, optional*
            With redundancy on, if the job is submitted and unique_name is on,
            then if another job of the same name is running, a pointer to that
            job will instead be returned.

    **Returns**

        job_obj: :class:`squid.jobs.nbs.Job`
            A Job object.
    '''
    # Store the defaults
    params = {
        "queue": "shared",
        "nprocs": 1,
        "sub_flag": "",
        "unique_name": True,
        "redundancy": False,
        "outfile_name": None,
        "xhosts": None,
        "email": None,
        "priority": None,
        "sandbox": True,
        "walltime": None,
    }
    ## Ensure we are passing only the above
    #for key, value in kwargs.items():
    #    assert key in params,\
    #        "Error - Unknown variable (%s) passed to nbs.submit_job." % key
    params.update(kwargs)

    if params["walltime"] is not None:
        print("Warning - Walltime is not handled in NBS yet.  Your job will \
have the default time of the given queue.")

    # Ensure variables of correct types
    param_types = {
        "queue": lambda s: str(s).strip(),
        "nprocs": int,
        "sub_flag": lambda s: str(s).strip(),
        "unique_name": bool,
        "redundancy": bool,
        "sandbox": bool,
    }
    for k, f in param_types.items():
        params[k] = f(params[k])

    # Ensure default values make sense
    # Check Queue
    nbs_queues = get_nbs_queues()
    assert params["queue"] in nbs_queues,\
        "Error - Invalid queue (%s) requested.  Options: %s"\
        % (params["queue"], ", ".join(nbs_queues))

    jsub_path = which("jsub")
    assert jsub_path is not None,\
        "Error - Unable to find jsub path!"

    # Deal with variables accordingly
    if params["xhosts"] is not None:
        if isinstance(params["xhosts"], str):
            xhosts = "##NBS-xhost: \"%s\"" % params["xhosts"]
        elif is_array(params["xhosts"]):
            xhosts = "##NBS-xhost: " +\
                     ", ".join(map(lambda x: '"' + x + '"', params["xhosts"]))
        else:
            raise Exception("xhosts has been passed oddly!")
    else:
        xhosts = ""

    # Generate your script
    generic_script = '''#!/bin/sh
##NBS-name: ''' + name + '''
##NBS-nproc: ''' + params["nprocs"] + '''
##NBS-queue: ''' + params["queue"] + '''
''' + ["", "##NBS-unique: yes"][int(params["unique_name"])] + '''
''' + xhosts

    # If emailing, set here
    if params["email"] is not None:
        generic_script += "##NBS-email: " + params["email"] + "\n"

    # If priority is set, add it
    if params["priority"] is not None:
        if int(params["priority"]) > 255:
            params["priority"] = 255
        if int(params["priority"]) < 1:
            params["priority"] = 1
        generic_script += "##NBS-priority: " + str(params["priority"]) + "\n"

    # Take care of sandboxing if needed
    if params["sandbox"] is not None:
        generic_script = generic_script + '''
##NBS-fdisk: 8192
##NBS-fswap: 8192
##NBS-sandbox: yes
##NBS-tmp-sandbox: yes
'''
        for sb_in in params["sandbox"][0]:
            generic_script = generic_script + '''
##NBS-input: ''' + sb_in
        for sb_out in params["sandbox"][1]:
            generic_script = generic_script + '''
##NBS-output: ''' + sb_out + ''' -overwrite'''
        generic_script = generic_script + "\n\n"

    # Add in your script now
    generic_script = generic_script + "\ndate\n" + job_to_submit + "\ndate\n\n"

    f = open(name + '.nbs', 'w')
    f.write(generic_script)
    f.close()

    # Submit job
    cmd = "%s %s.nbs %s" % (jsub_path, name, params["sub_flag"])
    job_pipe = subprocess.Popen(cmd.split(),
                                shell=False,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
    job_err = job_pipe.stderr.read().decode("utf-8")

    if params["redundancy"] and "+notunique:" in job_err:
        try:
            job_info = get_job(name, detail=1)[0]
        except IndexError:
            # Job finished in process of submitting and redundancy call
            job_to_return = Job(name)
            # Attach the redundancy flag
            job_to_return.redundancy = True
            return job_to_return
        job_to_return = Job(job_info[0], job_id=job_info[-1])
        job_to_return.redundancy = True
        close_pipes(job_pipe)
        return job_to_return
    elif "+notunique:" in job_err:
        raise Exception("Job with name %s already exists in the queue!" % name)

    job_id_str = job_pipe.stdout.read().decode("utf-8")

    if "submitted to queue" not in job_id_str:
        print("\nFailed to submit the job!")
        print("--------------- JOB OUTPUT ---------------")
        print(job_id_str)
        print("---------------- JOB ERROR ---------------")
        print(job_err)
        print("---------------------------------")
        sys.stdout.flush()
        raise Exception()

    try:
        job_id = job_id_str.split("submitted to queue")[0].split()[-1][2:-1]
    except IndexError:
        print("ERROR - job_id_str is:")
        print(job_id_str)
        print("Defaulting to None, should still work... FIX!")
        job_id = None
    close_pipes(job_pipe)
    return Job(name, job_id=job_id)
Esempio n. 8
0
def get_lmp_obj(parallel=True):
    '''
    This function will find the lmp executable and a corresponding mpi
    executable.  It will handle errors accordingly.

    **Parameters**

        parallel: *bool, optional*
            Whether to get corresponding mpiexec info or not.

    **Returns**

        lmp_path: *str*
            Path to a lammps executable.
        mpi_path: *str*
            Path to an mpi executable.
    '''

    # If running in parallel, ensure we have mpi
    mpi_path = None
    if parallel:
        mpi_path = which("mpiexec")
        p = subprocess.Popen([mpi_path, "-h"],
                             shell=False,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        stdout = str(p.stdout.read().decode("utf-8").strip())
        close_pipes(p)

        # Simple check for openmpi
        assert "mpi" in stdout.lower(),\
            "Error - Unable to access mpiexec.  Please ensure it is in your \
PATH environment variable!"

    # First, look for lmp_X in order of common names
    lmp_path = None
    common_names = ["lmp_mpi", "lmp_serial", "lmp_smrff"]
    lmp_string_id = "Large-scale Atomic/Molecular Massively Parallel Simulator"
    for name in common_names:
        if which(name) is not None:
            # Check stuff
            if mpi_path is not None:
                cmd = [mpi_path, "-n", "1", which(name), "-h"]
            else:
                cmd = [which(name), "-h"]
            p = subprocess.Popen(cmd,
                                 shell=False,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            stdout = str(p.stdout.read().decode("utf-8").strip())

            if lmp_string_id not in stdout:
                close_pipes(p)
                continue
            else:
                # If it works, then save it
                close_pipes(p)
                lmp_path = which(name)
                break
    assert lmp_path is not None,\
        "Error - Unable to find lmp executable.  Please ensure it is \
in your PATH environment variable!"

    return lmp_path, mpi_path