def view_queue(request, machine_name):
    """Returns the current state of the queue in a list

    Keyword arguments:
    request -- Django HttpRequest
    machine_name -- name of the machine
    """
    pass
    machine = slurmutil.GRID_RESOURCE_TABLE.get(machine_name, None)
    if not machine:
        return json_response(status="ERROR",
                             status_code=400,
                             error="Invalid machine name: %s" % machine_name)
    env = slurmutil.get_cred_env(request.user)
    mycmd = "ssh " + machine["hostname"] + " ' " + machine["qstat"][
        "bin"] + " '"
    (output, error, retcode) = run_command(mycmd)
    if retcode != 0:
        return json_response(status="ERROR",
                             status_code=500,
                             error="Unable to get queue: %s" % error)
    patt = re.compile(
        r'(?P<jobid>[^\s]+)\s+(?P<partition>[^\s]+)\s+(?P<job_name>[^\s]+)\s+(?P<user>[^\s]+)\s+(?P<state>[^\s]+)\s+(?P<time>[^\s]+)\s+(?P<nodes>\d+)\s+(?P<nodelist>.*)$'
    )
    output = output.splitlines()
    output = [x.strip() for x in output]
    output = filter(lambda line: patt.match(line), output)
    output = map(lambda x: patt.match(x).groupdict(), output)
    #print( list(output)  )
    return list(output)
Exemple #2
0
def get_info(request, machine_name, job_id):
    """Gets the information of a job, given the id

    Keyword arguments:
    machine_name -- name of the machine
    job_id -- the job id
    """
    pass
    machine = slurmutil.GRID_RESOURCE_TABLE.get(machine_name, None)
    if not machine:
        return json_response(status="ERROR",
                             status_code=400,
                             error="Invalid machine name: %s" % machine_name)
    env = slurmutil.get_cred_env(request.user)
    mycmd = "ssh " + machine["hostname"] + " ' " + ' sacct -j  ' + job_id + " '"
    (output, error, retcode) = run_command(mycmd)
    if retcode != 0:
        return json_response(status="ERROR",
                             status_code=500,
                             error="Unable to get queue: %s" % error)
    patt = re.compile(
        r'(?P<jobid>[^\s]+)\s+(?P<jobname>[^\s]+)\s+(?P<partition>[^\s]+)\s+(?P<account>[^\s]+)\s+(?P<alloccpus>[^\s]+)\s+(?P<state>[^\s]+)\s+(?P<exitcode>.*)$'
    )
    output = output.splitlines()
    output = [x.strip() for x in output]
    output = filter(lambda line: patt.match(line), output)
    output = list(map(lambda x: patt.match(x).groupdict(), output))[2:]
    #print( output  )
    return (output)
def submit_job(request, machine_name):
    """Submits a job to the queue

    Keyword arguments:
    request -- Django HttpRequest
    machine_name -- name of the machine
    """
    pass
    machine = slurmutil.GRID_RESOURCE_TABLE.get(machine_name, None)
    if not machine:
        return json_response(status="ERROR",
                             status_code=400,
                             error="Invalid machine name: %s" % machine_name)
    qsub = machine['qsub']['bin']
    env = slurmutil.get_cred_env(request.user)
    user = request.user  # User.objects.get(username=username)
    if request.POST.get("jobfile", False):
        # Create command for sbatch on an existing slurm file
        job_file_path = request.POST.get("jobfile")
        jobfile = job_file_path
        cmd = "%s %s" % (qsub, job_file_path)
    elif request.POST.get("jobscript", False):
        # Create command for qsub from stdin data
        job_script = request.POST.get("jobscript").encode()

        # Creates a temporary job file
        tmp_job_file = tempfile.NamedTemporaryFile(
            prefix="newt_", dir='/HOME/nscc-gz_jiangli/tmp', delete=False)
        print(job_script)
        tmp_job_file.write(job_script)
        tmp_job_file.flush()
        jobfile = tmp_job_file.name
        cmd = "%s %s" % (qsub, tmp_job_file.name)
    else:
        return json_response(status="ERROR",
                             status_code=400,
                             error="No data received")
    job = HPCJob(user=user, jobfile=jobfile, machine=machine_name)
    job.save()
    try:
        #runner = GlobusHelper(request.user)
        cmd_str = "ssh " + machine["hostname"] + '  " ' + cmd + ' " '
        print(cmd_str)
        (output, error, retcode) = run_command(cmd_str, env=env)
    except Exception as ex:
        return json_response(status="ERROR",
                             status_code=500,
                             error="qsub failed with error: %s" % str(ex))
    if retcode != 0:
        return json_response(status="ERROR",
                             status_code=500,
                             error="qsub failed with error: %s" % error)
    job.jobid = output.strip().split(' ')[-1]
    job.save()
    return {"jobid": job.jobid}
def delete_job(request, machine_name, job_id):
    machine = slurmutil.GRID_RESOURCE_TABLE.get(machine_name, None)
    if not machine:
        return json_response(status="ERROR",
                             status_code=400,
                             error="Invalid machine name: %s" % machine_name)
    env = slurmutil.get_cred_env(request.user)
    mycmd = "ssh " + machine["hostname"] + " ' " + ' scancel  ' + job_id + " '"
    (output, error, retcode) = run_command(mycmd)
    if retcode != 0:
        return json_response(status="ERROR",
                             status_code=500,
                             error="Unable to get queue: %s" % error)
    return (output)
def get_info(request, machine_name, job_id):
    """Gets the information of a job, given the id

    Keyword arguments:
    machine_name -- name of the machine
    job_id -- the job id
    """
    pass
    machine = slurmutil.GRID_RESOURCE_TABLE.get(machine_name, None)
    if not machine:
        return json_response(status="ERROR",
                             status_code=400,
                             error="Invalid machine name: %s" % machine_name)
    env = slurmutil.get_cred_env(request.user)
    mycmd = "ssh " + machine["hostname"] + " ' " + ' sacct -j  ' + job_id + " '"
    job = HPCJob.objects.get(machine=machine_name, jobid=job_id)
    if job.state == "COMPLETED" or job.state == "FAILED":
        return {
            "partition": job.state,
            "jobid": job.jobid,
            "state": job.state,
            "exitcode": job.exit_code,
            "jobname": job.job_name
        }
    (output, error, retcode) = run_command(mycmd)
    if retcode != 0:
        return json_response(status="ERROR",
                             status_code=500,
                             error="Unable to get queue: %s" % error)
    patt = re.compile(
        r'(?P<jobid>[^\s]+)\s+(?P<jobname>[^\s]+)\s+(?P<partition>[^\s]+)\s+(?P<account>[^\s]+)\s+(?P<alloccpus>[^\s]+)\s+(?P<state>[^\s]+)\s+(?P<exitcode>.*)$'
    )
    output = output.splitlines()
    output = [x.strip() for x in output]
    output = filter(lambda line: patt.match(line), output)
    output = list(map(lambda x: patt.match(x).groupdict(), output))[2:]
    #print( output  )
    # {"partition": "work", "account": "nscc-gz", "alloccpus": "24", "jobid": "3422542", "state": "COMPLETED", "exitcode": "0:0", "jobname": "newt_cs0r+"}
    job.partition = output[0]["partition"]
    job.exit_code = output[0]["exitcode"].split(':')[1]
    job.job_name = output[0]["jobname"]
    job.state = output[0]["state"]
    job.save()
    return (output)