Exemple #1
0
def upload_request():

    system_addr = EXT_TRANSFER_MACHINE_INTERNAL
    system_name = EXT_TRANSFER_MACHINE_PUBLIC

    targetPath = request.form.get("targetPath",
                                  None)  # path to save file in cluster
    v = validate_input(targetPath)
    if v != "":
        return jsonify(description="Failed to upload file",
                       error=f"'targetPath' {v}"), 400

    sourcePath = request.form.get("sourcePath", None)  # path from the local FS
    v = validate_input(sourcePath)
    if v != "":
        return jsonify(description="Failed to upload file",
                       error=f"'sourcePath' {v}"), 400

    [headers, ID] = get_tracing_headers(request)
    # checks if targetPath is a valid path
    check = is_valid_dir(targetPath, headers, system_name, system_addr)

    if not check["result"]:
        return jsonify(description="sourcePath error"), 400, check["headers"]

    # obtain new task from Tasks microservice
    task_id = create_task(headers, service="storage")

    if task_id == -1:
        return jsonify(error="Error creating task"), 400

    # asynchronous task creation
    try:
        update_task(task_id, headers, async_task.QUEUED)

        aTask = threading.Thread(target=upload_task,
                                 name=ID,
                                 args=(headers, system_name, system_addr,
                                       targetPath, sourcePath, task_id))

        storage_tasks[task_id] = aTask

        storage_tasks[task_id].start()

        task_url = f"{KONG_URL}/tasks/{task_id}"

        data = jsonify(success="Task created",
                       task_url=task_url,
                       task_id=task_id)
        return data, 201

    except Exception as e:
        data = jsonify(error=e)
        return data, 400
Exemple #2
0
def cancel_job_task(headers, system_name, system_addr, action, task_id):
    # exec scancel command
    resp = exec_remote_command(headers, system_name, system_addr, action)

    app.logger.info(resp)

    data = resp["msg"]

    # in case of error:
    # permission denied, jobid to be canceled is owned by user without permission
    if resp["error"] == 210:
        update_task(task_id, headers, async_task.ERROR,
                    "User does not have permission to cancel job")
        return

    if resp["error"] == -2:
        update_task(task_id, headers, async_task.ERROR,
                    "Machine is not available")
        return

    if resp["error"] != 0:
        err_msg = resp["msg"]
        if in_str(err_msg, "OPENSSH"):
            err_msg = "User does not have permissions to access machine"
        update_task(task_id, headers, async_task.ERROR, err_msg)
        return

    # in specific scancel's case, this command doesn't give error code over
    # invalid or completed jobs, but -v catches stderr even if it's ok
    # so, if error key word is on stderr scancel has failed, otherwise:

    # if "error" word appears:
    if in_str(data, "error"):
        # error message: "scancel: error: Kill job error on job id 5: Invalid job id specified"
        # desired output: "Kill job error on job id 5: Invalid job id specified"
        err_msg = data[(data.index("error") + 7):]
        update_task(task_id, headers, async_task.ERROR, err_msg)
        return

    # otherwise
    update_task(task_id, headers, async_task.SUCCESS, data)
Exemple #3
0
def acct_task(headers, system_name, system_addr, action, task_id):
    # exec remote command
    resp = exec_remote_command(headers, system_name, system_addr, action)

    app.logger.info(resp)

    # in case of error:
    if resp["error"] == -2:
        update_task(task_id, headers, async_task.ERROR,
                    "Machine is not available")
        return

    # in case of error:
    if resp["error"] != 0:
        err_msg = resp["msg"]
        if in_str(err_msg, "OPENSSH"):
            err_msg = "User does not have permissions to access machine"
        update_task(task_id, headers, async_task.ERROR, err_msg)
        return

    if len(resp["msg"]) == 0:
        update_task(task_id, headers, async_task.SUCCESS, {}, True)
        return

    # on success:
    joblist = resp["msg"].split("$")
    jobs = []
    for job in joblist:
        # ouput by sacct uses '|'
        jobaux = job.split("|")
        jobinfo = {
            "jobid": jobaux[0],
            "partition": jobaux[1],
            "name": jobaux[2],
            "user": jobaux[3],
            "state": jobaux[4],
            "start_time": jobaux[5],
            "time": jobaux[6],
            "time_left": jobaux[7],
            "nodes": jobaux[8],
            "nodelist": jobaux[9]
        }

        jobs.append(jobinfo)

    # as it is a json data to be stored in Tasks, the is_json=True
    update_task(task_id, headers, async_task.SUCCESS, jobs, is_json=True)
Exemple #4
0
def list_job(jobid):

    auth_header = request.headers[AUTH_HEADER_NAME]

    try:
        system_name = request.headers["X-Machine-Name"]
    except KeyError as e:
        app.logger.error("No machinename given")
        return jsonify(description="No machine name given"), 400

    # public endpoints from Kong to users
    if system_name not in SYSTEMS_PUBLIC:
        header = {"X-Machine-Does-Not-Exists": "Machine does not exists"}
        return jsonify(description="Failed to retrieve job information", error="Machine does not exists"), 400, header

    #check if jobid is a valid jobid for SLURM
    if not is_jobid(jobid):
        return jsonify(description="Failed to retrieve job information", error=f"{jobid} is not a valid job ID"), 400

    # select index in the list corresponding with machine name
    system_idx = SYSTEMS_PUBLIC.index(system_name)
    system_addr = SYS_INTERNALS[system_idx]

    # check if machine is accessible by user:
    # exec test remote command
    resp = exec_remote_command(auth_header, system_name, system_addr, "true")

    if resp["error"] != 0:
        error_str = resp["msg"]
        if resp["error"] == -2:
            header = {"X-Machine-Not-Available": "Machine is not available"}
            return jsonify(description="Failed to retrieve job information"), 400, header
        if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"):
            header = {"X-Permission-Denied": "User does not have permissions to access machine or path"}
            return jsonify(description="Failed to retrieve job information"), 404, header

    username = get_username(auth_header)
    app.logger.info(f"Getting SLURM information of job={jobid} from {system_name} ({system_addr})")

    # format: jobid (i) partition (P) jobname (j) user (u) job sTate (T),
    #          start time (S), job time (M), left time (L)
    #           nodes allocated (M) and resources (R)
    action = "squeue -u {username} --format='%i|%P|%j|%u|%T|%M|%S|%L|%D|%R' --noheader -j {jobid}".\
        format(username=username,jobid=jobid)

    try:
        # obtain new task from Tasks microservice
        task_id = create_task(auth_header,service="compute")

        # if error in creating task:
        if task_id == -1:
            return jsonify(description="Failed to retrieve job information",error='Error creating task'), 400

        update_task(task_id, auth_header, async_task.QUEUED)

        # asynchronous task creation
        aTask = threading.Thread(target=list_job_task,
                                 args=(auth_header, system_name, system_addr, action, task_id, 1, 1))

        aTask.start()

        task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id)

        data = jsonify(success="Task created", task_id=task_id, task_url=task_url)
        return data, 200

    except Exception as e:
        data = jsonify(description="Failed to retrieve job information",error=e)
        return data, 400
Exemple #5
0
def list_job_task(auth_header,system_name, system_addr,action,task_id,pageSize,pageNumber):
    # exec command
    resp = exec_remote_command(auth_header, system_name, system_addr, action)

    app.logger.info(resp)

    # in case of error:
    if resp["error"] == -2:
        update_task(task_id, auth_header,async_task.ERROR,"Machine is not available")
        return

    if resp["error"] == 1:
        err_msg = resp["msg"]
        if in_str(err_msg,"OPENSSH"):
            err_msg = "User does not have permissions to access machine"
        update_task(task_id, auth_header,async_task.ERROR ,err_msg)
        return

    if len(resp["msg"]) == 0:
         #update_task(task_id, auth_header, async_task.SUCCESS, "You don't have active jobs on {machine}".format(machine=machine))
         update_task(task_id, auth_header, async_task.SUCCESS,{},True)
         return


    # on success:
    jobList = resp["msg"].split("$")
    app.logger.info("Size jobs: %d" % len(jobList))

    # pagination
    totalSize   = len(jobList)
    pageNumber  = float(pageNumber)
    pageSize    = float(pageSize)

    totalPages = int(ceil(float(totalSize) / float(pageSize)))

    app.logger.info(f"Total Size: {totalSize}")
    app.logger.info(f"Total Pages: {totalPages}")

    if pageNumber < 0 or pageNumber > totalPages-1:
        app.logger.warning(
            "pageNumber ({pageNumber}) greater than total pages ({totalPages})".format(pageNumber=pageNumber,
                                                                                       totalPages=totalPages))
        app.logger.warning("set to default")
        pageNumber = 0

    beg_reg = int(pageNumber * pageSize)
    end_reg = int( (pageNumber+1 * pageSize) -1 )

    app.logger.info("Initial reg {beg_reg}, final reg: {end_reg}".format(beg_reg=beg_reg, end_reg=end_reg))

    jobList = jobList[beg_reg:end_reg + 1]

    jobs = {}
    for job_index in range(len(jobList)):
        job = jobList[job_index]
        jobaux = job.split("|")
        jobinfo = {"jobid": jobaux[0], "partition": jobaux[1], "name": jobaux[2],
                   "user": jobaux[3], "state": jobaux[4], "start_time": jobaux[5],
                   "time": jobaux[6], "time_left": jobaux[7],
                   "nodes": jobaux[8], "nodelist": jobaux[9]}

        # now looking for log and err files location
        jobinfo = get_slurm_files(auth_header, system_name, system_addr, task_id,jobinfo,True)

        # add jobinfo to the array
        jobs[str(job_index)]=jobinfo

    data = jobs

    update_task(task_id, auth_header, async_task.SUCCESS, data, True)
Exemple #6
0
def list_jobs():

    auth_header = request.headers[AUTH_HEADER_NAME]

    try:
        system_name = request.headers["X-Machine-Name"]
    except KeyError as e:
        app.logger.error("No machinename given")
        return jsonify(description="No machine name given"), 400

    # public endpoints from Kong to users
    if system_name not in SYSTEMS_PUBLIC:
        header = {"X-Machine-Does-Not-Exists": "Machine does not exists"}
        return jsonify(description="Failed to retrieve jobs information", error="Machine does not exists"), 400, header

    # select index in the list corresponding with machine name
    system_idx = SYSTEMS_PUBLIC.index(system_name)
    system_addr = SYS_INTERNALS[system_idx]

    # check if machine is accessible by user:
    # exec test remote command
    resp = exec_remote_command(auth_header, system_name, system_addr, "true")

    if resp["error"] != 0:
        error_str = resp["msg"]
        if resp["error"] == -2:
            header = {"X-Machine-Not-Available": "Machine is not available"}
            return jsonify(description="Failed to retrieve jobs information"), 400, header
        if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"):
            header = {"X-Permission-Denied": "User does not have permissions to access machine or path"}
            return jsonify(description="Failed to retrieve jobs information"), 404, header

    username = get_username(auth_header)

    app.logger.info(f"Getting SLURM information of jobs from {system_name} ({system_addr})")

    # job list comma separated:
    jobs        = request.args.get("jobs", None)
    pageSize    = request.args.get("pageSize", None)
    pageNumber  = request.args.get("pageNumber", None)

    if pageSize != None and pageNumber != None:
        try:
            pageNumber  = int(pageNumber)
            pageSize    = int(pageSize)

            if pageSize not in [10,25,50,100]:
                pageSize = 25

        except ValueError:
            pageNumber = 0
            pageSize = 25
            app.logger.error("Wrong pageNumber and/or pageSize")
    else:
        # if not set, by default
        pageNumber  = 0
        pageSize    = 25

    # by default empty
    job_list = ""
    if jobs != None:
        try:
            # check if input is correct:
            job_aux_list = jobs.split(",")
            if '' in job_aux_list:
                return jsonify(error="Jobs list wrong format",description="Failed to retrieve job information"), 400

            for jobid in job_aux_list:
                if not is_jobid(jobid):
                    return jsonify(error=f"{jobid} is not a valid job ID", description="Failed to retrieve job information"), 400

            job_list="--job={jobs}".format(jobs=jobs)
        except:
            return jsonify(error="Jobs list wrong format",description="Failed to retrieve job information"), 400

    # format: jobid (i) partition (P) jobname (j) user (u) job sTate (T),
    #          start time (S), job time (M), left time (L)
    #           nodes allocated (M) and resources (R)
    action = f"squeue -u {username} {job_list} --format='%i|%P|%j|%u|%T|%M|%S|%L|%D|%R' --noheader"

    try:
        task_id = create_task(auth_header,service="compute")

        # if error in creating task:
        if task_id == -1:
            return jsonify(description="Failed to retrieve job information",error='Error creating task'), 400

        update_task(task_id, auth_header, async_task.QUEUED)

        # asynchronous task creation
        aTask = threading.Thread(target=list_job_task,
                                 args=(auth_header, system_name, system_addr, action, task_id, pageSize, pageNumber))

        aTask.start()

        task_url = f"{KONG_URL}/tasks/{task_id}"

        data = jsonify(success="Task created", task_id=task_id, task_url=task_url)
        return data, 200

    except Exception as e:
        data = jsonify(description="Failed to retrieve job information",error=e)
        return data, 400
Exemple #7
0
def submit_job_path():
    auth_header = request.headers[AUTH_HEADER_NAME]

    try:
        system_name = request.headers["X-Machine-Name"]
    except KeyError as e:
        app.logger.error("No machinename given")
        return jsonify(description="Failed to submit job", error="No machine name given"), 400

    # public endpoints from Kong to users
    if system_name not in SYSTEMS_PUBLIC:
        header={"X-Machine-Does-Not-Exists":"Machine does not exists"}
        return jsonify(description="Failed to submit job",error="Machine does not exists"), 400, header

    # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order

    # select index in the list corresponding with machine name
    system_idx = SYSTEMS_PUBLIC.index(system_name)
    system_addr = SYS_INTERNALS[system_idx]

    # check if machine is accessible by user:
    # exec test remote command
    resp = exec_remote_command(auth_header, system_name, system_addr, "true")

    if resp["error"] != 0:
        error_str = resp["msg"]
        if resp["error"] == -2:
            header = {"X-Machine-Not-Available": "Machine is not available"}
            return jsonify(description="Failed to submit job"), 400, header
        if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"):
            header = {"X-Permission-Denied": "User does not have permissions to access machine or path"}
            return jsonify(description="Failed to submit job"), 404, header

    try:
        targetPath = request.form["targetPath"]
    except KeyError as e:
        data = jsonify(description="Failed to submit job", error="'targetPath' parameter not set in request")
        return data, 400

    if targetPath == None:
        data = jsonify(description="Failed to submit job", error="'targetPath' parameter not set in request")
        return data, 400

    if targetPath == "":
        data = jsonify(description="Failed to submit job", error="'targetPath' parameter value is empty")
        return data, 400


    # checks if targetPath is a valid path for this user in this machine
    check = is_valid_file(targetPath, auth_header, system_name, system_addr)

    if not check["result"]:
        return jsonify(description="Failed to submit job"), 400, check["headers"]

    # creates the async task related to the job submission
    task_id = create_task(auth_header,service="compute")
    # if error in creating task:
    if task_id == -1:
        return jsonify(description="Failed to submit job",error='Error creating task'), 400

    # if targetPath = "/home/testuser/test/sbatch.sh/"
    # split by / and discard last element (the file name): ['', 'home', 'testuser', 'test']
    job_dir_splitted = targetPath.split("/")[:-1]
    # in case the targetPath ends with /, like: "/home/testuser/test/sbatch.sh/"
    # =>  ['', 'home', 'testuser', 'test', ''], then last element of the list is discarded
    if job_dir_splitted[-1] == "":
        job_dir_splitted = job_dir_splitted[:-1]

    job_dir = "/".join(job_dir_splitted)


    try:
        # asynchronous task creation
        aTask = threading.Thread(target=submit_job_path_task,
                             args=(auth_header, system_name, system_addr, targetPath, job_dir, task_id))

        aTask.start()
        retval = update_task(task_id, auth_header, async_task.QUEUED, TASKS_URL)

        task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id)
        data = jsonify(success="Task created", task_id=task_id, task_url=task_url)
        return data, 201

    except Exception as e:
        data = jsonify(description="Failed to submit job",error=e)
        return data, 400
Exemple #8
0
def submit_job_upload():

    auth_header = request.headers[AUTH_HEADER_NAME]

    try:
        system_name = request.headers["X-Machine-Name"]
    except KeyError as e:
        app.logger.error("No machinename given")
        return jsonify(description="No machine name given"), 400

    # public endpoints from Kong to users
    if system_name not in SYSTEMS_PUBLIC:
        header={"X-Machine-Does-Not-Exists":"Machine does not exists"}
        return jsonify(description="Failed to submit job file",error="Machine does not exists"), 400, header

    # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order

    # select index in the list corresponding with machine name
    system_idx = SYSTEMS_PUBLIC.index(system_name)
    system_addr = SYS_INTERNALS[system_idx]

    # check if machine is accessible by user:
    # exec test remote command
    resp = exec_remote_command(auth_header, system_name, system_addr, "true")

    if resp["error"] != 0:
        error_str = resp["msg"]
        if resp["error"] == -2:
            header = {"X-Machine-Not-Available": "Machine is not available"}
            return jsonify(description="Failed to submit job file"), 400, header
        if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"):
            header = {"X-Permission-Denied": "User does not have permissions to access machine or path"}
            return jsonify(description="Failed to submit job file"), 404, header

    job_base_fs = COMPUTE_BASE_FS[system_idx]

    try:
        # check if the post request has the file part
        if 'file' not in request.files:
            app.logger.error('No batch file part')
            error = jsonify(description="Failed to submit job file", error='No batch file part')
            return error, 400

        job_file = {'filename': secure_filename(request.files['file'].filename), 'content': request.files['file'].read()}

        # if user does not select file, browser also
        # submit an empty part without filename
        if job_file['filename'] == '':
            app.logger.error('No batch file selected')
            error = jsonify(description="Failed to submit job file", error='No batch file selected')
            return error, 400

    except RequestEntityTooLarge as re:
        app.logger.error(re.description)
        data = jsonify(description="Failed to submit job file", error=f"File is bigger than {MAX_FILE_SIZE} MB")
        return data, 413
    except Exception as e:
        data = jsonify(description="Failed to submit job file",error=e)
        return data, 400


    task_id = create_task(auth_header,service="compute")
    # if error in creating task:
    if task_id == -1:
        return jsonify(description="Failed to submit job file",error='Error creating task'), 400

    # create tmp file with timestamp
    # using hash_id from Tasks, which is user-task_id (internal)
    tmpdir = "{task_id}".format(task_id=task_id)

    username = get_username(auth_header)

    job_dir = f"{job_base_fs}/{username}/firecrest/{tmpdir}"

    app.logger.info(f"Job dir: {job_dir}")

    try:
        # asynchronous task creation
        aTask = threading.Thread(target=submit_job_task,
                             args=(auth_header, system_name, system_addr, job_file, job_dir, task_id))

        aTask.start()
        retval = update_task(task_id, auth_header,async_task.QUEUED)

        task_url = f"{KONG_URL}/tasks/{task_id}"
        data = jsonify(success="Task created", task_id=task_id, task_url=task_url)
        return data, 201

    except Exception as e:
        data = jsonify(description="Failed to submit job",error=e)
        return data, 400
Exemple #9
0
def submit_job_path_task(auth_header,system_name, system_addr,fileName,job_dir, task_id):

    try:
        # get scopes from token
        decoded = jwt.decode(auth_header[7:], verify=False)
        # scope: "openid profile email firecrest-tds.cscs.ch/storage/something"
        scopes = decoded['scope'].split(' ')
        scopes_parameters = ''

        # SCOPES sintax: id_service/microservice/parameter
        for s in scopes:
            s2 = s.split('/')
            if s2[0] == FIRECREST_SERVICE:
                if s2[1] == 'storage':
                    if scopes_parameters != '':
                        scopes_parameters = scopes_parameters + ','

                    scopes_parameters = scopes_parameters + s2[2]

        if scopes_parameters != '':
            scopes_parameters = '--firecrest=' + scopes_parameters

        app.logger.info("scope parameters: " + scopes_parameters)


    except Exception as e:
        app.logger.error(type(e))

        app.logger.error(e.args)


    action=f"sbatch --chdir={job_dir} {scopes_parameters} -- {fileName}"

    resp = exec_remote_command(auth_header, system_name, system_addr, action)

    app.logger.info(resp)

    # in case of error:
    if resp["error"] != 0:
        if resp["error"] == -2:
            update_task(task_id, auth_header, async_task.ERROR,"Machine is not available")
            return

        if resp["error"] == 1:
            err_msg = resp["msg"]
            if in_str(err_msg,"OPENSSH"):
                err_msg = "User does not have permissions to access machine"
            update_task(task_id, auth_header, async_task.ERROR ,err_msg)
            return
        err_msg = resp["msg"]
        update_task(task_id, auth_header, async_task.ERROR, err_msg)


    jobid = extract_jobid(resp["msg"])

    msg = {"result":"Job submitted", "jobid":jobid}


    # now looking for log and err files location
    job_extra_info = get_slurm_files(auth_header, system_name, system_addr, task_id,msg)

    update_task(task_id, auth_header,async_task.SUCCESS, job_extra_info,True)
Exemple #10
0
def os_to_fs(task_id):
    upl_file = uploaded_files[task_id]
    system_name = upl_file["system_name"]
    system_addr = upl_file["system_addr"]
    username = upl_file["user"]
    objectname = upl_file["source"]
    headers = {}
    headers[TRACER_HEADER] = upl_file['trace_id']

    try:
        app.logger.info(upl_file["msg"])

        # certificate is encrypted with CERT_CIPHER_KEY key
        # here is decrypted
        cert = upl_file["msg"]["cert"]
        cipher = Fernet(CERT_CIPHER_KEY)
        # the decryption process produces a byte type
        # remember that is stored as str not as byte in the JSON
        pub_cert = cipher.decrypt(cert[0].encode('utf-8')).decode('utf-8')

        # cert_pub in 0 /user-key-cert.pub
        # temp-dir in 1
        # get tmp directory
        td = cert[1]

        app.logger.info(f"Temp dir: {td}")

        if not os.path.exists(td):
            # retrieve public certificate and store in temp dir location
            str_to_file(pub_cert, td, "user-key-cert.pub")

            # user public and private key should be in Storage / path, symlinking in order to not use the same key at the same time
            os.symlink(os.getcwd() + "/user-key.pub",
                       td + "/user-key.pub")  # link on temp dir
            os.symlink(os.getcwd() + "/user-key",
                       td + "/user-key")  # link on temp dir

            # stat.S_IRUSR -> owner has read permission
            os.chmod(td + "/user-key-cert.pub", stat.S_IRUSR)

        cert_list = [
            f"{td}/user-key-cert.pub", f"{td}/user-key.pub", f"{td}/user-key",
            td
        ]

        # start download from OS to FS
        update_task(task_id, headers, async_task.ST_DWN_BEG)

        # execute download
        result = exec_remote_command(username, system_name, system_addr, "",
                                     "storage_cert", cert_list)

        # if no error, then download is complete
        if result["error"] == 0:
            update_task(task_id, headers, async_task.ST_DWN_END)

            # No need to delete the dictionary, it will be cleaned on next iteration

            # delete upload request
            # del uploaded_files[task_id]

            # must be deleted after object is moved to storage
            # staging.delete_object(containername=username,prefix=task_id,objectname=objectname)
            # for big files delete_object consumes a long time and often gives a TimeOut error between system and staging area
            # Therefore, using delete_object_after a few minutes (in this case 5 minutes) will trigger internal staging area
            # mechanism to delete the file automatically and without a need of a connection

            staging.delete_object_after(containername=username,
                                        prefix=task_id,
                                        objectname=objectname,
                                        ttl=int(time.time()) + 600)

        else:
            # if error, should be prepared for try again
            upl_file["status"] = async_task.ST_DWN_ERR
            uploaded_files[task_id] = upl_file

            # update but conserv "msg" as the data for download to OS, to be used for retry in next iteration
            update_task(task_id,
                        headers,
                        async_task.ST_DWN_ERR,
                        msg=upl_file,
                        is_json=True)

    except Exception as e:
        app.logger.error(e)
Exemple #11
0
def submit_job_path():

    try:
        system_name = request.headers["X-Machine-Name"]
    except KeyError as e:
        app.logger.error("No machinename given")
        return jsonify(description="Failed to submit job",
                       error="No machine name given"), 400

    # public endpoints from Kong to users
    if system_name not in SYSTEMS_PUBLIC:
        header = {"X-Machine-Does-Not-Exists": "Machine does not exists"}
        return jsonify(description="Failed to submit job",
                       error="Machine does not exists"), 400, header

    # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order

    # select index in the list corresponding with machine name
    system_idx = SYSTEMS_PUBLIC.index(system_name)
    system_addr = SYS_INTERNALS[system_idx]
    use_plugin = USE_SPANK_PLUGIN[system_idx]

    targetPath = request.form.get("targetPath", None)
    v = validate_input(targetPath)
    if v != "":
        return jsonify(description="Failed to submit job",
                       error=f"'targetPath' {v}"), 400

    # check "account parameter"
    account = request.form.get("account", None)
    if account != None:
        v = validate_input(account)
        if v != "":
            return jsonify(description="Invalid account",
                           error=f"'account' {v}"), 400

    [headers, ID] = get_tracing_headers(request)
    # check if machine is accessible by user:
    resp = exec_remote_command(headers, system_name, system_addr,
                               f"ID={ID} true")

    if resp["error"] != 0:
        error_str = resp["msg"]
        if resp["error"] == -2:
            header = {"X-Machine-Not-Available": "Machine is not available"}
            return jsonify(description="Failed to submit job"), 400, header
        if in_str(error_str, "Permission") or in_str(error_str, "OPENSSH"):
            header = {
                "X-Permission-Denied":
                "User does not have permissions to access machine or path"
            }
            return jsonify(description="Failed to submit job"), 404, header

    # checks if targetPath is a valid path for this user in this machine
    check = is_valid_file(targetPath, headers, system_name, system_addr)

    if not check["result"]:
        return jsonify(
            description="Failed to submit job"), 400, check["headers"]

    # creates the async task related to the job submission
    task_id = create_task(headers, service="compute")
    # if error in creating task:
    if task_id == -1:
        return jsonify(description="Failed to submit job",
                       error='Error creating task'), 400

    # if targetPath = "/home/testuser/test/sbatch.sh/"
    # split by / and discard last element (the file name): ['', 'home', 'testuser', 'test']
    job_dir_splitted = targetPath.split("/")[:-1]
    # in case the targetPath ends with /, like: "/home/testuser/test/sbatch.sh/"
    # =>  ['', 'home', 'testuser', 'test', ''], then last element of the list is discarded
    if job_dir_splitted[-1] == "":
        job_dir_splitted = job_dir_splitted[:-1]

    job_dir = "/".join(job_dir_splitted)

    try:
        # asynchronous task creation
        aTask = threading.Thread(target=submit_job_path_task,
                                 name=ID,
                                 args=(headers, system_name, system_addr,
                                       targetPath, job_dir, account,
                                       use_plugin, task_id))

        aTask.start()
        retval = update_task(task_id, headers, async_task.QUEUED, TASKS_URL)

        task_url = f"{KONG_URL}/tasks/{task_id}"
        data = jsonify(success="Task created",
                       task_id=task_id,
                       task_url=task_url)
        return data, 201

    except Exception as e:
        data = jsonify(description="Failed to submit job", error=e)
        return data, 400
Exemple #12
0
def upload_task(headers, system_name, system_addr, targetPath, sourcePath,
                task_id):

    fileName = sourcePath.split("/")[-1]

    # container to bind:
    container_name = get_username(headers[AUTH_HEADER_NAME])
    ID = headers.get(TRACER_HEADER, '')
    # change hash_id for task_id since is not longer needed for (failed) redirection
    uploaded_files[task_id] = {
        "user": container_name,
        "system_name": system_name,
        "system_addr": system_addr,
        "target": targetPath,
        "source": fileName,
        "status": async_task.ST_URL_ASK,
        "hash_id": task_id,
        "trace_id": ID
    }

    data = uploaded_files[task_id]

    global staging
    data[
        "msg"] = f"Waiting for Presigned URL to upload file to staging area ({staging.get_object_storage()})"

    # change to dictionary containing upload data (for backup purpouses) and adding url call
    update_task(task_id, headers, async_task.ST_URL_ASK, data, is_json=True)

    # check if staging token is valid
    if not staging.renew_token():
        msg = "Staging Area auth error, try again later"
        data["msg"] = msg
        data["status"] = async_task.ERROR
        update_task(task_id, headers, async_task.ERROR, data, is_json=True)
        return

    # create or return container
    if not staging.is_container_created(container_name):
        errno = staging.create_container(container_name)
        if errno == -1:
            msg = f"Could not create container {container_name} in Staging Area ({staging.get_object_storage()})"
            data["msg"] = msg
            data["status"] = async_task.ERROR
            update_task(task_id, headers, async_task.ERROR, data, is_json=True)
            return

    object_prefix = task_id

    # create temporary upload form
    resp = staging.create_upload_form(sourcePath,
                                      container_name,
                                      object_prefix,
                                      STORAGE_TEMPURL_EXP_TIME,
                                      STORAGE_MAX_FILE_SIZE,
                                      internal=False)

    # create download URL for later download from Object Storage to filesystem
    app.logger.info("Creating URL for later download")
    download_url = staging.create_temp_url(container_name, object_prefix,
                                           fileName, STORAGE_TEMPURL_EXP_TIME)

    # create certificate for later download from OS to filesystem
    app.logger.info(f"Creating certificate for later download")
    options = f"-s -G -o '{targetPath}/{fileName}' -- '{download_url}'"
    exp_time = STORAGE_TEMPURL_EXP_TIME
    certs = create_certificate(headers, system_name, system_addr,
                               f"ID={ID} curl", options, exp_time)

    if not certs[0]:
        msg = "Could not create certificate for download from Staging Area to filesystem"
        app.logger.error(msg)
        data["msg"] = msg
        data["status"] = async_task.ERROR
        update_task(task_id, headers, async_task.ERROR, data, is_json=True)
        return

    # converts file to string to store in Tasks
    cert_pub = file_to_str(fileName=certs[0])
    # key_pub  = file_to_str(fileName=certs[1])
    # key_priv = file_to_str(fileName=certs[2])
    temp_dir = certs[3]

    # encrypt certificate with CERT_CIPHER_KEY key
    cipher = Fernet(CERT_CIPHER_KEY)
    # data to be encrypted should be encoded to bytes
    # in order to save it as json, the cert encrypted should be decoded to string
    cert_pub_enc = cipher.encrypt(cert_pub.encode('utf-8')).decode('utf-8')

    resp["download_url"] = download_url
    resp["action"] = f"curl {options}"
    resp["cert"] = [cert_pub_enc, temp_dir]

    data["msg"] = resp
    data["status"] = async_task.ST_URL_REC

    app.logger.info("Cert and url created correctly")

    update_task(task_id, headers, async_task.ST_URL_REC, data, is_json=True)

    return
Exemple #13
0
def download_task(headers, system_name, system_addr, sourcePath, task_id):
    object_name = sourcePath.split("/")[-1]
    global staging

    # check if staging area token is valid
    if not staging.renew_token():
        msg = "Staging area auth error"
        update_task(task_id, headers, async_task.ERROR, msg)
        return

    # create container if it doesn't exists:
    container_name = get_username(headers[AUTH_HEADER_NAME])

    if not staging.is_container_created(container_name):
        errno = staging.create_container(container_name)

        if errno == -1:
            msg = f"Could not create container {container_name} in Staging Area ({staging.get_object_storage()})"
            update_task(task_id, headers, async_task.ERROR, msg)
            return

    # upload file to swift
    object_prefix = task_id

    upload_url = staging.create_upload_form(sourcePath, container_name,
                                            object_prefix,
                                            STORAGE_TEMPURL_EXP_TIME,
                                            STORAGE_MAX_FILE_SIZE)

    # advice Tasks that upload begins:
    update_task(task_id, headers, async_task.ST_UPL_BEG)

    # upload starts:
    res = exec_remote_command(headers, system_name, system_addr,
                              upload_url["command"])

    # if upload to SWIFT fails:
    if res["error"] != 0:
        msg = f"Upload to Staging area has failed. Object: {object_name}"

        error_str = res["msg"]
        if in_str(error_str, "OPENSSH"):
            error_str = "User does not have permissions to access machine"
        msg = f"{msg}. {error_str}"

        app.logger.error(msg)
        update_task(task_id, headers, async_task.ST_UPL_ERR, msg)
        return

    # get Download Temp URL with [seconds] time expiration
    # create temp url for file: valid for STORAGE_TEMPURL_EXP_TIME seconds
    temp_url = staging.create_temp_url(container_name,
                                       object_prefix,
                                       object_name,
                                       STORAGE_TEMPURL_EXP_TIME,
                                       internal=False)

    # if error raises in temp url creation:
    if temp_url == None:
        msg = f"Temp URL creation failed. Object: {object_name}"
        update_task(task_id, headers, async_task.ERROR, msg)
        return

    # if succesfully created: temp_url in task with success status
    update_task(task_id, headers, async_task.ST_UPL_END, temp_url)
    # marked deletion from here to STORAGE_TEMPURL_EXP_TIME (default 30 days)
    retval = staging.delete_object_after(containername=container_name,
                                         prefix=object_prefix,
                                         objectname=object_name,
                                         ttl=int(time.time()) +
                                         STORAGE_TEMPURL_EXP_TIME)

    if retval == 0:
        app.logger.info(
            f"Setting {STORAGE_TEMPURL_EXP_TIME} [s] as X-Delete-At")
    else:
        app.logger.error("Object couldn't be marked as X-Delete-At")
Exemple #14
0
def check_upload_files():

    global staging

    while True:

        # Get updated task status from Tasks microservice DB backend (TaskPersistence)
        get_upload_unfinished_tasks()

        app.logger.info(
            f"Check files in Object Storage - Pendings uploads: {len(uploaded_files)}"
        )

        # create STATIC auxiliary upload list in order to avoid "RuntimeError: dictionary changed size during iteration"
        # (this occurs since upload_files dictionary is shared between threads and since Python3 dict.items() trigger that error)
        upl_list = [(task_id, upload)
                    for task_id, upload in uploaded_files.items()]

        for task_id, upload in upl_list:
            #checks if file is ready or not for download to FileSystem
            try:
                task_status = async_task.status_codes[upload['status']]

                headers = {}
                app.logger.info(f"Status of {task_id}: {task_status}")

                #if upload["status"] in [async_task.ST_URL_REC,async_task.ST_DWN_ERR] :
                if upload["status"] == async_task.ST_URL_REC:
                    app.logger.info(
                        f"Task {task_id} -> File ready to upload or already downloaded"
                    )

                    upl = uploaded_files[task_id]

                    containername = upl["user"]
                    prefix = task_id
                    objectname = upl["source"]
                    headers[TRACER_HEADER] = upl['trace_id']

                    if not staging.is_object_created(containername, prefix,
                                                     objectname):
                        app.logger.info(
                            f"{containername}/{prefix}/{objectname} isn't created in staging area, continue polling"
                        )
                        continue

                    # confirms that file is in OS (auth_header is not needed)
                    update_task(task_id,
                                headers,
                                async_task.ST_UPL_CFM,
                                msg=upload,
                                is_json=True)
                    upload["status"] = async_task.ST_UPL_CFM
                    uploaded_files["task_id"] = upload
                    os_to_fs_task = threading.Thread(target=os_to_fs,
                                                     name=upl['trace_id'],
                                                     args=(task_id, ))
                    os_to_fs_task.start()
                # if the upload to OS is done but the download to FS failed, then resume
                elif upload["status"] == async_task.ST_DWN_ERR:
                    upl = uploaded_files[task_id]
                    containername = upl["user"]
                    prefix = task_id
                    objectname = upl["source"]
                    headers[TRACER_HEADER] = upl['trace_id']
                    # if file has been deleted from OS, then erroneous upload process. Restart.
                    if not staging.is_object_created(containername, prefix,
                                                     objectname):
                        app.logger.info(
                            f"{containername}/{prefix}/{objectname} isn't created in staging area, task marked as erroneous"
                        )
                        update_task(
                            task_id, headers, async_task.ERROR,
                            "File was deleted from staging area. Start a new upload process"
                        )
                        upload["status"] = async_task.ERROR
                        continue

                    # if file is still in OS, proceed to new download to FS
                    update_task(task_id, headers, async_task.ST_DWN_BEG)
                    upload["status"] = async_task.ST_DWN_BEG
                    uploaded_files["task_id"] = upload
                    os_to_fs_task = threading.Thread(target=os_to_fs,
                                                     name=upl['trace_id'],
                                                     args=(task_id, ))
                    os_to_fs_task.start()
            except Exception as e:
                app.logger.error(type(e), e)
                continue

        time.sleep(STORAGE_POLLING_INTERVAL)
Exemple #15
0
def cancel_job(jobid):

    auth_header = request.headers[AUTH_HEADER_NAME]

    try:
        system_name = request.headers["X-Machine-Name"]
    except KeyError as e:
        app.logger.error("No machinename given")
        return jsonify(description="No machine name given"), 400

    # public endpoints from Kong to users
    if system_name not in SYSTEMS_PUBLIC:
        header = {"X-Machine-Does-Not-Exists": "Machine does not exists"}
        return jsonify(description="Failed to delete job", error="Machine does not exists"), 400, header

    # select index in the list corresponding with machine name
    system_idx = SYSTEMS_PUBLIC.index(system_name)
    system_addr = SYS_INTERNALS[system_idx]

    # check if machine is accessible by user:
    # exec test remote command
    resp = exec_remote_command(auth_header, system_name, system_addr, "true")

    if resp["error"] != 0:
        error_str = resp["msg"]
        if resp["error"] == -2:
            header = {"X-Machine-Not-Available": "Machine is not available"}
            return jsonify(description="Failed to delete job"), 400, header
        if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"):
            header = {"X-Permission-Denied": "User does not have permissions to access machine or path"}
            return jsonify(description="Failed to delete job"), 404, header


    app.logger.info(f"Cancel SLURM job={jobid} from {system_name} ({system_addr})")

    # scancel with verbose in order to show correctly the error
    action = f"scancel -v {jobid}"

    try:
        # obtain new task from TASKS microservice.
        task_id = create_task(auth_header,service="compute")

        # if error in creating task:
        if task_id == -1:
            return jsonify(description="Failed to delete job",error='Error creating task'), 400

        # asynchronous task creation
        aTask = threading.Thread(target=cancel_job_task,
                             args=(auth_header, system_name, system_addr, action, task_id))

        aTask.start()

        update_task(task_id, auth_header, async_task.QUEUED)

        task_url = f"{KONG_URL}/tasks/{task_id}"

        data = jsonify(success="Task created", task_id=task_id, task_url=task_url)
        return data, 200

    except Exception as e:
        data = jsonify(description="Failed to delete job",error=e)
        return data, 400
Exemple #16
0
def submit_job_task(auth_header, system_name, system_addr, job_file, job_dir, task_id):

    try:
        # get scopes from token
        decoded = jwt.decode(auth_header[7:], verify=False)
        # scope: "openid profile email firecrest-tds.cscs.ch/storage/something"
        scopes = decoded.get('scope', '').split(' ')
        scopes_parameters = ''

        # allow empty scope
        if scopes[0] != '':
            # SCOPES sintax: id_service/microservice/parameter
            for s in scopes:
                s2 = s.split('/')
                if s2[0] == FIRECREST_SERVICE:
                    if s2[1] == 'storage':
                        if scopes_parameters != '':
                            scopes_parameters = scopes_parameters + ','

                        scopes_parameters = scopes_parameters + s2[2]

            if scopes_parameters != '':
                scopes_parameters = '--firecrest=' + scopes_parameters

        app.logger.info("scope parameters: " + scopes_parameters)

    except Exception as e:
        app.logger.error(type(e))
        app.logger.error(e.args)
        errmsg = e.message
        update_task(task_id, auth_header, async_task.ERROR, errmsg)
        return

    # -------------------
    try:
        # create tmpdir for sbatch file
        action = f"timeout {TIMEOUT} mkdir -p -- '{job_dir}'"
        app.logger.info(action)
        retval = exec_remote_command(auth_header, system_name, system_addr, action)

        if retval["error"] != 0:
            app.logger.error(f"(Error: {retval['msg']}")
            update_task(task_id, auth_header, async_task.ERROR, retval["msg"])
            return

        if job_file['content']:
            action = f"cat > {job_dir}/{job_file['filename']}"
            retval = exec_remote_command(auth_header, system_name, system_addr, action, file_transfer="upload", file_content=job_file['content'])
            if retval["error"] != 0:
                app.logger.error(f"(Error: {retval['msg']}")
                update_task(task_id, auth_header, async_task.ERROR, "Failed to upload file")
                return

        # execute sbatch
        action = f"sbatch --chdir={job_dir} {scopes_parameters} -- {job_file['filename']}"
        app.logger.info(action)

        retval = exec_remote_command(auth_header, system_name, system_addr, action)

        if retval["error"] != 0:
            app.logger.error(f"(Error: {retval['msg']}")
            update_task(task_id, auth_header,async_task.ERROR, retval["msg"])
            return

        outlines = retval["msg"]

        if outlines:
            app.logger.info(f"(No error) --> {outlines}")

        # if there's no error JobID should be extracted from slurm output
        # standard output is "Submitted batch job 9999" beign 9999 a jobid
        # it would be treated in extract_jobid function

        jobid = extract_jobid(outlines)

        msg = {"result" : "Job submitted", "jobid" : jobid}

        # now look for log and err files location
        job_extra_info = get_slurm_files(auth_header, system_name, system_addr, task_id, msg)

        update_task(task_id, auth_header, async_task.SUCCESS, job_extra_info, True)

    except IOError as e:
        app.logger.error(e.filename, exc_info=True, stack_info=True)
        app.logger.error(e.strerror)
        update_task(task_id, auth_header,async_task.ERROR, e.message)
    except Exception as e:
        app.logger.error(type(e), exc_info=True, stack_info=True)
        app.logger.error(e)
        traceback.print_exc(file=sys.stdout)
        update_task(task_id, auth_header, async_task.ERROR)



    #app.logger.info(result)
    return
Exemple #17
0
def acct():
    auth_header = request.headers[AUTH_HEADER_NAME]
    try:
        system_name = request.headers["X-Machine-Name"]
    except KeyError as e:
        app.logger.error("No machinename given")
        return jsonify(description="No machine name given"), 400

    # public endpoints from Kong to users
    if system_name not in SYSTEMS_PUBLIC:
        header = {"X-Machine-Does-Not-Exists": "Machine does not exists"}
        return jsonify(description="Failed to retrieve account information", error="Machine does not exists"), 400, header

    # select index in the list corresponding with machine name
    system_idx = SYSTEMS_PUBLIC.index(system_name)
    system_addr = SYS_INTERNALS[system_idx]

    # check if machine is accessible by user:
    # exec test remote command
    resp = exec_remote_command(auth_header, system_name, system_addr, "true")

    if resp["error"] != 0:
        error_str = resp["msg"]
        if resp["error"] == -2:
            header = {"X-Machine-Not-Available": "Machine is not available"}
            return jsonify(description="Failed to retrieve account information"), 400, header
        if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"):
            header = {"X-Permission-Denied": "User does not have permissions to access machine or path"}
            return jsonify(description="Failed to retrieve account information"), 404, header

    #check if startime (--startime=) param is set:
    start_time_opt = ""

    try:
        starttime = request.args.get("starttime","")
        if starttime != "":
            # check if starttime parameter is correctly encoded
            if check_sacctTime(starttime):
                start_time_opt  = " --starttime={start_time} ".format(start_time=starttime)
            else:
                app.logger.warning("starttime wrongly encoded")

        # check if endtime (--endtime=) param is set:
        end_time_opt = ""
        endtime   =  request.args.get("endtime","")
        if endtime != "":
            # check if endtime parameter is correctly encoded
            if check_sacctTime(endtime):
                end_time_opt = " --endtime={end_time} ".format(end_time=endtime)
            else:
                app.logger.warning("endtime wrongly encoded")
    except Exception as e:
        data = jsonify(description="Failed to retrieve account information", error=e)
        return data, 400


    # check optional parameter jobs=jobidA,jobidB,jobidC
    jobs_opt = ""

    jobs = request.args.get("jobs","")

    if jobs != "":
        jobs_opt = " --jobs={jobs} ".format(jobs=jobs)

    # sacct
    # -X so no step information is shown (ie: just jobname, not jobname.batch or jobname.0, etc)
    # --starttime={start_time_opt} starts accounting info
    # --endtime={start_time_opt} end accounting info
    # --jobs={job1,job2,job3} list of jobs to be reported
    # format: 0 - jobid  1-partition 2-jobname 3-user 4-job sTate,
    #         5 - start time, 6-elapsed time , 7-end time
    #          8 - nodes allocated and 9 - resources
    # --parsable2 = limits with | character not ending with it

    action = "sacct -X {starttime} {endtime} {jobs_opt} " \
             "--format='jobid,partition,jobname,user,state,start,cputime,end,NNodes,NodeList' " \
              "--noheader --parsable2".format(starttime=start_time_opt,endtime=end_time_opt, jobs_opt=jobs_opt)

    try:
        # obtain new task from Tasks microservice
        task_id = create_task(auth_header,service="compute")

        # if error in creating task:
        if task_id == -1:
            return jsonify(description="Failed to retrieve account information",error='Error creating task'), 400


        update_task(task_id, auth_header, async_task.QUEUED)

        # asynchronous task creation
        aTask = threading.Thread(target=acct_task,
                                 args=(auth_header, system_name, system_addr, action, task_id))

        aTask.start()
        task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id)

        data = jsonify(success="Task created", task_id=task_id, task_url=task_url)
        return data, 200

    except Exception as e:
        data = jsonify(description="Failed to retrieve account information",error=e)
        return data, 400
Exemple #18
0
def get_upload_unfinished_tasks():

    # cleanup upload dictionary
    global uploaded_files
    uploaded_files = {}

    app.logger.info(
        f"Staging Area Used: {staging.priv_url} - ObjectStorage Technology: {staging.get_object_storage()}"
    )

    try:
        # query Tasks microservice for previous tasks. Allow 30 seconds to answer

        # only unfinished upload process
        status_code = [
            async_task.ST_URL_ASK, async_task.ST_URL_REC,
            async_task.ST_UPL_CFM, async_task.ST_DWN_BEG, async_task.ST_DWN_ERR
        ]
        retval = requests.get(f"{TASKS_URL}/taskslist",
                              json={
                                  "service": "storage",
                                  "status_code": status_code
                              },
                              timeout=30,
                              verify=(SSL_CRT if USE_SSL else False))

        if not retval.ok:
            app.logger.error(
                "Error getting tasks from Tasks microservice: query failed with status {retval.status_code}, STORAGE microservice will not be fully functional. Next try will be in {STORAGE_POLLING_INTERVAL} seconds"
            )
            return

        queue_tasks = retval.json()

        # queue_tasks structure: "tasks"{
        #                                  task_{id1}: {..., data={} }
        #                                  task_{id2}: {..., data={} }  }
        # data is the field containing every

        queue_tasks = queue_tasks["tasks"]

        n_tasks = 0

        for key, task in queue_tasks.items():

            task = json.loads(task)

            # iterating over queue_tasls
            try:
                data = task["data"]

                # check if task is a non ending /xfer-external/upload downloading
                # from SWIFT to filesystem and it crashed before download finished,
                # so it can be re-initiated with /xfer-external/upload-finished
                # In that way it's is marked as erroneous

                if task["status"] == async_task.ST_DWN_BEG:
                    task["status"] = async_task.ST_DWN_ERR
                    task[
                        "description"] = "Storage has been restarted, process will be resumed"
                    headers = {}
                    headers[TRACER_HEADER] = data['trace_id']
                    update_task(task["hash_id"],
                                headers,
                                async_task.ST_DWN_ERR,
                                data,
                                is_json=True)

                uploaded_files[task["hash_id"]] = data

                n_tasks += 1

            except KeyError as e:
                app.logger.error(e)
                app.logger.error(task["data"])
                app.logger.error(key)

            except Exception as e:
                app.logger.error(data)
                app.logger.error(e)
                app.logger.error(type(e))

        app.logger.info(
            f"Not finished upload tasks recovered from taskpersistance: {n_tasks}"
        )

    except Exception as e:
        app.logger.warning(
            "Error querying TASKS microservice: STORAGE microservice will not be fully functional"
        )
        app.logger.error(e)