def delete(reservation): try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(error="Error deleting reservation", description="No machine name given"), 400 # PUBLIC endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exist": "Machine does not exist"} return jsonify(error="Error deleting reservation"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # checking input data if not check_name(reservation): return jsonify(error="Error deleting reservation", description=f"'reservation' parameter format is not valid (value entered:'{reservation}')"), 400 [headers, ID] = get_tracing_headers(request) # rsvmgmt -d reservationName action = f"ID={ID} timeout {TIMEOUT} {RESERVATION_CMD} -d '{reservation}'" #execute command retval = exec_remote_command(headers, system_name, system_addr, action) error_str = retval["msg"] if retval["error"] != 0: if retval["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(error="Error deleting reservation"), 400, header if retval["error"] == 124: header = {"X-Timeout": "Command has finished with timeout signal"} return jsonify(error="Error deleting reservation"), 400, header #in case of permission for other user if in_str(error_str,"Permission") or in_str(error_str,"SystemAdministrator"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(error="Error deleting reservation"), 404, header # otherwise, generic error error_str = cleanup_rsvmgmt_error(error_str) return jsonify(error="Error deleting reservation", description=error_str), 400 output = retval["msg"] # "rsvmgmt: Reservation csstaff_32 removed", removing "rsvmgmt: " output = output.lstrip("rsvmgmt: ") data = jsonify(success=output) return data, 204
def cancel_job_task(headers, system_name, system_addr, action, task_id): # exec scancel command resp = exec_remote_command(headers, system_name, system_addr, action) app.logger.info(resp) data = resp["msg"] # in case of error: # permission denied, jobid to be canceled is owned by user without permission if resp["error"] == 210: update_task(task_id, headers, async_task.ERROR, "User does not have permission to cancel job") return if resp["error"] == -2: update_task(task_id, headers, async_task.ERROR, "Machine is not available") return if resp["error"] != 0: err_msg = resp["msg"] if in_str(err_msg, "OPENSSH"): err_msg = "User does not have permissions to access machine" update_task(task_id, headers, async_task.ERROR, err_msg) return # in specific scancel's case, this command doesn't give error code over # invalid or completed jobs, but -v catches stderr even if it's ok # so, if error key word is on stderr scancel has failed, otherwise: # if "error" word appears: if in_str(data, "error"): # error message: "scancel: error: Kill job error on job id 5: Invalid job id specified" # desired output: "Kill job error on job id 5: Invalid job id specified" err_msg = data[(data.index("error") + 7):] update_task(task_id, headers, async_task.ERROR, err_msg) return # otherwise update_task(task_id, headers, async_task.SUCCESS, data)
def acct_task(headers, system_name, system_addr, action, task_id): # exec remote command resp = exec_remote_command(headers, system_name, system_addr, action) app.logger.info(resp) # in case of error: if resp["error"] == -2: update_task(task_id, headers, async_task.ERROR, "Machine is not available") return # in case of error: if resp["error"] != 0: err_msg = resp["msg"] if in_str(err_msg, "OPENSSH"): err_msg = "User does not have permissions to access machine" update_task(task_id, headers, async_task.ERROR, err_msg) return if len(resp["msg"]) == 0: update_task(task_id, headers, async_task.SUCCESS, {}, True) return # on success: joblist = resp["msg"].split("$") jobs = [] for job in joblist: # ouput by sacct uses '|' jobaux = job.split("|") jobinfo = { "jobid": jobaux[0], "partition": jobaux[1], "name": jobaux[2], "user": jobaux[3], "state": jobaux[4], "start_time": jobaux[5], "time": jobaux[6], "time_left": jobaux[7], "nodes": jobaux[8], "nodelist": jobaux[9] } jobs.append(jobinfo) # as it is a json data to be stored in Tasks, the is_json=True update_task(task_id, headers, async_task.SUCCESS, jobs, is_json=True)
def get(): # checks if machine name is set try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(error="Error listing reservation", description="No machine name given"), 400 # PUBLIC endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exist": "Machine does not exist"} return jsonify(error="Error listing reservation"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] [headers, ID] = get_tracing_headers(request) # list reservations action = f"ID={ID} timeout {TIMEOUT} {RESERVATION_CMD} -l" #execute command retval = exec_remote_command(headers, system_name, system_addr, action) error_str = retval["msg"] if retval["error"] != 0: if retval["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(error="Error listing reservations"), 400, header if retval["error"] == 124: header = {"X-Timeout": "Command has finished with timeout signal"} return jsonify(error="Error listing reservations"), 400, header #in case of permission for other user # sudo error returned: # # "We trust you have received the usual lecture from the local SystemAdministrator. It usually boils down to these three things: # #1) Respect the privacy of others. #2) Think before you type. #3) With great power comes great responsibility.sudo: # no tty present and no askpass program specified # if in_str(error_str,"Permission") or in_str(error_str,"SystemAdministrator"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(error="Error listing reservations"), 404, header # otherwise, generic error return jsonify(error="Error listing reservations", description=error_str), 400 output = retval["msg"] # output should have this format: ## if some reservation: # # ## rsvmgmt: Current Reservations ## --------------------------- ## ReservationName=selvedas StartTime=2020-12-24T08:00:00 EndTime=2020-12-25T12:30:00 Duration=1-04:30:00 Nodes=nid0000[0-9] NodeCnt=10 ## CoreCnt=640 Features=knl PartitionName=normal Flags= TRES=cpu=2560 Users=(null) Accounts=csstaff Licenses=(null) State=INACTIVE BurstBuffer=(null) Watts=n/a ## --------------------------- # ## if not reservation found # # ## rsvmgmt: Current Reservations ## --------------------------- ## --------------------------- reservations = [] # selects only what is between ----- lines output_list = output.split("$")[2:-1] for _output in output_list: # split by space _output = _output.split() if len(_output) == 1: # then no reservations break # otherwise this is the output list: # ['ReservationName=selvedas', 'StartTime=2020-12-24T08:00:00', 'EndTime=2020-12-25T12:30:00', 'Duration=1-04:30:00', 'Nodes=nid0000[0-9]', 'NodeCnt=10', # 'CoreCnt=640', 'Features=knl', 'PartitionName=normal', 'Flags=', 'TRES=cpu=2560', 'Users=(null)', 'Accounts=csstaff', 'Licenses=(null)', 'State=INACTIVE', 'BurstBuffer=(null)', 'Watts=n/a'] rsv_dict = {} for item in _output: try: key, value = item.split("=") rsv_dict[key.lower()] = value except ValueError: continue reservations.append(rsv_dict) # return list data = jsonify(success=reservations) return data, 200
def acct(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to retrieve account information", error="Machine does not exists"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to retrieve account information"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to retrieve account information"), 404, header #check if startime (--startime=) param is set: start_time_opt = "" try: starttime = request.args.get("starttime","") if starttime != "": # check if starttime parameter is correctly encoded if check_sacctTime(starttime): start_time_opt = " --starttime={start_time} ".format(start_time=starttime) else: app.logger.warning("starttime wrongly encoded") # check if endtime (--endtime=) param is set: end_time_opt = "" endtime = request.args.get("endtime","") if endtime != "": # check if endtime parameter is correctly encoded if check_sacctTime(endtime): end_time_opt = " --endtime={end_time} ".format(end_time=endtime) else: app.logger.warning("endtime wrongly encoded") except Exception as e: data = jsonify(description="Failed to retrieve account information", error=e) return data, 400 # check optional parameter jobs=jobidA,jobidB,jobidC jobs_opt = "" jobs = request.args.get("jobs","") if jobs != "": jobs_opt = " --jobs={jobs} ".format(jobs=jobs) # sacct # -X so no step information is shown (ie: just jobname, not jobname.batch or jobname.0, etc) # --starttime={start_time_opt} starts accounting info # --endtime={start_time_opt} end accounting info # --jobs={job1,job2,job3} list of jobs to be reported # format: 0 - jobid 1-partition 2-jobname 3-user 4-job sTate, # 5 - start time, 6-elapsed time , 7-end time # 8 - nodes allocated and 9 - resources # --parsable2 = limits with | character not ending with it action = "sacct -X {starttime} {endtime} {jobs_opt} " \ "--format='jobid,partition,jobname,user,state,start,cputime,end,NNodes,NodeList' " \ "--noheader --parsable2".format(starttime=start_time_opt,endtime=end_time_opt, jobs_opt=jobs_opt) try: # obtain new task from Tasks microservice task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to retrieve account information",error='Error creating task'), 400 update_task(task_id, auth_header, async_task.QUEUED) # asynchronous task creation aTask = threading.Thread(target=acct_task, args=(auth_header, system_name, system_addr, action, task_id)) aTask.start() task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id) data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to retrieve account information",error=e) return data, 400
def cancel_job(jobid): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to delete job", error="Machine does not exists"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to delete job"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to delete job"), 404, header app.logger.info(f"Cancel SLURM job={jobid} from {system_name} ({system_addr})") # scancel with verbose in order to show correctly the error action = f"scancel -v {jobid}" try: # obtain new task from TASKS microservice. task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to delete job",error='Error creating task'), 400 # asynchronous task creation aTask = threading.Thread(target=cancel_job_task, args=(auth_header, system_name, system_addr, action, task_id)) aTask.start() update_task(task_id, auth_header, async_task.QUEUED) task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to delete job",error=e) return data, 400
def list_job(jobid): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to retrieve job information", error="Machine does not exists"), 400, header #check if jobid is a valid jobid for SLURM if not is_jobid(jobid): return jsonify(description="Failed to retrieve job information", error=f"{jobid} is not a valid job ID"), 400 # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to retrieve job information"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to retrieve job information"), 404, header username = get_username(auth_header) app.logger.info(f"Getting SLURM information of job={jobid} from {system_name} ({system_addr})") # format: jobid (i) partition (P) jobname (j) user (u) job sTate (T), # start time (S), job time (M), left time (L) # nodes allocated (M) and resources (R) action = "squeue -u {username} --format='%i|%P|%j|%u|%T|%M|%S|%L|%D|%R' --noheader -j {jobid}".\ format(username=username,jobid=jobid) try: # obtain new task from Tasks microservice task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to retrieve job information",error='Error creating task'), 400 update_task(task_id, auth_header, async_task.QUEUED) # asynchronous task creation aTask = threading.Thread(target=list_job_task, args=(auth_header, system_name, system_addr, action, task_id, 1, 1)) aTask.start() task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id) data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to retrieve job information",error=e) return data, 400
def list_job_task(auth_header,system_name, system_addr,action,task_id,pageSize,pageNumber): # exec command resp = exec_remote_command(auth_header, system_name, system_addr, action) app.logger.info(resp) # in case of error: if resp["error"] == -2: update_task(task_id, auth_header,async_task.ERROR,"Machine is not available") return if resp["error"] == 1: err_msg = resp["msg"] if in_str(err_msg,"OPENSSH"): err_msg = "User does not have permissions to access machine" update_task(task_id, auth_header,async_task.ERROR ,err_msg) return if len(resp["msg"]) == 0: #update_task(task_id, auth_header, async_task.SUCCESS, "You don't have active jobs on {machine}".format(machine=machine)) update_task(task_id, auth_header, async_task.SUCCESS,{},True) return # on success: jobList = resp["msg"].split("$") app.logger.info("Size jobs: %d" % len(jobList)) # pagination totalSize = len(jobList) pageNumber = float(pageNumber) pageSize = float(pageSize) totalPages = int(ceil(float(totalSize) / float(pageSize))) app.logger.info(f"Total Size: {totalSize}") app.logger.info(f"Total Pages: {totalPages}") if pageNumber < 0 or pageNumber > totalPages-1: app.logger.warning( "pageNumber ({pageNumber}) greater than total pages ({totalPages})".format(pageNumber=pageNumber, totalPages=totalPages)) app.logger.warning("set to default") pageNumber = 0 beg_reg = int(pageNumber * pageSize) end_reg = int( (pageNumber+1 * pageSize) -1 ) app.logger.info("Initial reg {beg_reg}, final reg: {end_reg}".format(beg_reg=beg_reg, end_reg=end_reg)) jobList = jobList[beg_reg:end_reg + 1] jobs = {} for job_index in range(len(jobList)): job = jobList[job_index] jobaux = job.split("|") jobinfo = {"jobid": jobaux[0], "partition": jobaux[1], "name": jobaux[2], "user": jobaux[3], "state": jobaux[4], "start_time": jobaux[5], "time": jobaux[6], "time_left": jobaux[7], "nodes": jobaux[8], "nodelist": jobaux[9]} # now looking for log and err files location jobinfo = get_slurm_files(auth_header, system_name, system_addr, task_id,jobinfo,True) # add jobinfo to the array jobs[str(job_index)]=jobinfo data = jobs update_task(task_id, auth_header, async_task.SUCCESS, data, True)
def list_jobs(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to retrieve jobs information", error="Machine does not exists"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to retrieve jobs information"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to retrieve jobs information"), 404, header username = get_username(auth_header) app.logger.info(f"Getting SLURM information of jobs from {system_name} ({system_addr})") # job list comma separated: jobs = request.args.get("jobs", None) pageSize = request.args.get("pageSize", None) pageNumber = request.args.get("pageNumber", None) if pageSize != None and pageNumber != None: try: pageNumber = int(pageNumber) pageSize = int(pageSize) if pageSize not in [10,25,50,100]: pageSize = 25 except ValueError: pageNumber = 0 pageSize = 25 app.logger.error("Wrong pageNumber and/or pageSize") else: # if not set, by default pageNumber = 0 pageSize = 25 # by default empty job_list = "" if jobs != None: try: # check if input is correct: job_aux_list = jobs.split(",") if '' in job_aux_list: return jsonify(error="Jobs list wrong format",description="Failed to retrieve job information"), 400 for jobid in job_aux_list: if not is_jobid(jobid): return jsonify(error=f"{jobid} is not a valid job ID", description="Failed to retrieve job information"), 400 job_list="--job={jobs}".format(jobs=jobs) except: return jsonify(error="Jobs list wrong format",description="Failed to retrieve job information"), 400 # format: jobid (i) partition (P) jobname (j) user (u) job sTate (T), # start time (S), job time (M), left time (L) # nodes allocated (M) and resources (R) action = f"squeue -u {username} {job_list} --format='%i|%P|%j|%u|%T|%M|%S|%L|%D|%R' --noheader" try: task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to retrieve job information",error='Error creating task'), 400 update_task(task_id, auth_header, async_task.QUEUED) # asynchronous task creation aTask = threading.Thread(target=list_job_task, args=(auth_header, system_name, system_addr, action, task_id, pageSize, pageNumber)) aTask.start() task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 200 except Exception as e: data = jsonify(description="Failed to retrieve job information",error=e) return data, 400
def internal_operation(request, command): system_idx = SYSTEMS_PUBLIC.index(STORAGE_JOBS_MACHINE) system_addr = SYS_INTERNALS_UTILITIES[system_idx] system_name = STORAGE_JOBS_MACHINE targetPath = request.form.get("targetPath", None) # path to save file in cluster v = validate_input(targetPath) if v != "": return jsonify(description=f"Error on {command} operation", error=f"'targetPath' {v}"), 400 [headers, ID] = get_tracing_headers(request) # using actual_command to add options to check sanity of the command to be executed actual_command = "" if command in ['cp', 'mv', 'rsync']: sourcePath = request.form.get("sourcePath", None) # path to get file in cluster v = validate_input(sourcePath) if v != "": return jsonify(description=f"Error on {command} operation", error=f"'sourcePath' {v}"), 400 # checks if file to copy, move or rsync (targetPath) is a valid path # remove the last part of the path (after last "/" char) to check if the dir can be written by user _targetPath = targetPath.split("/")[:-1] _targetPath = "/".join(_targetPath) app.logger.info(f"_targetPath={_targetPath}") check_dir = is_valid_dir(_targetPath, headers, system_name, system_addr) if not check_dir["result"]: return jsonify( description="targetPath error"), 400, check_dir["headers"] check_file = is_valid_file(sourcePath, headers, system_name, system_addr) if not check_file["result"]: check_dir = is_valid_dir(sourcePath, headers, system_name, system_addr) if not check_dir["result"]: return jsonify( description="sourcePath error"), 400, check_dir["headers"] if command == "cp": actual_command = "cp --force -dR --preserve=all -- " elif command == "mv": actual_command = "mv --force -- " else: actual_command = "rsync -av -- " elif command == "rm": # for 'rm' there's no source, set empty to call exec_internal_command(...) # checks if file or dir to delete (targetPath) is a valid path or valid directory check_file = is_valid_file(targetPath, headers, system_name, system_addr) if not check_file["result"]: check_dir = is_valid_dir(targetPath, headers, system_name, system_addr) if not check_dir["result"]: return jsonify( description="targetPath error"), 400, check_dir["headers"] sourcePath = "" actual_command = "rm -rf -- " else: return jsonify(error=f"Command {command} not allowed"), 400 # don't add tracing ID, we'll be executed by srun actual_command = f"{actual_command} '{sourcePath}' '{targetPath}'" jobName = request.form.get("jobName", "") # jobName for SLURM if jobName == "": jobName = command + "-job" app.logger.info(f"jobName not found, setting default to: {jobName}") else: v = validate_input(jobName) if v != "": return jsonify(description="Invalid jobName", error=f"'jobName' {v}"), 400 try: jobTime = request.form["time"] # job time, default is 2:00:00 H:M:s if not job_time.check_jobTime(jobTime): return jsonify(error="Not supported time format"), 400 except: jobTime = "02:00:00" stageOutJobId = request.form.get( "stageOutJobId", None) # start after this JobId has finished if stageOutJobId != None: v = validate_input(stageOutJobId) if v != "": return jsonify(description="Invalid stageOutJobId", error=f"'stageOutJobId' {v}"), 400 # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(STORAGE_JOBS_MACHINE) system_addr = SYS_INTERNALS[system_idx] app.logger.info(f"USE_SLURM_ACCOUNT: {USE_SLURM_ACCOUNT}") # get "account" parameter, if not found, it is obtained from "id" command try: account = request.form["account"] v = validate_input(account) if v != "": return jsonify(description="Invalid account", error=f"'account' {v}"), 400 except: if USE_SLURM_ACCOUNT: username = get_username(headers[AUTH_HEADER_NAME]) id_command = f"ID={ID} timeout {UTILITIES_TIMEOUT} id -gn -- {username}" resp = exec_remote_command(headers, STORAGE_JOBS_MACHINE, system_addr, id_command) if resp["error"] != 0: retval = check_command_error(resp["msg"], resp["error"], f"{command} job") return jsonify(description=f"Failed to submit {command} job", error=retval["description"] ), retval["status_code"], retval["header"] account = resp["msg"] else: account = None # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(headers, STORAGE_JOBS_MACHINE, system_addr, f"ID={ID} true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify( description=f"Failed to submit {command} job"), 400, header if in_str(error_str, "Permission") or in_str(error_str, "OPENSSH"): header = { "X-Permission-Denied": "User does not have permissions to access machine or path" } return jsonify( description=f"Failed to submit {command} job"), 404, header retval = exec_internal_command(headers, actual_command, jobName, jobTime, stageOutJobId, account) # returns "error" key or "success" key try: error = retval["error"] errmsg = retval["msg"] desc = retval["desc"] # headers values cannot contain "\n" strings return jsonify(error=desc), 400, {"X-Sbatch-Error": errmsg} except KeyError: success = retval["success"] task_id = retval["task_id"] return jsonify(success=success, task_id=task_id), 201
def submit_job_upload(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header={"X-Machine-Does-Not-Exists":"Machine does not exists"} return jsonify(description="Failed to submit job file",error="Machine does not exists"), 400, header # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to submit job file"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to submit job file"), 404, header job_base_fs = COMPUTE_BASE_FS[system_idx] try: # check if the post request has the file part if 'file' not in request.files: app.logger.error('No batch file part') error = jsonify(description="Failed to submit job file", error='No batch file part') return error, 400 job_file = {'filename': secure_filename(request.files['file'].filename), 'content': request.files['file'].read()} # if user does not select file, browser also # submit an empty part without filename if job_file['filename'] == '': app.logger.error('No batch file selected') error = jsonify(description="Failed to submit job file", error='No batch file selected') return error, 400 except RequestEntityTooLarge as re: app.logger.error(re.description) data = jsonify(description="Failed to submit job file", error=f"File is bigger than {MAX_FILE_SIZE} MB") return data, 413 except Exception as e: data = jsonify(description="Failed to submit job file",error=e) return data, 400 task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to submit job file",error='Error creating task'), 400 # create tmp file with timestamp # using hash_id from Tasks, which is user-task_id (internal) tmpdir = "{task_id}".format(task_id=task_id) username = get_username(auth_header) job_dir = f"{job_base_fs}/{username}/firecrest/{tmpdir}" app.logger.info(f"Job dir: {job_dir}") try: # asynchronous task creation aTask = threading.Thread(target=submit_job_task, args=(auth_header, system_name, system_addr, job_file, job_dir, task_id)) aTask.start() retval = update_task(task_id, auth_header,async_task.QUEUED) task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 201 except Exception as e: data = jsonify(description="Failed to submit job",error=e) return data, 400
def submit_job_path_task(auth_header,system_name, system_addr,fileName,job_dir, task_id): try: # get scopes from token decoded = jwt.decode(auth_header[7:], verify=False) # scope: "openid profile email firecrest-tds.cscs.ch/storage/something" scopes = decoded['scope'].split(' ') scopes_parameters = '' # SCOPES sintax: id_service/microservice/parameter for s in scopes: s2 = s.split('/') if s2[0] == FIRECREST_SERVICE: if s2[1] == 'storage': if scopes_parameters != '': scopes_parameters = scopes_parameters + ',' scopes_parameters = scopes_parameters + s2[2] if scopes_parameters != '': scopes_parameters = '--firecrest=' + scopes_parameters app.logger.info("scope parameters: " + scopes_parameters) except Exception as e: app.logger.error(type(e)) app.logger.error(e.args) action=f"sbatch --chdir={job_dir} {scopes_parameters} -- {fileName}" resp = exec_remote_command(auth_header, system_name, system_addr, action) app.logger.info(resp) # in case of error: if resp["error"] != 0: if resp["error"] == -2: update_task(task_id, auth_header, async_task.ERROR,"Machine is not available") return if resp["error"] == 1: err_msg = resp["msg"] if in_str(err_msg,"OPENSSH"): err_msg = "User does not have permissions to access machine" update_task(task_id, auth_header, async_task.ERROR ,err_msg) return err_msg = resp["msg"] update_task(task_id, auth_header, async_task.ERROR, err_msg) jobid = extract_jobid(resp["msg"]) msg = {"result":"Job submitted", "jobid":jobid} # now looking for log and err files location job_extra_info = get_slurm_files(auth_header, system_name, system_addr, task_id,msg) update_task(task_id, auth_header,async_task.SUCCESS, job_extra_info,True)
def submit_job_path(): try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="Failed to submit job", error="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exists": "Machine does not exists"} return jsonify(description="Failed to submit job", error="Machine does not exists"), 400, header # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] use_plugin = USE_SPANK_PLUGIN[system_idx] targetPath = request.form.get("targetPath", None) v = validate_input(targetPath) if v != "": return jsonify(description="Failed to submit job", error=f"'targetPath' {v}"), 400 # check "account parameter" account = request.form.get("account", None) if account != None: v = validate_input(account) if v != "": return jsonify(description="Invalid account", error=f"'account' {v}"), 400 [headers, ID] = get_tracing_headers(request) # check if machine is accessible by user: resp = exec_remote_command(headers, system_name, system_addr, f"ID={ID} true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to submit job"), 400, header if in_str(error_str, "Permission") or in_str(error_str, "OPENSSH"): header = { "X-Permission-Denied": "User does not have permissions to access machine or path" } return jsonify(description="Failed to submit job"), 404, header # checks if targetPath is a valid path for this user in this machine check = is_valid_file(targetPath, headers, system_name, system_addr) if not check["result"]: return jsonify( description="Failed to submit job"), 400, check["headers"] # creates the async task related to the job submission task_id = create_task(headers, service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to submit job", error='Error creating task'), 400 # if targetPath = "/home/testuser/test/sbatch.sh/" # split by / and discard last element (the file name): ['', 'home', 'testuser', 'test'] job_dir_splitted = targetPath.split("/")[:-1] # in case the targetPath ends with /, like: "/home/testuser/test/sbatch.sh/" # => ['', 'home', 'testuser', 'test', ''], then last element of the list is discarded if job_dir_splitted[-1] == "": job_dir_splitted = job_dir_splitted[:-1] job_dir = "/".join(job_dir_splitted) try: # asynchronous task creation aTask = threading.Thread(target=submit_job_path_task, name=ID, args=(headers, system_name, system_addr, targetPath, job_dir, account, use_plugin, task_id)) aTask.start() retval = update_task(task_id, headers, async_task.QUEUED, TASKS_URL) task_url = f"{KONG_URL}/tasks/{task_id}" data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 201 except Exception as e: data = jsonify(description="Failed to submit job", error=e) return data, 400
def post(): auth_header = request.headers[AUTH_HEADER_NAME] # checks if machine name is set try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(error="Error creating reservation", description="No machine name given"), 400 # PUBLIC endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exist": "Machine does not exist"} return jsonify(error="Error creating reservation"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # checking input data # getting reservation name from request form try: reservation = request.form["reservation"] if not check_name(reservation): return jsonify( error="Error creating reservation", description= f"'reservation' parameter format is not valid (value entered:'{reservation}')" ), 400 except BadRequestKeyError: return jsonify( error="Error creating reservation", description="'reservation' form data input missing"), 400 # getting account name from request form try: account = request.form["account"] if not check_name(account): return jsonify( error="Error creating reservation", description= f"'account' parameter format is not valid (value entered:'{account}')" ), 400 except BadRequestKeyError: return jsonify(error="Error creating reservation", description="'account' form data input missing"), 400 # getting numberOfNodes from request form try: numberOfNodes = request.form["numberOfNodes"] if not check_number(numberOfNodes): return jsonify( error="Error creating reservation", description= f"'numberOfNodes' parameter is not valid. It should be an integer > 0 (value entered:'{numberOfNodes}')" ), 400 except BadRequestKeyError: return jsonify( error="Error creating reservation", description="'numberOfNodes' form data input missing"), 400 # getting nodeType from request form try: nodeType = request.form["nodeType"] if not check_name(nodeType): return jsonify( error="Error creating reservation", description= f"'nodeType' parameter format is not valid (value entered:'{nodeType}')" ), 400 except BadRequestKeyError: return jsonify(error="Error creating reservation", description="'nodeType' form data input missing"), 400 # getting starttime from request form try: starttime = request.form["starttime"] if not check_dateTime(starttime): return jsonify( error="Error creating reservation", description= f"'starttime' parameter format is not valid. It should be YYYY-MM-DDTHH:MM:SS (value entered:'{starttime}')" ), 400 except BadRequestKeyError: return jsonify(error="Error creating reservation", description="'starttime' form data input missing"), 400 # getting endtime from request form try: endtime = request.form["endtime"] if not check_dateTime(endtime): return jsonify( error="Error creating reservation", description= f"'endtime' parameter format is not valid. It should be YYYY-MM-DDTHH:MM:SS (value entered:'{endtime}')" ), 400 except BadRequestKeyError: return jsonify(error="Error creating reservation", description="'endtime' form data input missing"), 400 if not check_dateDiff(starttime, endtime): return jsonify( error="Error creating reservation", description= f"'endtime' occurs before 'starttime' (values entered: endtime='{endtime}' <= starttime='{starttime}')" ), 400 if not check_actualDate(starttime): return jsonify( error="Error creating reservation", description= f"'starttime' is in the pass (values entered: starttime='{starttime}')" ), 400 # create a reservation # rsvmgmt -a unixGroupName numberOfNodes NodeType startDateTime endDateTime [optional reservationName] action = f"timeout {TIMEOUT} {RESERVATION_CMD} -a {account} {numberOfNodes} {nodeType} {starttime} {endtime} {reservation}" #execute command retval = exec_remote_command(auth_header, system_name, system_addr, action) error_str = retval["msg"] if retval["error"] != 0: if retval["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(error="Error creating reservation"), 400, header if retval["error"] == 124: header = {"X-Timeout": "Command has finished with timeout signal"} return jsonify(error="Error creating reservation"), 400, header #in case of permission for other user if in_str(error_str, "Permission") or in_str(error_str, "SystemAdministrator"): header = { "X-Permission-Denied": "User does not have permissions to access machine or path" } return jsonify(error="Error creating reservation"), 404, header # otherwise, generic error error_str = cleanup_rsvmgmt_error(error_str) return jsonify(error="Error creating reservation", description=error_str), 400 output = retval["msg"] # Reservation created: {reservation} data = jsonify(success=output) return data, 201
def put(reservation): try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(error="Error updating reservation", description="No machine name given"), 400 # PUBLIC endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header = {"X-Machine-Does-Not-Exist": "Machine does not exist"} return jsonify(error="Error updating reservation"), 400, header # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # checking input data if not check_name(reservation): return jsonify(error="Error updating reservation", description=f"'reservation' parameter format is not valid (value entered:'{reservation}')"), 400 # getting numberOfNodes from request form try: numberOfNodes = request.form["numberOfNodes"] if not check_number(numberOfNodes): return jsonify(error="Error updating reservation", description=f"'numberOfNodes' parameter is not valid. It should be an integer > 0 (value entered:'{numberOfNodes}')"), 400 except BadRequestKeyError: return jsonify(error="Error updating reservation", description="'numberOfNodes' form data input missing"), 400 # getting nodeType from request form try: nodeType = request.form["nodeType"] if not check_name(nodeType): return jsonify(error="Error updating reservation", description=f"'nodeType' parameter format is not valid (value entered:'{nodeType}')"), 400 except BadRequestKeyError: return jsonify(error="Error updating reservation", description="'nodeType' form data input missing"), 400 # getting starttime from request form try: starttime = request.form["starttime"] if not check_dateTime(starttime): return jsonify(error="Error updating reservation", description=f"'starttime' parameter format is not valid. It should be YYYY-MM-DDTHH:MM:SS (value entered:'{starttime}')"), 400 except BadRequestKeyError: return jsonify(error="Error updating reservation", description="'starttime' form data input missing"), 400 # getting endtime from request form try: endtime = request.form["endtime"] if not check_dateTime(endtime): return jsonify(error="Error updating reservation", description=f"'endtime' parameter format is not valid. It should be YYYY-MM-DDTHH:MM:SS (value entered:'{endtime}')"), 400 except BadRequestKeyError: return jsonify(error="Error updating reservation", description="'endtime' form data input missing"), 400 if not check_dateDiff(starttime,endtime): return jsonify(error="Error updating reservation", description=f"'endtime' occurs before 'starttime' (values entered: endtime='{endtime}' <= starttime='{starttime}')"), 400 if not check_actualDate(starttime): return jsonify(error="Error creating reservation", description=f"'starttime' is in the pass (values entered: starttime='{starttime}')"), 400 [headers, ID] = get_tracing_headers(request) # Update a reservation # rsvmgmt -u reservationName numberOfNodes NodeType StartDateTime EndDateTime action = f"ID={ID} timeout {TIMEOUT} {RESERVATION_CMD} -u '{reservation}' {numberOfNodes} {nodeType} {starttime} {endtime}" #execute command retval = exec_remote_command(headers, system_name, system_addr, action) error_str = retval["msg"] if retval["error"] != 0: if retval["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(error="Error updating reservation"), 400, header if retval["error"] == 124: header = {"X-Timeout": "Command has finished with timeout signal"} return jsonify(error="Error updating reservation"), 400, header #in case of permission for other user if in_str(error_str,"Permission") or in_str(error_str,"SystemAdministrator"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(error="Error updating reservation"), 404, header # otherwise, generic error # First cleanup "timeout:" error string. # Then if it comes from rsvmgmt this is the format # rsvmgmt: Error: You are not a member of the $1 project" # let's extract "rsvmgmt: Error: " string so it reports "You are not a member of the $1 project" error_str = error_str.lstrip("timeout:") error_str = error_str.lstrip("rsvmgmt:") error_str = error_str.lstrip("Error: ") return jsonify(error="Error updating reservation", description=error_str), 400 output = retval["msg"] # Reservation updated data = jsonify(success=output) return data, 200
def submit_job_path(): auth_header = request.headers[AUTH_HEADER_NAME] try: system_name = request.headers["X-Machine-Name"] except KeyError as e: app.logger.error("No machinename given") return jsonify(description="Failed to submit job", error="No machine name given"), 400 # public endpoints from Kong to users if system_name not in SYSTEMS_PUBLIC: header={"X-Machine-Does-Not-Exists":"Machine does not exists"} return jsonify(description="Failed to submit job",error="Machine does not exists"), 400, header # iterate over SYSTEMS_PUBLIC list and find the endpoint matching same order # select index in the list corresponding with machine name system_idx = SYSTEMS_PUBLIC.index(system_name) system_addr = SYS_INTERNALS[system_idx] # check if machine is accessible by user: # exec test remote command resp = exec_remote_command(auth_header, system_name, system_addr, "true") if resp["error"] != 0: error_str = resp["msg"] if resp["error"] == -2: header = {"X-Machine-Not-Available": "Machine is not available"} return jsonify(description="Failed to submit job"), 400, header if in_str(error_str,"Permission") or in_str(error_str,"OPENSSH"): header = {"X-Permission-Denied": "User does not have permissions to access machine or path"} return jsonify(description="Failed to submit job"), 404, header try: targetPath = request.form["targetPath"] except KeyError as e: data = jsonify(description="Failed to submit job", error="'targetPath' parameter not set in request") return data, 400 if targetPath == None: data = jsonify(description="Failed to submit job", error="'targetPath' parameter not set in request") return data, 400 if targetPath == "": data = jsonify(description="Failed to submit job", error="'targetPath' parameter value is empty") return data, 400 # checks if targetPath is a valid path for this user in this machine check = is_valid_file(targetPath, auth_header, system_name, system_addr) if not check["result"]: return jsonify(description="Failed to submit job"), 400, check["headers"] # creates the async task related to the job submission task_id = create_task(auth_header,service="compute") # if error in creating task: if task_id == -1: return jsonify(description="Failed to submit job",error='Error creating task'), 400 # if targetPath = "/home/testuser/test/sbatch.sh/" # split by / and discard last element (the file name): ['', 'home', 'testuser', 'test'] job_dir_splitted = targetPath.split("/")[:-1] # in case the targetPath ends with /, like: "/home/testuser/test/sbatch.sh/" # => ['', 'home', 'testuser', 'test', ''], then last element of the list is discarded if job_dir_splitted[-1] == "": job_dir_splitted = job_dir_splitted[:-1] job_dir = "/".join(job_dir_splitted) try: # asynchronous task creation aTask = threading.Thread(target=submit_job_path_task, args=(auth_header, system_name, system_addr, targetPath, job_dir, task_id)) aTask.start() retval = update_task(task_id, auth_header, async_task.QUEUED, TASKS_URL) task_url = "{KONG_URL}/tasks/{task_id}".format(KONG_URL=KONG_URL, task_id=task_id) data = jsonify(success="Task created", task_id=task_id, task_url=task_url) return data, 201 except Exception as e: data = jsonify(description="Failed to submit job",error=e) return data, 400
def download_task(headers, system_name, system_addr, sourcePath, task_id): object_name = sourcePath.split("/")[-1] global staging # check if staging area token is valid if not staging.renew_token(): msg = "Staging area auth error" update_task(task_id, headers, async_task.ERROR, msg) return # create container if it doesn't exists: container_name = get_username(headers[AUTH_HEADER_NAME]) if not staging.is_container_created(container_name): errno = staging.create_container(container_name) if errno == -1: msg = f"Could not create container {container_name} in Staging Area ({staging.get_object_storage()})" update_task(task_id, headers, async_task.ERROR, msg) return # upload file to swift object_prefix = task_id upload_url = staging.create_upload_form(sourcePath, container_name, object_prefix, STORAGE_TEMPURL_EXP_TIME, STORAGE_MAX_FILE_SIZE) # advice Tasks that upload begins: update_task(task_id, headers, async_task.ST_UPL_BEG) # upload starts: res = exec_remote_command(headers, system_name, system_addr, upload_url["command"]) # if upload to SWIFT fails: if res["error"] != 0: msg = f"Upload to Staging area has failed. Object: {object_name}" error_str = res["msg"] if in_str(error_str, "OPENSSH"): error_str = "User does not have permissions to access machine" msg = f"{msg}. {error_str}" app.logger.error(msg) update_task(task_id, headers, async_task.ST_UPL_ERR, msg) return # get Download Temp URL with [seconds] time expiration # create temp url for file: valid for STORAGE_TEMPURL_EXP_TIME seconds temp_url = staging.create_temp_url(container_name, object_prefix, object_name, STORAGE_TEMPURL_EXP_TIME, internal=False) # if error raises in temp url creation: if temp_url == None: msg = f"Temp URL creation failed. Object: {object_name}" update_task(task_id, headers, async_task.ERROR, msg) return # if succesfully created: temp_url in task with success status update_task(task_id, headers, async_task.ST_UPL_END, temp_url) # marked deletion from here to STORAGE_TEMPURL_EXP_TIME (default 30 days) retval = staging.delete_object_after(containername=container_name, prefix=object_prefix, objectname=object_name, ttl=int(time.time()) + STORAGE_TEMPURL_EXP_TIME) if retval == 0: app.logger.info( f"Setting {STORAGE_TEMPURL_EXP_TIME} [s] as X-Delete-At") else: app.logger.error("Object couldn't be marked as X-Delete-At")