def post(self): args = self.post_parser.parse_args() vcName = args["vcName"] userName = args["userName"] database = args["database"] templateName = args["templateName"] if database == "master": if AuthorizationManager.HasAccess(userName, ResourceType.Cluster, "", Permission.Admin): scope = "master" else: return "access denied", 403 elif database == "vc": if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Admin): scope = "vc:" + vcName else: return "access denied", 403 else: scope = "user:"******"Invalid JSON") dataHandler = DataHandler() ret = {} ret["result"] = dataHandler.UpdateTemplate(templateName, scope, json.dumps(template_json)) dataHandler.Close() return generate_response(ret)
def delete(self): args = self.delete_parser.parse_args() vcName = args["vcName"] userName = args["userName"] database = args["database"] templateName = args["templateName"] if database == "master": if AuthorizationManager.HasAccess(userName, ResourceType.Cluster, "", Permission.Admin): scope = "master" else: return "access denied", 403 elif database == "vc": if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Admin): scope = "vc:" + vcName else: return "access denied", 403 else: scope = "user:"******"result"] = dataHandler.DeleteTemplate(templateName, scope) dataHandler.Close() return generate_response(ret)
def ListVCs(userName): ret = [] vcList = DataManager.ListVCs() for vc in vcList: if AuthorizationManager.HasAccess(userName, ResourceType.VC, vc["vcName"], Permission.User): vc['admin'] = AuthorizationManager.HasAccess(userName, ResourceType.VC, vc["vcName"], Permission.Admin) ret.append(vc) # web portal (client) can filter out Default VC return ret
def GetJobList(userName, vcName, jobOwner, num=None): try: dataHandler = DataHandler() jobs = [] hasAccessOnAllJobs = False if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Collaborator): hasAccessOnAllJobs = True if jobOwner != "all" or not hasAccessOnAllJobs: jobs = jobs + GetUserPendingJobs(userName, vcName) jobs = jobs + dataHandler.GetJobList( userName, vcName, num, "running,queued,scheduling,unapproved,pausing,paused", ("<>", "and")) else: jobs = GetUserPendingJobs(jobOwner, vcName) for job in jobs: job.pop('jobMeta', None) dataHandler.Close() return jobs except Exception as e: logger.error('Exception: %s', str(e)) logger.warn("Fail to get job list for user %s, return empty list", userName) return []
def GetJobDetail(userName, jobId): job = None dataHandler = DataHandler() jobs = dataHandler.GetJob(jobId=jobId) if len(jobs) == 1: if jobs[0]["userName"] == userName or AuthorizationManager.HasAccess(userName, ResourceType.VC, jobs[0]["vcName"], Permission.Collaborator): job = jobs[0] job["log"] = "" #jobParams = json.loads(base64.b64decode(job["jobMeta"])) #jobPath,workPath,dataPath = GetStoragePath(jobParams["jobPath"],jobParams["workPath"],jobParams["dataPath"]) #localJobPath = os.path.join(config["storage-mount-path"],jobPath) #logPath = os.path.join(localJobPath,"joblog.txt") #print logPath #if os.path.isfile(logPath): # with open(logPath, 'r') as f: # log = f.read() # job["log"] = log # f.close() if "jobDescription" in job: job.pop("jobDescription",None) try: log = dataHandler.GetJobTextField(jobId,"jobLog") try: if isBase64(log): log = base64.b64decode(log) except Exception: pass if log is not None: job["log"] = log except: job["log"] = "fail-to-get-logs" dataHandler.Close() return job
def ListStorages(userName, vcName): ret = [] dataHandler = DataHandler() if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.User): ret = dataHandler.ListStorages(vcName) dataHandler.Close() return ret
def GetJobLog(userName, jobId): dataHandler = DataHandler() jobs = dataHandler.GetJob(jobId=jobId) if len(jobs) == 1: if jobs[0]["userName"] == userName or AuthorizationManager.HasAccess( userName, ResourceType.VC, jobs[0]["vcName"], Permission.Collaborator): try: log = dataHandler.GetJobTextField(jobId, "jobLog") try: if isBase64(log): log = base64.b64decode(log) except Exception: pass if log is not None: return { "log": log, "cursor": None, } except: pass return { "log": {}, "cursor": None, }
def GetCommands(userName, jobId): commands = [] dataHandler = DataHandler() jobs = dataHandler.GetJob(jobId=jobId) if jobs[0]["userName"] == userName or AuthorizationManager.HasAccess(userName, ResourceType.VC, jobs[0]["vcName"], Permission.Collaborator): commands = dataHandler.GetCommands(jobId=jobId) dataHandler.Close() return commands
def DeleteAce(userName, identityName, resourceType, resourceName): ret = None resourceAclPath = AuthorizationManager.GetResourceAclPath(resourceName, resourceType) if AuthorizationManager.HasAccess(userName, resourceType, resourceName, Permission.Admin): ret = AuthorizationManager.DeleteAce(identityName, resourceAclPath) else: ret = "Access Denied!" return ret
def ResumeJob(userName, jobId): dataHandler = DataHandler() ret = False jobs = dataHandler.GetJob(jobId=jobId) if len(jobs) == 1 and jobs[0]["jobStatus"] == "paused": if jobs[0]["userName"] == userName or AuthorizationManager.HasAccess(userName, ResourceType.VC, jobs[0]["vcName"], Permission.Collaborator): ret = dataHandler.UpdateJobTextField(jobId, "jobStatus", "unapproved") dataHandler.Close() return ret
def UpdateStorage(userName, vcName, url, storageType, metadata, defaultMountPath): ret = None dataHandler = DataHandler() if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Admin): ret = dataHandler.UpdateStorage(vcName, url, storageType, metadata, defaultMountPath) else: ret = "Access Denied!" dataHandler.Close() return ret
def AddCommand(userName, jobId,command): dataHandler = DataHandler() ret = False jobs = dataHandler.GetJob(jobId=jobId) if len(jobs) == 1: if jobs[0]["userName"] == userName or AuthorizationManager.HasAccess(userName, ResourceType.VC, jobs[0]["vcName"], Permission.Collaborator): ret = dataHandler.AddCommand(jobId,command) dataHandler.Close() return ret
def PauseJob(userName, jobId): dataHandler = DataHandler() ret = False jobs = dataHandler.GetJob(jobId=jobId) if len(jobs) == 1: if jobs[0]["userName"] == userName or AuthorizationManager.HasAccess(userName, ResourceType.VC, jobs[0]["vcName"], Permission.Admin): ret = dataHandler.UpdateJobTextField(jobId,"jobStatus","pausing") dataHandler.Close() return ret
def DeleteStorage(userName, vcName, url): ret = None dataHandler = DataHandler() if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Admin): ret = dataHandler.DeleteStorage(vcName, url) else: ret = "Access Denied!" dataHandler.Close() return ret
def ApproveJob(userName, jobId): dataHandler = DataHandler() ret = False jobs = dataHandler.GetJob(jobId=jobId) if len(jobs) == 1: if AuthorizationManager.HasAccess(userName, ResourceType.VC, jobs[0]["vcName"], Permission.Admin): ret = dataHandler.UpdateJobTextField(jobId,"jobStatus","queued") dataHandler.Close() InvalidateJobListCache(jobs[0]["vcName"]) return ret
def ApproveJob(userName, jobId): dataHandler = DataHandler() ret = False job = dataHandler.GetJobTextFields(jobId, ["vcName", "jobStatus"]) if job is not None and job["jobStatus"] == "unapproved": if AuthorizationManager.HasAccess(userName, ResourceType.VC, job["vcName"], Permission.Admin): ret = dataHandler.UpdateJobTextField(jobId, "jobStatus", "queued") dataHandler.Close() return ret
def GetCommands(userName, jobId): commands = [] dataHandler = DataHandler() job = dataHandler.GetJobTextFields(jobId, ["userName", "vcName"]) if job is not None: if job["userName"] == userName or AuthorizationManager.HasAccess( userName, ResourceType.VC, job["vcName"], Permission.Collaborator): commands = dataHandler.GetCommands(jobId=jobId) dataHandler.Close() return commands
def AddCommand(userName, jobId, command): dataHandler = DataHandler() ret = False job = dataHandler.GetJobTextFields(jobId, ["userName", "vcName"]) if job is not None: if job["userName"] == userName or AuthorizationManager.HasAccess( userName, ResourceType.VC, job["vcName"], Permission.Collaborator): ret = dataHandler.AddCommand(jobId, command) dataHandler.Close() return ret
def ListVCs(userName): ret = [] dataHandler = DataHandler() vcList = dataHandler.ListVCs() for vc in vcList: if AuthorizationManager.HasAccess(userName, ResourceType.VC, vc["vcName"], Permission.User): # todo : get other info (resource consumption, quota etc.) about VC? ret.append(vc) # web portal (client) can filter out Default VC dataHandler.Close() return ret
def post(self): parser = reqparse.RequestParser() parser.add_argument('vcName', location="args") parser.add_argument('userName', location="args") parser.add_argument('database', location="args") parser.add_argument('templateName', location="args") args = parser.parse_args() vcName = args["vcName"] userName = args["userName"] database = args["database"] templateName = args["templateName"] if database == 'master': if AuthorizationManager.HasAccess(userName, ResourceType.Cluster, "", Permission.Admin): scope = 'master' else: return 'access denied', 403 elif database == 'vc': if AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.Admin): scope = 'vc:' + vcName else: return 'access denied', 403 else: scope = 'user:' + userName template_json = request.json if template_json is None: return jsonify(result=False, message="Invalid JSON") dataHandler = DataHandler() ret = {} ret["result"] = dataHandler.UpdateTemplate(templateName, scope, json.dumps(template_json)) dataHandler.Close() resp = jsonify(ret) resp.headers["Access-Control-Allow-Origin"] = "*" resp.headers["dataType"] = "json" return resp
def PauseJob(userName, jobId): dataHandler = DataHandler() ret = False job = dataHandler.GetJobTextFields(jobId, ["userName", "vcName", "jobStatus"]) if job is not None and job["jobStatus"] in [ "unapproved", "queued", "scheduling", "running" ]: if job["userName"] == userName or AuthorizationManager.HasAccess( userName, ResourceType.VC, job["vcName"], Permission.Admin): ret = dataHandler.UpdateJobTextField(jobId, "jobStatus", "pausing") dataHandler.Close() return ret
def KillJob(userName, jobId): ret = False dataHandler = DataHandler() job = dataHandler.GetJobTextFields( jobId, ["userName", "vcName", "jobStatus", "isParent", "familyToken"]) if job is not None and job["jobStatus"] in pendingStatus.split(","): if job["userName"] == userName or AuthorizationManager.HasAccess( userName, ResourceType.VC, job["vcName"], Permission.Admin): dataFields = {"jobStatus": "killing"} conditionFields = {"jobId": jobId} if job["isParent"] == 1: conditionFields = {"familyToken": job["familyToken"]} ret = dataHandler.UpdateJobTextFields(conditionFields, dataFields) dataHandler.Close() return ret
def KillJob(userName, jobId): ret = False dataHandler = DataHandler() jobs = dataHandler.GetJob(jobId=jobId) if len(jobs) == 1: job = jobs[0] if job["userName"] == userName or AuthorizationManager.HasAccess(userName, ResourceType.VC, job["vcName"], Permission.Admin): if job["isParent"] == 1: ret = True for currJob in dataHandler.GetJob(familyToken=job["familyToken"]): ret = ret and dataHandler.UpdateJobTextField(currJob["jobId"],"jobStatus","killing") else: ret = dataHandler.UpdateJobTextField(jobId,"jobStatus","killing") dataHandler.Close() return ret
def GetVC(userName, vcName): ret = None clusterStatus, dummy = DataManager.GetClusterStatus() clusterTotalRes = ResourceInfo(clusterStatus["gpu_capacity"]) clusterReservedRes = ResourceInfo(clusterStatus["gpu_unschedulable"]) user_status = {} vcList = DataManager.ListVCs() for vc in vcList: if vc["vcName"] == vcName and AuthorizationManager.HasAccess(userName, ResourceType.VC, vcName, Permission.User): vcTotalRes = ResourceInfo(json.loads(vc["quota"])) vcConsumedRes = ResourceInfo() jobs = DataManager.GetAllPendingJobs(vcName) for job in jobs: if job["jobStatus"] == "running": username = job["userName"] jobParam = json.loads(base64.b64decode(job["jobParams"])) if "gpuType" in jobParam and not jobParam["preemptionAllowed"]: vcConsumedRes.Add(ResourceInfo({jobParam["gpuType"] : GetJobTotalGpu(jobParam)})) if username not in user_status: user_status[username] = ResourceInfo() user_status[username].Add(ResourceInfo({jobParam["gpuType"] : GetJobTotalGpu(jobParam)})) vcReservedRes = clusterReservedRes.GetFraction(vcTotalRes, clusterTotalRes) vcAvailableRes = ResourceInfo.Difference(ResourceInfo.Difference(vcTotalRes, vcConsumedRes), vcReservedRes) vc["gpu_capacity"] = vcTotalRes.ToSerializable() vc["gpu_used"] = vcConsumedRes.ToSerializable() vc["gpu_unschedulable"] = vcReservedRes.ToSerializable() vc["gpu_avaliable"] = vcAvailableRes.ToSerializable() vc["AvaliableJobNum"] = len(jobs) vc["node_status"] = clusterStatus["node_status"] vc["user_status"] = [] for user_name, user_gpu in user_status.iteritems(): # TODO: job_manager.getAlias should be put in a util file user_name = user_name.split("@")[0].strip() vc["user_status"].append({"userName":user_name, "userGPU":user_gpu.ToSerializable()}) ret = vc break return ret
def GetJobDetailV2(userName, jobId): job = {} dataHandler = None try: dataHandler = DataHandler() jobs = dataHandler.GetJobV2(jobId) if len(jobs) == 1: if jobs[0][ "userName"] == userName or AuthorizationManager.HasAccess( userName, ResourceType.VC, jobs[0]["vcName"], Permission.Collaborator): job = jobs[0] except Exception as e: logger.error( "get job detail v2 exception for user: %s, jobId: %s, exception: %s", userName, jobId, str(e)) finally: if dataHandler is not None: dataHandler.Close() return job
def GetEndpoints(userName, jobId): dataHandler = DataHandler() ret = [] try: job = dataHandler.GetJobTextFields(jobId, ["userName", "vcName", "endpoints"]) if job is not None: if job["userName"] == userName or AuthorizationManager.HasAccess( userName, ResourceType.VC, job["vcName"], Permission.Admin): endpoints = {} if job["endpoints"] is not None: endpoints = json.loads(job["endpoints"]) for [_, endpoint] in endpoints.items(): epItem = { "id": endpoint["id"], "name": endpoint["name"], "username": endpoint["username"], "status": endpoint["status"], "hostNetwork": endpoint["hostNetwork"], "podName": endpoint["podName"], "domain": config["domain"], } if "podPort" in endpoint: epItem["podPort"] = endpoint["podPort"] if endpoint["status"] == "running": if endpoint["hostNetwork"]: port = int(endpoint["endpointDescription"]["spec"] ["ports"][0]["port"]) else: port = int(endpoint["endpointDescription"]["spec"] ["ports"][0]["nodePort"]) epItem["port"] = port if "nodeName" in endpoint: epItem["nodeName"] = endpoint["nodeName"] ret.append(epItem) except Exception as e: logger.error("Get endpoint exception, ex: %s", str(e)) finally: dataHandler.Close() return ret
def update_job_priorites(username, job_priorities): data_handler = None try: data_handler = DataHandler() # Only job owner and VC admin can update job priority. # Fail job priority update if there is one unauthorized items. pendingJobs = {} for job_id in job_priorities: priority = job_priorities[job_id] job = data_handler.GetJobTextFields( job_id, ["userName", "vcName", "jobStatus"]) if job is None: continue vc_admin = AuthorizationManager.HasAccess(username, ResourceType.VC, job["vcName"], Permission.Admin) if job["userName"] != username and (not vc_admin): return False # Adjust priority based on permission permission = Permission.Admin if vc_admin else Permission.User job_priorities[job_id] = adjust_job_priority(priority, permission) if job["jobStatus"] in pendingStatus.split(","): pendingJobs[job_id] = job_priorities[job_id] ret_code = data_handler.update_job_priority(job_priorities) return ret_code, pendingJobs except Exception as e: logger.error("Exception when updating job priorities: %s" % e) finally: if data_handler is not None: data_handler.Close()
def update_job_priorites(username, job_priorities): data_handler = None try: data_handler = DataHandler() # Only job owner and VC admin can update job priority. # Fail job priority update if there is one unauthorized items. for job_id in job_priorities: priority = job_priorities[job_id] jobs = data_handler.GetJob(jobId=job_id) if len(jobs) == 0: logger.warn("Update priority %s for non-existent job %s" % (priority, job_id)) continue if len(jobs) > 1: logger.warn("Multiple job entries found that matches job %s. " "Most likely a platform bug." % job_id) job = jobs[0] vc_admin = AuthorizationManager.HasAccess(username, ResourceType.VC, job["vcName"], Permission.Admin) if job["userName"] != username and (not vc_admin): return False # Adjust priority based on permission permission = Permission.Admin if vc_admin else Permission.User job_priorities[job_id] = adjust_job_priority(priority, permission) ret_code = data_handler.update_job_priority(job_priorities) return ret_code except Exception as e: logger.error("Exception when updating job priorities: %s" % e) finally: if data_handler is not None: data_handler.Close()
def GetJobListV2(userName, vcName, jobOwner, num=None): jobs = {} dataHandler = None try: dataHandler = DataHandler() hasAccessOnAllJobs = False if jobOwner == "all": hasAccessOnAllJobs = AuthorizationManager.HasAccess( userName, ResourceType.VC, vcName, Permission.Collaborator) # if user needs to access all jobs, and has been authorized, he could get all pending jobs; otherwise, he could get his own jobs with all status if hasAccessOnAllJobs: jobs = dataHandler.GetJobListV2("all", vcName, num, pendingStatus, ("=", "or")) else: jobs = dataHandler.GetJobListV2(userName, vcName, num) except Exception as e: logger.error('get job list V2 Exception: user: %s, ex: %s', userName, str(e)) finally: if dataHandler is not None: dataHandler.Close() return jobs
def SubmitJob(jobParamsJsonStr): ret = {} jobParams = LoadJobParams(jobParamsJsonStr) if "jobName" not in jobParams or len(jobParams["jobName"].strip()) == 0: ret["error"] = "ERROR: Job name cannot be empty" return ret if "vcName" not in jobParams or len(jobParams["vcName"].strip()) == 0: ret["error"] = "ERROR: VC name cannot be empty" return ret if "preemptionAllowed" not in jobParams: jobParams["preemptionAllowed"] = False else: jobParams["preemptionAllowed"] = ToBool(jobParams["preemptionAllowed"]) if "jobId" not in jobParams or jobParams["jobId"] == "": #jobParams["jobId"] = jobParams["jobName"] + "-" + str(uuid.uuid4()) #jobParams["jobId"] = jobParams["jobName"] + "-" + str(time.time()) jobParams["jobId"] = str(uuid.uuid4()) #jobParams["jobId"] = jobParams["jobId"].replace("_","-").replace(".","-") if "resourcegpu" not in jobParams: jobParams["resourcegpu"] = 0 if isinstance(jobParams["resourcegpu"], basestring): if len(jobParams["resourcegpu"].strip()) == 0: jobParams["resourcegpu"] = 0 else: jobParams["resourcegpu"] = int(jobParams["resourcegpu"]) if "familyToken" not in jobParams or jobParams["familyToken"].isspace(): jobParams["familyToken"] = str(uuid.uuid4()) if "isParent" not in jobParams: jobParams["isParent"] = 1 userName = jobParams["userName"] if "@" in userName: userName = userName.split("@")[0].strip() if "/" in userName: userName = userName.split("/")[1].strip() if not AuthorizationManager.HasAccess(jobParams["userName"], ResourceType.VC, jobParams["vcName"].strip(), Permission.User): ret["error"] = "Access Denied!" return ret if "cmd" not in jobParams: jobParams["cmd"] = "" if "jobPath" in jobParams and len(jobParams["jobPath"].strip()) > 0: jobPath = jobParams["jobPath"] if ".." in jobParams["jobPath"]: ret["error"] = "ERROR: '..' cannot be used in job directory" return ret if "\\." in jobParams["jobPath"]: ret["error"] = "ERROR: invalided job directory" return ret if jobParams["jobPath"].startswith("/") or jobParams["jobPath"].startswith("\\"): ret["error"] = "ERROR: job directory should not start with '/' or '\\' " return ret if not jobParams["jobPath"].startswith(userName): jobParams["jobPath"] = os.path.join(userName,jobParams["jobPath"]) else: jobPath = userName+"/"+ "jobs/"+time.strftime("%y%m%d")+"/"+jobParams["jobId"] jobParams["jobPath"] = jobPath if "workPath" not in jobParams or len(jobParams["workPath"].strip()) == 0: jobParams["workPath"] = "." if ".." in jobParams["workPath"]: ret["error"] = "ERROR: '..' cannot be used in work directory" return ret if "\\." in jobParams["workPath"]: ret["error"] = "ERROR: invalided work directory" return ret if jobParams["workPath"].startswith("/") or jobParams["workPath"].startswith("\\"): ret["error"] = "ERROR: work directory should not start with '/' or '\\' " return ret if not jobParams["workPath"].startswith(userName): jobParams["workPath"] = os.path.join(userName,jobParams["workPath"]) if "dataPath" not in jobParams or len(jobParams["dataPath"].strip()) == 0: jobParams["dataPath"] = "." if ".." in jobParams["dataPath"]: ret["error"] = "ERROR: '..' cannot be used in data directory" return ret if "\\." in jobParams["dataPath"]: ret["error"] = "ERROR: invalided data directory" return ret if jobParams["dataPath"][0] == "/" or jobParams["dataPath"][0] == "\\": ret["error"] = "ERROR: data directory should not start with '/' or '\\' " return ret jobParams["dataPath"] = jobParams["dataPath"].replace("\\","/") jobParams["workPath"] = jobParams["workPath"].replace("\\","/") jobParams["jobPath"] = jobParams["jobPath"].replace("\\","/") jobParams["dataPath"] = os.path.realpath(os.path.join("/",jobParams["dataPath"]))[1:] jobParams["workPath"] = os.path.realpath(os.path.join("/",jobParams["workPath"]))[1:] jobParams["jobPath"] = os.path.realpath(os.path.join("/",jobParams["jobPath"]))[1:] dataHandler = DataHandler() if "logDir" in jobParams and len(jobParams["logDir"].strip()) > 0: tensorboardParams = jobParams.copy() # overwrite for distributed job if tensorboardParams["jobtrainingtype"] == "PSDistJob": tensorboardParams["jobtrainingtype"] = "RegularJob" match = re.match('(.*)(/.*)', tensorboardParams["logDir"]) if not match is None: newDir = match.group(1) + "/worker0" + match.group(2) prefix = match.group(1) match2 = re.match('.*/worker0', prefix) if match2 is None: tensorboardParams["logDir"] = newDir #match = re.match('(.*--logdir\s+.*)(/.*--.*)', tensorboardParams["cmd"]) #if not match is None: # tensorboardParams["cmd"] = match.group(1) + "/worker0" + match.group(2) tensorboardParams["jobId"] = str(uuid.uuid4()) tensorboardParams["jobName"] = "tensorboard-"+jobParams["jobName"] tensorboardParams["jobPath"] = jobPath tensorboardParams["jobType"] = "visualization" tensorboardParams["cmd"] = "tensorboard --logdir " + tensorboardParams["logDir"] + " --host 0.0.0.0" tensorboardParams["image"] = jobParams["image"] tensorboardParams["resourcegpu"] = 0 tensorboardParams["interactivePort"] = "6006" if "error" not in ret: if not dataHandler.AddJob(tensorboardParams): ret["error"] = "Cannot schedule tensorboard job." if "error" not in ret: if dataHandler.AddJob(jobParams): ret["jobId"] = jobParams["jobId"] else: ret["error"] = "Cannot schedule job. Cannot add job into database." dataHandler.Close() return ret