def test_ParametricChain(self): """ This test will submit a parametric job which should generate 3 actual jobs """ wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(result['OK']) jobIDList = result['Value'] self.assertEqual(len(jobIDList), 3) result = jobMonitor.getJobsParameters(jobIDList, ['JobName']) self.assertTrue(result['OK']) jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']] self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)])) for jobID in jobIDList: result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(result['OK']) result = wmsClient.deleteJob(jobIDList) self.assertTrue(result['OK']) for jobID in jobIDList: result = jobMonitor.getJobStatus(jobID) self.assertTrue(result['OK']) self.assertEqual(result['Value'], 'Deleted')
def initialize(self): """agent initialisation reading and setting config opts :param self: self reference """ # # shifter proxy # See cleanContent method: this proxy will be used ALSO when the file catalog used # is the DIRAC File Catalog (DFC). # This is possible because of unset of the "UseServerCertificate" option self.shifterProxy = self.am_getOption("shifterProxy", self.shifterProxy) # # transformations types self.dataProcTTypes = Operations().getValue( "Transformations/DataProcessing", self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( "Transformations/DataManipulation", self.dataManipTTypes) agentTSTypes = self.am_getOption("TransformationTypes", []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption("DirectoryLocations", self.directoryLocations)) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption("TransfIDMeta", self.transfidmeta) self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption("ArchiveAfter", self.archiveAfter) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # transformation log SEs self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE) self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() # # job monitoring client self.jobMonitoringClient = JobMonitoringClient() return S_OK()
def __init__( self, *args, **kwargs ): ''' c'tor ''' AgentModule.__init__( self, *args, **kwargs ) # # replica manager self.replicaManager = ReplicaManager() # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.requestClient = RequestClient() # # file catalog clinet self.metadataClient = FileCatalogClient() # # placeholders for CS options # # transformations types self.transformationTypes = None # # directory locations self.directoryLocations = None # # transformation metadata self.transfidmeta = None # # archive periof in days self.archiveAfter = None # # active SEs self.activeStorages = None # # transformation log SEs self.logSE = None # # enable/disable execution self.enableFlag = None
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None, destinationPlugin=None, ownerDN=None, ownerGroup=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) useCertificates = True if (bool(ownerDN) and bool(ownerGroup)) else False if not submissionClient: self.submissionClient = WMSClient(useCertificates=useCertificates, delegatedDN=ownerDN, delegatedGroup=ownerGroup) else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not destinationPlugin: self.destinationPlugin = self.opsH.getValue( 'Transformations/DestinationPlugin', 'BySE') else: self.destinationPlugin = destinationPlugin self.destinationPlugin_o = None self.outputDataModule_o = None
def test_matcher( self ): # insert a proper DN to run the test resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB', 'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO', 'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 } matcher = RPCClient( 'WorkloadManagement/Matcher' ) JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' ) wmsClient = WMSClient() job = helloWorldJob() job.setDestination( 'DIRAC.Jenkins.org' ) job.setInputData( '/a/bbb' ) job.setType( 'User' ) jobDescription = createFile( job ) res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) ) self.assert_( res['OK'] ) jobID = res['Value'] res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' ) self.assert_( res['OK'] ) tqDB = TaskQueueDB() tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]', 'OwnerGroup':'prod', 'Setup':'dirac-JenkinsSetup', 'CPUTime':86400} res = tqDB.insertJob( jobID, tqDefDict, 10 ) self.assert_( res['OK'] ) res = matcher.requestJob( resourceDescription ) print res self.assert_( res['OK'] ) wmsClient.deleteJob( jobID )
def initialize(self): """ agent initialisation reading and setting confing opts :param self: self reference """ # # shifter proxy self.am_setOption('shifterProxy', 'DataManager') # # transformations types self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', self.dataProcTTypes) self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', self.dataManipTTypes) agentTSTypes = self.am_getOption('TransformationTypes', []) if agentTSTypes: self.transformationTypes = sorted(agentTSTypes) else: self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes) self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) # # directory locations self.directoryLocations = sorted( self.am_getOption('DirectoryLocations', ['TransformationDB', 'MetadataCatalog'])) self.log.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) # # transformation metadata self.transfidmeta = self.am_getOption('TransfIDMeta', "TransformationID") self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) # # archive periof in days self.archiveAfter = self.am_getOption('ArchiveAfter', 7) # days self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter) # # active SEs self.activeStorages = sorted(self.am_getOption('ActiveSEs', [])) self.log.info("Will check the following storage elements: %s" % str(self.activeStorages)) # # transformation log SEs self.logSE = Operations().getValue('/LogStorage/LogSE', 'LogSE') self.log.info("Will remove logs found on storage element: %s" % self.logSE) # # enable/disable execution, should be using CS option Status?? with default value as 'Active'?? self.enableFlag = self.am_getOption('EnableFlag', 'True') # # transformation client self.transClient = TransformationClient() # # wms client self.wmsClient = WMSClient() # # request client self.reqClient = ReqClient() # # file catalog client self.metadataClient = FileCatalogClient() return S_OK()
def deleteJobsByStatus(self, condDict, delay=False): """Sets the job status to "DELETED" for jobs in condDict. :param dict condDict: a dict like {'JobType': 'User', 'Status': 'Killed'} :param int delay: days of delay :returns: S_OK/S_ERROR """ res = self._getJobsList(condDict, delay) if not res["OK"]: return res jobList = res["Value"] if not jobList: return S_OK() self.log.notice("Attempting to delete jobs", "(%d for %s)" % (len(jobList), condDict)) result = self.deleteJobOversizedSandbox( jobList) # This might set a request if not result["OK"]: self.log.error("Cannot schedule removal of oversized sandboxes", result["Message"]) return result failedJobs = result["Value"][JobStatus.FAILED] for job in failedJobs: jobList.pop(jobList.index(job)) if not jobList: return S_OK() ownerJobsDict = self._getOwnerJobsDict(jobList) fail = False for owner, jobsList in ownerJobsDict.items(): ownerDN = owner.split(";")[0] ownerGroup = owner.split(";")[1] self.log.verbose( "Attempting to delete jobs", "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup)) wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup) result = wmsClient.deleteJob(jobsList) if not result["OK"]: self.log.error( "Could not delete jobs", "for %s : %s (n=%d) : %s" % (ownerDN, ownerGroup, len(jobsList), result["Message"]), ) fail = True if fail: return S_ERROR() return S_OK()
def createJob(): job = helloWorldJob() jobDescription = createFile(job) wmsClient = WMSClient() res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) assert res['OK'], res['Message'] jobID = int(res['Value']) return jobID
def __sendKillCommand(self, job): """Send a kill signal to the job such that it cannot continue running. :param int job: ID of job to send kill command """ ownerDN = self.jobDB.getJobAttribute(job, 'OwnerDN') ownerGroup = self.jobDB.getJobAttribute(job, 'OwnerGroup') if ownerDN['OK'] and ownerGroup['OK']: wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN['Value'], delegatedGroup=ownerGroup['Value']) resKill = wmsClient.killJob(job) if not resKill['OK']: self.log.error("Failed to send kill command to job", "%s: %s" % (job, resKill['Message'])) else: self.log.error("Failed to get ownerDN or Group for job:", "%s: %s, %s" % (job, ownerDN.get('Message', ''), ownerGroup.get('Message', '')))
def __init__(self, *args, **kwargs): ''' c'tor ''' TaskManagerAgentBase.__init__(self, *args, **kwargs) self.submissionClient = WMSClient() self.taskManager = WorkflowTasks( transClient=self.transClient, submissionClient=self.submissionClient) self.shifterProxy = 'ProductionManager' agentTSTypes = self.am_getOption('TransType', []) if agentTSTypes: self.transType = agentTSTypes else: self.transType = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'])
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not outputDataModule: self.outputDataModule = gConfig.getValue( "/DIRAC/VOPolicy/OutputDataModule", "") else: self.outputDataModule = outputDataModule if not jobClass: from DIRAC.Interfaces.API.Job import Job self.jobClass = Job else: self.jobClass = jobClass if not opsH: from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations self.opsH = Operations() else: self.opsH = opsH
def test_matcher(self): # insert a proper DN to run the test resourceDescription = { "OwnerGroup": "prod", "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "DIRACVersion": "pippo", "GridCE": "some.grid.ce.org", "ReleaseVersion": "blabla", "VirtualOrganization": "LHCb", "PilotInfoReportedFlag": "True", "PilotBenchmark": "anotherPilot", "Site": "DIRAC.Jenkins.ch", "CPUTime": 86400, } wmsClient = WMSClient() job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData("/a/bbb") job.setType("User") jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] # forcing the update res = JobStateUpdateClient().setJobStatus(jobID, JobStatus.WAITING, "matching", "source", None, True) self.assertTrue(res["OK"], res.get("Message")) tqDB = TaskQueueDB() tqDefDict = { "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser", "OwnerGroup": "prod", "Setup": "dirac-JenkinsSetup", "CPUTime": 86400, } res = tqDB.insertJob(jobID, tqDefDict, 10) self.assertTrue(res["OK"], res.get("Message")) res = MatcherClient().requestJob(resourceDescription) print(res) self.assertTrue(res["OK"], res.get("Message")) wmsClient.deleteJob(jobID)
def __init__(self, transClient=None, logger=None, submissionClient=None, jobMonitoringClient=None, outputDataModule=None, jobClass=None, opsH=None): """ Generates some default objects. jobClass is by default "DIRAC.Interfaces.API.Job.Job". An extension of it also works: VOs can pass in their job class extension, if present """ if not logger: logger = gLogger.getSubLogger('WorkflowTasks') super(WorkflowTasks, self).__init__(transClient, logger) if not submissionClient: self.submissionClient = WMSClient() else: self.submissionClient = submissionClient if not jobMonitoringClient: self.jobMonitoringClient = JobMonitoringClient() else: self.jobMonitoringClient = jobMonitoringClient if not jobClass: self.jobClass = Job else: self.jobClass = jobClass if not opsH: self.opsH = Operations() else: self.opsH = opsH if not outputDataModule: self.outputDataModule = self.opsH.getValue( "Transformations/OutputDataModule", "") else: self.outputDataModule = outputDataModule
def initialize(self): """Sets defaults """ self.replicaManager = ReplicaManager() self.transClient = TransformationClient() self.wmsClient = WMSClient() self.requestClient = RequestClient() self.metadataClient = FileCatalogClient() self.storageUsageClient = StorageUsageClient() # This sets the Default Proxy to used as that defined under # /Operations/Shifter/DataManager # the shifterProxy option in the Configuration can be used to change this default. self.am_setOption('shifterProxy', 'DataManager') self.transformationTypes = sortList( self.am_getOption('TransformationTypes', [ 'MCSimulation', 'DataReconstruction', 'DataStripping', 'MCStripping', 'Merge', 'Replication' ])) gLogger.info("Will consider the following transformation types: %s" % str(self.transformationTypes)) self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB', 'StorageUsage', 'MetadataCatalog'])) gLogger.info( "Will search for directories in the following locations: %s" % str(self.directoryLocations)) self.transfidmeta = self.am_getOption('TransfIDMeta', "TransformationID") gLogger.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta) self.archiveAfter = self.am_getOption('ArchiveAfter', 7) # days gLogger.info("Will archive Completed transformations after %d days" % self.archiveAfter) self.activeStorages = sortList(self.am_getOption('ActiveSEs', [])) gLogger.info("Will check the following storage elements: %s" % str(self.activeStorages)) self.logSE = self.am_getOption('TransformationLogSE', 'LogSE') gLogger.info("Will remove logs found on storage element: %s" % self.logSE) return S_OK()
def main(): if len(sys.argv) < 2: print "At least one parameter (user group, e.g. dune_user) expected, got %s !" \ % (len(sys.argv)-1) print "Usage: resetjobs.py <user group> -or- resetjobs.py <user group> <site>" print "Example: ./resetjobs.py dune_user LCG.UKI-LT2-IC-HEP.uk" print "Only available to dirac_admin." sys.exit(1) # dictionary JOBFILTER = {} JOBFILTER['OwnerGroup'] = str(sys.argv[1]) JOBFILTER['Status'] = 'Failed' if len(sys.argv) == 3: JOBFILTER['Site'] = str(sys.argv[2]) print JOBFILTER rpcClient = RPCClient("WorkloadManagement/JobMonitoring") jobs = rpcClient.getJobs(JOBFILTER) if not jobs["OK"]: print "Could not retrieve jobs." sys.exit(1) job_ids = jobs["Value"] print "%s matching jobs found." % len(job_ids) if len(job_ids) > 500: print "Will reset the first 500 jobs, please rerun script to delete more." wmsClient = WMSClient() for jobid in job_ids[0:500]: # print jobid res = wmsClient.resetJob(int(jobid)) if not res['OK']: print "Could not reset job %s" % jobid
def __init__(self, agentName, baseAgentName=False, properties=dict()): """ c'tor :param self: self reference :param str agentName: name of agent :param bool baseAgentName: whatever :param dict properties: whatever else """ AgentModule.__init__(self, agentName, baseAgentName, properties) ## replica manager self.replicaManager = ReplicaManager() ## transformation client self.transClient = TransformationClient() ## wms client self.wmsClient = WMSClient() ## request client self.requestClient = RequestClient() ## file catalog clinet self.metadataClient = FileCatalogClient() ## storage usage agent self.storageUsageClient = StorageUsageClient() ## placeholders for CS options ## transformations types self.transformationTypes = None ## directory locations self.directoryLocations = None ## transformation metadata self.transfidmeta = None ## archive periof in days self.archiveAfter = None ## active SEs self.activeStorages = None ## transformation log SEs self.logSE = None ## enable/disable execution self.enableFlag = None
def test_ParametricChain(self): """This test will submit a parametric job which should generate 3 actual jobs""" wmsClient = WMSClient() jobStateUpdate = JobStateUpdateClient() jobMonitor = JobMonitoringClient() # create the job job = parametricJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobIDList = res["Value"] self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList)) res = jobMonitor.getJobsParameters(jobIDList, ["JobName"]) self.assertTrue(res["OK"], res.get("Message")) jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]] self.assertEqual( set(jobNames), set(["parametric_helloWorld_%s" % nJob for nJob in range(3)])) for jobID in jobIDList: res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = wmsClient.deleteJob(jobIDList) self.assertTrue(res["OK"], res.get("Message")) print(res) for jobID in jobIDList: res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))
shutil.copyfileobj(file.file, tFile) file.file.close() tFile.close() fileNameList.append(name) except Exception,x: exception_counter = 1 c.result = {"success":"false","error":"An EXCEPTION happens during saving your sandbox file(s): %s" % str(x)} if len(fileNameList) > 0 and exception_counter == 0: sndBox = "InputSandbox = {\"" + "\",\"".join(fileNameList) + "\"};" else: sndBox = "" if exception_counter == 0: jdl = jdl + sndBox from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient jobManager = WMSClient(getRPCClient("WorkloadManagement/JobManager"), getRPCClient("WorkloadManagement/SandboxStore"), getTransferClient("WorkloadManagement/SandboxStore")) jdl = str(jdl) gLogger.info("J D L : ",jdl) try: result = jobManager.submitJob(jdl) if result["OK"]: c.result = {"success":"true","result":result["Value"]} else: c.result = {"success":"false","error":result["Message"]} except Exception,x: c.result = {"success":"false","error":"An EXCEPTION happens during job submittion: %s" % str(x)} if clearFS: shutil.rmtree(storePath) return c.result ################################################################################
class JobLaunchpadHandler(WebHandler): AUTH_PROPS = "authenticated" def __init__(self, *args, **kwargs ): super( JobLaunchpadHandler, self ).__init__( *args, **kwargs ) sessionData = self.getSessionData() self.user = sessionData['user'].get( 'username', '' ) self.group = sessionData['user'].get( 'group', '' ) self.vo = getVOForGroup( self.group ) def web_getProxyStatus(self): self.write(self.__getProxyStatus()) def __getProxyStatus(self, secondsOverride=None): from DIRAC.FrameworkSystem.Client.ProxyManagerClient import ProxyManagerClient proxyManager = ProxyManagerClient() userData = self.getSessionData() group = str(userData["user"]["group"]) if group == "visitor": return {"success":"false", "error":"User is anonymous or is not registered in the system"} userDN = str(userData["user"]["DN"]) defaultSeconds = 24 * 3600 + 60 # 24H + 1min validSeconds = gConfig.getValue("/Registry/DefaultProxyLifeTime", defaultSeconds) gLogger.info("\033[0;31m userHasProxy(%s, %s, %s) \033[0m" % (userDN, group, validSeconds)) result = proxyManager.userHasProxy(userDN, group, validSeconds) if result["OK"]: if result["Value"]: return {"success":"true", "result":"true"} else: return {"success":"true", "result":"false"} else: return {"success":"false", "error":"false"} gLogger.info("\033[0;31m PROXY: \033[0m", result) def __getPlatform(self): gLogger.info("start __getPlatform") path = "/Resources/Computing/OSCompatibility" result = gConfig.getOptionsDict(path) gLogger.debug(result) if not result[ "OK" ]: return False platformDict = result[ "Value" ] platform = platformDict.keys() gLogger.debug("platform: %s" % platform) gLogger.info("end __getPlatform") return platform def __getOptionsFromCS(self , path="/WebApp/Launchpad/Options" , delimiter=","): gLogger.info("start __getOptionsFromCS") result = gConfig.getOptionsDict(path) gLogger.always(result) if not result["OK"]: return [] options = result["Value"] for i in options.keys(): options[ i ] = options[ i ].split(delimiter) result = gConfig.getSections(path) if result["OK"]: sections = result["Value"] if len(sections) > 0: for i in sections: options[ i ] = self.__getOptionsFromCS(path + '/' + i , delimiter) gLogger.always("options: %s" % options) gLogger.info("end __getOptionsFromCS") return options def web_getLaunchpadOpts(self): defaultParams = {"JobName" : [1, 'DIRAC'], "Executable" : [1, "/bin/ls"], "Arguments" : [1, "-ltrA"], "OutputSandbox" : [1, "std.out, std.err"], "JobGroup" : [0, "Unknown"], "InputData" : [0, ""], "OutputData" : [0, ""], "OutputSE" : [0, "DIRAC-USER"], "OutputPath": [0, ""], "CPUTime" : [0, "86400"], "Site" : [0, ""], "BannedSite" : [0, ""], "Platform" : [0, "Linux_x86_64_glibc-2.12"], "Priority" : [0, "5"], "StdError" : [0, "std.err"], "StdOutput" : [0, "std.out"], "Parameters" : [0, "0"], "ParameterStart" : [0, "0"], "ParameterStep" : [0, "1"], "ParameterFactor": [0, "0"]} delimiter = gConfig.getValue("/WebApp/Launchpad/ListSeparator" , ',') options = self.__getOptionsFromCS(delimiter=delimiter) # platform = self.__getPlatform() # if platform and options: # if not options.has_key("Platform"): # options[ "Platform" ] = platform # else: # csPlatform = list(options[ "Platform" ]) # allPlatforms = csPlatform + platform # platform = uniqueElements(allPlatforms) # options[ "Platform" ] = platform gLogger.debug("Combined options from CS: %s" % options) override = gConfig.getValue("/WebApp/Launchpad/OptionsOverride" , False) gLogger.info("end __getLaunchpadOpts") # Updating the default values from OptionsOverride configuration branch for key in options: if key not in defaultParams: defaultParams[key] = [ 0, "" ] defaultParams[key][1] = options[key][0] # Reading of the predefined sets of launchpad parameters values obj = Operations( vo = self.vo ) predefinedSets = {} launchpadSections = obj.getSections("Launchpad") import pprint if launchpadSections['OK']: for section in launchpadSections["Value"]: predefinedSets[section] = {} sectionOptions = obj.getOptionsDict("Launchpad/" + section) pprint.pprint(sectionOptions) if sectionOptions['OK']: predefinedSets[section] = sectionOptions["Value"] self.write({"success":"true", "result":defaultParams, "predefinedSets":predefinedSets}) def __canRunJobs(self): data = self.getSessionData() isAuth = False if "properties" in data["user"]: if "NormalUser" in data["user"]["properties"]: isAuth = True return isAuth @asyncGen def web_jobSubmit(self): # self.set_header('Content-type', "text/html") # Otherwise the browser would offer you to download a JobSubmit file if not self.__canRunJobs(): self.finish({"success":"false", "error":"You are not allowed to run the jobs"}) return proxy = yield self.threadTask( self.__getProxyStatus, 86460 ) if proxy["success"] == "false" or proxy["result"] == "false": self.finish({"success":"false", "error":"You can not run a job: your proxy is valid less then 24 hours"}) return jdl = "" params = {} lfns = [] for tmp in self.request.arguments: try: if len(self.request.arguments[tmp][0]) > 0: if tmp[:8] == "lfnField": if len(self.request.arguments[tmp][0].strip()) > 0: lfns.append("LFN:" + self.request.arguments[tmp][0]) else: params[tmp] = self.request.arguments[tmp][0] except: pass for item in params: if item == "OutputSandbox": jdl = jdl + str(item) + " = {" + str(params[item]) + "};" if item == "Parameters": try: parameters = int(params[item]) jdl = jdl + str(item) + " = \"" + str(parameters) + "\";" except: parameters = str(params[item]) if parameters.find("{") >= 0 and parameters.find("}") >= 0: parameters = parameters.rstrip("}") parameters = parameters.lstrip("{") if len(parameters) > 0: jdl = jdl + str(item) + " = {" + parameters + "};" else: self.finish({"success":"false", "error":"Parameters vector has zero length"}) return else: self.finish({"success":"false", "error":"Parameters must be an integer or a vector. Example: 4 or {1,2,3,4}"}) return else: jdl = jdl + str(item) + " = \"" + str(params[item]) + "\";" store = [] for key in self.request.files: try: if self.request.files[key][0].filename: gLogger.info("\033[0;31m file - %s \033[0m " % self.request.files[key][0].filename) store.append(self.request.files[key][0]) except: pass gLogger.info("\033[0;31m *** %s \033[0m " % params) clearFS = False # Clear directory flag fileNameList = [] exception_counter = 0 callback = {} if len(store) > 0: # If there is a file(s) in sandbox clearFS = True import shutil import os storePath = tempfile.mkdtemp(prefix='DIRAC_') try: for fileObj in store: name = os.path.join(storePath , fileObj.filename.lstrip(os.sep)) tFile = open(name , 'w') tFile.write(fileObj.body) tFile.close() fileNameList.append(name) except Exception, x: exception_counter = 1 callback = {"success":"false", "error":"An EXCEPTION happens during saving your sandbox file(s): %s" % str(x)} if ((len(fileNameList) > 0) or (len(lfns) > 0)) and exception_counter == 0: sndBox = "InputSandbox = {\"" + "\",\"".join(fileNameList + lfns) + "\"};" else: sndBox = "" if exception_counter == 0: jdl = jdl + sndBox from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient jobManager = WMSClient(useCertificates=True, timeout = 1800 ) jdl = str(jdl) gLogger.info("J D L : ", jdl) try: result = yield self.threadTask(jobManager.submitJob, jdl) if result["OK"]: callback = {"success":"true", "result":result["Value"]} else: callback = {"success":"false", "error":result["Message"]} except Exception, x: callback = {"success":"false", "error":"An EXCEPTION happens during job submittion: %s" % str(x)}
def __failStalledJobs(self, failedTime): """ Changes the Stalled status to Failed for jobs long in the Stalled status """ result = self.jobDB.selectJobs({'Status': 'Stalled'}) if not result['OK']: return result jobs = result['Value'] failedCounter = 0 minorStalledStatuses = ("Job stalled: pilot not running", 'Stalling for more than %d sec' % failedTime) if jobs: self.log.info('%s Stalled jobs will be checked for failure' % (len(jobs))) for job in jobs: setFailed = False # Check if the job pilot is lost result = self.__getJobPilotStatus(job) if not result['OK']: self.log.error('Failed to get pilot status', result['Message']) continue pilotStatus = result['Value'] if pilotStatus != "Running": setFailed = minorStalledStatuses[0] else: result = self.__getLatestUpdateTime(job) if not result['OK']: self.log.error('Failed to get job update time', result['Message']) continue elapsedTime = toEpoch() - result['Value'] if elapsedTime > failedTime: setFailed = minorStalledStatuses[1] # Set the jobs Failed, send them a kill signal in case they are not really dead and send accounting info if setFailed: # Send a kill signal to the job such that it cannot continue running WMSClient().killJob(job) self.__updateJobStatus(job, 'Failed', setFailed) failedCounter += 1 result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) recoverCounter = 0 for minor in minorStalledStatuses: result = self.jobDB.selectJobs({ 'Status': 'Failed', 'MinorStatus': minor, 'AccountedFlag': 'False' }) if not result['OK']: return result if result['Value']: jobs = result['Value'] self.log.info('%s Stalled jobs will be Accounted' % (len(jobs))) for job in jobs: result = self.__sendAccounting(job) if not result['OK']: self.log.error('Failed to send accounting', result['Message']) continue recoverCounter += 1 if not result['OK']: break if failedCounter: self.log.info('%d jobs set to Failed' % failedCounter) if recoverCounter: self.log.info('%d jobs properly Accounted' % recoverCounter) return S_OK(failedCounter)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') jobIDs = [] dests = ['DIRAC.site1.org', 'DIRAC.site2.org'] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for dest in dests: for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination(dest) job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob( job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() self.assert_(res['OK']) self.assert_( set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch'])) res = jobMonitor.getJobTypes() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(types)) res = jobMonitor.getApplicationStates() self.assert_(res['OK']) self.assertEqual(sorted(res['Value']), sorted(['Unknown'])) res = jobMonitor.getOwners() self.assert_(res['OK']) res = jobMonitor.getOwnerGroup() self.assert_(res['OK']) res = jobMonitor.getProductionIds() self.assert_(res['OK']) res = jobMonitor.getJobGroups() self.assert_(res['OK']) res = jobMonitor.getStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assert_(res['OK']) self.assert_( sorted(res['Value']) in [['Job accepted'], sorted(['Job accepted', 'matching'])]) self.assert_(res['OK']) res = jobMonitor.getJobs() self.assert_(res['OK']) self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assert_( res['OK'] ) res = jobMonitor.getCurrentJobCounters() self.assert_(res['OK']) try: self.assert_( res['Value'].get('Received') + res['Value'].get('Waiting') >= long(len(dests) * len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assert_(res['OK']) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assert_(res['OK']) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assert_(res['OK']) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assert_(res['OK']) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def test_JobStateUpdateAndJobMonitoringMultuple(self): """# Now, let's submit some jobs. Different sites, types, inputs""" wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [["/a/1.txt", "/a/2.txt"], ["/a/1.txt", "/a/3.txt", "/a/4.txt"], []] types = ["User", "Test"] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination("DIRAC.Jenkins.ch") job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) jobID = res["Value"] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set(res["Value"]) <= {"ANY", "DIRAC.Jenkins.ch", "Site"}, msg="Got %s" % res["Value"]) res = jobMonitor.getJobTypes() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(sorted(res["Value"]), sorted(types), msg="Got %s" % str(sorted(res["Value"]))) res = jobMonitor.getApplicationStates() self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], ["app status", "Unknown"], msg="Got %s" % str(res["Value"])) res = jobMonitor.getOwners() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getOwnerGroup() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getProductionIds() self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobGroups() self.assertTrue(res["OK"], res.get("Message")) resJG_empty = res["Value"] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanNow = res["Value"] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res["OK"], res.get("Message")) resJG_olderThanOneYear = res["Value"] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)), resJG_olderThanOneYear) res = jobMonitor.getStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [[JobStatus.RECEIVED], sorted([JobStatus.RECEIVED, JobStatus.KILLED])], res["Value"]) res = jobMonitor.getMinorStates() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( sorted(res["Value"]) in [ ["Job accepted"], sorted(["Job accepted", "Job Rescheduled"]), sorted(["Job accepted", "Marked for termination"]), ], res["Value"], ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobs() self.assertTrue(res["OK"], res.get("Message")) self.assertTrue( set([str(x) for x in jobIDs]) <= set(res["Value"]), res["Value"]) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { "Status": JobStatus.CHECKING, "MinorStatus": "MinorStatus", "Source": "Unknown", } }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.CHECKING) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus") res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)): { "Status": JobStatus.WAITING, "MinorStatus": "MinorStatus", "Source": "Unknown", }, str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)): { "Status": JobStatus.MATCHED, "MinorStatus": "MinorStatus-matched", "Source": "Unknown", }, }, False, ) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobsParameter({jobID: ["Whatever", "booh"]}) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED) self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched") res = jobStateUpdate.setJobAttribute(jobID, "Status", JobStatus.RUNNING) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobSummary(int(jobID)) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
class JobLaunchpadHandler(WebHandler): AUTH_PROPS = "authenticated" def web_getProxyStatus(self): self.write(self.__getProxyStatus()) def __getProxyStatus(self, secondsOverride=None): from DIRAC.FrameworkSystem.Client.ProxyManagerClient import ProxyManagerClient proxyManager = ProxyManagerClient() userData = self.getSessionData() group = str(userData["user"]["group"]) if group == "visitor": return { "success": "false", "error": "User is anonymous or is not registered in the system" } userDN = str(userData["user"]["DN"]) defaultSeconds = 24 * 3600 + 60 # 24H + 1min validSeconds = gConfig.getValue("/Registry/DefaultProxyLifeTime", defaultSeconds) gLogger.info("\033[0;31m userHasProxy(%s, %s, %s) \033[0m" % (userDN, group, validSeconds)) result = proxyManager.userHasProxy(userDN, group, validSeconds) if result["OK"]: if result["Value"]: return {"success": "true", "result": "true"} else: return {"success": "true", "result": "false"} else: return {"success": "false", "error": "false"} gLogger.info("\033[0;31m PROXY: \033[0m", result) def __getPlatform(self): gLogger.info("start __getPlatform") path = "/Resources/Computing/OSCompatibility" result = gConfig.getOptionsDict(path) gLogger.debug(result) if not result["OK"]: return False platformDict = result["Value"] platform = platformDict.keys() gLogger.debug("platform: %s" % platform) gLogger.info("end __getPlatform") return platform def __getOptionsFromCS(self, path="/Website/Launchpad/Options", delimiter=","): gLogger.info("start __getOptionsFromCS") result = gConfig.getOptionsDict(path) gLogger.always(result) if not result["OK"]: return [] options = result["Value"] for i in options.keys(): options[i] = options[i].split(delimiter) result = gConfig.getSections(path) if result["OK"]: sections = result["Value"] if len(sections) > 0: for i in sections: options[i] = self.__getOptionsFromCS(path + '/' + i, delimiter) gLogger.always("options: %s" % options) gLogger.info("end __getOptionsFromCS") return options ''' Method obtain launchpad setup to Eiscat with pre-selected LFNs as input data parameter, the caller js client will use setup to open an new Launchpad ''' @asyncGen def web_getLaunchpadSetupWithLFNs(self): #on the fly file catalog for advanced launchpad if not hasattr(self, 'fc'): userData = self.getSessionData() group = str(userData["user"]["group"]) vo = getVOForGroup(group) self.fc = FileCatalog(vo=vo) self.set_header('Content-type', 'text/plain') lfnList = [] arguments = self.request.arguments gLogger.always( "submit: incoming arguments %s to getLaunchpadSetupWithLFNs" % arguments) lfnStr = str(arguments['path'][0]) lfnList = lfnStr.split(',') #checks if the experiments folder in lfn list has a rtg_def.m file at some subfolder gLogger.always("submit: checking if some rtg_def.m" % arguments) processed = [] metaDict = {'type': 'info'} for lfn in lfnStr.split(','): pos_relative = lfn.find("/") pos_relative = lfn.find("/", pos_relative + 1) pos_relative = lfn.find("/", pos_relative + 1) pos_relative = lfn.find("/", pos_relative + 1) pos_relative = lfn.find("/", pos_relative + 1) experiment_lfn = lfn[0:pos_relative] if experiment_lfn in processed: continue processed.append(experiment_lfn) gLogger.always("checking rtg_def.m in %s" % experiment_lfn) result = self.fc.findFilesByMetadata(metaDict, path=str(experiment_lfn)) print "result" print result if not result['OK'] or not result['Value']: gLogger.error("Failed to get type info from $s, %s" % (experiment_lfn, result["Message"])) continue for candidate_lfn in result['Value']: if candidate_lfn.find('rtg_def.m') > 0: lfnList.append(candidate_lfn) totalfn = len(lfnList) ptlfn = '' current = 1 for lfn in lfnList: ptlfn = ptlfn + lfn if current < totalfn: ptlfn = ptlfn + ', ' current = current + 1 defaultParams = { "JobName": [1, 'Eiscat'], "Executable": [1, "/bin/ls"], "Arguments": [1, "-ltrA"], "OutputSandbox": [1, "std.out, std.err"], "InputData": [1, ptlfn], "OutputData": [0, ""], "OutputSE": [1, "EISCAT-disk"], "OutputPath": [0, ""], "CPUTime": [0, "86400"], "Site": [0, ""], "BannedSite": [0, ""], "Platform": [0, "Linux_x86_64_glibc-2.5"], "Priority": [0, "5"], "StdError": [0, "std.err"], "StdOutput": [0, "std.out"], "Parameters": [0, "0"], "ParameterStart": [0, "0"], "ParameterStep": [0, "1"] } delimiter = gConfig.getValue("/Website/Launchpad/ListSeparator", ',') options = self.__getOptionsFromCS(delimiter=delimiter) # platform = self.__getPlatform() # if platform and options: # if not options.has_key("Platform"): # options[ "Platform" ] = platform # else: # csPlatform = list(options[ "Platform" ]) # allPlatforms = csPlatform + platform # platform = uniqueElements(allPlatforms) # options[ "Platform" ] = platform gLogger.debug("Options from CS: %s" % options) override = gConfig.getValue("/Website/Launchpad/OptionsOverride", False) gLogger.info("end __getLaunchpadOpts") # Updating the default values from OptionsOverride configuration branch, for key in options: if key not in defaultParams: defaultParams[key] = [0, ""] defaultParams[key][1] = options[key][0] gLogger.info( "Default params + override from /Website/Launchpad/OptionsOverride -> %s" % defaultParams) # Reading of the predefined sets of launchpad parameters values obj = Operations() predefinedSets = {} launchpadSections = obj.getSections("Launchpad") import pprint if launchpadSections['OK']: for section in launchpadSections["Value"]: predefinedSets[section] = {} sectionOptions = obj.getOptionsDict("Launchpad/" + section) pprint.pprint(sectionOptions) if sectionOptions['OK']: predefinedSets[section] = sectionOptions["Value"] self.write({ "success": "true", "result": defaultParams, "predefinedSets": predefinedSets }) def web_getLaunchpadOpts(self): defaultParams = { "JobName": [1, 'DIRAC'], "Executable": [1, "/bin/ls"], "Arguments": [1, "-ltrA"], "OutputSandbox": [1, "std.out, std.err"], "InputData": [0, ""], "OutputData": [0, ""], "OutputSE": [0, "DIRAC-USER"], "OutputPath": [0, ""], "CPUTime": [0, "86400"], "Site": [0, ""], "BannedSite": [0, ""], "Platform": [0, "Linux_x86_64_glibc-2.5"], "Priority": [0, "5"], "StdError": [0, "std.err"], "StdOutput": [0, "std.out"], "Parameters": [0, "0"], "ParameterStart": [0, "0"], "ParameterStep": [0, "1"] } delimiter = gConfig.getValue("/Website/Launchpad/ListSeparator", ',') options = self.__getOptionsFromCS(delimiter=delimiter) # platform = self.__getPlatform() # if platform and options: # if not options.has_key("Platform"): # options[ "Platform" ] = platform # else: # csPlatform = list(options[ "Platform" ]) # allPlatforms = csPlatform + platform # platform = uniqueElements(allPlatforms) # options[ "Platform" ] = platform gLogger.debug("Combined options from CS: %s" % options) override = gConfig.getValue("/Website/Launchpad/OptionsOverride", False) gLogger.info("end __getLaunchpadOpts") # Updating the default values from OptionsOverride configuration branch for key in options: if key not in defaultParams: defaultParams[key] = [0, ""] defaultParams[key][1] = options[key][0] # Reading of the predefined sets of launchpad parameters values obj = Operations() predefinedSets = {} launchpadSections = obj.getSections("Launchpad") import pprint if launchpadSections['OK']: for section in launchpadSections["Value"]: predefinedSets[section] = {} sectionOptions = obj.getOptionsDict("Launchpad/" + section) pprint.pprint(sectionOptions) if sectionOptions['OK']: predefinedSets[section] = sectionOptions["Value"] self.write({ "success": "true", "result": defaultParams, "predefinedSets": predefinedSets }) def __canRunJobs(self): data = self.getSessionData() isAuth = False if "properties" in data["user"]: if "NormalUser" in data["user"]["properties"]: isAuth = True return isAuth @asyncGen def web_jobSubmit(self): # self.set_header('Content-type', "text/html") # Otherwise the browser would offer you to download a JobSubmit file if not self.__canRunJobs(): self.finish({ "success": "false", "error": "You are not allowed to run the jobs" }) return proxy = yield self.threadTask(self.__getProxyStatus, 86460) if proxy["success"] == "false" or proxy["result"] == "false": self.finish({ "success": "false", "error": "You can not run a job: your proxy is valid less then 24 hours" }) return jdl = "" params = {} lfns = [] for tmp in self.request.arguments: try: if len(self.request.arguments[tmp][0]) > 0: if tmp[:8] == "lfnField": if len(self.request.arguments[tmp][0].strip()) > 0: lfns.append("LFN:" + self.request.arguments[tmp][0]) else: params[tmp] = self.request.arguments[tmp][0] except: pass for item in params: if item == "OutputSandbox": jdl = jdl + str(item) + " = {" + str(params[item]) + "};" if item == "Parameters": try: parameters = int(params[item]) jdl = jdl + str(item) + " = \"" + str(parameters) + "\";" except: parameters = str(params[item]) if parameters.find("{") >= 0 and parameters.find("}") >= 0: parameters = parameters.rstrip("}") parameters = parameters.lstrip("{") if len(parameters) > 0: jdl = jdl + str(item) + " = {" + parameters + "};" else: self.finish({ "success": "false", "error": "Parameters vector has zero length" }) return else: self.finish({ "success": "false", "error": "Parameters must be an integer or a vector. Example: 4 or {1,2,3,4}" }) return else: jdl = jdl + str(item) + " = \"" + str(params[item]) + "\";" store = [] for key in self.request.files: try: if self.request.files[key][0].filename: gLogger.info("\033[0;31m file - %s \033[0m " % self.request.files[key][0].filename) store.append(self.request.files[key][0]) except: pass gLogger.info("\033[0;31m *** %s \033[0m " % params) clearFS = False # Clear directory flag fileNameList = [] exception_counter = 0 callback = {} if len(store) > 0: # If there is a file(s) in sandbox clearFS = True import shutil import os storePath = tempfile.mkdtemp(prefix='DIRAC_') try: for fileObj in store: name = os.path.join(storePath, fileObj.filename.lstrip(os.sep)) tFile = open(name, 'w') tFile.write(fileObj.body) tFile.close() fileNameList.append(name) except Exception, x: exception_counter = 1 callback = { "success": "false", "error": "An EXCEPTION happens during saving your sandbox file(s): %s" % str(x) } if ((len(fileNameList) > 0) or (len(lfns) > 0)) and exception_counter == 0: sndBox = "InputSandbox = {\"" + "\",\"".join(fileNameList + lfns) + "\"};" else: sndBox = "" if exception_counter == 0: jdl = jdl + sndBox from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient jobManager = WMSClient(useCertificates=True, timeout=1800) jdl = str(jdl) gLogger.info("J D L : ", jdl) try: result = yield self.threadTask(jobManager.submitJob, jdl) if result["OK"]: callback = {"success": "true", "result": result["Value"]} else: callback = {"success": "false", "error": result["Message"]} except Exception, x: callback = { "success": "false", "error": "An EXCEPTION happens during job submittion: %s" % str(x) }
def __init__(self): TaskBase.__init__(self) self.submissionClient = WMSClient() self.jobMonitoringClient = JobMonitoringClient()
def test_JobStateUpdateAndJobMonitoringMultuple(self): """ # Now, let's submit some jobs. Different sites, types, inputs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() jobIDs = [] lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []] types = ['User', 'Test'] for lfns in lfnss: for jobType in types: job = helloWorldJob() job.setDestination('DIRAC.Jenkins.ch') job.setInputData(lfns) job.setType(jobType) jobDescription = createFile(job) res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) jobID = res['Value'] jobIDs.append(jobID) res = jobMonitor.getSites() print(res) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'}, msg="Got %s" % res['Value']) res = jobMonitor.getJobTypes() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(types), msg="Got %s" % str(sorted(res['Value']))) res = jobMonitor.getApplicationStates() self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(sorted(res['Value']), sorted(['Unknown']), msg="Got %s" % sorted(str(res['Value']))) res = jobMonitor.getOwners() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getOwnerGroup() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getProductionIds() self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobGroups() self.assertTrue(res['OK'], res.get('Message')) resJG_empty = res['Value'] res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow()) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanNow = res['Value'] self.assertEqual(resJG_empty, resJG_olderThanNow) res = jobMonitor.getJobGroups( None, datetime.datetime.utcnow() - datetime.timedelta(days=365)) self.assertTrue(res['OK'], res.get('Message')) resJG_olderThanOneYear = res['Value'] self.assertTrue( set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow))) res = jobMonitor.getStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Received'], sorted(['Received', 'Waiting'])]) res = jobMonitor.getMinorStates() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue( sorted(res['Value']) in [['Job accepted'], sorted( ['Job accepted', 'Job Rescheduled'])]) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobs() self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value'])) # res = jobMonitor.getCounters(attrList) # self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getCurrentJobCounters() self.assertTrue(res['OK'], res.get('Message')) try: self.assertTrue( res['Value'].get('Received') + res['Value'].get('Waiting') >= int(len(lfnss) * len(types))) except TypeError: pass res = jobMonitor.getJobsSummary(jobIDs) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobStatusBulk( jobID, { str(datetime.datetime.utcnow()): { 'Status': 'Running', 'MinorStatus': 'MinorStatus', 'ApplicationStatus': 'ApplicationStatus', 'Source': 'Unknown' } }) self.assertTrue(res['OK'], res.get('Message')) res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']}) self.assertTrue(res['OK'], res.get('Message')) # delete the jobs - this will just set its status to "deleted" wmsClient.deleteJob(jobIDs)
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res['OK'], res.get('Message')) self.assertTrue(isinstance(res['Value'], int), msg="Got %s" % type(res['Value'])) self.assertEqual(res['Value'], res['JobID'], msg="Got %s, expected %s" % (str(res['Value']), res['JobID'])) jobID = res['JobID'] jobID = res['Value'] # updating the status res = jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') self.assertTrue(res['OK'], res.get('Message')) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res['OK'], res.get('Message')) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Received', msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'MinorStatus': 'Job Rescheduled', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], {jobID: { 'ApplicationStatus': 'Unknown', 'JobID': jobID }}, msg="Got %s" % str(res['Value'])) # updating the status again res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Killed', msg="Got %s" % str(res['Value'])) # updating the status aaaagain res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') self.assertTrue(res['OK'], res.get('Message')) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual( res['Value'], 'Done', msg="Got %s" % str(res['Value'])) # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res['OK'], res.get('Message')) res = jobMonitor.getJobStatus(jobID) self.assertTrue(res['OK'], res.get('Message')) self.assertEqual(res['Value'], 'Deleted', msg="Got %s" % str(res['Value']))
def test_JobStateUpdateAndJobMonitoring(self): """ Verifying all JobStateUpdate and JobMonitoring functions """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create a job and check stuff job = helloWorldJob() jobDescription = createFile(job) # submitting the job. Checking few stuff res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) jobID = int(res['Value']) # jobID = res['JobID'] res = jobMonitor.getJobJDL(jobID, True) self.assert_(res['OK']) res = jobMonitor.getJobJDL(jobID, False) self.assert_(res['OK']) res = jobMonitor.getJobsParameters([jobID], []) self.assert_(res['OK']) self.assertEqual(res['Value'], {}) res = jobMonitor.getJobsParameters([jobID], ['Owner']) self.assert_(res['OK']) # Adding stuff res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') self.assert_(res['OK']) res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'), ('par2', 'par2Value')]) self.assert_(res['OK']) res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status', 'source') self.assert_(res['OK']) # res = jobStateUpdate.setJobFlag() # self.assert_( res['OK'] ) # res = jobStateUpdate.unsetJobFlag() # self.assert_( res['OK'] ) res = jobStateUpdate.setJobSite(jobID, 'Site') self.assert_(res['OK']) # res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' ) # self.assert_( res['OK'] ) # now checking few things res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Running') res = jobMonitor.getJobParameter(jobID, 'par1') self.assert_(res['OK']) self.assertEqual(res['Value'], {'par1': 'par1Value'}) res = jobMonitor.getJobParameters(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], { 'par1': 'par1Value', 'par2': 'par2Value' }) res = jobMonitor.getJobAttribute(jobID, 'Site') self.assert_(res['OK']) self.assertEqual(res['Value'], 'Site') res = jobMonitor.getJobAttributes(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['JobName'], 'helloWorld') res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['ApplicationStatus'], 'app status') self.assertEqual(res['Value']['Status'], 'Running') res = jobMonitor.getJobHeartBeatData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getInputData(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], []) res = jobMonitor.getJobPrimarySummary(jobID) self.assert_(res['OK']) res = jobMonitor.getAtticJobParameters(jobID) self.assert_(res['OK']) res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus', 'Unknown') self.assert_(res['OK']) res = jobMonitor.getJobSummary(jobID) self.assert_(res['OK']) self.assertEqual(res['Value']['Status'], 'Done') self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus') self.assertEqual(res['Value']['ApplicationStatus'], 'app status') res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'}, {'boh': 'boh'}) self.assert_(res['OK']) # delete the job - this will just set its status to "deleted" wmsClient.deleteJob(jobID)
def removeDeletedJobs(self): """Fully remove jobs that are already in status "DELETED", unless there are still requests. :returns: S_OK/S_ERROR """ res = self._getJobsList({"Status": JobStatus.DELETED}) if not res["OK"]: return res jobList = res["Value"] if not jobList: self.log.info("No jobs to remove") return S_OK() self.log.info("Unassigning sandboxes from soon to be deleted jobs", "(%d)" % len(jobList)) result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList) if not result["OK"]: self.log.error("Cannot unassign jobs to sandboxes", result["Message"]) return result self.log.info("Attempting to remove deleted jobs", "(%d)" % len(jobList)) # remove from jobList those that have still Operations to do in RMS reqClient = ReqClient() res = reqClient.getRequestIDsForJobs(jobList) if not res["OK"]: return res if res["Value"]["Successful"]: notFinal = set() # Check whether these requests are in a final status for job, reqID in res["Value"]["Successful"].items(): # If not, remove job from list to remove if reqClient.getRequestStatus(reqID).get( "Value") not in Request.FINAL_STATES: # Keep that job notFinal.add(job) else: # Remove the request, if failed, keep the job res1 = reqClient.deleteRequest(reqID) if not res1["OK"]: notFinal.add(job) if notFinal: self.log.info( "Some jobs won't be removed, as still having Requests not in final status", "(n=%d)" % len(notFinal)) jobList = list(set(jobList) - notFinal) if not jobList: return S_OK() ownerJobsDict = self._getOwnerJobsDict(jobList) fail = False for owner, jobsList in ownerJobsDict.items(): ownerDN = owner.split(";")[0] ownerGroup = owner.split(";")[1] self.log.verbose( "Attempting to remove jobs", "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup)) wmsClient = WMSClient(useCertificates=True, delegatedDN=ownerDN, delegatedGroup=ownerGroup) result = wmsClient.removeJob(jobsList) if not result["OK"]: self.log.error( "Could not remove jobs", "for %s : %s (n=%d) : %s" % (ownerDN, ownerGroup, len(jobsList), result["Message"]), ) fail = True if fail: return S_ERROR() return S_OK()
def test_FullChain(self): """ This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate') # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assert_(res['OK']) # self.assertEqual( type( res['Value'] ), int ) # self.assertEqual( res['Value'], res['JobID'] ) # jobID = res['JobID'] jobID = res['Value'] # updating the status jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp', 'source') # reset the job res = wmsClient.resetJob(jobID) self.assert_(res['OK']) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Received') # updating the status again jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Killed') # updating the status aaaagain jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source') # kill the job res = wmsClient.killJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Done') # this time it won't kill... it's done! # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assert_(res['OK']) res = jobMonitor.getJobStatus(jobID) self.assert_(res['OK']) self.assertEqual(res['Value'], 'Deleted')
def test_FullChain(self): """This test will - call all the WMSClient methods that will end up calling all the JobManager service methods - use the JobMonitoring to verify few properties - call the JobCleaningAgent to eliminate job entries from the DBs """ wmsClient = WMSClient() jobMonitor = JobMonitoringClient() jobStateUpdate = JobStateUpdateClient() # create the job job = helloWorldJob() jobDescription = createFile(job) # submit the job res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription)) self.assertTrue(res["OK"], res.get("Message")) self.assertTrue(isinstance(res["Value"], int), msg="Got %s" % type(res["Value"])) self.assertEqual(res["Value"], res["JobID"], msg="Got %s, expected %s" % (str(res["Value"]), res["JobID"])) jobID = res["JobID"] jobID = res["Value"] # updating the status res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "Executing Minchiapp", "source") self.assertTrue(res["OK"], res.get("Message")) # reset the job res = wmsClient.resetJob(jobID) self.assertTrue(res["OK"], res.get("Message")) # reschedule the job res = wmsClient.rescheduleJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.RECEIVED, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsMinorStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "MinorStatus": "Job Rescheduled" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsApplicationStatus([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"], {jobID: { "ApplicationStatus": "Unknown" }}, msg="Got %s" % str(res["Value"])) res = jobMonitor.getJobsStates([jobID]) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual( res["Value"], { jobID: { "Status": JobStatus.RECEIVED, "MinorStatus": "Job Rescheduled", "ApplicationStatus": "Unknown" } }, msg="Got %s" % str(res["Value"]), ) # updating the status again res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING, "checking", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.WAITING, "waiting", "source") self.assertTrue(res["OK"], res.get("Message")) res = jobStateUpdate.setJobStatus(jobID, JobStatus.MATCHED, "matched", "source") self.assertTrue(res["OK"], res.get("Message")) # kill the job res = wmsClient.killJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.KILLED, msg="Got %s" % str(res["Value"])) # delete the job - this will just set its status to "deleted" res = wmsClient.deleteJob(jobID) self.assertTrue(res["OK"], res.get("Message")) res = jobMonitor.getJobsStatus(jobID) self.assertTrue(res["OK"], res.get("Message")) self.assertEqual(res["Value"][jobID]["Status"], JobStatus.DELETED, msg="Got %s" % str(res["Value"]))