def bulkQueueJobs(listOfSites, *jobSpecDicts): """ _bulkQueueJobs_ For a list of jobs all going to the same site(s) add them to the job queue. For each job spec a dictionary should be provided containing: "JobSpecId" "JobSpecFile" "JobType" "WorkflowSpecId" "WorkflowPriority" A list of site names or se names should be provided. All jobs will be queued for that list of sites """ try: Session.connect() Session.start_transaction() jobQ = JobQueueDB() #jobQ.loadSiteMatchData() jobQ.insertJobSpecsForSites(listOfSites, *jobSpecDicts) logging.info("Job List Queued for sites: %s" % listOfSites) Session.commit_all() Session.close_all() except Exception, ex: msg = "Failed to queue JobSpecs:\n" msg += str(ex) logging.error(msg) Session.rollback() Session.close_all()
def startComponent(self): """ _startComponent_ Start the component and subscribe to messages """ self.ms = MessageService() # register self.ms.registerAs("RelValInjector") # subscribe to messages self.ms.subscribeTo("RelValInjector:StartDebug") self.ms.subscribeTo("RelValInjector:EndDebug") self.ms.subscribeTo("RelValInjector:Inject") self.ms.subscribeTo("JobSuccess") self.ms.subscribeTo("GeneralJobFailure") self.ms.subscribeTo("RelValInjector:Poll") self.ms.publish("RelValInjector:Poll", "", self.args['PollInterval']) self.ms.commit() while True: Session.set_database(dbConfig) Session.connect() Session.start_transaction() type, payload = self.ms.get() self.ms.commit() logging.debug("RelValInjector: %s, %s" % (type, payload)) self.__call__(type, payload) Session.commit_all() Session.close_all()
def migrate(): """ Function to migrate data from StatTracker to ProdMon """ print "Connecting to database" Session.set_database(dbConfig) Session.connect(sessionID=db_id) #Session.start_transaction() print "Make way for new data" wipeDB() print "Load StatTracker data..." loadStatTrackerDB() print "Migrating workflows..." migrateWorkflows() print "Migrating jobs..." print "You may ignore the database warnings... (probably)" migrateJobs() #Session.execute("COMMIT") Session.commit_all() Session.close_all() print "Migration Successful" return
def dropMode(): """ _dropMode_ Remove a site from the ResourceControlDB """ if site == None: msg = "--site option not provided" raise RuntimeError, msg msg = "Dropping Site named: %s\n" % site Session.set_database(dbConfig) Session.connect() Session.start_transaction() resCon = ResourceControlDB() try: resCon.dropSite(site) Session.commit_all() Session.close_all() except Exception, ex: msg += "Error dropping site:\n" msg += str(ex) Session.rollback() Session.close_all() print msg sys.exit(1)
def index(self, workflow): errHtml = "<html><body><h2>No Graph Tools installed!!!</h2>\n " errHtml += "</body></html>" try: from graphtool.graphs.common_graphs import StackedBarGraph except ImportError: return errHtml Session.set_database(dbConfig) Session.connect() Session.start_transaction() procStatus = {} mergeStatus = {} for state in _States: procStatus[state] = len( WEUtil.jobsForWorkflow(workflow, "Processing", state )) mergeStatus[state] = len( WEUtil.jobsForWorkflow(workflow, "Merge", state)) Session.commit_all() Session.close_all() pngfile = os.path.join(self.workingDir, "%s-WorkflowGraph.png" % workflow) pngfileUrl = "%s?filepath=%s" % (self.imageServer, pngfile) data = { "Processing" : procStatus, "Merge" : mergeStatus} metadata = {"title" : "Job States for %s" % workflow } plotfile = open(pngfile, 'w') SBG = StackedBarGraph() SBG(data, plotfile, metadata) plotfile.close() html = "<html><body><img src=\"%s\"></body></html>" % pngfileUrl return html
def retrieveSites(self): """ _retrieveSites_ Return a list of all sites from the ResourceControl DB and stores them in this object for access by the plugins """ Session.set_database(dbConfig) Session.connect() Session.start_transaction() resCon = ResourceControlDB() siteNames = resCon.siteNames() for site in siteNames: siteData = resCon.getSiteData(site) self.allSites[site] = siteData siteIndex = siteData['SiteIndex'] if siteData['Active'] == True: self.activeSites.append(site) self.siteThresholds[site] = resCon.siteThresholds(siteIndex) self.siteAttributes[site] = resCon.siteAttributes(siteIndex) self.sitePerformance[site] = \ selectRcSitePerformance(siteIndex, self.performanceInterval) del resCon self.jq = JobQueueDB() self.sitejobs = self.jq.countQueuedActiveJobs() Session.commit_all() Session.close_all() return
def __call__(self): """ Query PA DB for jobs """ Session.set_database(dbConfig) Session.connect() Session.start_transaction() sqlStr1 = \ """ SELECT COUNT(id) FROM we_Job WHERE job_type="Processing" AND status='inProgress'; """ sqlStr2 = \ """ SELECT COUNT(id) FROM we_Job WHERE job_type="Merge" AND status='inProgress'; """ sqlStr3 = \ """ SELECT COUNT(id) FROM we_Job WHERE job_type="CleanUp" AND status='inProgress'; """ sqlStr4 = \ """ SELECT COUNT(id) FROM we_Job WHERE job_type="LogCollect" AND status='inProgress'; """ sqlStr5 = \ """ SELECT COUNT(id) FROM we_Job WHERE job_type="Repack" AND status='inProgress'; """ Session.execute(sqlStr1) numProcessing = Session.fetchone()[0] Session.execute(sqlStr2) numMerge = Session.fetchone()[0] Session.execute(sqlStr3) numClean = Session.fetchone()[0] Session.execute(sqlStr4) numLog = Session.fetchone()[0] Session.execute(sqlStr5) numRepack = Session.fetchone()[0] Session.close_all() total = numProcessing + numMerge + numRepack self['Total'] = total self['Processing'] = numProcessing self['Merge'] = numMerge self['CleanUp'] = numClean self['LogCollect'] = numLog self['Repack'] = numRepack return
def testC(self): print("""\nEmit partial cleanup events to test the partialCleanupHandler""") Session.set_database(dbConfig) Session.connect() Session.start_transaction() for i in xrange(0,self.jobSpecs): payload="jobSpec"+str(i)+",SubmitJob,jobSpec"+str(i) self.ms.publish("PartialJobCleanup", payload) self.ms.commit() Session.commit_all() Session.close_all() print("""\nSleep for several seconds""") time.sleep(3)
def testG(self): print("""\nEmit failure cleanup events to test the failureCleanupHandler""") Session.set_database(dbConfig) Session.connect() Session.start_transaction() for i in xrange(0,self.failureJobSpecs): payload="failureJobSpec"+str(i) #print('publishing FailureCleanup for failureJobSpec'+str(i)) self.ms.publish("FailureCleanup", payload) self.ms.commit() Session.commit_all() Session.close_all() print("""\nSleep for several seconds""") time.sleep(3)
def testA(self): print("""\npublish events to turn JobCleanup logging on""") try: Session.set_database(dbConfig) Session.connect() Session.start_transaction() self.ms.publish("JobCleanup:StartDebug", "none") self.ms.commit() Session.commit_all() Session.close_all() except StandardError, ex: msg = "Failed testA:\n" msg += str(ex) self.fail(msg)
def reQueueJob(jobs_spec_id): """ Put job back in queue - generally used after a failure """ Session.connect() Session.start_transaction() jobQ = JobQueueDB() result = jobQ.reQueueJob(jobs_spec_id) Session.commit_all() Session.close_all() return result
def getSiteForReleasedJob(job_spec_id): """ get site index for given job """ Session.connect() Session.start_transaction() jobQ = JobQueueDB() result = jobQ.getSiteForReleasedJob(job_spec_id) Session.commit_all() Session.close_all() return result
def testF(self): print("""\nSet the job cache (used for failure job cleanup)""") try: Session.set_database(dbConfig) Session.connect() Session.start_transaction() for i in xrange(0,self.failureJobSpecs): JobState.register("failureJobSpec"+str(i),"Processing",2,2) JobState.create("failureJobSpec"+str(i),self.location+"/failureJobSpecDir_"+str(i)) Session.commit_all() Session.close_all() except StandardError, ex: msg = "Failed testB:\n" msg += str(ex) self.fail(msg)
def testE(self): try: Session.set_database(dbConfig) Session.connect() Session.start_transaction() for i in xrange(0,self.jobSpecs): for k in xrange(0,self.flags): self.trigger.setFlag("jobCleanupTrigger"+str(i),\ "jobSpec"+str(i),"flag"+str(k)) Session.commit_all() Session.close_all() except StandardError, ex: msg = "Failed testD:\n" msg += str(ex) self.fail(msg)
def setUp(self): if not TriggerUnitTests._triggerSet: Session.set_database(dbConfig) Session.connect() Session.start_transaction() print "\n**************Start TriggerUnitTests**********" self.ms=MessageService() self.ms.registerAs("TriggerTest") self.trigger=TriggerAPI(self.ms) self.triggers=5 self.jobspecs=5 self.flags=5 TriggerUnitTests._triggerSet=True Session.commit_all() Session.close_all()
def queueLength(jobType = None): """ _queueLength_ Get the number of jobs in the queue, optionally distinguishing by type """ Session.connect() Session.start_transaction() jobQ = JobQueueDB() length = jobQ.queueLength(jobType) Session.commit_all() Session.close_all() return length
def findMatchedJobs(self, constraint): """ _findMatchedJobs_ Method that finds jobs matching the constraint provided and stores the list in self.matchedJobs """ Session.set_database(dbConfig) Session.connect() Session.start_transaction() jobQ = JobQueueDB() if constraint['site'] != None: # // # // site based job match #// site = int(constraint['site']) #jobQ.loadSiteMatchData() jobIndices = jobQ.retrieveJobsAtSites(constraint['count'], constraint["type"], constraint['workflow'], * [site]) jobs = jobQ.retrieveJobDetails(*jobIndices) [ x.__setitem__("Site", site) for x in jobs ] else: # // # // non-site based job match #// jobIndices = jobQ.retrieveJobs(constraint['count'], constraint["type"], constraint['workflow']) jobs = jobQ.retrieveJobDetails(*jobIndices) [ x.__setitem__("Site", None) for x in jobs ] Session.commit_all() Session.close_all() logging.info("Matched %s jobs for constraint %s" % ( len(jobs), constraint)) self.matchedJobs = jobs return
def index(self): Session.set_database(dbConfig) Session.connect() Session.start_transaction() logs = getUnCollectedLogDetails() # find site pfns ses = set() for wf, details in logs.items(): for se, log in details.items(): ses.add(se) sitesPFNMapping = self.getSitePFNMapping(ses) # now format html html = """<html><body><h2>Job Logs</h2>\n """ html += "<table>\n" html += " <tr><th>Workflow</th><th>SE</th><th>Log</th></tr>\n" for wf, details in logs.items(): for site, details in details.items(): for log in details: html += " <tr><th>%s</th><th>%s</th><th>%s</th></tr>\n" % (wf, site, \ self.formatSRMcommand(sitesPFNMapping, site, log)) # html += "<td>%s</td></tr>\n" % len(queuedProcJobs) # # html += " <tr><td>Processing</td><td>Released</td>" # html += "<td>%s</td></tr>\n" % len(releasedProcJobs) # # html += " <tr><td>Merge</td><td>Queued</td>" # html += "<td>%s</td></tr>\n" % len(queuedMrgJobs) # # html += " <tr><td>Merge</td><td>Released</td>" # html += "<td>%s</td></tr>\n" % len(releasedMrgJobs) html += "</table>\n" html += """</body></html>""" Session.commit_all() Session.close_all() return html
def testD(self): print("""\nCreate and set triggers to activate job cleanup""") try: Session.set_database(dbConfig) Session.connect() Session.start_transaction() for i in xrange(0,self.jobSpecs): for k in xrange(0,self.flags): self.trigger.addFlag("jobCleanupTrigger"+str(i),\ "jobSpec"+str(i),"flag"+str(k)) self.trigger.setAction("jobSpec"+str(i),\ "jobCleanupTrigger"+str(i),"jobCleanAction") Session.commit_all() Session.close_all() except StandardError, ex: msg = "Failed testC:\n" msg += str(ex) self.fail(msg)
def wrapperFuction(*args, **dictArgs): """ _wrapperFuction_ """ try: Session.connect() Session.start_transaction() reValue = dbfunc(*args, **dictArgs) Session.commit_all() Session.close_all() return reValue except Exception, ex: msg = "Error: %s\n" % str(ex) logging.error(msg) Session.rollback() Session.close_all()
def testH(self): return print("""\nCleanup the prodagent database""") print("\nsleep for 20 seconds to") print("let the cleanup component receive the messages") time.sleep(20) try: Session.set_database(dbConfig) Session.connect() Session.start_transaction() JobState.purgeStates() self.ms.purgeMessages() Session.commit_all() Session.close_all() except StandardError, ex: msg = "Failed testE:\n" msg += str(ex) self.fail(msg)
def removeHoldByWorkflow(workflowID): """ _removeHoldByWorkflow_ Change the status of all jobs in the JobQueue with a particular workflow ID from "held" to "new" so that they will eventually be released. """ try: Session.connect() Session.start_transaction() jobQ = JobQueueDB() jobQ.removeHoldForWorkflow(workflowID) Session.commit_all() Session.close_all() except Exception, ex: msg = "Failed to remove jobs for workflow %s." % workflowID msg += str(ex) logging.error(msg) Session.rollback() Session.close_all()
def releaseJobs(siteIndex = None, *jobDefs): """ _releaseJobs_ Flag jobs as released so that they can be removed from the queue """ logging.debug("releasing jobDefs: %s for site %s" % (str(jobDefs), siteIndex)) indices = [ x['JobIndex'] for x in jobDefs ] logging.debug("releasing indices: %s" % indices) Session.connect() Session.start_transaction() jobQ = JobQueueDB() jobQ.flagAsReleased(siteIndex, *indices) Session.commit_all() Session.close_all() return
def index(self): Session.set_database(dbConfig) Session.connect() Session.start_transaction() html = """<html><body><h2>ProdAgent Workflows </h2>\n """ for owner in _Owners: workflowList = WEUtil.listWorkflowsByOwner(owner) html += "<h4>Workflow Owner: %s</h4>\n<ul>\n" % owner for workflow in workflowList: html += "<li><a href=\"%s?workflow=%s\">%s</a></li>\n" % ( self.graphmon, workflow, workflow) html += "</ul>\n" html += """</body></html>""" Session.commit_all() Session.close_all() return html
def index(self): Session.set_database(dbConfig) Session.connect() Session.start_transaction() html = """<html><body><h2>JobQueue State </h2>\n """ jobQueue = JobQueueDB() jobQueue.loadSiteMatchData() releasedProcJobs = jobQueue.retrieveReleasedJobs(1000000, "Processing") queuedProcJobs = jobQueue.retrieveJobs(1000000, "Processing") releasedMrgJobs = jobQueue.retrieveReleasedJobs(1000000, "Merge") queuedMrgJobs = jobQueue.retrieveJobs(1000000, "Merge") html += "<table>\n" html += " <tr><th>Job Type</th><th>Status</th><th>Total</th></tr>\n" html += " <tr><td>Processing</td><td>Queued</td>" html += "<td>%s</td></tr>\n" % len(queuedProcJobs) html += " <tr><td>Processing</td><td>Released</td>" html += "<td>%s</td></tr>\n" % len(releasedProcJobs) html += " <tr><td>Merge</td><td>Queued</td>" html += "<td>%s</td></tr>\n" % len(queuedMrgJobs) html += " <tr><td>Merge</td><td>Released</td>" html += "<td>%s</td></tr>\n" % len(releasedMrgJobs) html += "</table>\n" html += """</body></html>""" Session.commit_all() Session.close_all() return html
def newMode(): """ _newMode_ Add a new site with some standard default thresholds """ if site == None: msg = "--site option not provided" raise RuntimeError, msg if ceName == None: msg = "--ce-name option not provided. Warning, this is not supported" msg += " by all ResourceMonitor / Submitter combinations\n" print(msg) if seName == None: msg = "--se-name option not provided" raise RuntimeError, msg msg = "Adding New Site named: %s\n" % site Session.set_database(dbConfig) Session.connect() Session.start_transaction() active = True if deactivate != None: active = False resCon = ResourceControlDB() try: siteIndex = resCon.newSite(site, seName, ceName, active) except Exception, ex: msg += "Error adding new site:\n%s\n" % str(ex) Session.rollback() Session.close_all() print msg sys.exit(1)
def fail(): Session.rollback_all() Session.close_all()
def finish(): Session.commit_all() Session.close_all()
JobState.submit("jobClassID1") # retries=0, racers=1; self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 3L, 'JobType': 'Processing'}) JobState.runFailure("jobClassID1","jobInstanceID1.1", "some.location1.1","job/Report/Location1.1.xml") JobState.submit("jobClassID1") except StandardError, ex: msg = "Failed State Change TestA:\n" msg += str(ex) self.fail(msg) Session.commit_all() Session.close_all() def testB(self): """change state test""" try: JobState.register("jobClassID2","Processing",2,1,"myWorkflowID") JobState.create("jobClassID2","cacheDir/location/2somewhere") JobState.inProgress("jobClassID2") # retries=racers=0 self.assertEqual(JobState.general("jobClassID2"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'}) JobState.submit("jobClassID2")
def findMatchedJobs(self, constraint): """ _findMatchedJobs_ Method that finds jobs matching the constraint provided and stores the list in self.matchedJobs """ logging.debug("LCGAdvanced findMatchedJobs started.") Session.set_database(dbConfig) Session.connect() Session.start_transaction() jobQ = JobQueueDB() jobs = [] ## check if JobSubmitter still needs to process jobs sqlStr = ''' SELECT count(*) FROM ms_process,ms_message WHERE ( ms_process.procid = ms_message.dest AND ms_process.name IN ('JobSubmitter','JobCreator')); ''' Session.execute(sqlStr) result = Session.fetchall() js_is_ok = True ## allowed number of messages (could be other messages ## for JobSubmitter) ## in principle this should also check for messages for ## JobCreator, as CreateJob messages will result in SubmitJob messages ## take number of jobs the JobSubmitter can handle in one ## ResourceMonitor:Poll interval allowed_nr_of_ms = 600 if int(result[0][0]) > allowed_nr_of_ms: js_is_ok = False msg = "LCGAdvanced: JobSubmitter still need to process " msg += str(result[0][0]) msg += " messages, which is more than number of allowed messages " msg += str(allowed_nr_of_ms) msg += ". Currently not releasing anything." logging.info(msg) Session.commit_all() return # skip the advanced stuff for non cmssw jobs if constraint['type'] in ('CleanUp', 'LogCollect'): return PrioritiserInterface.findMatchedJobs(self, constraint) #What extra code do we want here # is workflow max not taken into account by other methods somewhere # how does PrioritiserInterface do it? constraint['workflow'] = constraintID2WFname(constraint['workflow']) if js_is_ok and (constraint['site'] != None): # site based job match site = int(constraint['site']) jobIndices = jobQ.retrieveJobsAtSitesNotWorkflowSitesMax( constraint['count'], constraint['type'], constraint['workflow'], * [site]) jobs = jobQ.retrieveJobDetails(*jobIndices) [ x.__setitem__("Site", site) for x in jobs ] else: ## not implemented yet pass Session.commit_all() Session.close_all() logging.info("LCGAdvanced: Matched %s jobs for constraint %s" % ( len(jobs), constraint)) self.matchedJobs = jobs return
def setUp(self): Session.set_database(dbConfig) Session.connect() Session.start_transaction() if not ComponentServerTest._triggerSet: print "\n****Start ComponentServerTest (JobCleanup)*******" # we use this for event publication. self.ms=MessageService() self.ms.registerAs("JobCleanupTest") self.jobSpecs=1000 self.location='/tmp/prodagent/components/JobCleanup/cacheDirs' self.failureJobSpecs=1000 self.flags=5 self.trigger=TriggerAPI(self.ms) # create some directories in tmp print('\nCreating directories in the /tmp area to serve '+ \ 'as job cache dirs') for i in xrange(0,self.jobSpecs): try: os.makedirs(self.location+'/jobSpecDir_'+str(i)) # create some files (some of which should not be deleted, # by a partial cleanup) file1=open(self.location+'/jobSpecDir_'+str(i)+'/JobSpec.xml','w') file1.close() file2=open(self.location+'/jobSpecDir_'+str(i)+'/FrameworkJobReport.xml','w') file2.close() file3=open(self.location+'/jobSpecDir_'+str(i)+'/JobTarFile.tar.gz','w') file3.close() file4=open(self.location+'/jobSpecDir_'+str(i)+'/Pretend2BeADir1.txt','w') file4.close() file5=open(self.location+'/jobSpecDir_'+str(i)+'/Pretend2BeADir2.txt','w') file5.close() except: raise # create jobcaches that need to be tarred and then removed: for i in xrange(0,self.failureJobSpecs): try: os.makedirs(self.location+'/failureJobSpecDir_'+str(i)) file1=open(self.location+'/failureJobSpecDir_'+str(i)+'/JobSpec.xml','w') file1.close() file2=open(self.location+'/failureJobSpecDir_'+str(i)+'/FrameworkJobReport.xml','w') file2.close() file3=open(self.location+'/failureJobSpecDir_'+str(i)+'/JobTarFile.tar.gz','w') file3.close() file4=open(self.location+'/failureJobSpecDir_'+str(i)+'/aFile.txt','w') file4.close() os.makedirs(self.location+'/failureJobSpecDir_'+str(i)+'/aDir1') file5=open(self.location+'/failureJobSpecDir_'+str(i)+'/aDir1/File.txt','w') file5.close() os.makedirs(self.location+'/failureJobSpecDir_'+str(i)+'/aDir2') file6=open(self.location+'/failureJobSpecDir_'+str(i)+'/aDir2/aFile.txt','w') file6.close() os.makedirs(self.location+'/failureJobSpecDir_'+str(i)+'/aDir3') file7=open(self.location+'/failureJobSpecDir_'+str(i)+'/aDir3/aFile.txt','w') file7.close() except: raise ComponentServerTest._triggerSet=True Session.commit_all() Session.close_all()