def testB(self): """change state test""" try: JobState.register("jobClassID2","Processing",2,1,"myWorkflowID") JobState.create("jobClassID2","cacheDir/location/2somewhere") JobState.inProgress("jobClassID2") # retries=racers=0 self.assertEqual(JobState.general("jobClassID2"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'}) JobState.submit("jobClassID2") # retries0,=racers=1 self.assertEqual(JobState.general("jobClassID2"),{'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'}) JobState.runFailure("jobClassID2","jobInstanceID2.1", "some.location2.1","job/Report/Location2.1.xml") # retries= 1, racers=0 self.assertEqual(JobState.general("jobClassID2"), {'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'Retries': 1, 'JobType': 'Processing'}) JobState.submit("jobClassID2") # retries= 1, racers=1 self.assertEqual(JobState.general("jobClassID2"),{'Retries': 1L, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 2L, 'JobType': 'Processing'}) except StandardError, ex: msg = "Failed State Change TestB:\n" msg += str(ex) self.fail(msg)
def testI(self): JobState.register("jobClassID10","Processing",8,2,"myWorkflowID") #retries=racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 0, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.createFailure("jobClassID10") #retries=1, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 1, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.createFailure("jobClassID10") #retries=2, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.create("jobClassID10","cacheDir/location/10somewhere") #retries=2, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'create', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.inProgress("jobClassID10") #retries=2, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submitFailure("jobClassID10") #retries=3, racer=0 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 3, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submit("jobClassID10") #retries=3, racer=1 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 3, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submitFailure("jobClassID10") #retries=4, racer=1 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 4, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) JobState.submit("jobClassID10") #retries=4, racer=2 self.assertEqual(JobState.general("jobClassID10"),{'Retries': 4, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 2, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'}) # on purpose we introduce an error: try: JobState.submit("jobClassID10") except ProdException, ex: print('>>>Test succeeded for exception 1/1 in testH of JobState_t.py\n')
def handleEvent(self,payload): """ The payload of for a cleanup handler is a job id. """ if self.failureArchive == None: logging.error("No Failure Archive set: Cannot Archive Job:\n %s" % payload) return try: logging.debug(">FailureCleanupHandler< archiving "+\ "information for jobspec: "+str(payload)) try: os.makedirs(self.failureArchive) except: pass cacheDirLocation=JobState.general(str(payload))['CacheDirLocation'] logging.debug(">FailureCleanupHandler< archiving and removing directory: "+cacheDirLocation) #NOTE: check what this does when it is repeated (e.g. after a crash) tar=tarfile.open(self.failureArchive+'/'+str(payload)+'.tar.gz','w:gz') short_root=cacheDirLocation.split('/')[-1] tar.add(cacheDirLocation,short_root) tar.close() try: for root, dirs, files in os.walk(cacheDirLocation, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) os.rmdir(cacheDirLocation) except Exception,ex: logging.debug(">FailureCleanupHandler< WARNING job cleanup: "+str(ex)) JobState.cleanout(str(payload)) Job.remove(str(payload)) logging.debug(">FailureCleanupHandler< archived completed for jobspecID: "+str(payload))
def testD(self): """change state test""" try: JobState.register("jobClassID4","Processing",6,2,"myWorkflowID") JobState.create("jobClassID4","cacheDir/location/4somewhere") JobState.inProgress("jobClassID4") # retries=racers=0 self.assertEqual(JobState.general("jobClassID4"),{'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/4somewhere', 'MaxRacers': 2L, 'Racers': 0L, 'State': 'inProgress', 'MaxRetries': 6L, 'JobType': 'Processing'})
def testK(self): jobIDs=[] for i in xrange(0,20): JobState.register("jobClassID_0."+str(i),"Processing",30,1) JobState.register("jobClassID_1."+str(i),"Processing",30,1,"myWorkflowID1") JobState.register("jobClassID_2."+str(i),"Processing",30,1,"myWorkflowID2") JobState.register("jobClassID_3."+str(i),"Processing",30,1,"myWorkflowID3") jobIDs.append("jobClassID_1."+str(i)) jobIDs.append("jobClassID_2."+str(i)) jobIDs.append("jobClassID_3."+str(i)) JobState.setMaxRetries(jobIDs,2) self.assertEqual(JobState.general("jobClassID_1.1")['MaxRetries'],2) JobState.setMaxRetries("jobClassID_1.1",3) self.assertEqual(JobState.general("jobClassID_1.1")['MaxRetries'],3) jobIDs=JobState.retrieveJobIDs("myWorkflowID1") self.assertEqual(len(jobIDs),20) jobIDs=JobState.retrieveJobIDs(["myWorkflowID1","myWorkflowID2","myWorkflowID3"]) self.assertEqual(len(jobIDs),60) jobs=JobState.rangeGeneral(0,10) print(str(jobs))
def handleEvent(self,payload): """ The payload of a partial cleanup handler is a job id and the event (plus payload) it needs to emit aferwards. """ payloads=payload.split(',') jobId=payloads[0] nextEvent=payloads[1] nextPayload=payloads[2] delay=0 if len(payloads)==4: delay=payloads[3] try: logging.debug(">PartialCleanupHandler< removing cached files "+\ "for jobspec: "+str(jobId)) cacheDirLocation=JobState.general(str(jobId))['CacheDirLocation'] logging.debug(">PartialCleanupHandler< starting remove in: "+cacheDirLocation) try: for root, dirs, files in os.walk(cacheDirLocation, topdown=False): for name in files: # check if file is an .xml or .tar.gz file # if so do not remove. # NOTE: should use reg. exp. here. isSaved=False # we only keep files that are in the top dir. # if we in the top dir we check for certain extensions. if root==cacheDirLocation: extensions=['.xml','.tar.gz'] for extension in extensions: pos1=name.rfind(extension) pos2=len(name)-len(extension) if(pos1==pos2): isSaved=True break if not isSaved: try: os.remove(os.path.join(root, name)) except Exception,ex: logging.debug(">PartialCleanupHandler< WARNING "+\ " partial job cleanup: "+str(ex)) else: logging.debug(">PartialCleanupHandler< not removing: "+name) for name in dirs: os.rmdir(os.path.join(root, name)) except Exception,ex: logging.debug(">PartialCleanupHandler< WARNING partial job cleanup: "+\ str(ex)) except Exception,ex: logging.debug(">PartialCleanupHandler< ERROR partial job cleanup: "+\ str(ex))
def getJobCache(self, jobSpecId): """ _getJobCache_ Lookup a job cache for the job spec Id provided """ try: stateInfo = JobState.general(jobSpecId) except Exception, ex: msg = "ERROR: Cant get JobCache for %s\n" % jobSpecId msg += str(ex) logging.warning(msg) stateInfo = {}
def killJob(self, jobSpecId, erase=False): """ Arguments: JobSpecId -- the job id. erase -- remove job info from BOSS database Return: none """ # jobSpecId is job['name'] for BossLite # Fabio logging.info("BossLiteKiller.killJob(%s)" % jobSpecId) # verify that the job exists try: stateInfo = JobState.general(jobSpecId) except StandardError, ex: msg = "Cannot retrieve JobState Information for %s\n" % jobSpecId msg += str(ex) logging.error(msg) raise InvalidJobException, msg
def handleError(self,payload): jobId = payload generalInfo=JobState.general(jobId) delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">CreateFailureHandler<: re-creating with delay "+\ " (h:m:s) "+str(delay)) try: JobState.createFailure(jobId) logging.debug(">CreateFailureHandler<: Registered "+\ "a create failure,"\ "publishing a create event") self.publishEvent("CreateJob",(jobId),delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">CreateFailureHandler<: Registered "+\ "a create failure "+ \ "Maximum number of retries reached!" +\ " Submitting a general failure and cleanup job event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))
# register again (illegal): try: JobState.register("jobClassID1","Processing",3,1,"myWorkflowID") print('>>>Test ERROR \n') except ProdException, ex: print('>>>Test succeeded for exception 2/3 in testA of JobState_t.py\n') try: # illegal state transitions: JobState.inProgress("jobClassID1") except ProdException, ex: print('>>>Test succeeded for exception 3/3 in testA of JobState_t.py\n') JobState.create("jobClassID1","cacheDir/location/1somewhere") JobState.inProgress("jobClassID1") # retries=racers=0; self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 3, 'JobType': 'Processing'} ) JobState.submit("jobClassID1") # retries=0, racers=1; self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 3L, 'JobType': 'Processing'}) JobState.runFailure("jobClassID1","jobInstanceID1.1", "some.location1.1","job/Report/Location1.1.xml") JobState.submit("jobClassID1") except StandardError, ex: msg = "Failed State Change TestA:\n" msg += str(ex) self.fail(msg)
def handleError(self,payload): """ The payload of a job failure is a url to the job report """ jobReportUrl= payload # prepare to retrieve the job report file. # NOTE: we assume that the report file has a relative unique name # NOTE: if that is not the case we need to add a unique identifier to it. slash = jobReportUrl.rfind('/') fileName = jobReportUrl[slash+1:] urllib.urlretrieve(jobReportUrl, \ self.args['jobReportLocation']+'/'+fileName) logging.debug(">RunFailureHandler<:Retrieving job report from %s " % jobReportUrl) jobReport=readJobReport(self.args['jobReportLocation']+'/'+fileName) #NOTE: is this the right way to extract the job id. jobId=jobReport[0].jobSpecId logging.debug(">RunFailureHandler<:Retrieving jobId from job report "+\ "(used to dynamically load error handler) " \ "jobId="+str(jobId)) # create the jobReportLocation jobId hierarchy if not exists. pipe=os.popen("mkdir -p "+self.args['jobReportLocation']+'/'+jobId) pipe.close() # move the report file to this new location. pipe=os.popen("mv "+self.args['jobReportLocation']+'/'+fileName+" "+ \ self.args['jobReportLocation']+'/'+jobId) logging.debug(">RunFailureHandler<:Moving job report to permanent storage: " \ +self.args['jobReportLocation']+'/'+jobId) pipe.close() reportLocation=self.args['jobReportLocation']+'/'+ \ jobId+'/'+fileName generalInfo=JobState.general(jobId) # a submit event with delay delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">RunFailureHandler<: re-submitting with delay (h:m:s) "+\ str(delay)) if self.args['ReportAction'] == 'move' : # count how many files are in the dir (to generate unique ids # when moving files try: lastID = len(os.listdir(os.path.dirname(payload))) target = os.path.join(os.path.dirname(payload),\ os.path.basename(payload).split('.')[0] +\ str(lastID) +\ '.xml') logging.debug('Moving file: '+ payload + ' to: ' + target) shutil.move(payload,target) except: pass try: JobState.runFailure(jobId,jobReportLocation= reportLocation) # check the cache dir size. If it is beyond the threshold, purge it. dirSizeBytes=dirSize(generalInfo['CacheDirLocation'],0,0,0) dirSizeMegaBytes=convertSize(dirSizeBytes,'m') logging.debug(">RunFailureHandler<:Cache dir. size is "+\ str(dirSizeMegaBytes)+" MB. Maximum allowed is "+\ str(self.maxCacheDirSizeMB)+" MB ") jobspecfile="%s/%s-JobSpec.xml" % (generalInfo['CacheDirLocation'],jobId) # if necessary first a partial cleanup is done, which after it # is finished publishes the proper event. # retrieve the number of retries and publish if(float(dirSizeMegaBytes)>float(self.maxCacheDirSizeMB)): newPayload=jobId+",SubmitJob,"+jobId+","+str(delay) logging.debug(">RunFailureHandler<: Reached maximum cache size. "+\ "Performing partial cache cleanup first.") self.publishEvent("PartialJobCleanup",newPayload,delay) else: logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure,"\ "publishing a submit job event") if self.args['QueueFailures']: JobQueueAPI.reQueueJob(jobId) else: self.publishEvent("SubmitJob",jobspecfile,delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure "+ \ "Maximum number of retries reached!" +\ " Submitting a failure job and cleanup event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))