def testB(self): """change state test""" try: JobState.register("jobClassID2","Processing",2,1,"myWorkflowID") JobState.create("jobClassID2","cacheDir/location/2somewhere") JobState.inProgress("jobClassID2") # retries=racers=0 self.assertEqual(JobState.general("jobClassID2"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'}) JobState.submit("jobClassID2") # retries0,=racers=1 self.assertEqual(JobState.general("jobClassID2"),{'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'}) JobState.runFailure("jobClassID2","jobInstanceID2.1", "some.location2.1","job/Report/Location2.1.xml") # retries= 1, racers=0 self.assertEqual(JobState.general("jobClassID2"), {'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'Retries': 1, 'JobType': 'Processing'}) JobState.submit("jobClassID2") # retries= 1, racers=1 self.assertEqual(JobState.general("jobClassID2"),{'Retries': 1L, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 2L, 'JobType': 'Processing'}) except StandardError, ex: msg = "Failed State Change TestB:\n" msg += str(ex) self.fail(msg)
def testE(self): try: JobState.register("jobClassID5","Processing",2,2,"myWorkflowID") JobState.create("jobClassID5","cacheDir/location/5somewhere") JobState.inProgress("jobClassID5") JobState.submit("jobClassID5") # now introduce some failures until we have more failures # then retries (this raises an error) JobState.runFailure("jobClassID5","jobInstanceID5.1", "some.location5.1","job/Report/Location5.1.xml") try: JobState.runFailure("jobClassID5","jobInstanceID5.2", "some.location5.1","job/Report/Location5.1.xml") except ProdException, ex: print('>>>Test succeeded for exception 1/1 in testE of JobState_t.py\n') JobState.finished("jobClassID5")
JobState.inProgress("jobClassID1") except ProdException, ex: print('>>>Test succeeded for exception 3/3 in testA of JobState_t.py\n') JobState.create("jobClassID1","cacheDir/location/1somewhere") JobState.inProgress("jobClassID1") # retries=racers=0; self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 3, 'JobType': 'Processing'} ) JobState.submit("jobClassID1") # retries=0, racers=1; self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 3L, 'JobType': 'Processing'}) JobState.runFailure("jobClassID1","jobInstanceID1.1", "some.location1.1","job/Report/Location1.1.xml") JobState.submit("jobClassID1") except StandardError, ex: msg = "Failed State Change TestA:\n" msg += str(ex) self.fail(msg) Session.commit_all() Session.close_all() def testB(self): """change state test""" try: JobState.register("jobClassID2","Processing",2,1,"myWorkflowID") JobState.create("jobClassID2","cacheDir/location/2somewhere")
def handleError(self,payload): """ The payload of a job failure is a url to the job report """ jobReportUrl= payload # prepare to retrieve the job report file. # NOTE: we assume that the report file has a relative unique name # NOTE: if that is not the case we need to add a unique identifier to it. slash = jobReportUrl.rfind('/') fileName = jobReportUrl[slash+1:] urllib.urlretrieve(jobReportUrl, \ self.args['jobReportLocation']+'/'+fileName) logging.debug(">RunFailureHandler<:Retrieving job report from %s " % jobReportUrl) jobReport=readJobReport(self.args['jobReportLocation']+'/'+fileName) #NOTE: is this the right way to extract the job id. jobId=jobReport[0].jobSpecId logging.debug(">RunFailureHandler<:Retrieving jobId from job report "+\ "(used to dynamically load error handler) " \ "jobId="+str(jobId)) # create the jobReportLocation jobId hierarchy if not exists. pipe=os.popen("mkdir -p "+self.args['jobReportLocation']+'/'+jobId) pipe.close() # move the report file to this new location. pipe=os.popen("mv "+self.args['jobReportLocation']+'/'+fileName+" "+ \ self.args['jobReportLocation']+'/'+jobId) logging.debug(">RunFailureHandler<:Moving job report to permanent storage: " \ +self.args['jobReportLocation']+'/'+jobId) pipe.close() reportLocation=self.args['jobReportLocation']+'/'+ \ jobId+'/'+fileName generalInfo=JobState.general(jobId) # a submit event with delay delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">RunFailureHandler<: re-submitting with delay (h:m:s) "+\ str(delay)) if self.args['ReportAction'] == 'move' : # count how many files are in the dir (to generate unique ids # when moving files try: lastID = len(os.listdir(os.path.dirname(payload))) target = os.path.join(os.path.dirname(payload),\ os.path.basename(payload).split('.')[0] +\ str(lastID) +\ '.xml') logging.debug('Moving file: '+ payload + ' to: ' + target) shutil.move(payload,target) except: pass try: JobState.runFailure(jobId,jobReportLocation= reportLocation) # check the cache dir size. If it is beyond the threshold, purge it. dirSizeBytes=dirSize(generalInfo['CacheDirLocation'],0,0,0) dirSizeMegaBytes=convertSize(dirSizeBytes,'m') logging.debug(">RunFailureHandler<:Cache dir. size is "+\ str(dirSizeMegaBytes)+" MB. Maximum allowed is "+\ str(self.maxCacheDirSizeMB)+" MB ") jobspecfile="%s/%s-JobSpec.xml" % (generalInfo['CacheDirLocation'],jobId) # if necessary first a partial cleanup is done, which after it # is finished publishes the proper event. # retrieve the number of retries and publish if(float(dirSizeMegaBytes)>float(self.maxCacheDirSizeMB)): newPayload=jobId+",SubmitJob,"+jobId+","+str(delay) logging.debug(">RunFailureHandler<: Reached maximum cache size. "+\ "Performing partial cache cleanup first.") self.publishEvent("PartialJobCleanup",newPayload,delay) else: logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure,"\ "publishing a submit job event") if self.args['QueueFailures']: JobQueueAPI.reQueueJob(jobId) else: self.publishEvent("SubmitJob",jobspecfile,delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure "+ \ "Maximum number of retries reached!" +\ " Submitting a failure job and cleanup event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))