def handleError(self,payload): jobId = payload generalInfo=JobState.general(jobId) delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">CreateFailureHandler<: re-creating with delay "+\ " (h:m:s) "+str(delay)) try: JobState.createFailure(jobId) logging.debug(">CreateFailureHandler<: Registered "+\ "a create failure,"\ "publishing a create event") self.publishEvent("CreateJob",(jobId),delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">CreateFailureHandler<: Registered "+\ "a create failure "+ \ "Maximum number of retries reached!" +\ " Submitting a general failure and cleanup job event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))
def handleError(self,payload): """ The payload of a job failure is a url to the job report """ jobReportUrl= payload # prepare to retrieve the job report file. # NOTE: we assume that the report file has a relative unique name # NOTE: if that is not the case we need to add a unique identifier to it. slash = jobReportUrl.rfind('/') fileName = jobReportUrl[slash+1:] urllib.urlretrieve(jobReportUrl, \ self.args['jobReportLocation']+'/'+fileName) logging.debug(">RunFailureHandler<:Retrieving job report from %s " % jobReportUrl) jobReport=readJobReport(self.args['jobReportLocation']+'/'+fileName) #NOTE: is this the right way to extract the job id. jobId=jobReport[0].jobSpecId logging.debug(">RunFailureHandler<:Retrieving jobId from job report "+\ "(used to dynamically load error handler) " \ "jobId="+str(jobId)) # create the jobReportLocation jobId hierarchy if not exists. pipe=os.popen("mkdir -p "+self.args['jobReportLocation']+'/'+jobId) pipe.close() # move the report file to this new location. pipe=os.popen("mv "+self.args['jobReportLocation']+'/'+fileName+" "+ \ self.args['jobReportLocation']+'/'+jobId) logging.debug(">RunFailureHandler<:Moving job report to permanent storage: " \ +self.args['jobReportLocation']+'/'+jobId) pipe.close() reportLocation=self.args['jobReportLocation']+'/'+ \ jobId+'/'+fileName generalInfo=JobState.general(jobId) # a submit event with delay delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1)) delay=convertSeconds(delay) logging.debug(">RunFailureHandler<: re-submitting with delay (h:m:s) "+\ str(delay)) if self.args['ReportAction'] == 'move' : # count how many files are in the dir (to generate unique ids # when moving files try: lastID = len(os.listdir(os.path.dirname(payload))) target = os.path.join(os.path.dirname(payload),\ os.path.basename(payload).split('.')[0] +\ str(lastID) +\ '.xml') logging.debug('Moving file: '+ payload + ' to: ' + target) shutil.move(payload,target) except: pass try: JobState.runFailure(jobId,jobReportLocation= reportLocation) # check the cache dir size. If it is beyond the threshold, purge it. dirSizeBytes=dirSize(generalInfo['CacheDirLocation'],0,0,0) dirSizeMegaBytes=convertSize(dirSizeBytes,'m') logging.debug(">RunFailureHandler<:Cache dir. size is "+\ str(dirSizeMegaBytes)+" MB. Maximum allowed is "+\ str(self.maxCacheDirSizeMB)+" MB ") jobspecfile="%s/%s-JobSpec.xml" % (generalInfo['CacheDirLocation'],jobId) # if necessary first a partial cleanup is done, which after it # is finished publishes the proper event. # retrieve the number of retries and publish if(float(dirSizeMegaBytes)>float(self.maxCacheDirSizeMB)): newPayload=jobId+",SubmitJob,"+jobId+","+str(delay) logging.debug(">RunFailureHandler<: Reached maximum cache size. "+\ "Performing partial cache cleanup first.") self.publishEvent("PartialJobCleanup",newPayload,delay) else: logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure,"\ "publishing a submit job event") if self.args['QueueFailures']: JobQueueAPI.reQueueJob(jobId) else: self.publishEvent("SubmitJob",jobspecfile,delay) except ProdException,ex: if(ex["ErrorNr"]==3013): logging.debug(">RunFailureHandler<:Registered "+\ "a job run failure "+ \ "Maximum number of retries reached!" +\ " Submitting a failure job and cleanup event ") JobState.failed(jobId) self.publishEvent("FailureCleanup",(jobId)) self.publishEvent("GeneralJobFailure",(jobId))