Ejemplo n.º 1
0
    def handleError(self,payload):
         jobId  = payload
         generalInfo=JobState.general(jobId)

         delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1))
         delay=convertSeconds(delay)
         logging.debug(">CreateFailureHandler<: re-creating with delay "+\
             " (h:m:s) "+str(delay))
         try:
              JobState.createFailure(jobId)

              logging.debug(">CreateFailureHandler<: Registered "+\
                            "a create failure,"\
                            "publishing a create event")
              self.publishEvent("CreateJob",(jobId),delay)
         except ProdException,ex:
              if(ex["ErrorNr"]==3013):
                  logging.debug(">CreateFailureHandler<: Registered "+\
                  "a create failure "+ \
                  "Maximum number of retries reached!" +\
                  " Submitting a general failure and cleanup job event ")
                  JobState.failed(jobId)
                  self.publishEvent("FailureCleanup",(jobId))
                  self.publishEvent("GeneralJobFailure",(jobId))
Ejemplo n.º 2
0
    def handleError(self,payload):
         """
         The payload of a job failure is a url to the job report
         """
         jobReportUrl= payload
         # prepare to retrieve the job report file.
         # NOTE: we assume that the report file has a relative unique name
         # NOTE: if that is not the case we need to add a unique identifier to it.
         slash = jobReportUrl.rfind('/')
         fileName = jobReportUrl[slash+1:]
         urllib.urlretrieve(jobReportUrl, \
              self.args['jobReportLocation']+'/'+fileName)
         logging.debug(">RunFailureHandler<:Retrieving job report from %s " % jobReportUrl)

         jobReport=readJobReport(self.args['jobReportLocation']+'/'+fileName)
         #NOTE: is this the right way to extract the job id.
         jobId=jobReport[0].jobSpecId
         logging.debug(">RunFailureHandler<:Retrieving jobId from job report "+\
                       "(used to dynamically load error handler) " \
                       "jobId="+str(jobId))

         # create the jobReportLocation jobId hierarchy if not exists.
         pipe=os.popen("mkdir -p "+self.args['jobReportLocation']+'/'+jobId)
         pipe.close()
         # move the report file to this new location.
         pipe=os.popen("mv "+self.args['jobReportLocation']+'/'+fileName+" "+ \
                       self.args['jobReportLocation']+'/'+jobId)
         logging.debug(">RunFailureHandler<:Moving job report to permanent storage: " \
                       +self.args['jobReportLocation']+'/'+jobId)
         pipe.close()


         reportLocation=self.args['jobReportLocation']+'/'+ \
                           jobId+'/'+fileName
         generalInfo=JobState.general(jobId)
         # a submit event with delay
         delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1))
         delay=convertSeconds(delay) 
         logging.debug(">RunFailureHandler<: re-submitting with delay (h:m:s) "+\
                      str(delay))

         if self.args['ReportAction'] == 'move' :
            # count how many files are in the dir (to generate unique ids
            # when moving files
            try:
               lastID = len(os.listdir(os.path.dirname(payload)))
               target = os.path.join(os.path.dirname(payload),\
               os.path.basename(payload).split('.')[0] +\
               str(lastID) +\
               '.xml')
               logging.debug('Moving file: '+ payload + ' to: ' + target)
               shutil.move(payload,target)
            except:
               pass
         try:
              JobState.runFailure(jobId,jobReportLocation= reportLocation)

              # check the cache dir size. If it is beyond the threshold, purge it.
              dirSizeBytes=dirSize(generalInfo['CacheDirLocation'],0,0,0)
              dirSizeMegaBytes=convertSize(dirSizeBytes,'m')
              logging.debug(">RunFailureHandler<:Cache dir. size is "+\
                            str(dirSizeMegaBytes)+" MB. Maximum allowed is "+\
                            str(self.maxCacheDirSizeMB)+" MB ")
              jobspecfile="%s/%s-JobSpec.xml" % (generalInfo['CacheDirLocation'],jobId)
              # if necessary first a partial cleanup is done, which after it
              # is finished publishes the proper event.

              # retrieve the number of retries and publish
              if(float(dirSizeMegaBytes)>float(self.maxCacheDirSizeMB)):
                  newPayload=jobId+",SubmitJob,"+jobId+","+str(delay)
                  logging.debug(">RunFailureHandler<: Reached maximum cache size. "+\
                      "Performing partial cache cleanup first.")
                  self.publishEvent("PartialJobCleanup",newPayload,delay)
              else:
                  logging.debug(">RunFailureHandler<:Registered "+\
                                "a job run failure,"\
                                "publishing a submit job event")
                  if self.args['QueueFailures']:
                      JobQueueAPI.reQueueJob(jobId)
                  else:
                      self.publishEvent("SubmitJob",jobspecfile,delay)

         except ProdException,ex:
              if(ex["ErrorNr"]==3013): 
                  logging.debug(">RunFailureHandler<:Registered "+\
                  "a job run failure "+ \
                  "Maximum number of retries reached!" +\
                  " Submitting a failure job and cleanup event ")
                  JobState.failed(jobId)
                  self.publishEvent("FailureCleanup",(jobId))
                  self.publishEvent("GeneralJobFailure",(jobId))