Beispiel #1
0
    def testB(self):
        """change state test"""
        try:
         JobState.register("jobClassID2","Processing",2,1,"myWorkflowID")
         JobState.create("jobClassID2","cacheDir/location/2somewhere")
         JobState.inProgress("jobClassID2")

         # retries=racers=0
         self.assertEqual(JobState.general("jobClassID2"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'})

         JobState.submit("jobClassID2")

         # retries0,=racers=1
         self.assertEqual(JobState.general("jobClassID2"),{'Retries': 0, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 2, 'JobType': 'Processing'})

         JobState.runFailure("jobClassID2","jobInstanceID2.1",
              "some.location2.1","job/Report/Location2.1.xml")

         # retries= 1, racers=0
         self.assertEqual(JobState.general("jobClassID2"),
              {'CacheDirLocation': 'cacheDir/location/2somewhere', 
               'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 
               'MaxRetries': 2, 'Retries': 1, 'JobType': 'Processing'})

         JobState.submit("jobClassID2")

         # retries= 1, racers=1
         self.assertEqual(JobState.general("jobClassID2"),{'Retries': 1L, 'CacheDirLocation': 'cacheDir/location/2somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 2L, 'JobType': 'Processing'})

        except StandardError, ex:
            msg = "Failed State Change TestB:\n"
            msg += str(ex)
            self.fail(msg)
Beispiel #2
0
    def testI(self):
         JobState.register("jobClassID10","Processing",8,2,"myWorkflowID")
         #retries=racer=0
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 0, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.createFailure("jobClassID10")
         #retries=1, racer=0
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 1, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.createFailure("jobClassID10")
         #retries=2, racer=0
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': None, 'MaxRacers': 2, 'Racers': 0, 'State': 'register', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.create("jobClassID10","cacheDir/location/10somewhere")
         #retries=2, racer=0
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'create', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.inProgress("jobClassID10")
         #retries=2, racer=0
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 2, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.submitFailure("jobClassID10")
         #retries=3, racer=0
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 3, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.submit("jobClassID10")
         #retries=3, racer=1
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 3, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.submitFailure("jobClassID10")
         #retries=4, racer=1
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 4, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 1, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'})
         JobState.submit("jobClassID10")
         #retries=4, racer=2
         self.assertEqual(JobState.general("jobClassID10"),{'Retries': 4, 'CacheDirLocation': 'cacheDir/location/10somewhere', 'MaxRacers': 2, 'Racers': 2, 'State': 'inProgress', 'MaxRetries': 8, 'JobType': 'Processing'})

         # on purpose we introduce an error:
         try:
             JobState.submit("jobClassID10")
         except ProdException, ex:
             print('>>>Test succeeded for exception 1/1 in testH of JobState_t.py\n')
 def handleEvent(self,payload):
      """
      The payload of for a cleanup handler is a job id. 
      """
      if self.failureArchive == None:
          logging.error("No Failure Archive set: Cannot Archive Job:\n %s" % payload)
          return
      try:
          logging.debug(">FailureCleanupHandler< archiving  "+\
                         "information for jobspec: "+str(payload))
          try:
              os.makedirs(self.failureArchive)
          except:
              pass
          cacheDirLocation=JobState.general(str(payload))['CacheDirLocation']
          logging.debug(">FailureCleanupHandler< archiving and removing directory: "+cacheDirLocation)
          #NOTE: check what this does when it is repeated (e.g. after a crash)
          tar=tarfile.open(self.failureArchive+'/'+str(payload)+'.tar.gz','w:gz')
          short_root=cacheDirLocation.split('/')[-1]
          tar.add(cacheDirLocation,short_root)
          tar.close()
          try:
              for root, dirs, files in os.walk(cacheDirLocation, topdown=False):
                  for name in files:
                      os.remove(os.path.join(root, name))
                  for name in dirs:
                      os.rmdir(os.path.join(root, name))
              os.rmdir(cacheDirLocation)
          except Exception,ex:
              logging.debug(">FailureCleanupHandler< WARNING job cleanup: "+str(ex))
          JobState.cleanout(str(payload))
          Job.remove(str(payload))
          logging.debug(">FailureCleanupHandler< archived completed for jobspecID: "+str(payload))
Beispiel #4
0
    def testD(self):
        """change state test"""
        try:
         JobState.register("jobClassID4","Processing",6,2,"myWorkflowID")
         JobState.create("jobClassID4","cacheDir/location/4somewhere")
         JobState.inProgress("jobClassID4")

         # retries=racers=0
         self.assertEqual(JobState.general("jobClassID4"),{'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/4somewhere', 'MaxRacers': 2L, 'Racers': 0L, 'State': 'inProgress', 'MaxRetries': 6L, 'JobType': 'Processing'})
Beispiel #5
0
 def testK(self):
     jobIDs=[]
     for i in xrange(0,20):
         JobState.register("jobClassID_0."+str(i),"Processing",30,1)
         JobState.register("jobClassID_1."+str(i),"Processing",30,1,"myWorkflowID1")
         JobState.register("jobClassID_2."+str(i),"Processing",30,1,"myWorkflowID2")
         JobState.register("jobClassID_3."+str(i),"Processing",30,1,"myWorkflowID3")
         jobIDs.append("jobClassID_1."+str(i))
         jobIDs.append("jobClassID_2."+str(i))
         jobIDs.append("jobClassID_3."+str(i))
     JobState.setMaxRetries(jobIDs,2)
     self.assertEqual(JobState.general("jobClassID_1.1")['MaxRetries'],2)
     JobState.setMaxRetries("jobClassID_1.1",3)
     self.assertEqual(JobState.general("jobClassID_1.1")['MaxRetries'],3)
     jobIDs=JobState.retrieveJobIDs("myWorkflowID1")
     self.assertEqual(len(jobIDs),20)
     jobIDs=JobState.retrieveJobIDs(["myWorkflowID1","myWorkflowID2","myWorkflowID3"])
     self.assertEqual(len(jobIDs),60)
     jobs=JobState.rangeGeneral(0,10)
     print(str(jobs))
    def handleEvent(self,payload):
         """
         The payload of a partial cleanup handler is a job id and
         the event (plus payload) it needs to emit aferwards. 
         """
         payloads=payload.split(',')
         jobId=payloads[0]
         nextEvent=payloads[1]
         nextPayload=payloads[2]
         delay=0
         if len(payloads)==4:
             delay=payloads[3]

         try:
             logging.debug(">PartialCleanupHandler< removing cached files "+\
                            "for jobspec: "+str(jobId))
             cacheDirLocation=JobState.general(str(jobId))['CacheDirLocation']
             logging.debug(">PartialCleanupHandler< starting remove in: "+cacheDirLocation)
             try:
                 for root, dirs, files in os.walk(cacheDirLocation, topdown=False):
                     for name in files:
                         # check if file is an .xml or .tar.gz file 
                         # if so do not remove.
                         # NOTE: should use reg. exp. here.
                         isSaved=False 
                         # we only keep files that are in the top dir.
                         # if we in the top dir we check for certain extensions.
                         if root==cacheDirLocation:
                             extensions=['.xml','.tar.gz']
                             for extension in extensions:
                                 pos1=name.rfind(extension)
                                 pos2=len(name)-len(extension)
                                 if(pos1==pos2):
                                     isSaved=True
                                     break

                         if not isSaved:
                             try:
                                 os.remove(os.path.join(root, name))
                             except Exception,ex:
                                 logging.debug(">PartialCleanupHandler< WARNING "+\
                                     " partial job cleanup: "+str(ex))
                         else:
                             logging.debug(">PartialCleanupHandler< not removing: "+name)
                     for name in dirs:
                         os.rmdir(os.path.join(root, name))
             except Exception,ex:
                 logging.debug(">PartialCleanupHandler< WARNING partial job cleanup: "+\
                     str(ex))
         except Exception,ex:
             logging.debug(">PartialCleanupHandler< ERROR partial job cleanup: "+\
                 str(ex))
Beispiel #7
0
    def getJobCache(self, jobSpecId):
        """
        _getJobCache_

        Lookup a job cache for the job spec Id provided

        """
        try:
            stateInfo = JobState.general(jobSpecId)
        except Exception, ex:
            msg = "ERROR: Cant get JobCache for %s\n" % jobSpecId
            msg += str(ex)
            logging.warning(msg)
            stateInfo = {}
Beispiel #8
0
    def killJob(self, jobSpecId, erase=False):
        """

        Arguments:

          JobSpecId -- the job id.
          erase -- remove job info from BOSS database

        Return:

          none

        """
        # jobSpecId is job['name'] for BossLite # Fabio
        logging.info("BossLiteKiller.killJob(%s)" % jobSpecId)

        # verify that the job exists
        try:
            stateInfo = JobState.general(jobSpecId)
        except StandardError, ex:
            msg = "Cannot retrieve JobState Information for %s\n" % jobSpecId
            msg += str(ex)
            logging.error(msg)
            raise InvalidJobException, msg
    def handleError(self,payload):
         jobId  = payload
         generalInfo=JobState.general(jobId)

         delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1))
         delay=convertSeconds(delay)
         logging.debug(">CreateFailureHandler<: re-creating with delay "+\
             " (h:m:s) "+str(delay))
         try:
              JobState.createFailure(jobId)

              logging.debug(">CreateFailureHandler<: Registered "+\
                            "a create failure,"\
                            "publishing a create event")
              self.publishEvent("CreateJob",(jobId),delay)
         except ProdException,ex:
              if(ex["ErrorNr"]==3013):
                  logging.debug(">CreateFailureHandler<: Registered "+\
                  "a create failure "+ \
                  "Maximum number of retries reached!" +\
                  " Submitting a general failure and cleanup job event ")
                  JobState.failed(jobId)
                  self.publishEvent("FailureCleanup",(jobId))
                  self.publishEvent("GeneralJobFailure",(jobId))
Beispiel #10
0
         # register again (illegal):
         try:
             JobState.register("jobClassID1","Processing",3,1,"myWorkflowID")
             print('>>>Test ERROR \n')
         except ProdException, ex:
             print('>>>Test succeeded for exception 2/3 in testA of JobState_t.py\n')
         try:
         # illegal state transitions:
            JobState.inProgress("jobClassID1")
         except ProdException, ex:
            print('>>>Test succeeded for exception 3/3 in testA of JobState_t.py\n')
         JobState.create("jobClassID1","cacheDir/location/1somewhere")
         JobState.inProgress("jobClassID1")
         # retries=racers=0;
         self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1, 'Racers': 0, 'State': 'inProgress', 'MaxRetries': 3, 'JobType': 'Processing'}
)


         JobState.submit("jobClassID1")

         # retries=0, racers=1;
         self.assertEqual(JobState.general("jobClassID1"), {'Retries': 0L, 'CacheDirLocation': 'cacheDir/location/1somewhere', 'MaxRacers': 1L, 'Racers': 1L, 'State': 'inProgress', 'MaxRetries': 3L, 'JobType': 'Processing'})

         JobState.runFailure("jobClassID1","jobInstanceID1.1",
              "some.location1.1","job/Report/Location1.1.xml")
         JobState.submit("jobClassID1")
        except StandardError, ex:
            msg = "Failed State Change TestA:\n"
            msg += str(ex)
            self.fail(msg)
Beispiel #11
0
    def handleError(self,payload):
         """
         The payload of a job failure is a url to the job report
         """
         jobReportUrl= payload
         # prepare to retrieve the job report file.
         # NOTE: we assume that the report file has a relative unique name
         # NOTE: if that is not the case we need to add a unique identifier to it.
         slash = jobReportUrl.rfind('/')
         fileName = jobReportUrl[slash+1:]
         urllib.urlretrieve(jobReportUrl, \
              self.args['jobReportLocation']+'/'+fileName)
         logging.debug(">RunFailureHandler<:Retrieving job report from %s " % jobReportUrl)

         jobReport=readJobReport(self.args['jobReportLocation']+'/'+fileName)
         #NOTE: is this the right way to extract the job id.
         jobId=jobReport[0].jobSpecId
         logging.debug(">RunFailureHandler<:Retrieving jobId from job report "+\
                       "(used to dynamically load error handler) " \
                       "jobId="+str(jobId))

         # create the jobReportLocation jobId hierarchy if not exists.
         pipe=os.popen("mkdir -p "+self.args['jobReportLocation']+'/'+jobId)
         pipe.close()
         # move the report file to this new location.
         pipe=os.popen("mv "+self.args['jobReportLocation']+'/'+fileName+" "+ \
                       self.args['jobReportLocation']+'/'+jobId)
         logging.debug(">RunFailureHandler<:Moving job report to permanent storage: " \
                       +self.args['jobReportLocation']+'/'+jobId)
         pipe.close()


         reportLocation=self.args['jobReportLocation']+'/'+ \
                           jobId+'/'+fileName
         generalInfo=JobState.general(jobId)
         # a submit event with delay
         delay=int(self.args['DelayFactor'])*(int(generalInfo['Retries']+1))
         delay=convertSeconds(delay) 
         logging.debug(">RunFailureHandler<: re-submitting with delay (h:m:s) "+\
                      str(delay))

         if self.args['ReportAction'] == 'move' :
            # count how many files are in the dir (to generate unique ids
            # when moving files
            try:
               lastID = len(os.listdir(os.path.dirname(payload)))
               target = os.path.join(os.path.dirname(payload),\
               os.path.basename(payload).split('.')[0] +\
               str(lastID) +\
               '.xml')
               logging.debug('Moving file: '+ payload + ' to: ' + target)
               shutil.move(payload,target)
            except:
               pass
         try:
              JobState.runFailure(jobId,jobReportLocation= reportLocation)

              # check the cache dir size. If it is beyond the threshold, purge it.
              dirSizeBytes=dirSize(generalInfo['CacheDirLocation'],0,0,0)
              dirSizeMegaBytes=convertSize(dirSizeBytes,'m')
              logging.debug(">RunFailureHandler<:Cache dir. size is "+\
                            str(dirSizeMegaBytes)+" MB. Maximum allowed is "+\
                            str(self.maxCacheDirSizeMB)+" MB ")
              jobspecfile="%s/%s-JobSpec.xml" % (generalInfo['CacheDirLocation'],jobId)
              # if necessary first a partial cleanup is done, which after it
              # is finished publishes the proper event.

              # retrieve the number of retries and publish
              if(float(dirSizeMegaBytes)>float(self.maxCacheDirSizeMB)):
                  newPayload=jobId+",SubmitJob,"+jobId+","+str(delay)
                  logging.debug(">RunFailureHandler<: Reached maximum cache size. "+\
                      "Performing partial cache cleanup first.")
                  self.publishEvent("PartialJobCleanup",newPayload,delay)
              else:
                  logging.debug(">RunFailureHandler<:Registered "+\
                                "a job run failure,"\
                                "publishing a submit job event")
                  if self.args['QueueFailures']:
                      JobQueueAPI.reQueueJob(jobId)
                  else:
                      self.publishEvent("SubmitJob",jobspecfile,delay)

         except ProdException,ex:
              if(ex["ErrorNr"]==3013): 
                  logging.debug(">RunFailureHandler<:Registered "+\
                  "a job run failure "+ \
                  "Maximum number of retries reached!" +\
                  " Submitting a failure job and cleanup event ")
                  JobState.failed(jobId)
                  self.publishEvent("FailureCleanup",(jobId))
                  self.publishEvent("GeneralJobFailure",(jobId))