Beispiel #1
0
 def initialize(self):
     """ Initialize the agent.
 """
     self.am_setOption("PollingTime", 60)
     self.ovc = OverlaySystemClient()
     self.jobmon = JobMonitoringClient()
     return S_OK()
 def initialize(self):
   """ Initialize the agent.
   """
   self.am_setOption( "PollingTime", 60 )
   self.ovc = OverlaySystemClient()
   self.jobmon = JobMonitoringClient()
   return S_OK()
Beispiel #3
0
class ResetCounters(AgentModule):
    """ Reset the number of jobs at all sites: some sites are not updated properly, so 
  once in a while it's needed to restore the correct number of jobs.
  It does not need to be exact, but enough to clear some of the jobs.
  """
    def initialize(self):
        """ Initialize the agent.
    """
        self.am_setOption("PollingTime", 60)
        self.ovc = OverlaySystemClient()
        self.jobmon = JobMonitoringClient()
        return S_OK()

    def execute(self):
        """ This is called by the Agent Reactor
    """
        res = self.ovc.getSites()
        if not res['OK']:
            return res
        sitedict = {}
        sites = res['Value']
        gLogger.info("Will update info for sites %s" % sites)
        for site in sites:
            attribdict = {
                "Site": site,
                "ApplicationStatus": 'Getting overlay files'
            }
            res = self.jobmon.getCurrentJobCounters(attribdict)
            if not res['OK']:
                continue
            if 'Running' in res['Value']:
                sitedict[site] = res['Value']['Running']
            else:
                sitedict[site] = 0
        gLogger.info("Setting new values %s" % sitedict)
        res = self.ovc.setJobsAtSites(sitedict)
        if not res['OK']:
            gLogger.error(res['Message'])
            return res

        return S_OK()
class ResetCounters ( AgentModule ):
  """ Reset the number of jobs at all sites: some sites are not updated properly, so 
  once in a while it's needed to restore the correct number of jobs.
  It does not need to be exact, but enough to clear some of the jobs.
  """
  def initialize(self):
    """ Initialize the agent.
    """
    self.am_setOption( "PollingTime", 60 )
    self.ovc = OverlaySystemClient()
    self.jobmon = JobMonitoringClient()
    return S_OK()
  
  def execute(self):
    """ This is called by the Agent Reactor
    """
    res = self.ovc.getSites()
    if not res['OK']:
      return res
    sitedict = {}
    sites = res['Value']
    gLogger.info("Will update info for sites %s" % sites)
    for site in sites:
      attribdict = {"Site" : site, "ApplicationStatus": 'Getting overlay files'}
      res = self.jobmon.getCurrentJobCounters(attribdict)
      if not res['OK']:
        continue
      if res['Value'].has_key('Running'):
        sitedict[site] = res['Value']['Running']
      else:
        sitedict[site] = 0
    gLogger.info("Setting new values %s" % sitedict)    
    res = self.ovc.setJobsAtSites(sitedict)
    if not res['OK']:
      gLogger.error(res['Message'])
      return res
    
    return S_OK()
Beispiel #5
0
  def __getFilesLocaly(self):
    """ Download the files.
    """
    numberofeventstoget = ceil(self.BXOverlay * self.ggtohadint)
    nbfiles = len(self.lfns)
    availableevents = nbfiles * self.nbofeventsperfile
    if availableevents < numberofeventstoget:
      return S_ERROR("Number of %s events available is less than requested" % ( self.BkgEvtType ))

    if not self.NbSigEvtsPerJob:
      ##Compute Nsignal events
      self.NbSigEvtsPerJob = self.nbinputsigfile * self.nbsigeventsperfile
    if not self.NbSigEvtsPerJob:
      return S_ERROR('Could not determine the number of signal events per job')
    LOG.verbose("There are %s signal event" % self.NbSigEvtsPerJob)
    ##Now determine how many files are needed to cover all signal events
    totnboffilestoget = int(ceil(self.NbSigEvtsPerJob * numberofeventstoget / self.nbofeventsperfile))

    ##Limit ourself to some configuration maximum
    levels = [self.machine, self.energytouse, self.detectormodel, self.BkgEvtType]
    maxNbFilesToGet = getOptionValue(ops=self.ops, basePath="/Overlay", optionName="MaxNbFilesToGet", defaultValue=20,
                                     levels=levels)

    if totnboffilestoget > maxNbFilesToGet:
      totnboffilestoget = maxNbFilesToGet

    self.__disableWatchDog()
    overlaymon = OverlaySystemClient()
    ##Now need to check that there are not that many concurrent jobs getting the overlay at the same time
    error_count = 0
    count = 0
    while 1:
      if error_count > 10 :
        LOG.error('OverlayDB returned too many errors')
        return S_ERROR('Failed to get number of concurrent overlay jobs')

      res = overlaymon.canRun(self.site)
      if not res['OK']:
        error_count += 1
        time.sleep(60)
        continue
      error_count = 0
      #if running < max_concurrent_running:
      if res['Value']:
        break
      else:
        count += 1
        if count > 300:
          return S_ERROR("Waited too long: 5h, so marking job as failed")
        if count % 10 == 0 :
          self.setApplicationStatus("Overlay standby number %s" % count)
        time.sleep(60)

    self.__enableWatchDog()

    self.setApplicationStatus('Getting overlay files')

    LOG.info('Will obtain %s files for overlay' % totnboffilestoget)

    os.mkdir("./overlayinput_" + self.metaEventType)
    os.chdir("./overlayinput_" + self.metaEventType)
    filesobtained = []
    usednumbers = []
    fail = False
    fail_count = 0

    max_fail_allowed = self.ops.getValue("/Overlay/MaxFailedAllowed", 20)
    while not len(filesobtained) == totnboffilestoget:
      if fail_count > max_fail_allowed:
        fail = True
        break

      fileindex = random.randrange(nbfiles)
      if fileindex in usednumbers:
        continue

      usednumbers.append(fileindex)

      triedDataManager = False

      if self.site == 'LCG.CERN.ch':
        res = self.getEOSFile(self.lfns[fileindex])
      elif self.site == 'LCG.IN2P3-CC.fr':
        res = self.getLyonFile(self.lfns[fileindex])
      elif self.site == 'LCG.UKI-LT2-IC-HEP.uk':
        res = self.getImperialFile(self.lfns[fileindex])
      elif self.site == 'LCG.RAL-LCG2.uk':
        res = self.getRALFile(self.lfns[fileindex])
      elif self.site == 'LCG.KEK.jp':
        res = self.getKEKFile(self.lfns[fileindex])
      else:
        self.__disableWatchDog()
        res = self.datMan.getFile(self.lfns[fileindex])
        triedDataManager = True

      # In case the specific copying did not work (mostly because the files do
      # not exist locally) try again to get the file via the DataManager
      if (not res['OK']) and (not triedDataManager):
        res = self.datMan.getFile(self.lfns[fileindex])

      if not res['OK']:
        LOG.warn('Could not obtain %s' % self.lfns[fileindex])
        fail_count += 1
        # Wait for a random time around 3 minutes
        LOG.verbose("Waste happily some CPU time (on average 3 minutes)")
        resWaste = wasteCPUCycles(60 * random.gauss(3, 0.1))
        if not resWaste['OK']:
          LOG.error("Could not waste as much CPU time as wanted, but whatever!")
        continue

      filesobtained.append(self.lfns[fileindex])

      # If no file could be obtained, need to make sure the job fails
      if len(usednumbers) == nbfiles and not filesobtained:
        fail = True
        break

    ## Remove all scripts remaining
    scripts = glob.glob("*.sh")
    for script in scripts:
      os.remove(script)
      
    ##Print the file list
    mylist = os.listdir(os.getcwd())
    LOG.info("List of Overlay files:")
    LOG.info("\n".join(mylist))
    os.chdir(self.curdir)
    res = overlaymon.jobDone(self.site)
    if not res['OK']:
      LOG.error("Could not declare the job as finished getting the files")
    if fail:
      LOG.error("Did not manage to get all files needed, too many errors")
      return S_ERROR("Failed to get files")
    LOG.info('Got all files needed.')
    return S_OK()