Ejemplo n.º 1
0
  def __getFilesLocaly(self):
    """ Download the files.
    """
    numberofeventstoget = ceil(self.BXOverlay * self.ggtohadint)
    nbfiles = len(self.lfns)
    availableevents = nbfiles * self.nbofeventsperfile
    if availableevents < numberofeventstoget:
      return S_ERROR("Number of %s events available is less than requested" % ( self.BkgEvtType ))

    if not self.NbSigEvtsPerJob:
      ##Compute Nsignal events
      self.NbSigEvtsPerJob = self.nbinputsigfile * self.nbsigeventsperfile
    if not self.NbSigEvtsPerJob:
      return S_ERROR('Could not determine the number of signal events per job')
    self.log.verbose("There are %s signal event" % self.NbSigEvtsPerJob)
    ##Now determine how many files are needed to cover all signal events
    totnboffilestoget = int(ceil(self.NbSigEvtsPerJob * numberofeventstoget / self.nbofeventsperfile))

    print "[debug Akiya ] ##########################"
    print "Workflow::OverlayInput...."
    print "totnboffilestoget="+str(totnboffilestoget)
    print "numberofeventstoget="+str(numberofeventstoget)
    print "NbSigEvtsPerJob="+str(self.NbSigEvtsPerJob)
    print "numberofeventsperfile="+str(self.nbofeventsperfile)
    print "################################################################################" 


    ##Limit ourself to some configuration maximum
    maxNbFilesToGet = self.ops.getValue("/Overlay/MaxNbFilesToGet", 20)
    if totnboffilestoget > maxNbFilesToGet:
      totnboffilestoget = maxNbFilesToGet
#    res = self.ops.getOption("/Overlay/MaxConcurrentRunning",200)
#    self.log.verbose("Will allow only %s concurrent running"%res['Value'])
#    max_concurrent_running = res['Value']
#
#    jobpropdict = {}
#    jobpropdict['ApplicationStatus'] = 'Getting overlay files'
#    res = self.ops.getSections("/Overlay/Sites/")
#    sites = []
#    if res['OK']:
#      sites = res['Value']
#      self.log.verbose("Found the following sites to restrain: %s"%sites)
#    if self.site in sites:
#      res = self.ops.getOption("/Overlay/Sites/%s/MaxConcurrentRunning"%self.site,200)
#      self.log.verbose("Will allow only %s concurrent running at %s"%(res['Value'],self.site))
#      jobpropdict['Site']=self.site
#      max_concurrent_running = res['Value']
    self.__disableWatchDog()
    overlaymon = RPCClient('Overlay/Overlay', timeout=60)
    ##Now need to check that there are not that many concurrent jobs getting the overlay at the same time
    error_count = 0
    count = 0
    while 1:
      if error_count > 10 :
        self.log.error('OverlayDB returned too many errors')
        return S_ERROR('Failed to get number of concurrent overlay jobs')
      #jobMonitor = RPCClient('WorkloadManagement/JobMonitoring',timeout=60)
      #res = jobMonitor.getCurrentJobCounters(jobpropdict)
      #if not res['OK']:
      #  error_count += 1
      #  time.sleep(60)
      #  continue
      #running = 0
      #if res['Value'].has_key('Running'):
      #  running = res['Value']['Running']

      res = overlaymon.canRun(self.site)
      if not res['OK']:
        error_count += 1
        time.sleep(60)
        continue
      error_count = 0
      #if running < max_concurrent_running:
      if res['Value']:
        break
      else:
        count += 1
        if count > 300:
          return S_ERROR("Waited too long: 5h, so marking job as failed")
        if count % 10 == 0 :
          self.setApplicationStatus("Overlay standby number %s" % count)
        time.sleep(60)

    self.__enableWatchDog()

    self.setApplicationStatus('Getting overlay files')

    self.log.info('Will obtain %s files for overlay' % totnboffilestoget)

    os.mkdir("./overlayinput_" + self.BkgEvtType)
    os.chdir("./overlayinput_" + self.BkgEvtType)
    filesobtained = []
    usednumbers = []
    fail = False
    fail_count = 0

    max_fail_allowed = self.ops.getValue("/Overlay/MaxFailedAllowed", 20)
    while not len(filesobtained) == totnboffilestoget:
      if fail_count > max_fail_allowed:
        fail = True
        break

      fileindex = random.randrange(nbfiles)
      if fileindex not in usednumbers:
          
        usednumbers.append(fileindex)

        triedDataManager = False

        if self.site == 'LCG.CERN.ch':
          res = self.getCASTORFile(self.lfns[fileindex])
        elif self.site == 'LCG.IN2P3-CC.fr':
          res = self.getLyonFile(self.lfns[fileindex])
        elif self.site == 'LCG.UKI-LT2-IC-HEP.uk':
          res = self.getImperialFile(self.lfns[fileindex])
        elif  self.site == 'LCG.RAL-LCG2.uk':
          res = self.getRALFile(self.lfns[fileindex])
        elif  self.site == 'LCG.KEK.jp':
          res = self.getKEKFile(self.lfns[fileindex])
        else:
          self.__disableWatchDog()
          res = self.datMan.getFile(self.lfns[fileindex])
          triedDataManager = True

        #in case the specific copying did not work (mostly because the fileqs do
        #not exist locally) try again to get the file via the DataManager
        if (not res['OK']) and (not triedDataManager):
          res = self.datMan.getFile(self.lfns[fileindex])

        if not res['OK']:
          self.log.warn('Could not obtain %s' % self.lfns[fileindex])
          fail_count += 1
          continue
        
        filesobtained.append(self.lfns[fileindex])
      ##If no file could be obtained, need to make sure the job fails  
      if len(usednumbers) == nbfiles and not len(filesobtained):
        fail = True
        break

      if len(filesobtained) < totnboffilestoget:
        ##Now wait for a random time around 3 minutes
        ###Actually, waste CPU time !!!
        self.log.verbose("Waste happily some CPU time (on average 3 minutes)")
        res = wasteCPUCycles(60 * random.gauss(3, 0.1))
        if not res['OK']:
          self.log.error("Could not waste as much CPU time as wanted, but whatever!")

    ## Remove all scripts remaining
    scripts = glob.glob("*.sh")
    for script in scripts:
      os.remove(script)
      
    ##Print the file list
    mylist = os.listdir(os.getcwd())
    self.log.info("List of Overlay files:")
    self.log.info("\n".join(mylist))
    os.chdir(self.curdir)
    res = overlaymon.jobDone(self.site)
    if not res['OK']:
      self.log.error("Could not declare the job as finished getting the files")
    if fail:
      self.log.error("Did not manage to get all files needed, too many errors")
      return S_ERROR("Failed to get files")
    self.log.info('Got all files needed.')
    return S_OK()
Ejemplo n.º 2
0
  def __getFilesLocaly(self):
    """ Download the files.
    """
    numberofeventstoget = ceil(self.BXOverlay * self.ggtohadint)
    nbfiles = len(self.lfns)
    availableevents = nbfiles * self.nbofeventsperfile
    if availableevents < numberofeventstoget:
      return S_ERROR("Number of %s events available is less than requested" % ( self.BkgEvtType ))

    if not self.NbSigEvtsPerJob:
      ##Compute Nsignal events
      self.NbSigEvtsPerJob = self.nbinputsigfile * self.nbsigeventsperfile
    if not self.NbSigEvtsPerJob:
      return S_ERROR('Could not determine the number of signal events per job')
    self.log.verbose("There are %s signal event" % self.NbSigEvtsPerJob)
    ##Now determine how many files are needed to cover all signal events
    totnboffilestoget = int(ceil(self.NbSigEvtsPerJob * numberofeventstoget / self.nbofeventsperfile))

    ##Limit ourself to some configuration maximum
    maxNbFilesToGet = self.ops.getValue("/Overlay/MaxNbFilesToGet", 20)
    if totnboffilestoget > maxNbFilesToGet:
      totnboffilestoget = maxNbFilesToGet
#    res = self.ops.getOption("/Overlay/MaxConcurrentRunning",200)
#    self.log.verbose("Will allow only %s concurrent running"%res['Value'])
#    max_concurrent_running = res['Value']
#
#    jobpropdict = {}
#    jobpropdict['ApplicationStatus'] = 'Getting overlay files'
#    res = self.ops.getSections("/Overlay/Sites/")
#    sites = []
#    if res['OK']:
#      sites = res['Value']
#      self.log.verbose("Found the following sites to restrain: %s"%sites)
#    if self.site in sites:
#      res = self.ops.getOption("/Overlay/Sites/%s/MaxConcurrentRunning"%self.site,200)
#      self.log.verbose("Will allow only %s concurrent running at %s"%(res['Value'],self.site))
#      jobpropdict['Site']=self.site
#      max_concurrent_running = res['Value']
    if not os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'):
      f = file('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK','w')
      f.write('Dont look at cpu')
      f.close()
    overlaymon = RPCClient('Overlay/Overlay', timeout=60)
    ##Now need to check that there are not that many concurrent jobs getting the overlay at the same time
    error_count = 0
    count = 0
    while 1:
      if error_count > 10 :
        self.log.error('OverlayDB returned too any errors')
        return S_ERROR('Failed to get number of concurrent overlay jobs')
      #jobMonitor = RPCClient('WorkloadManagement/JobMonitoring',timeout=60)
      #res = jobMonitor.getCurrentJobCounters(jobpropdict)
      #if not res['OK']:
      #  error_count += 1
      #  time.sleep(60)
      #  continue
      #running = 0
      #if res['Value'].has_key('Running'):
      #  running = res['Value']['Running']

      res = overlaymon.canRun(self.site)
      if not res['OK']:
        error_count += 1
        time.sleep(60)
        continue
      error_count = 0
      #if running < max_concurrent_running:
      if res['Value']:
        break
      else:
        count += 1
        if count > 300:
          return S_ERROR("Waited too long: 5h, so marking job as failed")
        if count % 10 == 0 :
          self.setApplicationStatus("Overlay standby number %s" % count)
        time.sleep(60)
        
    if os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'):
      os.remove('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK')

    self.setApplicationStatus('Getting overlay files')

    self.log.info('Will obtain %s files for overlay' % totnboffilestoget)

    os.mkdir("./overlayinput_" + self.BkgEvtType)
    os.chdir("./overlayinput_" + self.BkgEvtType)
    filesobtained = []
    usednumbers = []
    fail = False
    fail_count = 0

    max_fail_allowed = self.ops.getValue("/Overlay/MaxFailedAllowed", 20)
    while not len(filesobtained) == totnboffilestoget:
      if fail_count > max_fail_allowed:
        fail = True
        break

      ##Now wait for a random time around 3 minutes
      ###Actually, waste CPU time !!!
      self.log.verbose("Waste happily some CPU time (on average 3 minutes)")
      res = WasteCPUCycles(60 * random.gauss(3, 0.1))
      if not res['OK']:
        self.log.error("Could not waste as much CPU time as wanted, but whatever!")

      fileindex = random.randrange(nbfiles)
      if fileindex not in usednumbers:
          
        usednumbers.append(fileindex)

        isDefault = False

        if self.site == 'LCG.CERN.ch':
          res = self.getCASTORFile(self.lfns[fileindex])
        elif self.site == 'LCG.IN2P3-CC.fr':
          res = self.getLyonFile(self.lfns[fileindex])
        elif self.site == 'LCG.UKI-LT2-IC-HEP.uk':
          res = self.getImperialFile(self.lfns[fileindex])
        elif  self.site == 'LCG.RAL-LCG2.uk':
          res = self.getRALFile(self.lfns[fileindex])
        else:
          if not os.path.exists('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK'):
            f = file('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK', 'w')
            f.write('Dont look at cpu')
            f.close()
          res = self.rm.getFile(self.lfns[fileindex])
          isDefault = True

        # Tue Jun 28 14:21:03 CEST 2011
        # Temporarily for Imperial College site until dCache is fixed

        if (not res['OK']) and (not isDefault) and \
          (self.site in ['LCG.UKI-LT2-IC-HEP.uk', 'LCG.IN2P3-CC.fr', 'LCG.CERN.ch']):
          res = self.rm.getFile(self.lfns[fileindex])

        if not res['OK']:
          self.log.warn('Could not obtain %s' % self.lfns[fileindex])
          fail_count += 1
          continue
        
        if res['Value'].has_key('Failed'):
          if len(res['Value']['Failed']):
            self.log.warn('Could not obtain %s' % self.lfns[fileindex])
            fail_count += 1
            continue
        filesobtained.append(self.lfns[fileindex])
      ##If no file could be obtained, need to make sure the job fails  
      if len(usednumbers) == nbfiles and not len(filesobtained):
        fail = True
        break
      
    #res = self.rm.getFile(filesobtained)
    #failed = len(res['Value']['Failed'])
    #tryagain = []
    #if failed:
    #  self.log.error('Had issues getting %s files, retrying now with new files'%failed)
    #  while len(tryagain) < failed:
    #    fileindex = random.randrange(nbfiles)
    #    if fileindex not in usednumbers:
    #      usednumbers.append(fileindex)
    #      tryagain.append(self.lfns[fileindex])
    #  res = self.rm.getFile(tryagain)
    #  if len(res['Value']['Failed']):
    #    os.chdir(curdir)
    #    return S_ERROR("Could not obtain enough files after 2 attempts")
    ## Remove all scripts remaining
    scripts = glob.glob("*.sh")
    for script in scripts:
      os.remove(script)
      
    ##Print the file list
    mylist = os.listdir(os.getcwd())
    self.log.info("List of Overlay files:")
    self.log.info(string.join(mylist, "\n"))
    os.chdir(self.curdir)
    res = overlaymon.jobDone(self.site)
    if not res['OK']:
      self.log.error("Could not declare the job as finished getting the files")
    if fail:
      self.log.error("Did not manage to get all files needed, too many errors")
      return S_ERROR("Failed to get files")
    self.log.info('Got all files needed.')
    return S_OK()
Ejemplo n.º 3
0
  def __getFilesLocaly(self):
    """ Download the files.
    """
    numberofeventstoget = ceil(self.BXOverlay * self.ggtohadint)
    nbfiles = len(self.lfns)
    availableevents = nbfiles * self.nbofeventsperfile
    if availableevents < numberofeventstoget:
      return S_ERROR("Number of %s events available is less than requested" % ( self.BkgEvtType ))

    if not self.NbSigEvtsPerJob:
      ##Compute Nsignal events
      self.NbSigEvtsPerJob = self.nbinputsigfile * self.nbsigeventsperfile
    if not self.NbSigEvtsPerJob:
      return S_ERROR('Could not determine the number of signal events per job')
    self.log.verbose("There are %s signal event" % self.NbSigEvtsPerJob)
    ##Now determine how many files are needed to cover all signal events
    totnboffilestoget = int(ceil(self.NbSigEvtsPerJob * numberofeventstoget / self.nbofeventsperfile))

    ##Limit ourself to some configuration maximum
    levels = [self.machine, self.energytouse, self.detectormodel, self.BkgEvtType]
    maxNbFilesToGet = getOptionValue(ops=self.ops, basePath="/Overlay", optionName="MaxNbFilesToGet", defaultValue=20,
                                     levels=levels)

    if totnboffilestoget > maxNbFilesToGet:
      totnboffilestoget = maxNbFilesToGet
#    res = self.ops.getOption("/Overlay/MaxConcurrentRunning",200)
#    self.log.verbose("Will allow only %s concurrent running"%res['Value'])
#    max_concurrent_running = res['Value']
#
#    jobpropdict = {}
#    jobpropdict['ApplicationStatus'] = 'Getting overlay files'
#    res = self.ops.getSections("/Overlay/Sites/")
#    sites = []
#    if res['OK']:
#      sites = res['Value']
#      self.log.verbose("Found the following sites to restrain: %s"%sites)
#    if self.site in sites:
#      res = self.ops.getOption("/Overlay/Sites/%s/MaxConcurrentRunning"%self.site,200)
#      self.log.verbose("Will allow only %s concurrent running at %s"%(res['Value'],self.site))
#      jobpropdict['Site']=self.site
#      max_concurrent_running = res['Value']
    self.__disableWatchDog()
    overlaymon = RPCClient('Overlay/Overlay', timeout=60)
    ##Now need to check that there are not that many concurrent jobs getting the overlay at the same time
    error_count = 0
    count = 0
    while 1:
      if error_count > 10 :
        self.log.error('OverlayDB returned too many errors')
        return S_ERROR('Failed to get number of concurrent overlay jobs')
      #jobMonitor = RPCClient('WorkloadManagement/JobMonitoring',timeout=60)
      #res = jobMonitor.getCurrentJobCounters(jobpropdict)
      #if not res['OK']:
      #  error_count += 1
      #  time.sleep(60)
      #  continue
      #running = 0
      #if 'Running' in res['Value']:
      #  running = res['Value']['Running']

      res = overlaymon.canRun(self.site)
      if not res['OK']:
        error_count += 1
        time.sleep(60)
        continue
      error_count = 0
      #if running < max_concurrent_running:
      if res['Value']:
        break
      else:
        count += 1
        if count > 300:
          return S_ERROR("Waited too long: 5h, so marking job as failed")
        if count % 10 == 0 :
          self.setApplicationStatus("Overlay standby number %s" % count)
        time.sleep(60)

    self.__enableWatchDog()

    self.setApplicationStatus('Getting overlay files')

    self.log.info('Will obtain %s files for overlay' % totnboffilestoget)

    os.mkdir("./overlayinput_" + self.metaEventType)
    os.chdir("./overlayinput_" + self.metaEventType)
    filesobtained = []
    usednumbers = []
    fail = False
    fail_count = 0

    max_fail_allowed = self.ops.getValue("/Overlay/MaxFailedAllowed", 20)
    while not len(filesobtained) == totnboffilestoget:
      if fail_count > max_fail_allowed:
        fail = True
        break

      fileindex = random.randrange(nbfiles)
      if fileindex not in usednumbers:
          
        usednumbers.append(fileindex)

        triedDataManager = False

        if self.site == 'LCG.CERN.ch':
          res = self.getEOSFile(self.lfns[fileindex])
        elif self.site == 'LCG.IN2P3-CC.fr':
          res = self.getLyonFile(self.lfns[fileindex])
        elif self.site == 'LCG.UKI-LT2-IC-HEP.uk':
          res = self.getImperialFile(self.lfns[fileindex])
        elif  self.site == 'LCG.RAL-LCG2.uk':
          res = self.getRALFile(self.lfns[fileindex])
        elif  self.site == 'LCG.KEK.jp':
          res = self.getKEKFile(self.lfns[fileindex])
        else:
          self.__disableWatchDog()
          res = self.datMan.getFile(self.lfns[fileindex])
          triedDataManager = True

        #in case the specific copying did not work (mostly because the fileqs do
        #not exist locally) try again to get the file via the DataManager
        if (not res['OK']) and (not triedDataManager):
          res = self.datMan.getFile(self.lfns[fileindex])

        if not res['OK']:
          self.log.warn('Could not obtain %s' % self.lfns[fileindex])
          fail_count += 1
          continue
        
        filesobtained.append(self.lfns[fileindex])
        print "files now",filesobtained
      ##If no file could be obtained, need to make sure the job fails  
      if len(usednumbers) == nbfiles and not filesobtained:
        fail = True
        break

      if len(filesobtained) < totnboffilestoget:
        ##Now wait for a random time around 3 minutes
        ###Actually, waste CPU time !!!
        self.log.verbose("Waste happily some CPU time (on average 3 minutes)")
        res = wasteCPUCycles(60 * random.gauss(3, 0.1))
        if not res['OK']:
          self.log.error("Could not waste as much CPU time as wanted, but whatever!")

    ## Remove all scripts remaining
    scripts = glob.glob("*.sh")
    for script in scripts:
      os.remove(script)
      
    ##Print the file list
    mylist = os.listdir(os.getcwd())
    self.log.info("List of Overlay files:")
    self.log.info("\n".join(mylist))
    os.chdir(self.curdir)
    res = overlaymon.jobDone(self.site)
    if not res['OK']:
      self.log.error("Could not declare the job as finished getting the files")
    if fail:
      self.log.error("Did not manage to get all files needed, too many errors")
      return S_ERROR("Failed to get files")
    self.log.info('Got all files needed.')
    return S_OK()