Example #1
0
 def getMaxAtomSize(self, effectiveSize=False, getNumEvents=False):
     # number of files per job if defined
     if not self.isMerging:
         nFilesPerJob = self.taskSpec.getNumFilesPerJob()
     else:
         nFilesPerJob = self.taskSpec.getNumFilesPerMergeJob()
     nEventsPerJob = None
     if nFilesPerJob == None:
         # number of events per job
         if not self.isMerging:
             nEventsPerJob = self.taskSpec.getNumEventsPerJob()
         else:
             nEventsPerJob = self.taskSpec.getNumEventsPerMergeJob()
         if nEventsPerJob == None:
             nFilesPerJob = 1
     # grouping with boundaryID
     useBoundary = self.taskSpec.useGroupWithBoundaryID()
     # LB
     respectLB = self.taskSpec.respectLumiblock()
     maxAtomSize = 0
     while True:
         if not self.isMerging:
             maxNumFiles = self.taskSpec.getMaxNumFilesPerJob()
         else:
             maxNumFiles = self.taskSpec.getMaxNumFilesPerMergeJob()
         # get one subchunk
         subChunk = self.getSubChunk(None,
                                     nFilesPerJob=nFilesPerJob,
                                     nEventsPerJob=nEventsPerJob,
                                     useBoundary=useBoundary,
                                     respectLB=respectLB,
                                     maxNumFiles=maxNumFiles)
         if subChunk == None:
             break
         # get size
         tmpAtomSize = 0
         for tmpDatasetSpec, tmpFileSpecList in subChunk:
             if (effectiveSize
                     or getNumEvents) and not tmpDatasetSpec.isMaster():
                 continue
             for tmpFileSpec in tmpFileSpecList:
                 if effectiveSize:
                     tmpAtomSize += JediCoreUtils.getEffectiveFileSize(
                         tmpFileSpec.fsize, tmpFileSpec.startEvent,
                         tmpFileSpec.endEvent, tmpFileSpec.nEvents)
                 elif getNumEvents:
                     tmpAtomSize += tmpFileSpec.getEffectiveNumEvents()
                 else:
                     tmpAtomSize += tmpFileSpec.fsize
         if maxAtomSize < tmpAtomSize:
             maxAtomSize = tmpAtomSize
     # reset counters
     self.resetUsedCounters()
     # return
     return maxAtomSize
Example #2
0
 def getMaxAtomSize(self,effectiveSize=False,getNumEvents=False):
     # number of files per job if defined
     if not self.isMerging:
         nFilesPerJob = self.taskSpec.getNumFilesPerJob()
     else:
         nFilesPerJob = self.taskSpec.getNumFilesPerMergeJob()
     nEventsPerJob = None
     if nFilesPerJob == None:
         # number of events per job
         if not self.isMerging:
             nEventsPerJob = self.taskSpec.getNumEventsPerJob()
         else:
             nEventsPerJob = self.taskSpec.getNumEventsPerMergeJob()
         if nEventsPerJob == None:
             nFilesPerJob = 1
     # grouping with boundaryID
     useBoundary = self.taskSpec.useGroupWithBoundaryID()    
     # LB
     respectLB = self.taskSpec.respectLumiblock()
     maxAtomSize = 0    
     while True:
         if not self.isMerging:
             maxNumFiles = self.taskSpec.getMaxNumFilesPerJob()
         else:
             maxNumFiles = self.taskSpec.getMaxNumFilesPerMergeJob()
         # get one subchunk
         subChunk = self.getSubChunk(None,nFilesPerJob=nFilesPerJob,
                                     nEventsPerJob=nEventsPerJob,
                                     useBoundary=useBoundary,
                                     respectLB=respectLB,
                                     maxNumFiles=maxNumFiles)
         if subChunk == None:
             break
         # get size
         tmpAtomSize = 0
         for tmpDatasetSpec,tmpFileSpecList in subChunk:
             if (effectiveSize or getNumEvents) and not tmpDatasetSpec.isMaster():
                 continue
             for tmpFileSpec in tmpFileSpecList:
                 if effectiveSize:
                     tmpAtomSize += JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent,
                                                                       tmpFileSpec.endEvent,tmpFileSpec.nEvents)
                 elif getNumEvents:
                     tmpAtomSize += tmpFileSpec.getEffectiveNumEvents()
                 else:
                     tmpAtomSize += tmpFileSpec.fsize
         if maxAtomSize < tmpAtomSize:
             maxAtomSize = tmpAtomSize
     # reset counters
     self.resetUsedCounters()
     # return
     return maxAtomSize
Example #3
0
 def wrappedMain(self):
     while True:
         proc = multiprocessing.Process(target=self.target, args=self.args)
         proc.start()
         pid = proc.pid
         while True:
             try:
                 proc.join(20)
                 if not JediCoreUtils.checkProcess(pid):
                     timeNow = datetime.datetime.utcnow()
                     print "{0} {1}: INFO    pid={2} not exist".format(
                         str(timeNow), self.__class__.__name__, pid)
                     break
             except:
                 timeNow = datetime.datetime.utcnow()
                 errType, errValue = sys.exc_info()[:2]
                 print "{0} {1}: INFO    failed to check pid={2} with {3} {4}".format(
                     str(timeNow), self.__class__.__name__, pid, errType,
                     errValue)
Example #4
0
 def wrappedMain(self):
     while True:
         proc = multiprocessing.Process(target=self.target,
                                        args=self.args)
         proc.start()
         pid = proc.pid
         while True:
             try:
                 proc.join(20)
                 if not JediCoreUtils.checkProcess(pid):
                     timeNow = datetime.datetime.utcnow()
                     print "{0} {1}: INFO    pid={2} not exist".format(str(timeNow),
                                                                       self.__class__.__name__,
                                                                       pid)
                     break
             except:
                 timeNow = datetime.datetime.utcnow()
                 errType,errValue = sys.exc_info()[:2]
                 print "{0} {1}: INFO    failed to check pid={2} with {3} {4}".format(str(timeNow),
                                                                                      self.__class__.__name__,
                                                                                      pid,errType,errValue)
Example #5
0
 def getSubChunk(self,
                 siteName,
                 maxNumFiles=None,
                 maxSize=None,
                 sizeGradients=0,
                 sizeIntercepts=0,
                 nFilesPerJob=None,
                 multiplicand=1,
                 walltimeGradient=0,
                 maxWalltime=0,
                 nEventsPerJob=None,
                 useBoundary=None,
                 sizeGradientsPerInSize=None,
                 maxOutSize=None,
                 coreCount=1,
                 respectLB=False,
                 corePower=None,
                 dynNumEvents=False,
                 maxNumEventRanges=None,
                 multiplicity=None,
                 splitByFields=None,
                 tmpLog=None,
                 useDirectIO=False):
     # check if there are unused files/events
     if not self.checkUnused():
         return None
     # protection against unreasonable values
     if nFilesPerJob == 0:
         nFilesPerJob = None
     if nEventsPerJob == 0:
         nEventsPerJob = None
     # set default max number of files
     if maxNumFiles == None:
         maxNumFiles = 50
     # set default max number of event ranges
     if maxNumEventRanges == None:
         maxNumEventRanges = 20
     # set default max size
     if maxSize == None and nFilesPerJob == None and nEventsPerJob == None:
         # 20 GB at most by default
         maxSize = 20 * 1024 * 1024 * 1024
     # set default output size
     minOutSize = self.defaultOutputSize
     # set default max number of events
     maxNumEvents = None
     # ignore negative walltime gradient
     if walltimeGradient < 0:
         walltimeGradient = 0
     # overwrite parameters when nFiles/EventsPerJob is used
     if nFilesPerJob != None and not dynNumEvents:
         maxNumFiles = nFilesPerJob
         if not respectLB:
             multiplicand = nFilesPerJob
     if nEventsPerJob != None:
         maxNumEvents = nEventsPerJob
     # split with boundayID
     splitWithBoundaryID = False
     if useBoundary != None:
         splitWithBoundaryID = True
         if useBoundary['inSplit'] == 2:
             # unset max values to split only with boundaryID
             maxNumFiles = None
             maxSize = None
             maxWalltime = 0
             maxNumEvents = None
             multiplicand = 1
     # get site when splitting per site
     if siteName != None:
         siteCandidate = self.siteCandidates[siteName]
     # use event ratios
     useEventRatio = self.useEventRatioForSec()
     # start splitting
     inputNumFiles = 0
     inputNumEvents = 0
     fileSize = 0
     firstLoop = True
     firstMaster = True
     inputFileMap = {}
     expWalltime = 0
     nextStartEvent = None
     boundaryID = None
     newBoundaryID = False
     eventJump = False
     nSecFilesMap = {}
     nSecEventsMap = {}
     numMaster = 0
     outSizeMap = {}
     lumiBlockNr = None
     newLumiBlockNr = False
     siteAvailable = True
     inputFileSet = set()
     fieldStr = None
     while (maxNumFiles == None or (not dynNumEvents and inputNumFiles <= maxNumFiles) or \
                (dynNumEvents and len(inputFileSet) <= maxNumFiles and inputNumFiles <= maxNumEventRanges)) \
             and (maxSize == None or (maxSize != None and fileSize <= maxSize)) \
             and (maxWalltime <= 0 or expWalltime <= maxWalltime) \
             and (maxNumEvents == None or (maxNumEvents != None and inputNumEvents <= maxNumEvents)) \
             and (maxOutSize == None or self.getOutSize(outSizeMap) <= maxOutSize):
         # get one file (or one file group for MP) from master
         datasetUsage = self.datasetMap[self.masterDataset.datasetID]
         if not self.masterDataset.datasetID in outSizeMap:
             outSizeMap[self.masterDataset.datasetID] = 0
         boundaryIDs = set()
         primaryHasEvents = False
         for tmpFileSpec in self.masterDataset.Files[
                 datasetUsage['used']:datasetUsage['used'] + multiplicand]:
             # check start event to keep continuity
             if (maxNumEvents != None
                     or dynNumEvents) and tmpFileSpec.startEvent != None:
                 if nextStartEvent != None and nextStartEvent != tmpFileSpec.startEvent:
                     eventJump = True
                     break
             # check boundaryID
             if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                     and useBoundary['inSplit'] != 3:
                 newBoundaryID = True
                 break
             # check LB
             if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr:
                 newLumiBlockNr = True
                 break
             # check field
             if splitByFields != None:
                 tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields)
                 if fieldStr == None:
                     fieldStr = tmpFieldStr
                 elif tmpFieldStr != fieldStr:
                     newBoundaryID = True
                     break
             # check for distributed datasets
             if self.masterDataset.isDistributed() and siteName != None and \
                     not siteCandidate.isAvailableFile(tmpFileSpec):
                 siteAvailable = False
                 break
             if not inputFileMap.has_key(self.masterDataset.datasetID):
                 inputFileMap[self.masterDataset.datasetID] = []
             inputFileMap[self.masterDataset.datasetID].append(tmpFileSpec)
             inputFileSet.add(tmpFileSpec.lfn)
             datasetUsage['used'] += 1
             numMaster += 1
             # get effective file size
             effectiveFsize = JediCoreUtils.getEffectiveFileSize(
                 tmpFileSpec.fsize, tmpFileSpec.startEvent,
                 tmpFileSpec.endEvent, tmpFileSpec.nEvents)
             # get num of events
             effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents()
             # sum
             inputNumFiles += 1
             if self.taskSpec.outputScaleWithEvents():
                 fileSize += long(sizeGradients * effectiveNumEvents)
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     if not useDirectIO:
                         fileSize += long(tmpFileSpec.fsize)
                 outSizeMap[self.masterDataset.datasetID] += long(
                     sizeGradients * effectiveNumEvents)
             else:
                 fileSize += long(sizeGradients * effectiveFsize)
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     if not useDirectIO:
                         fileSize += long(tmpFileSpec.fsize)
                 outSizeMap[self.masterDataset.datasetID] += long(
                     sizeGradients * effectiveFsize)
             if sizeGradientsPerInSize != None:
                 fileSize += long(effectiveFsize * sizeGradientsPerInSize)
                 outSizeMap[self.masterDataset.datasetID] += long(
                     effectiveFsize * sizeGradientsPerInSize)
             # sum offset only for the first master
             if firstMaster:
                 fileSize += sizeIntercepts
             # walltime
             if self.taskSpec.useHS06():
                 if firstMaster:
                     expWalltime += self.taskSpec.baseWalltime
                 tmpExpWalltime = walltimeGradient * effectiveNumEvents / float(
                     coreCount)
                 if not corePower in [None, 0]:
                     tmpExpWalltime /= corePower
                 if self.taskSpec.cpuEfficiency == 0:
                     tmpExpWalltime = 0
                 else:
                     tmpExpWalltime /= float(
                         self.taskSpec.cpuEfficiency) / 100.0
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 expWalltime += long(tmpExpWalltime)
             else:
                 tmpExpWalltime = walltimeGradient * effectiveFsize / float(
                     coreCount)
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 expWalltime += long(tmpExpWalltime)
             # the number of events
             if (
                     maxNumEvents != None or useEventRatio
             ) and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                 primaryHasEvents = True
                 inputNumEvents += (tmpFileSpec.endEvent -
                                    tmpFileSpec.startEvent + 1)
                 # set next start event
                 nextStartEvent = tmpFileSpec.endEvent + 1
                 if nextStartEvent == tmpFileSpec.nEvents:
                     nextStartEvent = 0
             # boundaryID
             if splitWithBoundaryID:
                 boundaryID = tmpFileSpec.boundaryID
                 if not boundaryID in boundaryIDs:
                     boundaryIDs.add(boundaryID)
             # LB
             if respectLB:
                 lumiBlockNr = tmpFileSpec.lumiBlockNr
             firstMaster = False
         # get files from secondaries
         firstSecondary = True
         for datasetSpec in self.secondaryDatasetList:
             if not datasetSpec.datasetID in outSizeMap:
                 outSizeMap[datasetSpec.datasetID] = 0
             if datasetSpec.isNoSplit():
                 # every job uses dataset without splitting
                 if firstLoop:
                     datasetUsage = self.datasetMap[datasetSpec.datasetID]
                     for tmpFileSpec in datasetSpec.Files:
                         if not inputFileMap.has_key(datasetSpec.datasetID):
                             inputFileMap[datasetSpec.datasetID] = []
                         inputFileMap[datasetSpec.datasetID].append(
                             tmpFileSpec)
                         # sum
                         if not useDirectIO:
                             fileSize += tmpFileSpec.fsize
                         if sizeGradientsPerInSize != None:
                             fileSize += (tmpFileSpec.fsize *
                                          sizeGradientsPerInSize)
                             outSizeMap[datasetSpec.datasetID] += (
                                 tmpFileSpec.fsize * sizeGradientsPerInSize)
                         datasetUsage['used'] += 1
             else:
                 if not nSecFilesMap.has_key(datasetSpec.datasetID):
                     nSecFilesMap[datasetSpec.datasetID] = 0
                 # get number of files to be used for the secondary
                 nSecondary = datasetSpec.getNumFilesPerJob()
                 if nSecondary != None and firstLoop == False:
                     # read files only in the first bunch when number of files per job is specified
                     continue
                 if nSecondary == None:
                     nSecondary = datasetSpec.getNumMultByRatio(
                         numMaster) - nSecFilesMap[datasetSpec.datasetID]
                     if (datasetSpec.getEventRatio() != None
                             and inputNumEvents > 0) or (
                                 splitWithBoundaryID
                                 and useBoundary['inSplit'] != 3):
                         # set large number to get all associated secondary files
                         nSecondary = 10000
                 datasetUsage = self.datasetMap[datasetSpec.datasetID]
                 # reset nUsed
                 if datasetSpec.isReusable(
                 ) and datasetUsage['used'] + nSecondary > len(
                         datasetSpec.Files):
                     datasetUsage['used'] = 0
                 for tmpFileSpec in datasetSpec.Files[
                         datasetUsage['used']:datasetUsage['used'] +
                         nSecondary]:
                     # check boundaryID
                     if (splitWithBoundaryID or (useBoundary != None and useBoundary['inSplit'] == 3 and datasetSpec.getRatioToMaster() > 1)) \
                             and boundaryID != None and \
                             not (boundaryID == tmpFileSpec.boundaryID or tmpFileSpec.boundaryID in boundaryIDs):
                         break
                     # check for distributed datasets
                     if datasetSpec.isDistributed() and siteName != None and \
                             not siteCandidate.isAvailableFile(tmpFileSpec):
                         break
                     # check ratio
                     if not datasetSpec.datasetID in nSecEventsMap:
                         nSecEventsMap[datasetSpec.datasetID] = 0
                     if datasetSpec.getEventRatio(
                     ) != None and inputNumEvents > 0:
                         if float(
                                 nSecEventsMap[datasetSpec.datasetID]
                         ) / float(inputNumEvents
                                   ) >= datasetSpec.getEventRatio():
                             break
                     if not inputFileMap.has_key(datasetSpec.datasetID):
                         inputFileMap[datasetSpec.datasetID] = []
                     inputFileMap[datasetSpec.datasetID].append(tmpFileSpec)
                     # sum
                     if not useDirectIO:
                         fileSize += tmpFileSpec.fsize
                     if sizeGradientsPerInSize != None:
                         fileSize += (tmpFileSpec.fsize *
                                      sizeGradientsPerInSize)
                         outSizeMap[datasetSpec.datasetID] += (
                             tmpFileSpec.fsize * sizeGradientsPerInSize)
                     datasetUsage['used'] += 1
                     nSecFilesMap[datasetSpec.datasetID] += 1
                     # the number of events
                     if firstSecondary and maxNumEvents != None and not primaryHasEvents:
                         if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                             inputNumEvents += (tmpFileSpec.endEvent -
                                                tmpFileSpec.startEvent + 1)
                         elif tmpFileSpec.nEvents != None:
                             inputNumEvents += tmpFileSpec.nEvents
                     if tmpFileSpec.nEvents != None:
                         nSecEventsMap[
                             datasetSpec.datasetID] += tmpFileSpec.nEvents
                 # use only the first secondary
                 firstSecondary = False
         # unset first loop flag
         firstLoop = False
         # check if there are unused files/evets
         if not self.checkUnused():
             break
         # break if nFilesPerJob is used as multiplicand
         if nFilesPerJob != None and not respectLB:
             break
         # boundayID is changed
         if newBoundaryID:
             break
         # LB is changed
         if newLumiBlockNr:
             break
         # event jump
         if eventJump:
             break
         # distributed files are unavailable
         if not siteAvailable:
             break
         primaryHasEvents = False
         # check master in the next loop
         datasetUsage = self.datasetMap[self.masterDataset.datasetID]
         newInputNumFiles = inputNumFiles
         newInputNumEvents = inputNumEvents
         newFileSize = fileSize
         newExpWalltime = expWalltime
         newNextStartEvent = nextStartEvent
         newNumMaster = numMaster
         terminateFlag = False
         newOutSizeMap = copy.copy(outSizeMap)
         newBoundaryIDs = set()
         newInputFileSet = copy.copy(inputFileSet)
         if not self.masterDataset.datasetID in newOutSizeMap:
             newOutSizeMap[self.masterDataset.datasetID] = 0
         for tmpFileSpec in self.masterDataset.Files[
                 datasetUsage['used']:datasetUsage['used'] + multiplicand]:
             # check continuity of event
             if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                 primaryHasEvents = True
                 newInputNumEvents += (tmpFileSpec.endEvent -
                                       tmpFileSpec.startEvent + 1)
                 # continuity of event is broken
                 if newNextStartEvent != None and newNextStartEvent != tmpFileSpec.startEvent:
                     # no files in the next loop
                     if newInputNumFiles == 0:
                         terminateFlag = True
                     break
                 newNextStartEvent = tmpFileSpec.endEvent + 1
             # check boundary
             if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                     and useBoundary['inSplit'] != 3:
                 # no files in the next loop
                 if newInputNumFiles == 0:
                     terminateFlag = True
                 break
             # check LB
             if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr:
                 # no files in the next loop
                 if newInputNumFiles == 0:
                     terminateFlag = True
                 break
             # check field
             if splitByFields != None:
                 tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields)
                 if tmpFieldStr != fieldStr:
                     # no files in the next loop
                     if newInputNumFiles == 0:
                         terminateFlag = True
                     break
             # check for distributed datasets
             if self.masterDataset.isDistributed() and siteName != None and \
                     not siteCandidate.isAvailableFile(tmpFileSpec):
                 # no files in the next loop
                 if newInputNumFiles == 0:
                     terminateFlag = True
                 break
             # get effective file size
             effectiveFsize = JediCoreUtils.getEffectiveFileSize(
                 tmpFileSpec.fsize, tmpFileSpec.startEvent,
                 tmpFileSpec.endEvent, tmpFileSpec.nEvents)
             # get num of events
             effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents()
             newInputNumFiles += 1
             newNumMaster += 1
             newInputFileSet.add(tmpFileSpec.lfn)
             if self.taskSpec.outputScaleWithEvents():
                 newFileSize += long(sizeGradients * effectiveNumEvents)
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     if not useDirectIO:
                         newFileSize += long(tmpFileSpec.fsize)
                 newOutSizeMap[self.masterDataset.datasetID] += long(
                     sizeGradients * effectiveNumEvents)
             else:
                 newFileSize += long(sizeGradients * effectiveFsize)
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     if not useDirectIO:
                         newFileSize += long(tmpFileSpec.fsize)
                 newOutSizeMap[self.masterDataset.datasetID] += long(
                     sizeGradients * effectiveFsize)
             if sizeGradientsPerInSize != None:
                 newFileSize += long(effectiveFsize *
                                     sizeGradientsPerInSize)
                 newOutSizeMap[self.masterDataset.datasetID] += long(
                     effectiveFsize * sizeGradientsPerInSize)
             if self.taskSpec.useHS06():
                 tmpExpWalltime = walltimeGradient * effectiveNumEvents / float(
                     coreCount)
                 if not corePower in [None, 0]:
                     tmpExpWalltime /= corePower
                 if self.taskSpec.cpuEfficiency == 0:
                     tmpExpWalltime = 0
                 else:
                     tmpExpWalltime /= float(
                         self.taskSpec.cpuEfficiency) / 100.0
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 newExpWalltime += long(tmpExpWalltime)
             else:
                 tmpExpWalltime = walltimeGradient * effectiveFsize / float(
                     coreCount)
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 newExpWalltime += long(tmpExpWalltime)
             # boundaryID
             if splitWithBoundaryID:
                 newBoundaryIDs.add(tmpFileSpec.boundaryID)
         # check secondaries
         firstSecondary = True
         for datasetSpec in self.secondaryDatasetList:
             if not datasetSpec.datasetID in newOutSizeMap:
                 newOutSizeMap[datasetSpec.datasetID] = 0
             if not datasetSpec.isNoSplit(
             ) and datasetSpec.getNumFilesPerJob() == None:
                 # check boundaryID
                 if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                         and useBoundary['inSplit'] != 3:
                     break
                 newNumSecondary = datasetSpec.getNumMultByRatio(
                     newNumMaster) - nSecFilesMap[datasetSpec.datasetID]
                 datasetUsage = self.datasetMap[datasetSpec.datasetID]
                 for tmpFileSpec in datasetSpec.Files[
                         datasetUsage['used']:datasetUsage['used'] +
                         nSecondary]:
                     # check boundaryID
                     if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                             and not tmpFileSpec.boundaryID in boundaryIDs and not tmpFileSpec.boundaryID in newBoundaryIDs:
                         break
                     if not useDirectIO:
                         newFileSize += tmpFileSpec.fsize
                     if sizeGradientsPerInSize != None:
                         newFileSize += (tmpFileSpec.fsize *
                                         sizeGradientsPerInSize)
                         newOutSizeMap[datasetSpec.datasetID] += (
                             tmpFileSpec.fsize * sizeGradientsPerInSize)
                     # the number of events
                     if firstSecondary and maxNumEvents != None and not primaryHasEvents:
                         if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                             newInputNumEvents += (tmpFileSpec.endEvent -
                                                   tmpFileSpec.startEvent +
                                                   1)
                         elif tmpFileSpec.nEvents != None:
                             newInputNumEvents += tmpFileSpec.nEvents
                 firstSecondary = False
         # termination
         if terminateFlag:
             break
         # check
         newOutSize = self.getOutSize(newOutSizeMap)
         if (maxNumFiles != None and ((not dynNumEvents and newInputNumFiles > maxNumFiles) \
                                          or (dynNumEvents and (len(newInputFileSet) > maxNumFiles or newInputNumFiles > maxNumEventRanges)))) \
                 or (maxSize != None and newFileSize > maxSize) \
                 or (maxSize != None and newOutSize < minOutSize and maxSize-minOutSize < newFileSize-newOutSize) \
                 or (maxWalltime > 0 and newExpWalltime > maxWalltime) \
                 or (maxNumEvents != None and newInputNumEvents > maxNumEvents) \
                 or (maxOutSize != None and self.getOutSize(newOutSizeMap) > maxOutSize):
             break
     # reset nUsed for repeated datasets
     for tmpDatasetID, datasetUsage in self.datasetMap.iteritems():
         tmpDatasetSpec = datasetUsage['datasetSpec']
         if tmpDatasetSpec.isRepeated():
             if len(tmpDatasetSpec.Files) > 0:
                 datasetUsage['used'] %= len(tmpDatasetSpec.Files)
     # make copy to return
     returnList = []
     for tmpDatasetID, inputFileList in inputFileMap.iteritems():
         tmpRetList = []
         for tmpFileSpec in inputFileList:
             # split par site or get atomic subchunk
             if siteName != None:
                 # make copy to individually set locality
                 newFileSpec = copy.copy(tmpFileSpec)
                 # set locality
                 newFileSpec.locality = siteCandidate.getFileLocality(
                     tmpFileSpec)
                 if newFileSpec.locality == 'remote':
                     newFileSpec.sourceName = siteCandidate.remoteSource
                 # append
                 tmpRetList.append(newFileSpec)
             else:
                 # getting atomic subchunk
                 tmpRetList.append(tmpFileSpec)
         # add to return map
         tmpDatasetSpec = self.getDatasetWithID(tmpDatasetID)
         returnList.append((tmpDatasetSpec, tmpRetList))
     # return
     return returnList
Example #6
0
 def getSubChunk(self,siteName,maxNumFiles=None,maxSize=None,
                 sizeGradients=0,sizeIntercepts=0,
                 nFilesPerJob=None,multiplicand=1,
                 walltimeGradient=0,maxWalltime=0,
                 nEventsPerJob=None,useBoundary=None,
                 sizeGradientsPerInSize=None,
                 maxOutSize=None,
                 coreCount=1,
                 respectLB=False,
                 corePower=None,
                 dynNumEvents=False,
                 maxNumEventRanges=None,
                 multiplicity=None,
                 splitByFields=None, 
                 tmpLog=None,
                 useDirectIO=False,
                 maxDiskSize=None):
     # check if there are unused files/events
     if not self.checkUnused():
         return None
     # protection against unreasonable values
     if nFilesPerJob == 0:
         nFilesPerJob = None
     if nEventsPerJob == 0:
         nEventsPerJob = None
     # set default max number of files
     if maxNumFiles == None:
         maxNumFiles = 50
     # set default max number of event ranges
     if maxNumEventRanges == None:
         maxNumEventRanges = 20
     # set default max size    
     if maxSize == None and nFilesPerJob == None and nEventsPerJob == None:
         # 20 GB at most by default
         maxSize = 20 * 1024 * 1024 * 1024
     # set default output size
     minOutSize = self.defaultOutputSize 
     # set default max number of events
     maxNumEvents = None
     # ignore negative walltime gradient
     if walltimeGradient < 0:
         walltimeGradient = 0
     # overwrite parameters when nFiles/EventsPerJob is used
     if nFilesPerJob != None and not dynNumEvents:
         maxNumFiles  = nFilesPerJob
         if not respectLB:
             multiplicand = nFilesPerJob
     if nEventsPerJob != None:
         maxNumEvents = nEventsPerJob
     # split with boundayID
     splitWithBoundaryID = False    
     if useBoundary != None:
         splitWithBoundaryID = True
         if useBoundary['inSplit'] == 2:
             # unset max values to split only with boundaryID 
             maxNumFiles = None
             maxSize = None
             maxWalltime = 0
             maxNumEvents = None
             multiplicand = 1
     # get site when splitting per site
     if siteName != None:
         siteCandidate = self.siteCandidates[siteName]
     # use event ratios
     useEventRatio = self.useEventRatioForSec()
     # start splitting
     inputNumFiles  = 0
     inputNumEvents = 0 
     fileSize       = 0
     firstLoop      = True
     firstMaster    = True
     inputFileMap   = {}
     expWalltime    = 0
     nextStartEvent = None
     boundaryID     = None
     newBoundaryID  = False
     eventJump      = False
     nSecFilesMap   = {}
     nSecEventsMap  = {}
     numMaster      = 0
     outSizeMap     = {}
     lumiBlockNr    = None
     newLumiBlockNr = False
     siteAvailable  = True
     inputFileSet   = set()
     fieldStr       = None
     diskSize       = 0
     while (maxNumFiles == None or (not dynNumEvents and inputNumFiles <= maxNumFiles) or \
                (dynNumEvents and len(inputFileSet) <= maxNumFiles and inputNumFiles <= maxNumEventRanges)) \
             and (maxSize == None or (maxSize != None and fileSize <= maxSize)) \
             and (maxWalltime <= 0 or expWalltime <= maxWalltime) \
             and (maxNumEvents == None or (maxNumEvents != None and inputNumEvents <= maxNumEvents)) \
             and (maxOutSize == None or self.getOutSize(outSizeMap) <= maxOutSize) \
             and (maxDiskSize is None or diskSize <= maxDiskSize):
         # get one file (or one file group for MP) from master
         datasetUsage = self.datasetMap[self.masterDataset.datasetID]
         if not self.masterDataset.datasetID in outSizeMap:
             outSizeMap[self.masterDataset.datasetID] = 0
         boundaryIDs = set()
         primaryHasEvents = False
         for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]:
             # check start event to keep continuity
             if (maxNumEvents != None or dynNumEvents) and tmpFileSpec.startEvent != None:
                 if nextStartEvent != None and nextStartEvent != tmpFileSpec.startEvent:
                     eventJump = True
                     break
             # check boundaryID
             if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                     and useBoundary['inSplit'] != 3:
                 newBoundaryID = True
                 break
             # check LB
             if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr:
                 newLumiBlockNr = True
                 break
             # check field
             if splitByFields != None:
                 tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields)
                 if fieldStr == None:
                     fieldStr = tmpFieldStr
                 elif tmpFieldStr != fieldStr:
                     newBoundaryID = True
                     break
             # check for distributed datasets
             if self.masterDataset.isDistributed() and siteName != None and \
                     not siteCandidate.isAvailableFile(tmpFileSpec):
                 siteAvailable = False
                 break
             if not inputFileMap.has_key(self.masterDataset.datasetID):
                 inputFileMap[self.masterDataset.datasetID] = []
             inputFileMap[self.masterDataset.datasetID].append(tmpFileSpec)
             inputFileSet.add(tmpFileSpec.lfn)
             datasetUsage['used'] += 1
             numMaster += 1
             # get effective file size
             effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent,
                                                                 tmpFileSpec.endEvent,tmpFileSpec.nEvents)
             # get num of events
             effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents()
             # sum
             inputNumFiles += 1
             if self.taskSpec.outputScaleWithEvents():
                 tmpOutSize = long(sizeGradients * effectiveNumEvents)
                 fileSize += tmpOutSize
                 diskSize += tmpOutSize
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     fileSize += long(tmpFileSpec.fsize)    
                     if not useDirectIO:
                         diskSize += long(tmpFileSpec.fsize)                                
                 outSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveNumEvents)
             else:
                 tmpOutSize = long(sizeGradients * effectiveFsize)
                 fileSize += tmpOutSize
                 diskSize += tmpOutSize
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     fileSize += long(tmpFileSpec.fsize)
                     if not useDirectIO:
                         diskSize += long(tmpFileSpec.fsize)
                 outSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveFsize)
             if sizeGradientsPerInSize != None:
                 tmpOutSize = long(effectiveFsize * sizeGradientsPerInSize)
                 fileSize += tmpOutSize
                 diskSize += tmpOutSize
                 outSizeMap[self.masterDataset.datasetID] += long(effectiveFsize * sizeGradientsPerInSize)
             # sum offset only for the first master
             if firstMaster:
                 fileSize += sizeIntercepts
             # walltime
             if self.taskSpec.useHS06():
                 if firstMaster:
                     expWalltime += self.taskSpec.baseWalltime
                 tmpExpWalltime = walltimeGradient * effectiveNumEvents / float(coreCount)
                 if not corePower in [None,0]:
                     tmpExpWalltime /= corePower
                 if self.taskSpec.cpuEfficiency == 0:
                     tmpExpWalltime = 0
                 else:
                     tmpExpWalltime /= float(self.taskSpec.cpuEfficiency)/100.0
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 expWalltime += long(tmpExpWalltime)
             else:
                 tmpExpWalltime = walltimeGradient * effectiveFsize / float(coreCount)
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 expWalltime += long(tmpExpWalltime)
             # the number of events
             if (maxNumEvents != None or useEventRatio) and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                 primaryHasEvents = True
                 inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1)
                 # set next start event
                 nextStartEvent = tmpFileSpec.endEvent + 1
                 if nextStartEvent == tmpFileSpec.nEvents:
                     nextStartEvent = 0
             # boundaryID
             if splitWithBoundaryID:
                 boundaryID = tmpFileSpec.boundaryID
                 if not boundaryID in boundaryIDs:
                     boundaryIDs.add(boundaryID)
             # LB
             if respectLB:
                 lumiBlockNr = tmpFileSpec.lumiBlockNr
             firstMaster = False
         # get files from secondaries
         firstSecondary = True
         for datasetSpec in self.secondaryDatasetList:
             if not datasetSpec.datasetID in outSizeMap:
                 outSizeMap[datasetSpec.datasetID] = 0
             if datasetSpec.isNoSplit():
                 # every job uses dataset without splitting
                 if firstLoop:
                     datasetUsage = self.datasetMap[datasetSpec.datasetID]
                     for tmpFileSpec in datasetSpec.Files:
                         if not inputFileMap.has_key(datasetSpec.datasetID):
                             inputFileMap[datasetSpec.datasetID] = []
                         inputFileMap[datasetSpec.datasetID].append(tmpFileSpec)
                         # sum
                         fileSize += tmpFileSpec.fsize
                         if not useDirectIO:
                             diskSize += tmpFileSpec.fsize
                         if sizeGradientsPerInSize != None:
                             tmpOutSize = (tmpFileSpec.fsize * sizeGradientsPerInSize)
                             fileSize += tmpOutSize
                             diskSize += tmpOutSize
                             outSizeMap[datasetSpec.datasetID] += (tmpFileSpec.fsize * sizeGradientsPerInSize)
                         datasetUsage['used'] += 1
             else:
                 if not nSecFilesMap.has_key(datasetSpec.datasetID):
                     nSecFilesMap[datasetSpec.datasetID] = 0
                 # get number of files to be used for the secondary
                 nSecondary = datasetSpec.getNumFilesPerJob()
                 if nSecondary != None and firstLoop == False:
                     # read files only in the first bunch when number of files per job is specified
                     continue
                 if nSecondary == None:
                     nSecondary = datasetSpec.getNumMultByRatio(numMaster) - nSecFilesMap[datasetSpec.datasetID]
                     if (datasetSpec.getEventRatio() != None and inputNumEvents > 0) or (splitWithBoundaryID and useBoundary['inSplit'] != 3):
                         # set large number to get all associated secondary files
                         nSecondary = 10000
                 datasetUsage = self.datasetMap[datasetSpec.datasetID]
                 # reset nUsed
                 if datasetSpec.isReusable() and datasetUsage['used']+nSecondary > len(datasetSpec.Files):
                     datasetUsage['used'] = 0
                 for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]:
                     # check boundaryID
                     if (splitWithBoundaryID or (useBoundary != None and useBoundary['inSplit'] == 3 and datasetSpec.getRatioToMaster() > 1)) \
                             and boundaryID != None and \
                             not (boundaryID == tmpFileSpec.boundaryID or tmpFileSpec.boundaryID in boundaryIDs):
                         break
                     # check for distributed datasets
                     if datasetSpec.isDistributed() and siteName != None and \
                             not siteCandidate.isAvailableFile(tmpFileSpec):
                         break
                     # check ratio
                     if not datasetSpec.datasetID in nSecEventsMap:
                         nSecEventsMap[datasetSpec.datasetID] = 0
                     if datasetSpec.getEventRatio() != None and inputNumEvents > 0:
                         if float(nSecEventsMap[datasetSpec.datasetID]) / float(inputNumEvents) >= datasetSpec.getEventRatio():
                             break
                     if not inputFileMap.has_key(datasetSpec.datasetID):
                         inputFileMap[datasetSpec.datasetID] = []
                     inputFileMap[datasetSpec.datasetID].append(tmpFileSpec)
                     # sum
                     fileSize += tmpFileSpec.fsize
                     if not useDirectIO:
                         diskSize += tmpFileSpec.fsize
                     if sizeGradientsPerInSize != None:
                         tmpOutSize = (tmpFileSpec.fsize * sizeGradientsPerInSize)
                         fileSize += tmpOutSize
                         diskSize += tmpOutSize
                         outSizeMap[datasetSpec.datasetID] += (tmpFileSpec.fsize * sizeGradientsPerInSize)
                     datasetUsage['used'] += 1
                     nSecFilesMap[datasetSpec.datasetID] += 1
                     # the number of events
                     if firstSecondary and maxNumEvents != None and not primaryHasEvents:
                         if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                             inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1)
                         elif tmpFileSpec.nEvents != None:
                             inputNumEvents += tmpFileSpec.nEvents
                     if tmpFileSpec.nEvents != None:
                         nSecEventsMap[datasetSpec.datasetID] += tmpFileSpec.nEvents
                 # use only the first secondary
                 firstSecondary = False
         # unset first loop flag
         firstLoop = False
         # check if there are unused files/evets 
         if not self.checkUnused():
             break
         # break if nFilesPerJob is used as multiplicand
         if nFilesPerJob != None and not respectLB:
             break
         # boundayID is changed
         if newBoundaryID:
             break
         # LB is changed
         if newLumiBlockNr:
             break
         # event jump
         if eventJump:
             break
         # distributed files are unavailable
         if not siteAvailable:
             break
         primaryHasEvents = False
         # check master in the next loop
         datasetUsage = self.datasetMap[self.masterDataset.datasetID]
         newInputNumFiles  = inputNumFiles
         newInputNumEvents = inputNumEvents
         newFileSize       = fileSize
         newExpWalltime    = expWalltime
         newNextStartEvent = nextStartEvent
         newNumMaster      = numMaster
         terminateFlag     = False
         newOutSizeMap     = copy.copy(outSizeMap)
         newBoundaryIDs    = set()
         newInputFileSet   = copy.copy(inputFileSet)
         newDiskSize       = diskSize
         if not self.masterDataset.datasetID in newOutSizeMap:
             newOutSizeMap[self.masterDataset.datasetID] = 0
         for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]:
             # check continuity of event
             if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                 primaryHasEvents = True
                 newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1)
                 # continuity of event is broken
                 if newNextStartEvent != None and newNextStartEvent != tmpFileSpec.startEvent:
                     # no files in the next loop
                     if newInputNumFiles == 0: 
                         terminateFlag = True
                     break
                 newNextStartEvent = tmpFileSpec.endEvent + 1
             # check boundary
             if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                     and useBoundary['inSplit'] != 3:
                 # no files in the next loop
                 if newInputNumFiles == 0:
                     terminateFlag = True
                 break
             # check LB
             if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr:
                 # no files in the next loop
                 if newInputNumFiles == 0:
                     terminateFlag = True
                 break
             # check field
             if splitByFields != None:
                 tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields)
                 if tmpFieldStr != fieldStr:
                     # no files in the next loop
                     if newInputNumFiles == 0:
                         terminateFlag = True
                     break
             # check for distributed datasets
             if self.masterDataset.isDistributed() and siteName != None and \
                     not siteCandidate.isAvailableFile(tmpFileSpec):
                 # no files in the next loop
                 if newInputNumFiles == 0:
                     terminateFlag = True
                 break
             # get effective file size
             effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent,
                                                                 tmpFileSpec.endEvent,tmpFileSpec.nEvents)
             # get num of events
             effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents()
             newInputNumFiles += 1
             newNumMaster += 1
             newInputFileSet.add(tmpFileSpec.lfn)
             if self.taskSpec.outputScaleWithEvents():
                 tmpOutSize = long(sizeGradients * effectiveNumEvents)
                 newFileSize += tmpOutSize
                 newDiskSize += tmpOutSize
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     newFileSize += long(tmpFileSpec.fsize)
                     if not useDirectIO:
                         newDiskSize += long(tmpFileSpec.fsize)
                 newOutSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveNumEvents)
             else:
                 tmpOutSize = long(sizeGradients * effectiveFsize)
                 newFileSize += tmpOutSize
                 newDiskSize += tmpOutSize
                 if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet:
                     newFileSize += long(tmpFileSpec.fsize)
                     if not useDirectIO:
                         newDiskSize += long(tmpFileSpec.fsize)
                 newOutSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveFsize)
             if sizeGradientsPerInSize != None:
                 tmpOutSize = long(effectiveFsize * sizeGradientsPerInSize)
                 newFileSize += tmpOutSize
                 newDiskSize += tmpOutSize
                 newOutSizeMap[self.masterDataset.datasetID] += long(effectiveFsize * sizeGradientsPerInSize)
             if self.taskSpec.useHS06():
                 tmpExpWalltime = walltimeGradient * effectiveNumEvents / float(coreCount)
                 if not corePower in [None,0]:
                     tmpExpWalltime /= corePower
                 if self.taskSpec.cpuEfficiency == 0:
                     tmpExpWalltime = 0
                 else:
                     tmpExpWalltime /= float(self.taskSpec.cpuEfficiency)/100.0
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 newExpWalltime += long(tmpExpWalltime)
             else:
                 tmpExpWalltime = walltimeGradient * effectiveFsize / float(coreCount)
                 if multiplicity != None:
                     tmpExpWalltime /= float(multiplicity)
                 newExpWalltime += long(tmpExpWalltime)
             # boundaryID
             if splitWithBoundaryID:
                 newBoundaryIDs.add(tmpFileSpec.boundaryID)
         # check secondaries
         firstSecondary = True
         for datasetSpec in self.secondaryDatasetList:
             if not datasetSpec.datasetID in newOutSizeMap:
                 newOutSizeMap[datasetSpec.datasetID] = 0
             if not datasetSpec.isNoSplit() and datasetSpec.getNumFilesPerJob() == None:
                 # check boundaryID
                 if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                         and useBoundary['inSplit'] != 3:
                     break
                 newNumSecondary = datasetSpec.getNumMultByRatio(newNumMaster) - nSecFilesMap[datasetSpec.datasetID]
                 datasetUsage = self.datasetMap[datasetSpec.datasetID]
                 for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]:
                     # check boundaryID
                     if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                             and not tmpFileSpec.boundaryID in boundaryIDs and not tmpFileSpec.boundaryID in newBoundaryIDs:
                         break
                     newFileSize += tmpFileSpec.fsize
                     if not useDirectIO:
                         newDiskSize += tmpFileSpec.fsize
                     if sizeGradientsPerInSize != None:
                         tmpOutSize = (tmpFileSpec.fsize * sizeGradientsPerInSize)
                         newFileSize += tmpOutSize
                         newDiskSize += tmpOutSize
                         newOutSizeMap[datasetSpec.datasetID] += (tmpFileSpec.fsize * sizeGradientsPerInSize)
                     # the number of events
                     if firstSecondary and maxNumEvents != None and not primaryHasEvents:
                         if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                             newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1)
                         elif tmpFileSpec.nEvents != None:
                             newInputNumEvents += tmpFileSpec.nEvents
                 firstSecondary = False
         # termination            
         if terminateFlag:
             break
         # check
         newOutSize = self.getOutSize(newOutSizeMap)
         if (maxNumFiles != None and ((not dynNumEvents and newInputNumFiles > maxNumFiles) \
                                          or (dynNumEvents and (len(newInputFileSet) > maxNumFiles or newInputNumFiles > maxNumEventRanges)))) \
                 or (maxSize != None and newFileSize > maxSize) \
                 or (maxSize != None and newOutSize < minOutSize and maxSize-minOutSize < newFileSize-newOutSize) \
                 or (maxWalltime > 0 and newExpWalltime > maxWalltime) \
                 or (maxNumEvents != None and newInputNumEvents > maxNumEvents) \
                 or (maxOutSize != None and self.getOutSize(newOutSizeMap) > maxOutSize) \
                 or (maxDiskSize is not None and newDiskSize > maxDiskSize):
             break
     # reset nUsed for repeated datasets
     for tmpDatasetID,datasetUsage in self.datasetMap.iteritems():
         tmpDatasetSpec = datasetUsage['datasetSpec']
         if tmpDatasetSpec.isRepeated():
             if len(tmpDatasetSpec.Files) > 0:
                 datasetUsage['used'] %= len(tmpDatasetSpec.Files)
     # make copy to return
     returnList = []
     for tmpDatasetID,inputFileList in inputFileMap.iteritems():
         tmpRetList = []
         for tmpFileSpec in inputFileList:
             # split par site or get atomic subchunk
             if siteName != None:
                 # make copy to individually set locality
                 newFileSpec = copy.copy(tmpFileSpec)
                 # set locality
                 newFileSpec.locality = siteCandidate.getFileLocality(tmpFileSpec)
                 if newFileSpec.locality == 'remote':
                     newFileSpec.sourceName = siteCandidate.remoteSource
                 # append
                 tmpRetList.append(newFileSpec)
             else:
                 # getting atomic subchunk
                 tmpRetList.append(tmpFileSpec)
         # add to return map    
         tmpDatasetSpec = self.getDatasetWithID(tmpDatasetID)    
         returnList.append((tmpDatasetSpec,tmpRetList))
     # return
     return returnList
Example #7
0
 def getSubChunk(self,siteName,maxNumFiles=None,maxSize=None,
                 sizeGradients=0,sizeIntercepts=0,
                 nFilesPerJob=None,multiplicand=1,
                 walltimeGradient=0,maxWalltime=0,
                 nEventsPerJob=None,useBoundary=None,
                 sizeGradientsPerInSize=None,
                 maxOutSize=None,
                 tmpLog=None):
     # check if there are unused files/events
     if not self.checkUnused():
         return None
     # set default max number of files
     if maxNumFiles == None:
         # 20 files at most by default
         maxNumFiles = 20
     # set default max size    
     if maxSize == None:     
         # 20 GB at most by default
         maxSize = 20 * 1024 * 1024 * 1024
     # set default output size 2G + 500MB (safety merging)
     minOutSize = 2500 * 1024 * 1024
     # set default max number of events
     maxNumEvents = None
     # overwrite parameters when nFiles/EventsPerJob is used
     if nFilesPerJob != None:
         maxNumFiles  = nFilesPerJob
         multiplicand = nFilesPerJob
     if nEventsPerJob != None:
         maxNumEvents = nEventsPerJob
     # split with boundayID
     splitWithBoundaryID = False    
     if useBoundary != None:
         splitWithBoundaryID = True
         if useBoundary['inSplit'] == 2:
             # unset max values to split only with boundaryID 
             maxNumFiles = None
             maxSize = None
             maxWalltime = 0
             maxNumEvents = None
             multiplicand = 1
     # get site when splitting per site
     if siteName != None:
         siteCandidate = self.siteCandidates[siteName]
     # start splitting
     inputNumFiles  = 0
     inputNumEvents = 0 
     fileSize       = 0
     firstLoop      = True
     firstMaster    = True
     inputFileMap   = {}
     expWalltime    = 0
     nextStartEvent = None
     boundaryID     = None
     newBoundaryID  = False
     nSecFilesMap   = {}
     numMaster      = 0
     outSize        = 0
     while (maxNumFiles == None or inputNumFiles <= maxNumFiles) \
             and (maxSize == None or (maxSize != None and fileSize <= maxSize)) \
             and (maxWalltime <= 0 or expWalltime <= maxWalltime) \
             and (maxNumEvents == None or (maxNumEvents != None and inputNumEvents <= maxNumEvents)) \
             and (maxOutSize == None or outSize <= maxOutSize):
         # get one file (or one file group for MP) from master
         datasetUsage = self.datasetMap[self.masterDataset.datasetID]
         for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]:
             # check start event to keep continuity
             if maxNumEvents != None and tmpFileSpec.startEvent != None:
                 if nextStartEvent != None and nextStartEvent != tmpFileSpec.startEvent:
                     break
             # check boundaryID
             if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \
                     and useBoundary['inSplit'] != 3:
                 newBoundaryID = True
                 break
             if not inputFileMap.has_key(self.masterDataset.datasetID):
                 inputFileMap[self.masterDataset.datasetID] = []
             inputFileMap[self.masterDataset.datasetID].append(tmpFileSpec)
             datasetUsage['used'] += 1
             numMaster += 1
             # get effective file size
             effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent,
                                                                 tmpFileSpec.endEvent,tmpFileSpec.nEvents)
             # sum
             inputNumFiles += 1
             fileSize += long(tmpFileSpec.fsize + sizeGradients * effectiveFsize)
             outSize += long(sizeGradients * effectiveFsize)
             if sizeGradientsPerInSize != None:
                 fileSize += long(effectiveFsize * sizeGradientsPerInSize)
                 outSize += long(effectiveFsize * sizeGradientsPerInSize)
             # sum offset only for the first master
             if firstMaster:
                 fileSize += sizeIntercepts
             firstMaster = False
             # walltime
             expWalltime += long(walltimeGradient * effectiveFsize)
             # the number of events
             if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                 inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1)
                 # set next start event
                 nextStartEvent = tmpFileSpec.endEvent + 1
                 if nextStartEvent == tmpFileSpec.nEvents:
                     nextStartEvent = 0
             # boundaryID
             if splitWithBoundaryID:
                 boundaryID = tmpFileSpec.boundaryID
         # get files from secondaries
         for datasetSpec in self.secondaryDatasetList:
             if datasetSpec.isNoSplit():
                 # every job uses dataset without splitting
                 if firstLoop:
                     datasetUsage = self.datasetMap[datasetSpec.datasetID]
                     for tmpFileSpec in datasetSpec.Files:
                         if not inputFileMap.has_key(datasetSpec.datasetID):
                             inputFileMap[datasetSpec.datasetID] = []
                         inputFileMap[datasetSpec.datasetID].append(tmpFileSpec)
                         # sum
                         fileSize += tmpFileSpec.fsize
                         if sizeGradientsPerInSize != None:
                             fileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize)
                             outSize += (tmpFileSpec.fsize * sizeGradientsPerInSize)
                         datasetUsage['used'] += 1
             else:
                 if not nSecFilesMap.has_key(datasetSpec.datasetID):
                     nSecFilesMap[datasetSpec.datasetID] = 0
                 # get number of files to be used for the secondary
                 nSecondary = datasetSpec.getNumFilesPerJob()
                 if nSecondary != None and firstLoop == False:
                     # read files only in the first bunch when number of files per job is specified
                     continue
                 if nSecondary == None:
                     nSecondary = datasetSpec.getNumMultByRatio(numMaster) - nSecFilesMap[datasetSpec.datasetID]
                     if splitWithBoundaryID and useBoundary['inSplit'] != 3:
                         # set large number to get all associated secondary files
                         nSecondary = 10000
                 datasetUsage = self.datasetMap[datasetSpec.datasetID]
                 for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]:
                     # check boundaryID
                     if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID:
                         break
                     if not inputFileMap.has_key(datasetSpec.datasetID):
                         inputFileMap[datasetSpec.datasetID] = []
                     inputFileMap[datasetSpec.datasetID].append(tmpFileSpec)
                     # sum
                     fileSize += tmpFileSpec.fsize
                     if sizeGradientsPerInSize != None:
                         fileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize)
                         outSize += (tmpFileSpec.fsize * sizeGradientsPerInSize)
                     datasetUsage['used'] += 1
                     nSecFilesMap[datasetSpec.datasetID] += 1
         # unset first loop flag
         firstLoop = False
         # check if there are unused files/evets 
         if not self.checkUnused():
             break
         # break if nFilesPerJob is used
         if nFilesPerJob != None:
             break
         # boundayID is changed
         if newBoundaryID:
             break
         # check master in the next loop
         datasetUsage = self.datasetMap[self.masterDataset.datasetID]
         newInputNumFiles  = inputNumFiles
         newInputNumEvents = inputNumEvents
         newFileSize       = fileSize
         newExpWalltime    = expWalltime
         newNextStartEvent = nextStartEvent
         newNumMaster      = numMaster
         terminateFlag     = False
         newOutSize        = outSize
         for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]:
             # check continuity of event
             if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None:
                 newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1)
                 # continuity of event is broken
                 if newNextStartEvent != None and newNextStartEvent != tmpFileSpec.startEvent:
                     # no files in the next loop
                     if newInputNumFiles == 0: 
                         terminateFlag = True
                     break
                 newNextStartEvent = tmpFileSpec.endEvent + 1
             # check boundary
             if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID:
                 # no files in the next loop
                 if newInputNumFiles == 0:
                     terminateFlag = True
                 break
             # get effective file size
             effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent,
                                                                 tmpFileSpec.endEvent,tmpFileSpec.nEvents)
             newInputNumFiles += 1
             newNumMaster += 1
             newFileSize += long(tmpFileSpec.fsize + sizeGradients * effectiveFsize)
             newOutSize += long(sizeGradients * effectiveFsize)
             if sizeGradientsPerInSize != None:
                 newFileSize += long(effectiveFsize * sizeGradientsPerInSize)
                 newOutSize += long(effectiveFsize * sizeGradientsPerInSize)
             newExpWalltime += long(walltimeGradient * effectiveFsize)
         # check secondaries
         for datasetSpec in self.secondaryDatasetList:
             if not datasetSpec.isNoSplit() and datasetSpec.getNumFilesPerJob() == None:
                 # check boundaryID
                 if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID:
                     break
                 newNumSecondary = datasetSpec.getNumMultByRatio(newNumMaster) - nSecFilesMap[datasetSpec.datasetID]
                 datasetUsage = self.datasetMap[datasetSpec.datasetID]
                 for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]:
                     # check boundaryID
                     if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID:
                         break
                     newFileSize += tmpFileSpec.fsize
                     if sizeGradientsPerInSize != None:
                         newFileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize)
         # termination            
         if terminateFlag:
             break
         # check
         if (maxNumFiles != None and newInputNumFiles > maxNumFiles) \
                 or (maxSize != None and newFileSize > maxSize) \
                 or (maxSize != None and newOutSize < minOutSize and maxSize-minOutSize < newFileSize-newOutSize) \
                 or (maxWalltime > 0 and newExpWalltime > maxWalltime) \
                 or (maxNumEvents != None and newInputNumEvents > maxNumEvents) \
                 or (maxOutSize != None and newOutSize > maxOutSize):
             break
     # reset nUsed for repeated datasets
     for tmpDatasetID,datasetUsage in self.datasetMap.iteritems():
         tmpDatasetSpec = datasetUsage['datasetSpec']
         if tmpDatasetSpec.isRepeated():
             datasetUsage['used'] %= len(tmpDatasetSpec.Files)
     # make copy to return
     returnList = []
     for tmpDatasetID,inputFileList in inputFileMap.iteritems():
         tmpRetList = []
         for tmpFileSpec in inputFileList:
             # split par site or get atomic subchunk
             if siteName != None:
                 # make copy to individually set locality
                 newFileSpec = copy.copy(tmpFileSpec)
                 # set locality
                 newFileSpec.locality = siteCandidate.getFileLocality(tmpFileSpec)
                 if newFileSpec.locality == 'remote':
                     newFileSpec.sourceName = siteCandidate.remoteSource
                 # append
                 tmpRetList.append(newFileSpec)
             else:
                 # getting atomic subchunk
                 tmpRetList.append(tmpFileSpec)
         # add to return map    
         tmpDatasetSpec = self.getDatasetWithID(tmpDatasetID)    
         returnList.append((tmpDatasetSpec,tmpRetList))
     # return
     return returnList