def getMaxAtomSize(self, effectiveSize=False, getNumEvents=False): # number of files per job if defined if not self.isMerging: nFilesPerJob = self.taskSpec.getNumFilesPerJob() else: nFilesPerJob = self.taskSpec.getNumFilesPerMergeJob() nEventsPerJob = None if nFilesPerJob == None: # number of events per job if not self.isMerging: nEventsPerJob = self.taskSpec.getNumEventsPerJob() else: nEventsPerJob = self.taskSpec.getNumEventsPerMergeJob() if nEventsPerJob == None: nFilesPerJob = 1 # grouping with boundaryID useBoundary = self.taskSpec.useGroupWithBoundaryID() # LB respectLB = self.taskSpec.respectLumiblock() maxAtomSize = 0 while True: if not self.isMerging: maxNumFiles = self.taskSpec.getMaxNumFilesPerJob() else: maxNumFiles = self.taskSpec.getMaxNumFilesPerMergeJob() # get one subchunk subChunk = self.getSubChunk(None, nFilesPerJob=nFilesPerJob, nEventsPerJob=nEventsPerJob, useBoundary=useBoundary, respectLB=respectLB, maxNumFiles=maxNumFiles) if subChunk == None: break # get size tmpAtomSize = 0 for tmpDatasetSpec, tmpFileSpecList in subChunk: if (effectiveSize or getNumEvents) and not tmpDatasetSpec.isMaster(): continue for tmpFileSpec in tmpFileSpecList: if effectiveSize: tmpAtomSize += JediCoreUtils.getEffectiveFileSize( tmpFileSpec.fsize, tmpFileSpec.startEvent, tmpFileSpec.endEvent, tmpFileSpec.nEvents) elif getNumEvents: tmpAtomSize += tmpFileSpec.getEffectiveNumEvents() else: tmpAtomSize += tmpFileSpec.fsize if maxAtomSize < tmpAtomSize: maxAtomSize = tmpAtomSize # reset counters self.resetUsedCounters() # return return maxAtomSize
def getMaxAtomSize(self,effectiveSize=False,getNumEvents=False): # number of files per job if defined if not self.isMerging: nFilesPerJob = self.taskSpec.getNumFilesPerJob() else: nFilesPerJob = self.taskSpec.getNumFilesPerMergeJob() nEventsPerJob = None if nFilesPerJob == None: # number of events per job if not self.isMerging: nEventsPerJob = self.taskSpec.getNumEventsPerJob() else: nEventsPerJob = self.taskSpec.getNumEventsPerMergeJob() if nEventsPerJob == None: nFilesPerJob = 1 # grouping with boundaryID useBoundary = self.taskSpec.useGroupWithBoundaryID() # LB respectLB = self.taskSpec.respectLumiblock() maxAtomSize = 0 while True: if not self.isMerging: maxNumFiles = self.taskSpec.getMaxNumFilesPerJob() else: maxNumFiles = self.taskSpec.getMaxNumFilesPerMergeJob() # get one subchunk subChunk = self.getSubChunk(None,nFilesPerJob=nFilesPerJob, nEventsPerJob=nEventsPerJob, useBoundary=useBoundary, respectLB=respectLB, maxNumFiles=maxNumFiles) if subChunk == None: break # get size tmpAtomSize = 0 for tmpDatasetSpec,tmpFileSpecList in subChunk: if (effectiveSize or getNumEvents) and not tmpDatasetSpec.isMaster(): continue for tmpFileSpec in tmpFileSpecList: if effectiveSize: tmpAtomSize += JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent, tmpFileSpec.endEvent,tmpFileSpec.nEvents) elif getNumEvents: tmpAtomSize += tmpFileSpec.getEffectiveNumEvents() else: tmpAtomSize += tmpFileSpec.fsize if maxAtomSize < tmpAtomSize: maxAtomSize = tmpAtomSize # reset counters self.resetUsedCounters() # return return maxAtomSize
def wrappedMain(self): while True: proc = multiprocessing.Process(target=self.target, args=self.args) proc.start() pid = proc.pid while True: try: proc.join(20) if not JediCoreUtils.checkProcess(pid): timeNow = datetime.datetime.utcnow() print "{0} {1}: INFO pid={2} not exist".format( str(timeNow), self.__class__.__name__, pid) break except: timeNow = datetime.datetime.utcnow() errType, errValue = sys.exc_info()[:2] print "{0} {1}: INFO failed to check pid={2} with {3} {4}".format( str(timeNow), self.__class__.__name__, pid, errType, errValue)
def wrappedMain(self): while True: proc = multiprocessing.Process(target=self.target, args=self.args) proc.start() pid = proc.pid while True: try: proc.join(20) if not JediCoreUtils.checkProcess(pid): timeNow = datetime.datetime.utcnow() print "{0} {1}: INFO pid={2} not exist".format(str(timeNow), self.__class__.__name__, pid) break except: timeNow = datetime.datetime.utcnow() errType,errValue = sys.exc_info()[:2] print "{0} {1}: INFO failed to check pid={2} with {3} {4}".format(str(timeNow), self.__class__.__name__, pid,errType,errValue)
def getSubChunk(self, siteName, maxNumFiles=None, maxSize=None, sizeGradients=0, sizeIntercepts=0, nFilesPerJob=None, multiplicand=1, walltimeGradient=0, maxWalltime=0, nEventsPerJob=None, useBoundary=None, sizeGradientsPerInSize=None, maxOutSize=None, coreCount=1, respectLB=False, corePower=None, dynNumEvents=False, maxNumEventRanges=None, multiplicity=None, splitByFields=None, tmpLog=None, useDirectIO=False): # check if there are unused files/events if not self.checkUnused(): return None # protection against unreasonable values if nFilesPerJob == 0: nFilesPerJob = None if nEventsPerJob == 0: nEventsPerJob = None # set default max number of files if maxNumFiles == None: maxNumFiles = 50 # set default max number of event ranges if maxNumEventRanges == None: maxNumEventRanges = 20 # set default max size if maxSize == None and nFilesPerJob == None and nEventsPerJob == None: # 20 GB at most by default maxSize = 20 * 1024 * 1024 * 1024 # set default output size minOutSize = self.defaultOutputSize # set default max number of events maxNumEvents = None # ignore negative walltime gradient if walltimeGradient < 0: walltimeGradient = 0 # overwrite parameters when nFiles/EventsPerJob is used if nFilesPerJob != None and not dynNumEvents: maxNumFiles = nFilesPerJob if not respectLB: multiplicand = nFilesPerJob if nEventsPerJob != None: maxNumEvents = nEventsPerJob # split with boundayID splitWithBoundaryID = False if useBoundary != None: splitWithBoundaryID = True if useBoundary['inSplit'] == 2: # unset max values to split only with boundaryID maxNumFiles = None maxSize = None maxWalltime = 0 maxNumEvents = None multiplicand = 1 # get site when splitting per site if siteName != None: siteCandidate = self.siteCandidates[siteName] # use event ratios useEventRatio = self.useEventRatioForSec() # start splitting inputNumFiles = 0 inputNumEvents = 0 fileSize = 0 firstLoop = True firstMaster = True inputFileMap = {} expWalltime = 0 nextStartEvent = None boundaryID = None newBoundaryID = False eventJump = False nSecFilesMap = {} nSecEventsMap = {} numMaster = 0 outSizeMap = {} lumiBlockNr = None newLumiBlockNr = False siteAvailable = True inputFileSet = set() fieldStr = None while (maxNumFiles == None or (not dynNumEvents and inputNumFiles <= maxNumFiles) or \ (dynNumEvents and len(inputFileSet) <= maxNumFiles and inputNumFiles <= maxNumEventRanges)) \ and (maxSize == None or (maxSize != None and fileSize <= maxSize)) \ and (maxWalltime <= 0 or expWalltime <= maxWalltime) \ and (maxNumEvents == None or (maxNumEvents != None and inputNumEvents <= maxNumEvents)) \ and (maxOutSize == None or self.getOutSize(outSizeMap) <= maxOutSize): # get one file (or one file group for MP) from master datasetUsage = self.datasetMap[self.masterDataset.datasetID] if not self.masterDataset.datasetID in outSizeMap: outSizeMap[self.masterDataset.datasetID] = 0 boundaryIDs = set() primaryHasEvents = False for tmpFileSpec in self.masterDataset.Files[ datasetUsage['used']:datasetUsage['used'] + multiplicand]: # check start event to keep continuity if (maxNumEvents != None or dynNumEvents) and tmpFileSpec.startEvent != None: if nextStartEvent != None and nextStartEvent != tmpFileSpec.startEvent: eventJump = True break # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and useBoundary['inSplit'] != 3: newBoundaryID = True break # check LB if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr: newLumiBlockNr = True break # check field if splitByFields != None: tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields) if fieldStr == None: fieldStr = tmpFieldStr elif tmpFieldStr != fieldStr: newBoundaryID = True break # check for distributed datasets if self.masterDataset.isDistributed() and siteName != None and \ not siteCandidate.isAvailableFile(tmpFileSpec): siteAvailable = False break if not inputFileMap.has_key(self.masterDataset.datasetID): inputFileMap[self.masterDataset.datasetID] = [] inputFileMap[self.masterDataset.datasetID].append(tmpFileSpec) inputFileSet.add(tmpFileSpec.lfn) datasetUsage['used'] += 1 numMaster += 1 # get effective file size effectiveFsize = JediCoreUtils.getEffectiveFileSize( tmpFileSpec.fsize, tmpFileSpec.startEvent, tmpFileSpec.endEvent, tmpFileSpec.nEvents) # get num of events effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents() # sum inputNumFiles += 1 if self.taskSpec.outputScaleWithEvents(): fileSize += long(sizeGradients * effectiveNumEvents) if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: if not useDirectIO: fileSize += long(tmpFileSpec.fsize) outSizeMap[self.masterDataset.datasetID] += long( sizeGradients * effectiveNumEvents) else: fileSize += long(sizeGradients * effectiveFsize) if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: if not useDirectIO: fileSize += long(tmpFileSpec.fsize) outSizeMap[self.masterDataset.datasetID] += long( sizeGradients * effectiveFsize) if sizeGradientsPerInSize != None: fileSize += long(effectiveFsize * sizeGradientsPerInSize) outSizeMap[self.masterDataset.datasetID] += long( effectiveFsize * sizeGradientsPerInSize) # sum offset only for the first master if firstMaster: fileSize += sizeIntercepts # walltime if self.taskSpec.useHS06(): if firstMaster: expWalltime += self.taskSpec.baseWalltime tmpExpWalltime = walltimeGradient * effectiveNumEvents / float( coreCount) if not corePower in [None, 0]: tmpExpWalltime /= corePower if self.taskSpec.cpuEfficiency == 0: tmpExpWalltime = 0 else: tmpExpWalltime /= float( self.taskSpec.cpuEfficiency) / 100.0 if multiplicity != None: tmpExpWalltime /= float(multiplicity) expWalltime += long(tmpExpWalltime) else: tmpExpWalltime = walltimeGradient * effectiveFsize / float( coreCount) if multiplicity != None: tmpExpWalltime /= float(multiplicity) expWalltime += long(tmpExpWalltime) # the number of events if ( maxNumEvents != None or useEventRatio ) and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: primaryHasEvents = True inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) # set next start event nextStartEvent = tmpFileSpec.endEvent + 1 if nextStartEvent == tmpFileSpec.nEvents: nextStartEvent = 0 # boundaryID if splitWithBoundaryID: boundaryID = tmpFileSpec.boundaryID if not boundaryID in boundaryIDs: boundaryIDs.add(boundaryID) # LB if respectLB: lumiBlockNr = tmpFileSpec.lumiBlockNr firstMaster = False # get files from secondaries firstSecondary = True for datasetSpec in self.secondaryDatasetList: if not datasetSpec.datasetID in outSizeMap: outSizeMap[datasetSpec.datasetID] = 0 if datasetSpec.isNoSplit(): # every job uses dataset without splitting if firstLoop: datasetUsage = self.datasetMap[datasetSpec.datasetID] for tmpFileSpec in datasetSpec.Files: if not inputFileMap.has_key(datasetSpec.datasetID): inputFileMap[datasetSpec.datasetID] = [] inputFileMap[datasetSpec.datasetID].append( tmpFileSpec) # sum if not useDirectIO: fileSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: fileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) outSizeMap[datasetSpec.datasetID] += ( tmpFileSpec.fsize * sizeGradientsPerInSize) datasetUsage['used'] += 1 else: if not nSecFilesMap.has_key(datasetSpec.datasetID): nSecFilesMap[datasetSpec.datasetID] = 0 # get number of files to be used for the secondary nSecondary = datasetSpec.getNumFilesPerJob() if nSecondary != None and firstLoop == False: # read files only in the first bunch when number of files per job is specified continue if nSecondary == None: nSecondary = datasetSpec.getNumMultByRatio( numMaster) - nSecFilesMap[datasetSpec.datasetID] if (datasetSpec.getEventRatio() != None and inputNumEvents > 0) or ( splitWithBoundaryID and useBoundary['inSplit'] != 3): # set large number to get all associated secondary files nSecondary = 10000 datasetUsage = self.datasetMap[datasetSpec.datasetID] # reset nUsed if datasetSpec.isReusable( ) and datasetUsage['used'] + nSecondary > len( datasetSpec.Files): datasetUsage['used'] = 0 for tmpFileSpec in datasetSpec.Files[ datasetUsage['used']:datasetUsage['used'] + nSecondary]: # check boundaryID if (splitWithBoundaryID or (useBoundary != None and useBoundary['inSplit'] == 3 and datasetSpec.getRatioToMaster() > 1)) \ and boundaryID != None and \ not (boundaryID == tmpFileSpec.boundaryID or tmpFileSpec.boundaryID in boundaryIDs): break # check for distributed datasets if datasetSpec.isDistributed() and siteName != None and \ not siteCandidate.isAvailableFile(tmpFileSpec): break # check ratio if not datasetSpec.datasetID in nSecEventsMap: nSecEventsMap[datasetSpec.datasetID] = 0 if datasetSpec.getEventRatio( ) != None and inputNumEvents > 0: if float( nSecEventsMap[datasetSpec.datasetID] ) / float(inputNumEvents ) >= datasetSpec.getEventRatio(): break if not inputFileMap.has_key(datasetSpec.datasetID): inputFileMap[datasetSpec.datasetID] = [] inputFileMap[datasetSpec.datasetID].append(tmpFileSpec) # sum if not useDirectIO: fileSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: fileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) outSizeMap[datasetSpec.datasetID] += ( tmpFileSpec.fsize * sizeGradientsPerInSize) datasetUsage['used'] += 1 nSecFilesMap[datasetSpec.datasetID] += 1 # the number of events if firstSecondary and maxNumEvents != None and not primaryHasEvents: if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) elif tmpFileSpec.nEvents != None: inputNumEvents += tmpFileSpec.nEvents if tmpFileSpec.nEvents != None: nSecEventsMap[ datasetSpec.datasetID] += tmpFileSpec.nEvents # use only the first secondary firstSecondary = False # unset first loop flag firstLoop = False # check if there are unused files/evets if not self.checkUnused(): break # break if nFilesPerJob is used as multiplicand if nFilesPerJob != None and not respectLB: break # boundayID is changed if newBoundaryID: break # LB is changed if newLumiBlockNr: break # event jump if eventJump: break # distributed files are unavailable if not siteAvailable: break primaryHasEvents = False # check master in the next loop datasetUsage = self.datasetMap[self.masterDataset.datasetID] newInputNumFiles = inputNumFiles newInputNumEvents = inputNumEvents newFileSize = fileSize newExpWalltime = expWalltime newNextStartEvent = nextStartEvent newNumMaster = numMaster terminateFlag = False newOutSizeMap = copy.copy(outSizeMap) newBoundaryIDs = set() newInputFileSet = copy.copy(inputFileSet) if not self.masterDataset.datasetID in newOutSizeMap: newOutSizeMap[self.masterDataset.datasetID] = 0 for tmpFileSpec in self.masterDataset.Files[ datasetUsage['used']:datasetUsage['used'] + multiplicand]: # check continuity of event if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: primaryHasEvents = True newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) # continuity of event is broken if newNextStartEvent != None and newNextStartEvent != tmpFileSpec.startEvent: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break newNextStartEvent = tmpFileSpec.endEvent + 1 # check boundary if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and useBoundary['inSplit'] != 3: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # check LB if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # check field if splitByFields != None: tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields) if tmpFieldStr != fieldStr: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # check for distributed datasets if self.masterDataset.isDistributed() and siteName != None and \ not siteCandidate.isAvailableFile(tmpFileSpec): # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # get effective file size effectiveFsize = JediCoreUtils.getEffectiveFileSize( tmpFileSpec.fsize, tmpFileSpec.startEvent, tmpFileSpec.endEvent, tmpFileSpec.nEvents) # get num of events effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents() newInputNumFiles += 1 newNumMaster += 1 newInputFileSet.add(tmpFileSpec.lfn) if self.taskSpec.outputScaleWithEvents(): newFileSize += long(sizeGradients * effectiveNumEvents) if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: if not useDirectIO: newFileSize += long(tmpFileSpec.fsize) newOutSizeMap[self.masterDataset.datasetID] += long( sizeGradients * effectiveNumEvents) else: newFileSize += long(sizeGradients * effectiveFsize) if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: if not useDirectIO: newFileSize += long(tmpFileSpec.fsize) newOutSizeMap[self.masterDataset.datasetID] += long( sizeGradients * effectiveFsize) if sizeGradientsPerInSize != None: newFileSize += long(effectiveFsize * sizeGradientsPerInSize) newOutSizeMap[self.masterDataset.datasetID] += long( effectiveFsize * sizeGradientsPerInSize) if self.taskSpec.useHS06(): tmpExpWalltime = walltimeGradient * effectiveNumEvents / float( coreCount) if not corePower in [None, 0]: tmpExpWalltime /= corePower if self.taskSpec.cpuEfficiency == 0: tmpExpWalltime = 0 else: tmpExpWalltime /= float( self.taskSpec.cpuEfficiency) / 100.0 if multiplicity != None: tmpExpWalltime /= float(multiplicity) newExpWalltime += long(tmpExpWalltime) else: tmpExpWalltime = walltimeGradient * effectiveFsize / float( coreCount) if multiplicity != None: tmpExpWalltime /= float(multiplicity) newExpWalltime += long(tmpExpWalltime) # boundaryID if splitWithBoundaryID: newBoundaryIDs.add(tmpFileSpec.boundaryID) # check secondaries firstSecondary = True for datasetSpec in self.secondaryDatasetList: if not datasetSpec.datasetID in newOutSizeMap: newOutSizeMap[datasetSpec.datasetID] = 0 if not datasetSpec.isNoSplit( ) and datasetSpec.getNumFilesPerJob() == None: # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and useBoundary['inSplit'] != 3: break newNumSecondary = datasetSpec.getNumMultByRatio( newNumMaster) - nSecFilesMap[datasetSpec.datasetID] datasetUsage = self.datasetMap[datasetSpec.datasetID] for tmpFileSpec in datasetSpec.Files[ datasetUsage['used']:datasetUsage['used'] + nSecondary]: # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and not tmpFileSpec.boundaryID in boundaryIDs and not tmpFileSpec.boundaryID in newBoundaryIDs: break if not useDirectIO: newFileSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: newFileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) newOutSizeMap[datasetSpec.datasetID] += ( tmpFileSpec.fsize * sizeGradientsPerInSize) # the number of events if firstSecondary and maxNumEvents != None and not primaryHasEvents: if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) elif tmpFileSpec.nEvents != None: newInputNumEvents += tmpFileSpec.nEvents firstSecondary = False # termination if terminateFlag: break # check newOutSize = self.getOutSize(newOutSizeMap) if (maxNumFiles != None and ((not dynNumEvents and newInputNumFiles > maxNumFiles) \ or (dynNumEvents and (len(newInputFileSet) > maxNumFiles or newInputNumFiles > maxNumEventRanges)))) \ or (maxSize != None and newFileSize > maxSize) \ or (maxSize != None and newOutSize < minOutSize and maxSize-minOutSize < newFileSize-newOutSize) \ or (maxWalltime > 0 and newExpWalltime > maxWalltime) \ or (maxNumEvents != None and newInputNumEvents > maxNumEvents) \ or (maxOutSize != None and self.getOutSize(newOutSizeMap) > maxOutSize): break # reset nUsed for repeated datasets for tmpDatasetID, datasetUsage in self.datasetMap.iteritems(): tmpDatasetSpec = datasetUsage['datasetSpec'] if tmpDatasetSpec.isRepeated(): if len(tmpDatasetSpec.Files) > 0: datasetUsage['used'] %= len(tmpDatasetSpec.Files) # make copy to return returnList = [] for tmpDatasetID, inputFileList in inputFileMap.iteritems(): tmpRetList = [] for tmpFileSpec in inputFileList: # split par site or get atomic subchunk if siteName != None: # make copy to individually set locality newFileSpec = copy.copy(tmpFileSpec) # set locality newFileSpec.locality = siteCandidate.getFileLocality( tmpFileSpec) if newFileSpec.locality == 'remote': newFileSpec.sourceName = siteCandidate.remoteSource # append tmpRetList.append(newFileSpec) else: # getting atomic subchunk tmpRetList.append(tmpFileSpec) # add to return map tmpDatasetSpec = self.getDatasetWithID(tmpDatasetID) returnList.append((tmpDatasetSpec, tmpRetList)) # return return returnList
def getSubChunk(self,siteName,maxNumFiles=None,maxSize=None, sizeGradients=0,sizeIntercepts=0, nFilesPerJob=None,multiplicand=1, walltimeGradient=0,maxWalltime=0, nEventsPerJob=None,useBoundary=None, sizeGradientsPerInSize=None, maxOutSize=None, coreCount=1, respectLB=False, corePower=None, dynNumEvents=False, maxNumEventRanges=None, multiplicity=None, splitByFields=None, tmpLog=None, useDirectIO=False, maxDiskSize=None): # check if there are unused files/events if not self.checkUnused(): return None # protection against unreasonable values if nFilesPerJob == 0: nFilesPerJob = None if nEventsPerJob == 0: nEventsPerJob = None # set default max number of files if maxNumFiles == None: maxNumFiles = 50 # set default max number of event ranges if maxNumEventRanges == None: maxNumEventRanges = 20 # set default max size if maxSize == None and nFilesPerJob == None and nEventsPerJob == None: # 20 GB at most by default maxSize = 20 * 1024 * 1024 * 1024 # set default output size minOutSize = self.defaultOutputSize # set default max number of events maxNumEvents = None # ignore negative walltime gradient if walltimeGradient < 0: walltimeGradient = 0 # overwrite parameters when nFiles/EventsPerJob is used if nFilesPerJob != None and not dynNumEvents: maxNumFiles = nFilesPerJob if not respectLB: multiplicand = nFilesPerJob if nEventsPerJob != None: maxNumEvents = nEventsPerJob # split with boundayID splitWithBoundaryID = False if useBoundary != None: splitWithBoundaryID = True if useBoundary['inSplit'] == 2: # unset max values to split only with boundaryID maxNumFiles = None maxSize = None maxWalltime = 0 maxNumEvents = None multiplicand = 1 # get site when splitting per site if siteName != None: siteCandidate = self.siteCandidates[siteName] # use event ratios useEventRatio = self.useEventRatioForSec() # start splitting inputNumFiles = 0 inputNumEvents = 0 fileSize = 0 firstLoop = True firstMaster = True inputFileMap = {} expWalltime = 0 nextStartEvent = None boundaryID = None newBoundaryID = False eventJump = False nSecFilesMap = {} nSecEventsMap = {} numMaster = 0 outSizeMap = {} lumiBlockNr = None newLumiBlockNr = False siteAvailable = True inputFileSet = set() fieldStr = None diskSize = 0 while (maxNumFiles == None or (not dynNumEvents and inputNumFiles <= maxNumFiles) or \ (dynNumEvents and len(inputFileSet) <= maxNumFiles and inputNumFiles <= maxNumEventRanges)) \ and (maxSize == None or (maxSize != None and fileSize <= maxSize)) \ and (maxWalltime <= 0 or expWalltime <= maxWalltime) \ and (maxNumEvents == None or (maxNumEvents != None and inputNumEvents <= maxNumEvents)) \ and (maxOutSize == None or self.getOutSize(outSizeMap) <= maxOutSize) \ and (maxDiskSize is None or diskSize <= maxDiskSize): # get one file (or one file group for MP) from master datasetUsage = self.datasetMap[self.masterDataset.datasetID] if not self.masterDataset.datasetID in outSizeMap: outSizeMap[self.masterDataset.datasetID] = 0 boundaryIDs = set() primaryHasEvents = False for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]: # check start event to keep continuity if (maxNumEvents != None or dynNumEvents) and tmpFileSpec.startEvent != None: if nextStartEvent != None and nextStartEvent != tmpFileSpec.startEvent: eventJump = True break # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and useBoundary['inSplit'] != 3: newBoundaryID = True break # check LB if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr: newLumiBlockNr = True break # check field if splitByFields != None: tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields) if fieldStr == None: fieldStr = tmpFieldStr elif tmpFieldStr != fieldStr: newBoundaryID = True break # check for distributed datasets if self.masterDataset.isDistributed() and siteName != None and \ not siteCandidate.isAvailableFile(tmpFileSpec): siteAvailable = False break if not inputFileMap.has_key(self.masterDataset.datasetID): inputFileMap[self.masterDataset.datasetID] = [] inputFileMap[self.masterDataset.datasetID].append(tmpFileSpec) inputFileSet.add(tmpFileSpec.lfn) datasetUsage['used'] += 1 numMaster += 1 # get effective file size effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent, tmpFileSpec.endEvent,tmpFileSpec.nEvents) # get num of events effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents() # sum inputNumFiles += 1 if self.taskSpec.outputScaleWithEvents(): tmpOutSize = long(sizeGradients * effectiveNumEvents) fileSize += tmpOutSize diskSize += tmpOutSize if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: fileSize += long(tmpFileSpec.fsize) if not useDirectIO: diskSize += long(tmpFileSpec.fsize) outSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveNumEvents) else: tmpOutSize = long(sizeGradients * effectiveFsize) fileSize += tmpOutSize diskSize += tmpOutSize if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: fileSize += long(tmpFileSpec.fsize) if not useDirectIO: diskSize += long(tmpFileSpec.fsize) outSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveFsize) if sizeGradientsPerInSize != None: tmpOutSize = long(effectiveFsize * sizeGradientsPerInSize) fileSize += tmpOutSize diskSize += tmpOutSize outSizeMap[self.masterDataset.datasetID] += long(effectiveFsize * sizeGradientsPerInSize) # sum offset only for the first master if firstMaster: fileSize += sizeIntercepts # walltime if self.taskSpec.useHS06(): if firstMaster: expWalltime += self.taskSpec.baseWalltime tmpExpWalltime = walltimeGradient * effectiveNumEvents / float(coreCount) if not corePower in [None,0]: tmpExpWalltime /= corePower if self.taskSpec.cpuEfficiency == 0: tmpExpWalltime = 0 else: tmpExpWalltime /= float(self.taskSpec.cpuEfficiency)/100.0 if multiplicity != None: tmpExpWalltime /= float(multiplicity) expWalltime += long(tmpExpWalltime) else: tmpExpWalltime = walltimeGradient * effectiveFsize / float(coreCount) if multiplicity != None: tmpExpWalltime /= float(multiplicity) expWalltime += long(tmpExpWalltime) # the number of events if (maxNumEvents != None or useEventRatio) and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: primaryHasEvents = True inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) # set next start event nextStartEvent = tmpFileSpec.endEvent + 1 if nextStartEvent == tmpFileSpec.nEvents: nextStartEvent = 0 # boundaryID if splitWithBoundaryID: boundaryID = tmpFileSpec.boundaryID if not boundaryID in boundaryIDs: boundaryIDs.add(boundaryID) # LB if respectLB: lumiBlockNr = tmpFileSpec.lumiBlockNr firstMaster = False # get files from secondaries firstSecondary = True for datasetSpec in self.secondaryDatasetList: if not datasetSpec.datasetID in outSizeMap: outSizeMap[datasetSpec.datasetID] = 0 if datasetSpec.isNoSplit(): # every job uses dataset without splitting if firstLoop: datasetUsage = self.datasetMap[datasetSpec.datasetID] for tmpFileSpec in datasetSpec.Files: if not inputFileMap.has_key(datasetSpec.datasetID): inputFileMap[datasetSpec.datasetID] = [] inputFileMap[datasetSpec.datasetID].append(tmpFileSpec) # sum fileSize += tmpFileSpec.fsize if not useDirectIO: diskSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: tmpOutSize = (tmpFileSpec.fsize * sizeGradientsPerInSize) fileSize += tmpOutSize diskSize += tmpOutSize outSizeMap[datasetSpec.datasetID] += (tmpFileSpec.fsize * sizeGradientsPerInSize) datasetUsage['used'] += 1 else: if not nSecFilesMap.has_key(datasetSpec.datasetID): nSecFilesMap[datasetSpec.datasetID] = 0 # get number of files to be used for the secondary nSecondary = datasetSpec.getNumFilesPerJob() if nSecondary != None and firstLoop == False: # read files only in the first bunch when number of files per job is specified continue if nSecondary == None: nSecondary = datasetSpec.getNumMultByRatio(numMaster) - nSecFilesMap[datasetSpec.datasetID] if (datasetSpec.getEventRatio() != None and inputNumEvents > 0) or (splitWithBoundaryID and useBoundary['inSplit'] != 3): # set large number to get all associated secondary files nSecondary = 10000 datasetUsage = self.datasetMap[datasetSpec.datasetID] # reset nUsed if datasetSpec.isReusable() and datasetUsage['used']+nSecondary > len(datasetSpec.Files): datasetUsage['used'] = 0 for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]: # check boundaryID if (splitWithBoundaryID or (useBoundary != None and useBoundary['inSplit'] == 3 and datasetSpec.getRatioToMaster() > 1)) \ and boundaryID != None and \ not (boundaryID == tmpFileSpec.boundaryID or tmpFileSpec.boundaryID in boundaryIDs): break # check for distributed datasets if datasetSpec.isDistributed() and siteName != None and \ not siteCandidate.isAvailableFile(tmpFileSpec): break # check ratio if not datasetSpec.datasetID in nSecEventsMap: nSecEventsMap[datasetSpec.datasetID] = 0 if datasetSpec.getEventRatio() != None and inputNumEvents > 0: if float(nSecEventsMap[datasetSpec.datasetID]) / float(inputNumEvents) >= datasetSpec.getEventRatio(): break if not inputFileMap.has_key(datasetSpec.datasetID): inputFileMap[datasetSpec.datasetID] = [] inputFileMap[datasetSpec.datasetID].append(tmpFileSpec) # sum fileSize += tmpFileSpec.fsize if not useDirectIO: diskSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: tmpOutSize = (tmpFileSpec.fsize * sizeGradientsPerInSize) fileSize += tmpOutSize diskSize += tmpOutSize outSizeMap[datasetSpec.datasetID] += (tmpFileSpec.fsize * sizeGradientsPerInSize) datasetUsage['used'] += 1 nSecFilesMap[datasetSpec.datasetID] += 1 # the number of events if firstSecondary and maxNumEvents != None and not primaryHasEvents: if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) elif tmpFileSpec.nEvents != None: inputNumEvents += tmpFileSpec.nEvents if tmpFileSpec.nEvents != None: nSecEventsMap[datasetSpec.datasetID] += tmpFileSpec.nEvents # use only the first secondary firstSecondary = False # unset first loop flag firstLoop = False # check if there are unused files/evets if not self.checkUnused(): break # break if nFilesPerJob is used as multiplicand if nFilesPerJob != None and not respectLB: break # boundayID is changed if newBoundaryID: break # LB is changed if newLumiBlockNr: break # event jump if eventJump: break # distributed files are unavailable if not siteAvailable: break primaryHasEvents = False # check master in the next loop datasetUsage = self.datasetMap[self.masterDataset.datasetID] newInputNumFiles = inputNumFiles newInputNumEvents = inputNumEvents newFileSize = fileSize newExpWalltime = expWalltime newNextStartEvent = nextStartEvent newNumMaster = numMaster terminateFlag = False newOutSizeMap = copy.copy(outSizeMap) newBoundaryIDs = set() newInputFileSet = copy.copy(inputFileSet) newDiskSize = diskSize if not self.masterDataset.datasetID in newOutSizeMap: newOutSizeMap[self.masterDataset.datasetID] = 0 for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]: # check continuity of event if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: primaryHasEvents = True newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) # continuity of event is broken if newNextStartEvent != None and newNextStartEvent != tmpFileSpec.startEvent: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break newNextStartEvent = tmpFileSpec.endEvent + 1 # check boundary if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and useBoundary['inSplit'] != 3: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # check LB if respectLB and lumiBlockNr != None and lumiBlockNr != tmpFileSpec.lumiBlockNr: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # check field if splitByFields != None: tmpFieldStr = tmpFileSpec.extractFieldsStr(splitByFields) if tmpFieldStr != fieldStr: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # check for distributed datasets if self.masterDataset.isDistributed() and siteName != None and \ not siteCandidate.isAvailableFile(tmpFileSpec): # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # get effective file size effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent, tmpFileSpec.endEvent,tmpFileSpec.nEvents) # get num of events effectiveNumEvents = tmpFileSpec.getEffectiveNumEvents() newInputNumFiles += 1 newNumMaster += 1 newInputFileSet.add(tmpFileSpec.lfn) if self.taskSpec.outputScaleWithEvents(): tmpOutSize = long(sizeGradients * effectiveNumEvents) newFileSize += tmpOutSize newDiskSize += tmpOutSize if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: newFileSize += long(tmpFileSpec.fsize) if not useDirectIO: newDiskSize += long(tmpFileSpec.fsize) newOutSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveNumEvents) else: tmpOutSize = long(sizeGradients * effectiveFsize) newFileSize += tmpOutSize newDiskSize += tmpOutSize if not dynNumEvents or tmpFileSpec.lfn not in inputFileSet: newFileSize += long(tmpFileSpec.fsize) if not useDirectIO: newDiskSize += long(tmpFileSpec.fsize) newOutSizeMap[self.masterDataset.datasetID] += long(sizeGradients * effectiveFsize) if sizeGradientsPerInSize != None: tmpOutSize = long(effectiveFsize * sizeGradientsPerInSize) newFileSize += tmpOutSize newDiskSize += tmpOutSize newOutSizeMap[self.masterDataset.datasetID] += long(effectiveFsize * sizeGradientsPerInSize) if self.taskSpec.useHS06(): tmpExpWalltime = walltimeGradient * effectiveNumEvents / float(coreCount) if not corePower in [None,0]: tmpExpWalltime /= corePower if self.taskSpec.cpuEfficiency == 0: tmpExpWalltime = 0 else: tmpExpWalltime /= float(self.taskSpec.cpuEfficiency)/100.0 if multiplicity != None: tmpExpWalltime /= float(multiplicity) newExpWalltime += long(tmpExpWalltime) else: tmpExpWalltime = walltimeGradient * effectiveFsize / float(coreCount) if multiplicity != None: tmpExpWalltime /= float(multiplicity) newExpWalltime += long(tmpExpWalltime) # boundaryID if splitWithBoundaryID: newBoundaryIDs.add(tmpFileSpec.boundaryID) # check secondaries firstSecondary = True for datasetSpec in self.secondaryDatasetList: if not datasetSpec.datasetID in newOutSizeMap: newOutSizeMap[datasetSpec.datasetID] = 0 if not datasetSpec.isNoSplit() and datasetSpec.getNumFilesPerJob() == None: # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and useBoundary['inSplit'] != 3: break newNumSecondary = datasetSpec.getNumMultByRatio(newNumMaster) - nSecFilesMap[datasetSpec.datasetID] datasetUsage = self.datasetMap[datasetSpec.datasetID] for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]: # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and not tmpFileSpec.boundaryID in boundaryIDs and not tmpFileSpec.boundaryID in newBoundaryIDs: break newFileSize += tmpFileSpec.fsize if not useDirectIO: newDiskSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: tmpOutSize = (tmpFileSpec.fsize * sizeGradientsPerInSize) newFileSize += tmpOutSize newDiskSize += tmpOutSize newOutSizeMap[datasetSpec.datasetID] += (tmpFileSpec.fsize * sizeGradientsPerInSize) # the number of events if firstSecondary and maxNumEvents != None and not primaryHasEvents: if tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) elif tmpFileSpec.nEvents != None: newInputNumEvents += tmpFileSpec.nEvents firstSecondary = False # termination if terminateFlag: break # check newOutSize = self.getOutSize(newOutSizeMap) if (maxNumFiles != None and ((not dynNumEvents and newInputNumFiles > maxNumFiles) \ or (dynNumEvents and (len(newInputFileSet) > maxNumFiles or newInputNumFiles > maxNumEventRanges)))) \ or (maxSize != None and newFileSize > maxSize) \ or (maxSize != None and newOutSize < minOutSize and maxSize-minOutSize < newFileSize-newOutSize) \ or (maxWalltime > 0 and newExpWalltime > maxWalltime) \ or (maxNumEvents != None and newInputNumEvents > maxNumEvents) \ or (maxOutSize != None and self.getOutSize(newOutSizeMap) > maxOutSize) \ or (maxDiskSize is not None and newDiskSize > maxDiskSize): break # reset nUsed for repeated datasets for tmpDatasetID,datasetUsage in self.datasetMap.iteritems(): tmpDatasetSpec = datasetUsage['datasetSpec'] if tmpDatasetSpec.isRepeated(): if len(tmpDatasetSpec.Files) > 0: datasetUsage['used'] %= len(tmpDatasetSpec.Files) # make copy to return returnList = [] for tmpDatasetID,inputFileList in inputFileMap.iteritems(): tmpRetList = [] for tmpFileSpec in inputFileList: # split par site or get atomic subchunk if siteName != None: # make copy to individually set locality newFileSpec = copy.copy(tmpFileSpec) # set locality newFileSpec.locality = siteCandidate.getFileLocality(tmpFileSpec) if newFileSpec.locality == 'remote': newFileSpec.sourceName = siteCandidate.remoteSource # append tmpRetList.append(newFileSpec) else: # getting atomic subchunk tmpRetList.append(tmpFileSpec) # add to return map tmpDatasetSpec = self.getDatasetWithID(tmpDatasetID) returnList.append((tmpDatasetSpec,tmpRetList)) # return return returnList
def getSubChunk(self,siteName,maxNumFiles=None,maxSize=None, sizeGradients=0,sizeIntercepts=0, nFilesPerJob=None,multiplicand=1, walltimeGradient=0,maxWalltime=0, nEventsPerJob=None,useBoundary=None, sizeGradientsPerInSize=None, maxOutSize=None, tmpLog=None): # check if there are unused files/events if not self.checkUnused(): return None # set default max number of files if maxNumFiles == None: # 20 files at most by default maxNumFiles = 20 # set default max size if maxSize == None: # 20 GB at most by default maxSize = 20 * 1024 * 1024 * 1024 # set default output size 2G + 500MB (safety merging) minOutSize = 2500 * 1024 * 1024 # set default max number of events maxNumEvents = None # overwrite parameters when nFiles/EventsPerJob is used if nFilesPerJob != None: maxNumFiles = nFilesPerJob multiplicand = nFilesPerJob if nEventsPerJob != None: maxNumEvents = nEventsPerJob # split with boundayID splitWithBoundaryID = False if useBoundary != None: splitWithBoundaryID = True if useBoundary['inSplit'] == 2: # unset max values to split only with boundaryID maxNumFiles = None maxSize = None maxWalltime = 0 maxNumEvents = None multiplicand = 1 # get site when splitting per site if siteName != None: siteCandidate = self.siteCandidates[siteName] # start splitting inputNumFiles = 0 inputNumEvents = 0 fileSize = 0 firstLoop = True firstMaster = True inputFileMap = {} expWalltime = 0 nextStartEvent = None boundaryID = None newBoundaryID = False nSecFilesMap = {} numMaster = 0 outSize = 0 while (maxNumFiles == None or inputNumFiles <= maxNumFiles) \ and (maxSize == None or (maxSize != None and fileSize <= maxSize)) \ and (maxWalltime <= 0 or expWalltime <= maxWalltime) \ and (maxNumEvents == None or (maxNumEvents != None and inputNumEvents <= maxNumEvents)) \ and (maxOutSize == None or outSize <= maxOutSize): # get one file (or one file group for MP) from master datasetUsage = self.datasetMap[self.masterDataset.datasetID] for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]: # check start event to keep continuity if maxNumEvents != None and tmpFileSpec.startEvent != None: if nextStartEvent != None and nextStartEvent != tmpFileSpec.startEvent: break # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID \ and useBoundary['inSplit'] != 3: newBoundaryID = True break if not inputFileMap.has_key(self.masterDataset.datasetID): inputFileMap[self.masterDataset.datasetID] = [] inputFileMap[self.masterDataset.datasetID].append(tmpFileSpec) datasetUsage['used'] += 1 numMaster += 1 # get effective file size effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent, tmpFileSpec.endEvent,tmpFileSpec.nEvents) # sum inputNumFiles += 1 fileSize += long(tmpFileSpec.fsize + sizeGradients * effectiveFsize) outSize += long(sizeGradients * effectiveFsize) if sizeGradientsPerInSize != None: fileSize += long(effectiveFsize * sizeGradientsPerInSize) outSize += long(effectiveFsize * sizeGradientsPerInSize) # sum offset only for the first master if firstMaster: fileSize += sizeIntercepts firstMaster = False # walltime expWalltime += long(walltimeGradient * effectiveFsize) # the number of events if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: inputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) # set next start event nextStartEvent = tmpFileSpec.endEvent + 1 if nextStartEvent == tmpFileSpec.nEvents: nextStartEvent = 0 # boundaryID if splitWithBoundaryID: boundaryID = tmpFileSpec.boundaryID # get files from secondaries for datasetSpec in self.secondaryDatasetList: if datasetSpec.isNoSplit(): # every job uses dataset without splitting if firstLoop: datasetUsage = self.datasetMap[datasetSpec.datasetID] for tmpFileSpec in datasetSpec.Files: if not inputFileMap.has_key(datasetSpec.datasetID): inputFileMap[datasetSpec.datasetID] = [] inputFileMap[datasetSpec.datasetID].append(tmpFileSpec) # sum fileSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: fileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) outSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) datasetUsage['used'] += 1 else: if not nSecFilesMap.has_key(datasetSpec.datasetID): nSecFilesMap[datasetSpec.datasetID] = 0 # get number of files to be used for the secondary nSecondary = datasetSpec.getNumFilesPerJob() if nSecondary != None and firstLoop == False: # read files only in the first bunch when number of files per job is specified continue if nSecondary == None: nSecondary = datasetSpec.getNumMultByRatio(numMaster) - nSecFilesMap[datasetSpec.datasetID] if splitWithBoundaryID and useBoundary['inSplit'] != 3: # set large number to get all associated secondary files nSecondary = 10000 datasetUsage = self.datasetMap[datasetSpec.datasetID] for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]: # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID: break if not inputFileMap.has_key(datasetSpec.datasetID): inputFileMap[datasetSpec.datasetID] = [] inputFileMap[datasetSpec.datasetID].append(tmpFileSpec) # sum fileSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: fileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) outSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) datasetUsage['used'] += 1 nSecFilesMap[datasetSpec.datasetID] += 1 # unset first loop flag firstLoop = False # check if there are unused files/evets if not self.checkUnused(): break # break if nFilesPerJob is used if nFilesPerJob != None: break # boundayID is changed if newBoundaryID: break # check master in the next loop datasetUsage = self.datasetMap[self.masterDataset.datasetID] newInputNumFiles = inputNumFiles newInputNumEvents = inputNumEvents newFileSize = fileSize newExpWalltime = expWalltime newNextStartEvent = nextStartEvent newNumMaster = numMaster terminateFlag = False newOutSize = outSize for tmpFileSpec in self.masterDataset.Files[datasetUsage['used']:datasetUsage['used']+multiplicand]: # check continuity of event if maxNumEvents != None and tmpFileSpec.startEvent != None and tmpFileSpec.endEvent != None: newInputNumEvents += (tmpFileSpec.endEvent - tmpFileSpec.startEvent + 1) # continuity of event is broken if newNextStartEvent != None and newNextStartEvent != tmpFileSpec.startEvent: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break newNextStartEvent = tmpFileSpec.endEvent + 1 # check boundary if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID: # no files in the next loop if newInputNumFiles == 0: terminateFlag = True break # get effective file size effectiveFsize = JediCoreUtils.getEffectiveFileSize(tmpFileSpec.fsize,tmpFileSpec.startEvent, tmpFileSpec.endEvent,tmpFileSpec.nEvents) newInputNumFiles += 1 newNumMaster += 1 newFileSize += long(tmpFileSpec.fsize + sizeGradients * effectiveFsize) newOutSize += long(sizeGradients * effectiveFsize) if sizeGradientsPerInSize != None: newFileSize += long(effectiveFsize * sizeGradientsPerInSize) newOutSize += long(effectiveFsize * sizeGradientsPerInSize) newExpWalltime += long(walltimeGradient * effectiveFsize) # check secondaries for datasetSpec in self.secondaryDatasetList: if not datasetSpec.isNoSplit() and datasetSpec.getNumFilesPerJob() == None: # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID: break newNumSecondary = datasetSpec.getNumMultByRatio(newNumMaster) - nSecFilesMap[datasetSpec.datasetID] datasetUsage = self.datasetMap[datasetSpec.datasetID] for tmpFileSpec in datasetSpec.Files[datasetUsage['used']:datasetUsage['used']+nSecondary]: # check boundaryID if splitWithBoundaryID and boundaryID != None and boundaryID != tmpFileSpec.boundaryID: break newFileSize += tmpFileSpec.fsize if sizeGradientsPerInSize != None: newFileSize += (tmpFileSpec.fsize * sizeGradientsPerInSize) # termination if terminateFlag: break # check if (maxNumFiles != None and newInputNumFiles > maxNumFiles) \ or (maxSize != None and newFileSize > maxSize) \ or (maxSize != None and newOutSize < minOutSize and maxSize-minOutSize < newFileSize-newOutSize) \ or (maxWalltime > 0 and newExpWalltime > maxWalltime) \ or (maxNumEvents != None and newInputNumEvents > maxNumEvents) \ or (maxOutSize != None and newOutSize > maxOutSize): break # reset nUsed for repeated datasets for tmpDatasetID,datasetUsage in self.datasetMap.iteritems(): tmpDatasetSpec = datasetUsage['datasetSpec'] if tmpDatasetSpec.isRepeated(): datasetUsage['used'] %= len(tmpDatasetSpec.Files) # make copy to return returnList = [] for tmpDatasetID,inputFileList in inputFileMap.iteritems(): tmpRetList = [] for tmpFileSpec in inputFileList: # split par site or get atomic subchunk if siteName != None: # make copy to individually set locality newFileSpec = copy.copy(tmpFileSpec) # set locality newFileSpec.locality = siteCandidate.getFileLocality(tmpFileSpec) if newFileSpec.locality == 'remote': newFileSpec.sourceName = siteCandidate.remoteSource # append tmpRetList.append(newFileSpec) else: # getting atomic subchunk tmpRetList.append(tmpFileSpec) # add to return map tmpDatasetSpec = self.getDatasetWithID(tmpDatasetID) returnList.append((tmpDatasetSpec,tmpRetList)) # return return returnList