class RequestIterator: """ _RequestIterator_ Working from a Generic Workflow template, generate concrete jobs from it, keeping in-memory history """ def __init__(self, workflowSpecFile, workingDir): self.workflow = workflowSpecFile self.workingDir = workingDir self.count = 0 self.runIncrement = 1 self.currentJob = None self.sitePref = None self.pileupDatasets = {} self.ownedJobSpecs = {} # // # // Initially hard coded, should be extracted from Component Config #// self.eventsPerJob = 10 self.workflowSpec = WorkflowSpec() try: self.workflowSpec.load(workflowSpecFile) except: logging.error("ERROR Loading Workflow: %s " % (workflowSpecFile)) return if self.workflowSpec.parameters.get("RunIncrement", None) != None: self.runIncrement = int( self.workflowSpec.parameters['RunIncrement'] ) self.generators = GeneratorMaker() self.workflowSpec.payload.operate(self.generators) # // # // Cache Area for JobSpecs #// self.specCache = os.path.join( self.workingDir, "%s-Cache" %self.workflowSpec.workflowName()) if not os.path.exists(self.specCache): os.makedirs(self.specCache) def loadPileupDatasets(self): """ _loadPileupDatasets_ Are we dealing with pileup? If so pull in the file list """ puDatasets = self.workflowSpec.pileupDatasets() if len(puDatasets) > 0: logging.info("Found %s Pileup Datasets for Workflow: %s" % ( len(puDatasets), self.workflowSpec.workflowName(), )) self.pileupDatasets = createPileupDatasets(self.workflowSpec) return def loadPileupSites(self): """ _loadPileupSites_ Are we dealing with pileup? If so pull in the site list """ sites = [] puDatasets = self.workflowSpec.pileupDatasets() if len(puDatasets) > 0: logging.info("Found %s Pileup Datasets for Workflow: %s" % ( len(puDatasets), self.workflowSpec.workflowName(), )) sites = getPileupSites(self.workflowSpec) return sites def __call__(self): """ _operator()_ When called generate a new concrete job payload from the generic workflow and return it. """ newJobSpec = self.createJobSpec() self.count += self.runIncrement return newJobSpec def createJobSpec(self): """ _createJobSpec_ Load the WorkflowSpec object and generate a JobSpec from it """ jobSpec = self.workflowSpec.createJobSpec() jobName = "%s-%s" % ( self.workflowSpec.workflowName(), self.count, ) self.currentJob = jobName jobSpec.setJobName(jobName) jobSpec.setJobType("Processing") jobSpec.parameters['RunNumber'] = self.count jobSpec.payload.operate(DefaultLFNMaker(jobSpec)) jobSpec.payload.operate(self.generateJobConfig) jobSpec.payload.operate(self.generateCmsGenConfig) specCacheDir = os.path.join( self.specCache, str(self.count // 1000).zfill(4)) if not os.path.exists(specCacheDir): os.makedirs(specCacheDir) jobSpecFile = os.path.join(specCacheDir, "%s-JobSpec.xml" % jobName) self.ownedJobSpecs[jobName] = jobSpecFile # // # // Add site pref if set #// if self.sitePref != None: # #AF: Allow site pref to be a comma separated list of sites, each one # added in the Whitelist: # jobSpec.addWhitelistSite(self.sitePref) for siteWhite in self.sitePref.split(","): jobSpec.addWhitelistSite(siteWhite) jobSpec.save(jobSpecFile) return jobSpecFile def generateJobConfig(self, jobSpecNode): """ _generateJobConfig_ Operator to act on a JobSpecNode tree to convert the template config file into a JobSpecific Config File """ if jobSpecNode.name not in self.generators.keys(): return generator = self.generators[jobSpecNode.name] useOutputMaxEv = False if jobSpecNode.cfgInterface != None: outMaxEv = jobSpecNode.cfgInterface.maxEvents['output'] if outMaxEv != None: useOutputMaxEv = True if useOutputMaxEv: jobCfg = generator(self.currentJob, maxEventsWritten = self.eventsPerJob, firstRun = self.count) else: jobCfg = generator(self.currentJob, maxEvents = self.eventsPerJob, firstRun = self.count) # // # // Is there pileup for this node? #// if self.pileupDatasets.has_key(jobSpecNode.name): puDataset = self.pileupDatasets[jobSpecNode.name] logging.debug("Node: %s has a pileup dataset: %s" % ( jobSpecNode.name, puDataset.dataset, )) fileList = puDataset.getPileupFiles() jobCfg.pileupFiles = fileList jobSpecNode.cfgInterface = jobCfg return def generateCmsGenConfig(self, jobSpecNode): """ _generateCmsGenConfig_ Process CmsGen type nodes to insert maxEvents and run numbers for cmsGen jobs """ if jobSpecNode.type != "CmsGen": return jobSpecNode.applicationControls['firstRun'] = self.count jobSpecNode.applicationControls['maxEvents'] = self.eventsPerJob jobSpecNode.applicationControls['randomSeed'] = randomSeed() jobSpecNode.applicationControls['fileName'] = "%s-%s.root" % ( self.currentJob, jobSpecNode.name) jobSpecNode.applicationControls['logicalFileName'] = "%s-%s.root" % ( self.currentJob, jobSpecNode.name) return def removeSpec(self, jobSpecId): """ _removeSpec_ Remove a Spec file when it has been successfully injected """ if jobSpecId not in self.ownedJobSpecs.keys(): return logging.info("Removing JobSpec For: %s" % jobSpecId) filename = self.ownedJobSpecs[jobSpecId] if os.path.exists(filename): os.remove(filename) del self.ownedJobSpecs[jobSpecId] return def save(self, directory): """ _save_ Persist this objects state into an XML file and save it in the directory provided """ doc = IMProvDoc("RequestIterator") node = IMProvNode(self.workflowSpec.workflowName()) doc.addNode(node) node.addNode(IMProvNode("Run", None, Value = str(self.count))) node.addNode( IMProvNode("EventsPerJob", None, Value = str(self.eventsPerJob)) ) node.addNode(IMProvNode("SitePref", None, Value = str(self.sitePref))) pu = IMProvNode("Pileup") node.addNode(pu) for key, value in self.pileupDatasets.items(): puNode = value.save() puNode.attrs['PayloadNode'] = key pu.addNode(puNode) specs = IMProvNode("JobSpecs") node.addNode(specs) for key, val in self.ownedJobSpecs.items(): specs.addNode(IMProvNode("JobSpec", val, ID = key)) fname = os.path.join( directory, "%s-Persist.xml" % self.workflowSpec.workflowName() ) handle = open(fname, 'w') handle.write(doc.makeDOMDocument().toprettyxml()) handle.close() return def load(self, directory): """ _load_ Load this instance given the workflow and directory containing the persistency file """ fname = os.path.join( directory, "%s-Persist.xml" % self.workflowSpec.workflowName() ) try: node = loadIMProvFile(fname) except Exception, ex: msg = "ERROR: Corrupted Persistency File:\n" msg += " => %s\n" % fname msg += "Cannot be read:\n => %s\n" % str(ex) logging.error(msg) return qbase = "/RequestIterator/%s" % self.workflowSpec.workflowName() runQ = IMProvQuery("%s/Run[attribute(\"Value\")]" % qbase) eventQ = IMProvQuery("%s/EventsPerJob[attribute(\"Value\")]" % qbase) siteQ = IMProvQuery("%s/SitePref[attribute(\"Value\")]" % qbase) runVal = int(runQ(node)[-1]) eventVal = int(eventQ(node)[-1]) siteVal = str(siteQ(node)[-1]) if siteVal.lower() == "none": siteVal = None self.count = runVal self.eventsPerJob = eventVal self.sitePref = siteVal puQ = IMProvQuery("%s/Pileup/*" % qbase) puNodes = puQ(node) for puNode in puNodes: payloadNode = str(puNode.attrs.get("PayloadNode")) puDataset = PileupDataset("dummy", 1) puDataset.load(puNode) self.pileupDatasets[payloadNode] = puDataset specQ = IMProvQuery("%s/JobSpecs/*" % qbase) specNodes = specQ(node) for specNode in specNodes: specId = str(specNode.attrs['ID']) specFile = str(specNode.chardata).strip() self.ownedJobSpecs[specId] = specFile return
class DatasetIterator: """ _DatasetIterator_ Working from a Generic Workflow template, generate concrete jobs from it, keeping in-memory history """ def __init__(self, workflowSpecFile, workingDir): self.workflow = workflowSpecFile self.workingDir = workingDir self.currentJob = None self.workflowSpec = WorkflowSpec() self.workflowSpec.load(workflowSpecFile) self.currentJobDef = None self.count = 0 self.onlyClosedBlocks = False if self.workflowSpec.parameters.has_key("OnlyClosedBlocks"): onlyClosed = str( self.workflowSpec.parameters["OnlyClosedBlocks"]).lower() if onlyClosed == "true": self.onlyClosedBlocks = True self.ownedJobSpecs = {} self.allowedBlocks = [] self.allowedSites = [] self.dbsUrl = getLocalDBSURL() self.splitType = \ self.workflowSpec.parameters.get("SplitType", "file").lower() self.splitSize = int(self.workflowSpec.parameters.get("SplitSize", 1)) self.generators = GeneratorMaker() self.generators(self.workflowSpec.payload) self.pileupDatasets = {} # // # // Does the workflow contain a block restriction?? #// blockRestriction = \ self.workflowSpec.parameters.get("OnlyBlocks", None) if blockRestriction != None: # // # // restriction on blocks present, populate allowedBlocks list #// msg = "Block restriction provided in Workflow Spec:\n" msg += "%s\n" % blockRestriction blockList = blockRestriction.split(",") for block in blockList: if len(block.strip() ) > 0: self.allowedBlocks.append(block.strip()) # // # // Does the workflow contain a site restriction?? #// siteRestriction = \ self.workflowSpec.parameters.get("OnlySites", None) if siteRestriction != None: # // # // restriction on sites present, populate allowedSites list #// msg = "Site restriction provided in Workflow Spec:\n" msg += "%s\n" % siteRestriction siteList = siteRestriction.split(",") for site in siteList: if len(site.strip() ) > 0: self.allowedSites.append(site.strip()) # // # // Is the DBSURL contact information provided?? #// value = self.workflowSpec.parameters.get("DBSURL", None) if value != None: self.dbsUrl = value if self.dbsUrl == None: msg = "Error: No DBSURL available for dataset:\n" msg += "Cant get local DBSURL and one not provided with workflow" raise RuntimeError, msg # // # // Cache Area for JobSpecs #// self.specCache = os.path.join( self.workingDir, "%s-Cache" %self.workflowSpec.workflowName()) if not os.path.exists(self.specCache): os.makedirs(self.specCache) def __call__(self, jobDef): """ _operator()_ When called generate a new concrete job payload from the generic workflow and return it. The JobDef should be a JobDefinition with the input details including LFNs and event ranges etc. """ newJobSpec = self.createJobSpec(jobDef) self.count += 1 return newJobSpec def loadPileupDatasets(self): """ _loadPileupDatasets_ Are we dealing with pileup? If so pull in the file list """ puDatasets = self.workflowSpec.pileupDatasets() if len(puDatasets) > 0: logging.info("Found %s Pileup Datasets for Workflow: %s" % ( len(puDatasets), self.workflowSpec.workflowName(), )) self.pileupDatasets = createPileupDatasets(self.workflowSpec) return def loadPileupSites(self): """ _loadPileupSites_ Are we dealing with pileup? If so pull in the site list """ sites = [] puDatasets = self.workflowSpec.pileupDatasets() if len(puDatasets) > 0: logging.info("Found %s Pileup Datasets for Workflow: %s" % ( len(puDatasets), self.workflowSpec.workflowName(), )) sites = getPileupSites(self.workflowSpec) return sites def inputDataset(self): """ _inputDataset_ Extract the input Dataset from this workflow """ topNode = self.workflowSpec.payload try: inputDataset = topNode._InputDatasets[-1] except StandardError, ex: msg = "Error extracting input dataset from Workflow:\n" msg += str(ex) logging.error(msg) return None return inputDataset.name()