def createCleanupWorkflowSpec(): """ _createCleanupWorkflowSpec_ Create a generic cleanup WorkflowSpec definition that can be used to generate a sanbox for cleanup jobs """ timestamp = str(time.asctime(time.localtime(time.time()))) timestamp = timestamp.replace(" ", "-") timestamp = timestamp.replace(":", "_") workflow = WorkflowSpec() workflow.setWorkflowName("CleanUp-%s" % timestamp) workflow.setActivity("CleanUp") workflow.setRequestCategory("mc-cleanup") workflow.setRequestTimestamp(timestamp) workflow.parameters['WorkflowType']="CleanUp" cleanUp = workflow.payload cleanUp.name = "cleanUp1" cleanUp.type = "CleanUp" cleanUp.application["Project"] = "" cleanUp.application["Version"] = "" cleanUp.application["Architecture"] = "" cleanUp.application["Executable"] = "RuntimeCleanUp.py" # binary name cleanUp.configuration = "" cleanUp.cfgInterface = None return workflow
def createCleanupWorkflowSpec(): """ _createCleanupWorkflowSpec_ Create a generic cleanup WorkflowSpec definition that can be used to generate a sanbox for cleanup jobs """ timestamp = str(time.asctime(time.localtime(time.time()))) timestamp = timestamp.replace(" ", "-") timestamp = timestamp.replace(":", "_") workflow = WorkflowSpec() workflow.setWorkflowName("CleanUp-%s" % timestamp) workflow.setActivity("CleanUp") workflow.setRequestCategory("mc-cleanup") workflow.setRequestTimestamp(timestamp) workflow.parameters['WorkflowType'] = "CleanUp" cleanUp = workflow.payload cleanUp.name = "cleanUp1" cleanUp.type = "CleanUp" cleanUp.application["Project"] = "" cleanUp.application["Version"] = "" cleanUp.application["Architecture"] = "" cleanUp.application["Executable"] = "RuntimeCleanUp.py" # binary name cleanUp.configuration = "" cleanUp.cfgInterface = None return workflow
def createProductionWorkflow(prodName, cmsswVersion, cfgFile=None, category="mc", **args): """ _createProductionWorkflow_ Create a Production style workflow, ie generation of new events """ timestamp = int(time.time()) if args.get("PyCfg", None) == None: if cfgFile == None: msg = "Error: No Cfg File or python cfg file provided to createProductionWorkflow" raise RuntimeError, msg pycfgFile = createPythonConfig(cfgFile) pycfgFileContent = file(pycfgFile).read() else: pycfgFileContent = args['PyCfg'] if args.get("PSetHash", None) == None: realPSetHash = createPSetHash(cfgFile) else: realPSetHash = args['PSetHash'] # // # // Create a new WorkflowSpec and set its name #// spec = WorkflowSpec() workflowname = "%s__%s-%s-%s-%s" % ( prodName, cmsswVersion, args.get("processingLabel", "Test07"), args.get("physicsGroup", "NoPhysicsGroup"), timestamp) spec.setWorkflowName(workflowname) spec.setRequestCategory(category) spec.setRequestTimestamp(timestamp) cmsRun = spec.payload populateCMSRunNode(cmsRun, "cmsRun1", cmsswVersion, pycfgFileContent, realPSetHash, timestamp, prodName, physicsGroup=args.get("physicsGroup", "NoPhysicsGroup"), processingLabel=args.get("processingLabel", "Test07"), fakeHash=args.get("FakeHash", False)) addStageOutNode(cmsRun, "stageOut1") generateFilenames(spec) return spec
def createProductionWorkflow(prodName, cmsswVersion, cfgFile = None, category = "mc", **args): """ _createProductionWorkflow_ Create a Production style workflow, ie generation of new events """ timestamp = int(time.time()) if args.get("PyCfg", None) == None: if cfgFile == None: msg = "Error: No Cfg File or python cfg file provided to createProductionWorkflow" raise RuntimeError, msg pycfgFile = createPythonConfig(cfgFile) pycfgFileContent = file(pycfgFile).read() else: pycfgFileContent = args['PyCfg'] if args.get("PSetHash", None) == None: realPSetHash = createPSetHash(cfgFile) else: realPSetHash = args['PSetHash'] # // # // Create a new WorkflowSpec and set its name #// spec = WorkflowSpec() workflowname = "%s__%s-%s-%s-%s"%(prodName,cmsswVersion,args.get("processingLabel","Test07"),args.get("physicsGroup","NoPhysicsGroup"),timestamp) spec.setWorkflowName(workflowname) spec.setRequestCategory(category) spec.setRequestTimestamp(timestamp) cmsRun = spec.payload populateCMSRunNode(cmsRun, "cmsRun1", cmsswVersion, pycfgFileContent, realPSetHash, timestamp, prodName, physicsGroup = args.get("physicsGroup", "NoPhysicsGroup"), processingLabel=args.get("processingLabel", "Test07"), fakeHash = args.get("FakeHash", False)) addStageOutNode(cmsRun, "stageOut1") generateFilenames(spec) return spec
def createLogCollectorWorkflowSpec(wf): """ _createLogColectorWorkflowSpec_ Create a generic LogArchive WorkflowSpec definition """ timestamp = str(time.asctime(time.localtime(time.time()))) timestamp = timestamp.replace(" ", "-") timestamp = timestamp.replace(":", "_") workflow = WorkflowSpec() workflow.setWorkflowName("LogCollect-%s" % timestamp) workflow.setActivity("LogCollect") workflow.setRequestCategory("logcollect") workflow.setRequestTimestamp(timestamp) workflow.parameters['WorkflowType'] = "LogCollect" logArchive = workflow.payload logArchive.name = "logCollect1" logArchive.type = "LogCollect" #TODO: remove this? #logArchive.workflow = wf logArchive.configuration logArchive.application["Project"] = "" logArchive.application["Version"] = "" logArchive.application["Architecture"] = "" logArchive.application[ "Executable"] = "RuntimeLogCollector.py" # binary name logArchive.configuration = "" logArchive.cfgInterface = None #set stageOut override #cfg = IMProvNode("config") #stageOut = IMProvNode("StageOutParameters") #cfg.addNode() #WorkflowTools.addStageOutNode(logArchive, "StageOut1") #WorkflowTools.addStageOutOverride(logArchive, stageOutParams['command'], # stageOutParams['option'], # stageOutParams['se-name'], # stageOutParams['lfnPrefix']) return workflow
def createLogCollectorWorkflowSpec(wf): """ _createLogColectorWorkflowSpec_ Create a generic LogArchive WorkflowSpec definition """ timestamp = str(time.asctime(time.localtime(time.time()))) timestamp = timestamp.replace(" ", "-") timestamp = timestamp.replace(":", "_") workflow = WorkflowSpec() workflow.setWorkflowName("LogCollect-%s" % timestamp) workflow.setActivity("LogCollect") workflow.setRequestCategory("logcollect") workflow.setRequestTimestamp(timestamp) workflow.parameters["WorkflowType"] = "LogCollect" logArchive = workflow.payload logArchive.name = "logCollect1" logArchive.type = "LogCollect" # TODO: remove this? # logArchive.workflow = wf logArchive.configuration logArchive.application["Project"] = "" logArchive.application["Version"] = "" logArchive.application["Architecture"] = "" logArchive.application["Executable"] = "RuntimeLogCollector.py" # binary name logArchive.configuration = "" logArchive.cfgInterface = None # set stageOut override # cfg = IMProvNode("config") # stageOut = IMProvNode("StageOutParameters") # cfg.addNode() # WorkflowTools.addStageOutNode(logArchive, "StageOut1") # WorkflowTools.addStageOutOverride(logArchive, stageOutParams['command'], # stageOutParams['option'], # stageOutParams['se-name'], # stageOutParams['lfnPrefix']) return workflow
class WorkflowMaker: """ _WorkflowMaker_ Basic MC workflow maker for PR to use to create workflow spec files. """ def __init__(self, requestId, channel, label): self.requestId = requestId self.group = None self.label = label self.timestamp = int(time.time()) self.channel = channel self.cmsswVersions = [] self.configurations = [] self.psetHashes = {} self.origCfgs = {} self.acquisitionEra = None self.processingString = None self.processingVersion = None self.conditions = None # turn on use of proper naming convention for datasets # should be made the default soon, lets deprecate all the old crap self.useProperNamingConventions = False self.options = {} self.options.setdefault('FakeHash', False) # Should we use another attribute for setting the output dataset # status in DBS? self.outputDatasetStatus = 'VALID' self.inputDataset = {} self.inputDataset['IsUsed'] = False self.inputDataset['DatasetName'] = None self.inputDataset['Primary'] = None self.inputDataset['Processed'] = None self.inputDataset['DataTier'] = None # // # // Extra controls over input dataset if required #// self.inputDataset['SplitType'] = None self.inputDataset['SplitSize'] = None self.inputDataset['OnlySites'] = None self.inputDataset['OnlyBlocks'] = None self.inputDataset['OnlyClosedBlocks'] = True # // # // Pileup Dataset controls #// self.pileupDatasets = [] # // # // Initialise basic workflow #// self.workflow = WorkflowSpec() self.workflowName = "%s-%s-%s" % (label, channel, requestId) self.workflow.setWorkflowName(self.workflowName) self.workflow.setRequestCategory("mc") self.workflow.setRequestTimestamp(self.timestamp) self.workflow.parameters['RequestLabel'] = self.label self.workflow.parameters['ProdRequestID'] = self.requestId self.cmsRunNode = self.workflow.payload self.cmsRunNode.name = "cmsRun1" self.cmsRunNode.type = "CMSSW" self.cmsRunNodes = [self.cmsRunNode] self.saveOutputFor = [] def chainCmsRunNode(self, stageOutIntermediates=False, *outputModules): """ append a cmsRun config to the current cmsRun node and chain them """ if stageOutIntermediates: #Do we want to keep cmsRunNode's products? self.saveOutputFor.append(self.cmsRunNode.name) newnode = self.cmsRunNode.newNode("cmsRun%s" % (len(self.cmsRunNodes) + 1)) newnode.type = "CMSSW" if not outputModules: outputModules = self.configurations[-1].outputModules.keys() for outmodule in outputModules: newnode.addInputLink(self.cmsRunNode.name, outmodule, 'source', AppearStandalone=not stageOutIntermediates) self.cmsRunNode = newnode self.cmsRunNodes.append(newnode) def changeCategory(self, newCategory): """ _changeCategory_ Change the workflow category from the default mc that appears in the LFNs """ self.workflow.setRequestCategory(newCategory) return def setAcquisitionEra(self, era): """ _setAcquisitionEra_ Sets the AcquisitionEra in the workflow """ self.workflow.setAcquisitionEra(era) self.acquisitionEra = era return def setNamingConventionParameters(self, era, procString, procVers): """ _setNamingConventionParameters_ Sets AcquisitionEra, ProcessingString and ProcessingVersion """ self.workflow.setAcquisitionEra(era) self.workflow.parameters['ProcessingString'] = procString self.workflow.parameters['ProcessingVersion'] = procVers self.acquisitionEra = era self.processingString = procString self.processingVersion = procVers self.useProperNamingConventions = True return def setActivity(self, activity): """ _changeWorkflowType_ Set the workflow type i.e. Simulation, Reconstruction, Reprocessing, Skimming """ self.workflow.setActivity(activity) return def setCMSSWVersion(self, version): """ _setCMSSWVersion_ Set the version of CMSSW to be used """ self.cmsswVersions.append(version) self.cmsRunNode.application['Version'] = version self.cmsRunNode.application['Executable'] = "cmsRun" self.cmsRunNode.application['Project'] = "CMSSW" self.cmsRunNode.application['Architecture'] = "" return def setUserSandbox(self, sandboxloc): """ _setSandbox_ Sets the location of the user sandbox """ self.cmsRunNode.userSandbox = sandboxloc return def setPhysicsGroup(self, group): """ _setPhysicsGroup_ Physics Group owning the workflow """ self.group = group self.workflow.parameters['PhysicsGroup'] = self.group return def setConfiguration(self, cfgFile, **args): """ _setConfiguration_ Provide the CMSSW configuration to be used. By default, assume that cfgFile is a python format string. The format & type can be specified using args: - Type : must be "file" or "string" or "instance" """ cfgType = args.get("Type", "instance") if cfgType not in ("file", "string", "instance"): msg = "Illegal Type for cfg file: %s\n" % cfgType msg += "Should be \"file\" or \"string\"\n" raise RuntimeError, msg cfgContent = cfgFile if cfgType == "file": cfgContent = file(cfgFile).read() cfgType = "string" if cfgType == "string": cfgData = cfgContent cfgContent = CMSSWConfig() cfgContent.unpack(cfgData) self.cmsRunNode.cfgInterface = cfgContent self.configurations.append(cfgContent) return def setOriginalCfg(self, honkingGreatString): """ _setOriginalCfg_ Set the original cfg file content that is to be recorded in DBS CALL THIS METHOD AFTER setConfiguration """ sep = '\n\n### Next chained config file ###\n\n' cfg = '' for link in self.cmsRunNode._InputLinks: if link['AppearStandalone']: prev_config = self.origCfgs.get(link['InputNode'], '') if prev_config: cfg = '%s%s%s' % (cfg, prev_config, sep) cfg = '%s%s' % (cfg, honkingGreatString) self.cmsRunNode.cfgInterface.originalCfg = cfg self.origCfgs[self.cmsRunNode.name] = cfg return def setPSetHash(self, hashValue): """ _setPSetHash_ Set the value for the PSetHash If any InputLinks are present their pset hashes are prepended """ hash = '' for link in self.cmsRunNode._InputLinks: if link['AppearStandalone']: prev_node_hash = self.psetHashes.get(link['InputNode'], None) if prev_node_hash: # cmsGen nodes will be missing hash = '%s%s_' % (hash, prev_node_hash) hash = '%s%s' % (hash, hashValue) self.psetHashes[self.cmsRunNode.name] = hash return def addInputDataset(self, datasetPath): """ _addInputDataset_ If this workflow processes a dataset, set that here NOTE: Is possible to also specify - Split Type (file or event) - Split Size (int) - input DBS Not sure how many of these we want to use. For now, they can be added to the inputDataset dictionary """ datasetBits = DatasetConventions.parseDatasetPath(datasetPath) self.inputDataset.update(datasetBits) self.inputDataset['IsUsed'] = True self.inputDataset['DatasetName'] = datasetPath return def addPileupDataset(self, datasetName, filesPerJob=10, targetModule=None): """ _addPileupDataset_ Add a dataset to provide pileup overlap. filesPerJob should be 1 in 99.9 % of cases """ pileupDataset = {} pileupDataset['Primary'] = None pileupDataset['Processed'] = None pileupDataset['DataTier'] = None datasetBits = DatasetConventions.parseDatasetPath(datasetName) pileupDataset.update(datasetBits) pileupDataset['FilesPerJob'] = filesPerJob # Target module coould be 'MixingModule' or 'DataMixingModule' for # the moment. If None, MixingModule will be used. pileupDataset['TargetModule'] = targetModule self.pileupDatasets.append(pileupDataset) return def addFinalDestination(self, *phedexNodeNames): """ _addFinalDestination_ Add a final destination that can be used to generate a PhEDEx subscription so that the data gets transferred to some final location. NOTE: Do we want to support a list of PhEDEx nodes? Eg CERN + FNAL """ nameList = "" for nodeName in phedexNodeNames: nameList += "%s," % nodeName nameList = nameList[:-1] self.workflow.parameters['PhEDExDestination'] = nameList return def addSelectionEfficiency(self, selectionEff): """ _addSelectionEfficiency_ Do we have a selection efficiency? """ self.cmsRunNode.applicationControls["SelectionEfficiency"] = \ selectionEff return def setOutputDatasetDbsStatus(self, status): """ _setOutputDatasetDbsStatus_ The output datasets will have this status in the field dataset.status. This value will be use when registering the output dataset in DBS. Only two values are acepted: - VALID - PRODUCTION """ if status in ('VALID', 'PRODUCTION'): self.outputDatasetStatus = status return def makeWorkflow(self): """ _makeWorkflow_ Call this method to create the workflow spec instance when done """ self._Validate() # // # // Add Stage Out node #// self.saveOutputFor.append(self.cmsRunNode.name) WorkflowTools.addStageOutNode(self.cmsRunNode, "stageOut1", *self.saveOutputFor) WorkflowTools.addLogArchNode(self.cmsRunNode, "logArchive") # // # // Input Dataset? #// if self.inputDataset['IsUsed']: inputDataset = self.cmsRunNodes[0].addInputDataset( self.inputDataset['Primary'], self.inputDataset['Processed']) inputDataset["DataTier"] = self.inputDataset['DataTier'] for keyname in [ 'SplitType', 'SplitSize', 'OnlySites', 'OnlyBlocks', 'OnlyClosedBlocks', ]: if self.inputDataset[keyname] != None: self.workflow.parameters[keyname] = self.inputDataset[ keyname] # // # // Pileup Datasets? #// for pileupDataset in self.pileupDatasets: puDataset = self.cmsRunNodes[0].addPileupDataset( pileupDataset['Primary'], pileupDataset['DataTier'], pileupDataset['Processed']) puDataset['FilesPerJob'] = pileupDataset['FilesPerJob'] if pileupDataset['TargetModule'] is not None: puDataset['TargetModule'] = pileupDataset['TargetModule'] # // # // Extract dataset info from cfg #// datasets = {} datasetsToForward = {} for cmsRunNode, config in zip(self.cmsRunNodes, self.configurations): # Ignore nodes that don't save any output. But keep input dataset # in case we need to forward it. if cmsRunNode.name not in self.saveOutputFor: # Store parent dataset in case we need to forward it. if self.inputDataset['IsUsed'] and \ cmsRunNode == self.cmsRunNodes[0]: datasetsToForward[cmsRunNode.name] = \ self.inputDataset['DatasetName'] elif cmsRunNode != self.cmsRunNodes[0]: for inputLink in cmsRunNode._InputLinks: # If the previous cmsRunNode stages out, pull down the # dataset it produced. if not inputLink["AppearStandalone"]: # TODO: Wont work if more than one InputLink exists datasetsToForward[cmsRunNode.name] = \ datasets['%s:%s' % (inputLink['InputNode'], inputLink['OutputModule'])] # If the previous cmsRunNode does not stage out, then # use it's parent. else: # TODO: Wont work if more than one InputLink exists datasetsToForward[cmsRunNode.name] = \ datasetsToForward[inputLink['InputNode']] continue for outModName in config.outputModules.keys(): moduleInstance = config.getOutputModule(outModName) dataTier = moduleInstance['dataTier'] filterName = moduleInstance["filterName"] primaryName = DatasetConventions.primaryDatasetName( PhysicsChannel=self.channel, ) if self.useProperNamingConventions: if self.processingString and filterName: processingString = "_".join( (self.processingString, filterName)) elif self.processingString: processingString = self.processingString elif filterName: processingString = filterName else: processingString = None processedName = DatasetConventions.properProcessedDatasetName( AcquisitionEra=self.acquisitionEra, ProcessingString=processingString, ProcessingVersion=self.processingVersion, Unmerged=True) elif self.acquisitionEra == None: processedName = DatasetConventions.processedDatasetName( Version=cmsRunNode.application['Version'], Label=self.label, Group=self.group, FilterName=filterName, RequestId=self.requestId, Unmerged=True) else: processedName = DatasetConventions.csa08ProcessedDatasetName( AcquisitionEra=self.acquisitionEra, Conditions=self.workflow.parameters['Conditions'], ProcessingVersion=self.workflow. parameters['ProcessingVersion'], FilterName=filterName, Unmerged=True) dataTier = DatasetConventions.checkDataTier(dataTier) moduleInstance['primaryDataset'] = primaryName moduleInstance['processedDataset'] = processedName outDS = cmsRunNode.addOutputDataset(primaryName, processedName, outModName) outDS['Status'] = self.outputDatasetStatus outDS['DataTier'] = dataTier outDS["ApplicationName"] = \ cmsRunNode.application["Executable"] outDS["ApplicationFamily"] = outModName outDS["PhysicsGroup"] = self.group # check for input dataset for first node if self.inputDataset[ 'IsUsed'] and cmsRunNode == self.cmsRunNodes[0]: outDS['ParentDataset'] = self.inputDataset['DatasetName'] # check for staged out intermediates elif cmsRunNode != self.cmsRunNodes[0]: for inputLink in cmsRunNode._InputLinks: if not inputLink["AppearStandalone"]: # TODO: Wont work if more than one InputLink exists outDS['ParentDataset'] = datasets[ '%s:%s' % (inputLink['InputNode'], inputLink['OutputModule'])] elif datasetsToForward.get( inputLink['InputNode']) is not None: outDS['ParentDataset'] = \ datasetsToForward[inputLink['InputNode']] if self.options['FakeHash']: guid = makeUUID() outDS['PSetHash'] = "hash=%s;guid=%s" % \ (self.psetHashes[cmsRunNode.name], guid) else: outDS['PSetHash'] = self.psetHashes[cmsRunNode.name] # record output in case used as input to a later node datasets['%s:%s' % (cmsRunNode.name, outModName)] = \ "/%s/%s/%s" % ( outDS['PrimaryDataset'], outDS['ProcessedDataset'], outDS['DataTier']) # optionally remap sibling relationships to parent-child (i.e HLTDEBUG) remapParentageForWorkflow(self.workflow) WorkflowTools.generateFilenames(self.workflow) return self.workflow def _Validate(self): """ _Validate_ Private method to test all options are set. Throws a WorkflowMakerError if any problems found """ notNoneAttrs = [ "requestId", "label", "group", "channel", ] for attrName in notNoneAttrs: value = getattr(self, attrName, None) if value == None: msg = "Attribute Not Set: %s" % attrName raise WorkflowMakerError(msg) if not len(self.configurations): msg = "Attribute Not Set: configurations" raise WorkflowMakerError(msg) if len(self.configurations) != len(self.cmsswVersions): msg = "len(self.configurations) != len(self.cmsswVersions)" raise WorkflowMakerError(msg) return
class PromptRecoWorkflow(FactoryInterface): """ _PromptWorkflow_ Factory to build workflows for PromptReco jobs. """ def __init__(self, runNumber, version, cmsPath, scramArch): FactoryInterface.__init__(self, version, cmsPath, scramArch) self.run = runNumber self.workflow = None self.timestamp = None self.cmsRunNode = None self.workflowName = None self.configFile = None self.useLazyDownload = False self.primaryDataset = None self.processedDataset = None self.parentProcessedDataset = None self.acquisitionEra = None self.processingVersion = None def setConfigFile(self, configFile): """ _setConfigFile_ Set the config file that will be loaded into the workflow. """ self.configFile = configFile def setPrimaryDataset(self, primaryDatasetName): """ _setPrimaryDataset_ Set the primary dataset that this workflow will run over. """ self.primaryDataset = primaryDatasetName return def setProcessedDataset(self, processedDatasetName): """ _setProcessedDataset_ Set the processed dataset that this workflow will produce. """ self.processedDataset = processedDatasetName return def setParentProcessedDataset(self, parentProcessedDatasetName): """ _setParentProcessedDataset_ Set the parent processed dataset for this workflow. """ self.parentProcessedDataset = parentProcessedDatasetName return def setAcquisitionEra(self, acquisitionEra): """ _setAcquisitionEra_ Set the acquisition era. """ self.acquisitionEra = acquisitionEra return def setProcessingVersion(self, processingVersion): """ _setProcessingVersion_ Set the processing version. """ self.processingVersion = processingVersion return def setLazyDownload(self, useLazyDownload): """ _setLazyDownload_ This enables/disables lazy download mode in the framework. """ self.useLazyDownload = useLazyDownload def setupOutputModule(self, outputModuleName, dataTier): """ _setupOutputModule_ Create the outputModule and outputDataset sections of the workflow. """ outputDataset = self.cmsRunNode.addOutputDataset(self.primaryDataset, self.processedDataset, outputModuleName) outputDataset["NoMerge"] = "True" outputDataset["DataTier"] = dataTier outputDataset["ApplicationName"] = "cmsRun" outputDataset["ApplicationProject"] = "CMSSW" outputDataset["ApplicationVersion"] = self.cmssw["CMSSWVersion"] outputDataset["ApplicationFamily"] = outputModuleName outputDataset["ParentDataset"] = "/%s/%s/%s" % (self.primaryDataset, self.parentProcessedDataset, "RAW") cfgWrapper = self.workflow.payload.cfgInterface outputModule = cfgWrapper.getOutputModule(outputModuleName) outputModule["catalog"] = '%s-Catalog.xml' % outputModule['Name'] outputModule["primaryDataset"] = self.primaryDataset outputModule["processedDataset"] = self.processedDataset outputModule["dataTier"] = dataTier outputModule["acquisitionEra"] = self.acquisitionEra outputModule["processingVersion"] = self.processingVersion outputDataset["LFNBase"] = getLFN(outputModule, self.run, Unmerged = True) outputDataset["MergedLFNBase"] = getLFN(outputModule, self.run) outputModule["LFNBase"] = outputDataset["LFNBase"] outputModule["MergedLFNBase"] = outputDataset["MergedLFNBase"] outputModule["fileName"] = "%s.root" % outputModule['Name'] outputModule["logicalFileName"] = os.path.join( outputDataset["LFNBase"], "PromptReco.%s.root" % dataTier) return def makeWorkflow(self): """ _makeWorkflow_ Generate a workflow. If the self.configFile parameter has been set this will attempt to load the config from file, otherwise it will create an empty process object which will get filled in by the runtime script. """ self.timestamp = int(time.time()) self.workflow = WorkflowSpec() self.workflowName = "PromptReco-Run%s-%s" % (self.run, self.primaryDataset) self.workflow.setWorkflowName(self.workflowName) self.workflow.setRequestCategory("data") self.workflow.setRequestTimestamp(self.timestamp) self.workflow.parameters["WorkflowType"] = "Processing" self.workflow.parameters["ProdRequestID"] = self.run self.workflow.parameters["RunNumber"] = self.run self.workflow.parameters["CMSSWVersion"] = self.cmssw["CMSSWVersion"] self.workflow.parameters["ScramArch"] = self.cmssw["ScramArch"] self.workflow.parameters["CMSPath"] = self.cmssw["CMSPath"] self.cmsRunNode = self.workflow.payload self.cmsRunNode.name = "cmsRun1" self.cmsRunNode.type = "CMSSW" self.cmsRunNode.application["Version"] = self.cmssw["CMSSWVersion"] self.cmsRunNode.application["Executable"] = "cmsRun" self.cmsRunNode.application["Project"] = "CMSSW" self.cmsRunNode.application["Architecture"] = self.cmssw["ScramArch"] inputDataset = self.cmsRunNode.addInputDataset(self.primaryDataset, self.parentProcessedDataset) inputDataset["DataTier"] = "RAW" if self.configFile == None: self.loadProcessFromFramework() else: self.loadProcessFromFile() WorkflowTools.addStageOutNode(self.cmsRunNode, "stageOut1") WorkflowTools.addLogArchNode(self.cmsRunNode, "logArchive") WorkflowTools.generateFilenames(self.workflow) return self.workflow def loadProcessFromFile(self): """ _loadProcessFromFile_ Load the config file into the workflow. """ preExecScript = self.cmsRunNode.scriptControls["PreExe"] preExecScript.append("T0.PromptRecoInjector.RuntimePromptReco") cfgBaseName = os.path.basename(self.configFile).replace(".py", "") cfgDirName = os.path.dirname(self.configFile) modPath = imp.find_module(cfgBaseName, [cfgDirName]) loader = CMSSWAPILoader(self.cmssw["ScramArch"], self.cmssw["CMSSWVersion"], self.cmssw["CMSPath"]) try: loader.load() except Exception, ex: logging.error("Couldn't load CMSSW libraries: %s" % ex) return None try: modRef = imp.load_module(cfgBaseName, modPath[0], modPath[1], modPath[2]) except Exception, ex: logging.error("Can't load config: %s" % ex) loader.unload() return None
class ExpressWorkflow(FactoryInterface): """ _ExpressFactory_ Util to build workflows for express processing jobs """ def __init__(self, runNumber, version, globalTag, cmsPath, scramArch, *outModuleInfo): FactoryInterface.__init__(self, version, cmsPath, scramArch) self.run = runNumber self.outputModules = list(outModuleInfo) self.globalTag = globalTag def buildConfiguration(self, configFile, enableLazyDownload): """ _buildConfiguration_ mostly just a method to take the passed in information """ outputModuleDetails = {} for moduleInfo in self.outputModules: if moduleInfo.has_key("dataset"): moduleName = "write_%s_%s_%s" % (moduleInfo["stream"], moduleInfo["dataset"], moduleInfo["dataTier"]) else: moduleName = "write_%s_%s" % (moduleInfo["stream"], moduleInfo["dataTier"]) outputModuleDetails[moduleName] = { "Stream" : moduleInfo["stream"], "primaryDataset" : moduleInfo.get("dataset", None), "processedDataset" : moduleInfo.get("processedDataset", None), "dataTier" : moduleInfo["dataTier"], "acquisitionEra" : moduleInfo["acquisitionEra"], "processingVersion" : moduleInfo["processingVersion"], ## "globalTag" : moduleInfo["globalTag"], "compressionLevel" : 3 } if moduleInfo.has_key("triggerPaths"): selEvents = [ "%s:%s" % (x, moduleInfo["process"]) for x in moduleInfo["triggerPaths"] ] outputModuleDetails[moduleName]["SelectEvents"] = selEvents else: outputModuleDetails[moduleName]["SelectEvents"] = None cfgInterface = self.createConfiguration(sourceType = "NewEventStreamFileReader", configFile = configFile, enableLazyDownload = enableLazyDownload, outputModuleDetails = outputModuleDetails, setEventContentInOutput = True, compressionLevel = 3) return cfgInterface def makeWorkflowSpec(self, name, configFile, enableLazyDownload): """ _makeWorkflowSpec_ Create a workflow spec instance """ # // # // Initialise basic workflow #// self.workflow = WorkflowSpec() self.workflow.setWorkflowName(name) self.workflow.setRequestCategory("data") self.workflow.setRequestTimestamp(int(time.time())) self.workflow.parameters["WorkflowType"] = "Repack" self.workflow.parameters["RequestLabel"] = name self.workflow.parameters["ProdRequestID"] = self.run self.workflow.parameters["RunNumber"] = self.run self.workflow.parameters["CMSSWVersion"] = self.cmssw["CMSSWVersion"] self.workflow.parameters["ScramArch"] = self.cmssw["ScramArch"] self.workflow.parameters["CMSPath"] = self.cmssw["CMSPath"] # runtime support for StreamerJobEntity self.workflow.addPythonLibrary("T0.DataStructs") cmsRunNode = self.workflow.payload cmsRunNode.name = "cmsRun1" cmsRunNode.type = "CMSSW" cmsRunNode.application["Version"] = self.cmssw["CMSSWVersion"] cmsRunNode.application["Executable"] = "cmsRun" cmsRunNode.application["Project"] = "CMSSW" cmsRunNode.application["Architecture"] = self.cmssw["ScramArch"] # runtime express script cmsRunNode.scriptControls["PreExe"].append( "T0.ExpressInjector.RuntimeExpress") # build the configuration template for the workflow cmsRunNode.cfgInterface = self.buildConfiguration(configFile, enableLazyDownload) if cmsRunNode.cfgInterface == None: return None # override global tag cmsRunNode.cfgInterface.conditionsTag = self.globalTag # generate Dataset information for workflow from cfgInterface for outMod,moduleInstance in cmsRunNode.cfgInterface.outputModules.items(): primaryName = moduleInstance["primaryDataset"] processedName = moduleInstance["processedDataset"] outDS = cmsRunNode.addOutputDataset(primaryName, processedName, outMod) outDS["DataTier"] = moduleInstance["dataTier"] outDS["ApplicationName"] = cmsRunNode.application["Executable"] outDS["ApplicationFamily"] = outMod outDS["PhysicsGroup"] = "Tier0" outDS["ApplicationFamily"] = outMod # generate just single LFN stub (all output is unmerged) # insert them into the output module and dataset info outDS["LFNBase"] = self.getLFN(moduleInstance, dataType = 'express', Unmerged = True) moduleInstance["LFNBase"] = outDS["LFNBase"] moduleInstance["logicalFileName"] = os.path.join( outDS["LFNBase"], "%s.root" % outMod ) WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1") WorkflowTools.addLogArchNode(cmsRunNode, "logArchive") return self.workflow
class WorkflowMaker: """ _WorkflowMaker_ Basic MC workflow maker for PR to use to create workflow spec files. """ def __init__(self, requestId, channel, label): self.requestId = requestId self.group = None self.label = label self.timestamp = int(time.time()) self.channel = channel self.cmsswVersions = [] self.configurations = [] self.psetHashes = {} self.origCfgs = {} self.acquisitionEra = None self.processingString = None self.processingVersion = None self.conditions = None # turn on use of proper naming convention for datasets # should be made the default soon, lets deprecate all the old crap self.useProperNamingConventions = False self.options = {} self.options.setdefault('FakeHash', False) # Should we use another attribute for setting the output dataset # status in DBS? self.outputDatasetStatus = 'VALID' self.inputDataset = {} self.inputDataset['IsUsed'] = False self.inputDataset['DatasetName'] = None self.inputDataset['Primary'] = None self.inputDataset['Processed'] = None self.inputDataset['DataTier'] = None # // # // Extra controls over input dataset if required #// self.inputDataset['SplitType'] = None self.inputDataset['SplitSize'] = None self.inputDataset['OnlySites'] = None self.inputDataset['OnlyBlocks'] = None self.inputDataset['OnlyClosedBlocks'] = True # // # // Pileup Dataset controls #// self.pileupDatasets = [] # // # // Initialise basic workflow #// self.workflow = WorkflowSpec() self.workflowName = "%s-%s-%s" % (label, channel, requestId) self.workflow.setWorkflowName(self.workflowName) self.workflow.setRequestCategory("mc") self.workflow.setRequestTimestamp(self.timestamp) self.workflow.parameters['RequestLabel'] = self.label self.workflow.parameters['ProdRequestID'] = self.requestId self.cmsRunNode = self.workflow.payload self.cmsRunNode.name = "cmsRun1" self.cmsRunNode.type = "CMSSW" self.cmsRunNodes = [self.cmsRunNode] self.saveOutputFor = [] def chainCmsRunNode(self, stageOutIntermediates = False, *outputModules): """ append a cmsRun config to the current cmsRun node and chain them """ if stageOutIntermediates: #Do we want to keep cmsRunNode's products? self.saveOutputFor.append(self.cmsRunNode.name) newnode = self.cmsRunNode.newNode("cmsRun%s" % (len(self.cmsRunNodes) + 1)) newnode.type = "CMSSW" if not outputModules: outputModules = self.configurations[-1].outputModules.keys() for outmodule in outputModules: newnode.addInputLink(self.cmsRunNode.name, outmodule, 'source', AppearStandalone = not stageOutIntermediates) self.cmsRunNode = newnode self.cmsRunNodes.append(newnode) def changeCategory(self, newCategory): """ _changeCategory_ Change the workflow category from the default mc that appears in the LFNs """ self.workflow.setRequestCategory(newCategory) return def setAcquisitionEra(self,era): """ _setAcquisitionEra_ Sets the AcquisitionEra in the workflow """ self.workflow.setAcquisitionEra(era) self.acquisitionEra=era return def setNamingConventionParameters(self, era, procString, procVers): """ _setNamingConventionParameters_ Sets AcquisitionEra, ProcessingString and ProcessingVersion """ self.workflow.setAcquisitionEra(era) self.workflow.parameters['ProcessingString'] = procString self.workflow.parameters['ProcessingVersion'] = procVers self.acquisitionEra=era self.processingString = procString self.processingVersion = procVers self.useProperNamingConventions = True return def setActivity(self, activity): """ _changeWorkflowType_ Set the workflow type i.e. Simulation, Reconstruction, Reprocessing, Skimming """ self.workflow.setActivity(activity) return def setCMSSWVersion(self, version): """ _setCMSSWVersion_ Set the version of CMSSW to be used """ self.cmsswVersions.append(version) self.cmsRunNode.application['Version'] = version self.cmsRunNode.application['Executable'] = "cmsRun" self.cmsRunNode.application['Project'] = "CMSSW" self.cmsRunNode.application['Architecture'] = "" return def setUserSandbox(self,sandboxloc): """ _setSandbox_ Sets the location of the user sandbox """ self.cmsRunNode.userSandbox=sandboxloc return def setPhysicsGroup(self, group): """ _setPhysicsGroup_ Physics Group owning the workflow """ self.group = group self.workflow.parameters['PhysicsGroup'] = self.group return def setConfiguration(self, cfgFile, **args): """ _setConfiguration_ Provide the CMSSW configuration to be used. By default, assume that cfgFile is a python format string. The format & type can be specified using args: - Type : must be "file" or "string" or "instance" """ cfgType = args.get("Type", "instance") if cfgType not in ("file", "string", "instance"): msg = "Illegal Type for cfg file: %s\n" % cfgType msg += "Should be \"file\" or \"string\"\n" raise RuntimeError, msg cfgContent = cfgFile if cfgType == "file": cfgContent = file(cfgFile).read() cfgType = "string" if cfgType == "string": cfgData = cfgContent cfgContent = CMSSWConfig() cfgContent.unpack(cfgData) self.cmsRunNode.cfgInterface = cfgContent self.configurations.append(cfgContent) return def setOriginalCfg(self, honkingGreatString): """ _setOriginalCfg_ Set the original cfg file content that is to be recorded in DBS CALL THIS METHOD AFTER setConfiguration """ sep = '\n\n### Next chained config file ###\n\n' cfg = '' for link in self.cmsRunNode._InputLinks: if link['AppearStandalone']: prev_config = self.origCfgs.get(link['InputNode'], '') if prev_config: cfg = '%s%s%s' % (cfg, prev_config, sep) cfg = '%s%s' % (cfg, honkingGreatString) self.cmsRunNode.cfgInterface.originalCfg = cfg self.origCfgs[self.cmsRunNode.name] = cfg return def setPSetHash(self, hashValue): """ _setPSetHash_ Set the value for the PSetHash If any InputLinks are present their pset hashes are prepended """ hash = '' for link in self.cmsRunNode._InputLinks: if link['AppearStandalone']: prev_node_hash = self.psetHashes.get(link['InputNode'], None) if prev_node_hash: # cmsGen nodes will be missing hash = '%s%s_' % (hash, prev_node_hash) hash = '%s%s' % (hash, hashValue) self.psetHashes[self.cmsRunNode.name] = hash return def addInputDataset(self, datasetPath): """ _addInputDataset_ If this workflow processes a dataset, set that here NOTE: Is possible to also specify - Split Type (file or event) - Split Size (int) - input DBS Not sure how many of these we want to use. For now, they can be added to the inputDataset dictionary """ datasetBits = DatasetConventions.parseDatasetPath(datasetPath) self.inputDataset.update(datasetBits) self.inputDataset['IsUsed'] = True self.inputDataset['DatasetName'] = datasetPath return def addPileupDataset(self, datasetName, filesPerJob = 10, targetModule=None): """ _addPileupDataset_ Add a dataset to provide pileup overlap. filesPerJob should be 1 in 99.9 % of cases """ pileupDataset = {} pileupDataset['Primary'] = None pileupDataset['Processed'] = None pileupDataset['DataTier'] = None datasetBits = DatasetConventions.parseDatasetPath(datasetName) pileupDataset.update(datasetBits) pileupDataset['FilesPerJob'] = filesPerJob # Target module coould be 'MixingModule' or 'DataMixingModule' for # the moment. If None, MixingModule will be used. pileupDataset['TargetModule'] = targetModule self.pileupDatasets.append(pileupDataset) return def addFinalDestination(self, *phedexNodeNames): """ _addFinalDestination_ Add a final destination that can be used to generate a PhEDEx subscription so that the data gets transferred to some final location. NOTE: Do we want to support a list of PhEDEx nodes? Eg CERN + FNAL """ nameList = "" for nodeName in phedexNodeNames: nameList += "%s," % nodeName nameList = nameList[:-1] self.workflow.parameters['PhEDExDestination'] = nameList return def addSelectionEfficiency(self, selectionEff): """ _addSelectionEfficiency_ Do we have a selection efficiency? """ self.cmsRunNode.applicationControls["SelectionEfficiency"] = \ selectionEff return def setOutputDatasetDbsStatus(self, status): """ _setOutputDatasetDbsStatus_ The output datasets will have this status in the field dataset.status. This value will be use when registering the output dataset in DBS. Only two values are acepted: - VALID - PRODUCTION """ if status in ('VALID', 'PRODUCTION'): self.outputDatasetStatus = status return def makeWorkflow(self): """ _makeWorkflow_ Call this method to create the workflow spec instance when done """ self._Validate() # // # // Add Stage Out node #// self.saveOutputFor.append(self.cmsRunNode.name) WorkflowTools.addStageOutNode(self.cmsRunNode, "stageOut1", *self.saveOutputFor) WorkflowTools.addLogArchNode(self.cmsRunNode, "logArchive") # // # // Input Dataset? #// if self.inputDataset['IsUsed']: inputDataset = self.cmsRunNodes[0].addInputDataset( self.inputDataset['Primary'], self.inputDataset['Processed'] ) inputDataset["DataTier"] = self.inputDataset['DataTier'] for keyname in [ 'SplitType', 'SplitSize', 'OnlySites', 'OnlyBlocks', 'OnlyClosedBlocks', ]: if self.inputDataset[keyname] != None: self.workflow.parameters[keyname] = self.inputDataset[keyname] # // # // Pileup Datasets? #// for pileupDataset in self.pileupDatasets: puDataset = self.cmsRunNodes[0].addPileupDataset( pileupDataset['Primary'], pileupDataset['DataTier'], pileupDataset['Processed']) puDataset['FilesPerJob'] = pileupDataset['FilesPerJob'] if pileupDataset['TargetModule'] is not None: puDataset['TargetModule'] = pileupDataset['TargetModule'] # // # // Extract dataset info from cfg #// datasets = {} datasetsToForward = {} for cmsRunNode, config in zip(self.cmsRunNodes, self.configurations): # Ignore nodes that don't save any output. But keep input dataset # in case we need to forward it. if cmsRunNode.name not in self.saveOutputFor: # Store parent dataset in case we need to forward it. if self.inputDataset['IsUsed'] and \ cmsRunNode == self.cmsRunNodes[0]: datasetsToForward[cmsRunNode.name] = \ self.inputDataset['DatasetName'] elif cmsRunNode != self.cmsRunNodes[0]: for inputLink in cmsRunNode._InputLinks: # If the previous cmsRunNode stages out, pull down the # dataset it produced. if not inputLink["AppearStandalone"]: # TODO: Wont work if more than one InputLink exists datasetsToForward[cmsRunNode.name] = \ datasets['%s:%s' % (inputLink['InputNode'], inputLink['OutputModule'])] # If the previous cmsRunNode does not stage out, then # use it's parent. else: # TODO: Wont work if more than one InputLink exists datasetsToForward[cmsRunNode.name] = \ datasetsToForward[inputLink['InputNode']] continue for outModName in config.outputModules.keys(): moduleInstance = config.getOutputModule(outModName) dataTier = moduleInstance['dataTier'] filterName = moduleInstance["filterName"] primaryName = DatasetConventions.primaryDatasetName( PhysicsChannel = self.channel, ) if self.useProperNamingConventions: if self.processingString and filterName: processingString = "_".join((self.processingString, filterName)) elif self.processingString: processingString = self.processingString elif filterName: processingString = filterName else: processingString = None processedName = DatasetConventions.properProcessedDatasetName( AcquisitionEra = self.acquisitionEra, ProcessingString = processingString, ProcessingVersion = self.processingVersion, Unmerged = True ) elif self.acquisitionEra == None: processedName = DatasetConventions.processedDatasetName( Version = cmsRunNode.application['Version'], Label = self.label, Group = self.group, FilterName = filterName, RequestId = self.requestId, Unmerged = True ) else: processedName = DatasetConventions.csa08ProcessedDatasetName( AcquisitionEra = self.acquisitionEra, Conditions = self.workflow.parameters['Conditions'], ProcessingVersion = self.workflow.parameters['ProcessingVersion'], FilterName = filterName, Unmerged = True ) dataTier = DatasetConventions.checkDataTier(dataTier) moduleInstance['primaryDataset'] = primaryName moduleInstance['processedDataset'] = processedName outDS = cmsRunNode.addOutputDataset(primaryName, processedName, outModName) outDS['Status'] = self.outputDatasetStatus outDS['DataTier'] = dataTier outDS["ApplicationName"] = \ cmsRunNode.application["Executable"] outDS["ApplicationFamily"] = outModName outDS["PhysicsGroup"] = self.group # check for input dataset for first node if self.inputDataset['IsUsed'] and cmsRunNode == self.cmsRunNodes[0]: outDS['ParentDataset'] = self.inputDataset['DatasetName'] # check for staged out intermediates elif cmsRunNode != self.cmsRunNodes[0]: for inputLink in cmsRunNode._InputLinks: if not inputLink["AppearStandalone"]: # TODO: Wont work if more than one InputLink exists outDS['ParentDataset'] = datasets['%s:%s' % (inputLink['InputNode'], inputLink['OutputModule'])] elif datasetsToForward.get( inputLink['InputNode']) is not None: outDS['ParentDataset'] = \ datasetsToForward[inputLink['InputNode']] if self.options['FakeHash']: guid = makeUUID() outDS['PSetHash'] = "hash=%s;guid=%s" % \ (self.psetHashes[cmsRunNode.name], guid) else: outDS['PSetHash'] = self.psetHashes[cmsRunNode.name] # record output in case used as input to a later node datasets['%s:%s' % (cmsRunNode.name, outModName)] = \ "/%s/%s/%s" % ( outDS['PrimaryDataset'], outDS['ProcessedDataset'], outDS['DataTier']) # optionally remap sibling relationships to parent-child (i.e HLTDEBUG) remapParentageForWorkflow(self.workflow) WorkflowTools.generateFilenames(self.workflow) return self.workflow def _Validate(self): """ _Validate_ Private method to test all options are set. Throws a WorkflowMakerError if any problems found """ notNoneAttrs = [ "requestId", "label", "group", "channel", ] for attrName in notNoneAttrs: value = getattr(self, attrName, None) if value == None: msg = "Attribute Not Set: %s" % attrName raise WorkflowMakerError(msg) if not len(self.configurations): msg = "Attribute Not Set: configurations" raise WorkflowMakerError(msg) if len(self.configurations) != len(self.cmsswVersions): msg = "len(self.configurations) != len(self.cmsswVersions)" raise WorkflowMakerError(msg) return
if not os.path.exists(cfgFile): msg = "Cfg File Not Found: %s" % cfgFile raise RuntimeError, msg # # create workflow # workflowName = "Tier0MCFeeder-%d" % int(time.time()) scramArch = "slc4_ia32_gcc345" cmsPath = "/afs/cern.ch/cms/sw" workflow = WorkflowSpec() workflow.setWorkflowName(workflowName) workflow.setRequestCategory("mc") workflow.setRequestTimestamp(int(time.time())) workflow.parameters["WorkflowType"] = "Processing" workflow.parameters["CMSSWVersion"] = version workflow.parameters["ScramArch"] = scramArch workflow.parameters["CMSPath"] = cmsPath # needed for streamed index stageout workflow.parameters['StreamerIndexDir'] = indexdir cmsRunNode = workflow.payload cmsRunNode.name = "cmsRun1" cmsRunNode.type = "CMSSW" cmsRunNode.application["Version"] = version cmsRunNode.application["Executable"] = "cmsRun" cmsRunNode.application["Project"] = "CMSSW"
class MergePackWorkflow(FactoryInterface): """ _MergePackWorkflow_ Util to build workflows for mergepack jobs """ def __init__(self, runNumber, version, cmsPath, scramArch, *outModuleInfo): FactoryInterface.__init__(self, version, cmsPath, scramArch) self.run = runNumber self.outputModules = list(outModuleInfo) def buildConfiguration(self, enableLazyDownload, configFile): """ _buildConfiguration_ mostly just a method to take the passed in information """ outputModuleDetails = {} for moduleInfo in self.outputModules: moduleName = "write_%s_%s_%s" % (moduleInfo["stream"], moduleInfo["dataset"], moduleInfo["dataTier"]) outputModuleDetails[moduleName] = { "Stream" : moduleInfo["stream"], "primaryDataset" : moduleInfo.get("dataset", None), "processedDataset" : moduleInfo.get("processedDataset", None), "dataTier" : moduleInfo["dataTier"], "acquisitionEra" : moduleInfo["acquisitionEra"], "processingVersion" : moduleInfo["processingVersion"], } if moduleInfo.has_key("triggerPaths"): selEvents = [ "%s:%s" % (x, moduleInfo["process"]) for x in moduleInfo["triggerPaths"] ] outputModuleDetails[moduleName]["SelectEvents"] = selEvents outputModuleDetails[moduleName]["compressionLevel"] = 3 if configFile == None: cfgInterface = self.createConfiguration(sourceType = "PoolSource", processName = "MERGEPACKER", configName = "mergepacker-config", enableLazyDownload = enableLazyDownload, outputModuleDetails = outputModuleDetails, noEventSort = True) else: cfgInterface = self.createConfiguration(sourceType = "PoolSource", configFile = configFile, enableLazyDownload = enableLazyDownload, outputModuleTemplate = outputModuleDetails.values()[0], noEventSort = True) return cfgInterface def makeWorkflowSpec(self, name, enableLazyDownload, configFile = None): """ _makeWorkflowSpec_ Create a workflow spec instance """ self.workflow = WorkflowSpec() self.workflow.setWorkflowName(name) self.workflow.setRequestCategory("data") self.workflow.setRequestTimestamp(int(time.time())) self.workflow.parameters["WorkflowType"] = "Repack" self.workflow.parameters["RequestLabel"] = name self.workflow.parameters["ProdRequestID"] = self.run self.workflow.parameters["RunNumber"] = self.run self.workflow.parameters["CMSSWVersion"] = self.cmssw["CMSSWVersion"] self.workflow.parameters["ScramArch"] = self.cmssw["ScramArch"] self.workflow.parameters["CMSPath"] = self.cmssw["CMSPath"] cmsRunNode = self.workflow.payload cmsRunNode.name = "cmsRun1" cmsRunNode.type = "CMSSW" cmsRunNode.application["Version"] = self.cmssw["CMSSWVersion"] cmsRunNode.application["Executable"] = "cmsRun" cmsRunNode.application["Project"] = "CMSSW" cmsRunNode.application["Architecture"] = self.cmssw["ScramArch"] # runtime express merge script cmsRunNode.scriptControls["PreExe"].append( "T0.ExpressMerger.RuntimeExpressMerger" ) # build the configuration template for the workflow cmsRunNode.cfgInterface = self.buildConfiguration(enableLazyDownload, configFile) if cmsRunNode.cfgInterface == None: return None # generate Dataset information for workflow from cfgInterface for outMod,moduleInstance in cmsRunNode.cfgInterface.outputModules.items(): primaryName = moduleInstance["primaryDataset"] processedName = moduleInstance["processedDataset"] outDS = cmsRunNode.addOutputDataset(primaryName, processedName, outMod) outDS["DataTier"] = moduleInstance["dataTier"] outDS["ApplicationName"] = cmsRunNode.application["Executable"] outDS["ApplicationFamily"] = outMod outDS["PhysicsGroup"] = "Tier0" outDS["ApplicationFamily"] = outMod # generate just single LFN stub (all output is merged) # insert them into the output module and dataset info outDS["LFNBase"] = self.getLFN(moduleInstance, dataType = "express") moduleInstance["LFNBase"] = outDS["LFNBase"] moduleInstance["logicalFileName"] = os.path.join( outDS["LFNBase"], "%s.root" % outMod ) WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1") WorkflowTools.addLogArchNode(cmsRunNode, "logArchive") # override stageout # # FIXME: This hardcodes the TFC LFN prefix !!! ## if svcClass != None: ## finder = NodeFinder("stageOut1") ## self.workflow.payload.operate(finder) ## node = finder.result ## WorkflowTools.addStageOutOverride(node, ## "rfcp", ## "", ## "srm-cms.cern.ch", ## "rfio:///castor?svcClass=%s&path=/castor/cern.ch/cms" % svcClass) return self.workflow
class RepackWorkflow(FactoryInterface): """ _RepackFactory_ Util to build workflows for accumulator or merge repacker jobs """ def __init__(self, runNumber, version, cmsPath, scramArch, *outModuleInfo): FactoryInterface.__init__(self, version, cmsPath, scramArch) self.run = runNumber self.outputModules = list(outModuleInfo) def buildConfiguration(self, enableLazyDownload): """ _buildConfiguration_ Using a RepackerConfigMaker instance, generate a template config file """ outputModuleDetails = {} for moduleInfo in self.outputModules: moduleName = "write_%s_%s_%s" % (moduleInfo["stream"], moduleInfo["dataset"], moduleInfo["dataTier"]) outputModuleDetails[moduleName] = { "Stream" : moduleInfo["stream"], "algorithm" : None, "primaryDataset" : moduleInfo.get("dataset", None), "processedDataset" : moduleInfo.get("processedDataset", None), "dataTier" : moduleInfo["dataTier"], "filterName" : None, "acquisitionEra" : moduleInfo["acquisitionEra"], "processingVersion" : moduleInfo["processingVersion"], "globalTag" : moduleInfo["globalTag"], "LFNBase" : None, "MergedLFNBase" : None, "compressionLevel" : 6 } if moduleInfo.has_key("triggerPaths"): selEvents = [ "%s:%s" % (x, moduleInfo["process"]) for x in moduleInfo["triggerPaths"] ] outputModuleDetails[moduleName]["SelectEvents"] = selEvents else: outputModuleDetails[moduleName]["SelectEvents"] = None cfgInterface = self.createConfiguration(sourceType = "NewEventStreamFileReader", processName = "REPACKER", configName = "repack-config", enableLazyDownload = enableLazyDownload, outputModuleDetails = outputModuleDetails) return cfgInterface def makeWorkflow(self, name, enableLazyDownload): """ _makeWorkflow_ Create a workflow spec instance for the run provided """ # // # // Initialise basic workflow #// self.workflow = WorkflowSpec() self.workflow.setWorkflowName(name) self.workflow.setRequestCategory("data") self.workflow.setRequestTimestamp(int(time.time())) self.workflow.parameters["WorkflowType"] = "Repack" self.workflow.parameters["RequestLabel"] = name self.workflow.parameters["ProdRequestID"] = self.run self.workflow.parameters["RunNumber"] = self.run self.workflow.parameters["CMSSWVersion"] = self.cmssw["CMSSWVersion"] self.workflow.parameters["ScramArch"] = self.cmssw["ScramArch"] self.workflow.parameters["CMSPath"] = self.cmssw["CMSPath"] # runtime support for StreamerJobEntity self.workflow.addPythonLibrary("T0.DataStructs") cmsRunNode = self.workflow.payload cmsRunNode.name = "cmsRun1" cmsRunNode.type = "CMSSW" cmsRunNode.application["Version"] = self.cmssw["CMSSWVersion"] cmsRunNode.application["Executable"] = "cmsRun" cmsRunNode.application["Project"] = "CMSSW" cmsRunNode.application["Architecture"] = self.cmssw["ScramArch"] # runtime repacker script cmsRunNode.scriptControls["PreExe"].append( "T0.RepackerInjector.RuntimeRepacker") # build the configuration template for the workflow cmsRunNode.cfgInterface = self.buildConfiguration(enableLazyDownload) if cmsRunNode.cfgInterface == None: return None # generate Dataset information for workflow from cfgInterface for outMod in cmsRunNode.cfgInterface.outputModules.keys(): moduleInstance = cmsRunNode.cfgInterface.getOutputModule(outMod) primaryName = moduleInstance["primaryDataset"] processedName = moduleInstance["processedDataset"] outDS = cmsRunNode.addOutputDataset(primaryName, processedName, outMod) outDS["DataTier"] = moduleInstance["dataTier"] outDS["ApplicationName"] = cmsRunNode.application["Executable"] outDS["ApplicationFamily"] = outMod outDS["PhysicsGroup"] = "Tier0" outDS["ApplicationFamily"] = outMod # generate merged and unmerged LFN stubs # insert them into the output module and dataset info outDS["LFNBase"] = self.getLFN(moduleInstance, Unmerged = True) outDS["MergedLFNBase"] = self.getLFN(moduleInstance) moduleInstance["LFNBase"] = outDS["LFNBase"] moduleInstance["MergedLFNBase"] = outDS["MergedLFNBase"] moduleInstance["logicalFileName"] = os.path.join( outDS["LFNBase"], "%s.root" % outMod) WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1") WorkflowTools.addLogArchNode(cmsRunNode, "logArchive") return self.workflow