Ejemplo n.º 1
0
def validateDataset( datasetPath, dbsUrl):
    """
    _validateDataset_
    
    Util method to check that the datasetPath provided
    exists in the dbsUrl provided
    
    """
    
    datasetDetails = DatasetConventions.parseDatasetPath(datasetPath)
    for key in ['Primary', 'DataTier', 'Processed']:
        if datasetDetails[key] == None:
            msg = "Invalid Dataset Name: \n ==> %s\n" % datasetPath
            msg += "Does not contain %s information" % key
            raise WorkflowMakerError(msg)
                

    datasets = []
    try:
        reader = DBSReader(dbsUrl)
        datasets = reader.matchProcessedDatasets(
            datasetDetails['Primary'],
            datasetDetails['DataTier'],
            datasetDetails['Processed'])
    except Exception, ex:
        msg = "Error calling DBS to validate dataset:\n%s\n" % datasetPath
        msg += str(ex)
        raise WorkflowMakerError(msg)
Ejemplo n.º 2
0
    def addInputDataset(self, datasetPath):
        """
        _addInputDataset_

        If this workflow processes a dataset, set that here

        NOTE: Is possible to also specify
            - Split Type (file or event)
            - Split Size (int)
            - input DBS
        Not sure how many of these we want to use.
        For now, they can be added to the inputDataset dictionary
        """
        datasetBits = DatasetConventions.parseDatasetPath(datasetPath)
        self.inputDataset.update(datasetBits)
        self.inputDataset['IsUsed'] = True
        self.inputDataset['DatasetName'] = datasetPath

        return
Ejemplo n.º 3
0
    def addInputDataset(self, datasetPath):
        """
        _addInputDataset_

        If this workflow processes a dataset, set that here

        NOTE: Is possible to also specify
            - Split Type (file or event)
            - Split Size (int)
            - input DBS
        Not sure how many of these we want to use.
        For now, they can be added to the inputDataset dictionary
        """
        datasetBits = DatasetConventions.parseDatasetPath(datasetPath)
        self.inputDataset.update(datasetBits)
        self.inputDataset['IsUsed'] = True
        self.inputDataset['DatasetName'] = datasetPath
        
        return
Ejemplo n.º 4
0
    def addPileupDataset(self, datasetName, filesPerJob=10, targetModule=None):
        """
        _addPileupDataset_

        Add a dataset to provide pileup overlap.
        filesPerJob should be 1 in 99.9 % of cases

        """
        pileupDataset = {}
        pileupDataset['Primary'] = None
        pileupDataset['Processed'] = None
        pileupDataset['DataTier'] = None
        datasetBits = DatasetConventions.parseDatasetPath(datasetName)
        pileupDataset.update(datasetBits)
        pileupDataset['FilesPerJob'] = filesPerJob
        # Target module coould be 'MixingModule' or 'DataMixingModule' for
        # the moment. If None, MixingModule will be used.
        pileupDataset['TargetModule'] = targetModule
        self.pileupDatasets.append(pileupDataset)
        return
Ejemplo n.º 5
0
    def addPileupDataset(self, datasetName, filesPerJob = 10,
            targetModule=None):
        """
        _addPileupDataset_

        Add a dataset to provide pileup overlap.
        filesPerJob should be 1 in 99.9 % of cases

        """
        pileupDataset = {}
        pileupDataset['Primary'] = None
        pileupDataset['Processed'] = None
        pileupDataset['DataTier'] = None
        datasetBits = DatasetConventions.parseDatasetPath(datasetName)
        pileupDataset.update(datasetBits)
        pileupDataset['FilesPerJob'] = filesPerJob
        # Target module coould be 'MixingModule' or 'DataMixingModule' for
        # the moment. If None, MixingModule will be used.
        pileupDataset['TargetModule'] = targetModule
        self.pileupDatasets.append(pileupDataset)
        return
Ejemplo n.º 6
0
    def makeWorkflow(self):
        """
        _makeWorkflow_

        Call this method to create the workflow spec instance when
        done

        """
        self._Validate()

        #  //
        # // Add Stage Out node
        #//
        self.saveOutputFor.append(self.cmsRunNode.name)
        WorkflowTools.addStageOutNode(self.cmsRunNode, "stageOut1",
                                      *self.saveOutputFor)
        WorkflowTools.addLogArchNode(self.cmsRunNode, "logArchive")

        #  //
        # // Input Dataset?
        #//
        if self.inputDataset['IsUsed']:
            inputDataset = self.cmsRunNodes[0].addInputDataset(
                self.inputDataset['Primary'], self.inputDataset['Processed'])
            inputDataset["DataTier"] = self.inputDataset['DataTier']
            for keyname in [
                    'SplitType',
                    'SplitSize',
                    'OnlySites',
                    'OnlyBlocks',
                    'OnlyClosedBlocks',
            ]:
                if self.inputDataset[keyname] != None:
                    self.workflow.parameters[keyname] = self.inputDataset[
                        keyname]

        #  //
        # // Pileup Datasets?
        #//
        for pileupDataset in self.pileupDatasets:
            puDataset = self.cmsRunNodes[0].addPileupDataset(
                pileupDataset['Primary'], pileupDataset['DataTier'],
                pileupDataset['Processed'])
            puDataset['FilesPerJob'] = pileupDataset['FilesPerJob']
            if pileupDataset['TargetModule'] is not None:
                puDataset['TargetModule'] = pileupDataset['TargetModule']

        #  //
        # // Extract dataset info from cfg
        #//
        datasets = {}
        datasetsToForward = {}
        for cmsRunNode, config in zip(self.cmsRunNodes, self.configurations):

            # Ignore nodes that don't save any output. But keep input dataset
            # in case we need to forward it.
            if cmsRunNode.name not in self.saveOutputFor:
                # Store parent dataset in case we need to forward it.
                if self.inputDataset['IsUsed'] and \
                                            cmsRunNode == self.cmsRunNodes[0]:
                    datasetsToForward[cmsRunNode.name] = \
                                            self.inputDataset['DatasetName']
                elif cmsRunNode != self.cmsRunNodes[0]:
                    for inputLink in cmsRunNode._InputLinks:
                        # If the previous cmsRunNode stages out, pull down the
                        # dataset it produced.
                        if not inputLink["AppearStandalone"]:
                            # TODO: Wont work if more than one InputLink exists
                            datasetsToForward[cmsRunNode.name] = \
                                datasets['%s:%s' % (inputLink['InputNode'],
                                inputLink['OutputModule'])]
                        # If the previous cmsRunNode does not stage out, then
                        # use it's parent.
                        else:
                            # TODO: Wont work if more than one InputLink exists
                            datasetsToForward[cmsRunNode.name] = \
                                datasetsToForward[inputLink['InputNode']]
                continue

            for outModName in config.outputModules.keys():
                moduleInstance = config.getOutputModule(outModName)
                dataTier = moduleInstance['dataTier']
                filterName = moduleInstance["filterName"]
                primaryName = DatasetConventions.primaryDatasetName(
                    PhysicsChannel=self.channel, )

                if self.useProperNamingConventions:
                    if self.processingString and filterName:
                        processingString = "_".join(
                            (self.processingString, filterName))
                    elif self.processingString:
                        processingString = self.processingString
                    elif filterName:
                        processingString = filterName
                    else:
                        processingString = None
                    processedName = DatasetConventions.properProcessedDatasetName(
                        AcquisitionEra=self.acquisitionEra,
                        ProcessingString=processingString,
                        ProcessingVersion=self.processingVersion,
                        Unmerged=True)
                elif self.acquisitionEra == None:
                    processedName = DatasetConventions.processedDatasetName(
                        Version=cmsRunNode.application['Version'],
                        Label=self.label,
                        Group=self.group,
                        FilterName=filterName,
                        RequestId=self.requestId,
                        Unmerged=True)
                else:
                    processedName = DatasetConventions.csa08ProcessedDatasetName(
                        AcquisitionEra=self.acquisitionEra,
                        Conditions=self.workflow.parameters['Conditions'],
                        ProcessingVersion=self.workflow.
                        parameters['ProcessingVersion'],
                        FilterName=filterName,
                        Unmerged=True)

                dataTier = DatasetConventions.checkDataTier(dataTier)

                moduleInstance['primaryDataset'] = primaryName
                moduleInstance['processedDataset'] = processedName

                outDS = cmsRunNode.addOutputDataset(primaryName, processedName,
                                                    outModName)

                outDS['Status'] = self.outputDatasetStatus
                outDS['DataTier'] = dataTier
                outDS["ApplicationName"] = \
                                         cmsRunNode.application["Executable"]
                outDS["ApplicationFamily"] = outModName
                outDS["PhysicsGroup"] = self.group

                # check for input dataset for first node
                if self.inputDataset[
                        'IsUsed'] and cmsRunNode == self.cmsRunNodes[0]:
                    outDS['ParentDataset'] = self.inputDataset['DatasetName']
                # check for staged out intermediates
                elif cmsRunNode != self.cmsRunNodes[0]:
                    for inputLink in cmsRunNode._InputLinks:
                        if not inputLink["AppearStandalone"]:
                            # TODO: Wont work if more than one InputLink exists
                            outDS['ParentDataset'] = datasets[
                                '%s:%s' % (inputLink['InputNode'],
                                           inputLink['OutputModule'])]
                        elif datasetsToForward.get(
                                inputLink['InputNode']) is not None:
                            outDS['ParentDataset'] = \
                                    datasetsToForward[inputLink['InputNode']]

                if self.options['FakeHash']:
                    guid = makeUUID()
                    outDS['PSetHash'] = "hash=%s;guid=%s" % \
                            (self.psetHashes[cmsRunNode.name], guid)
                else:
                    outDS['PSetHash'] = self.psetHashes[cmsRunNode.name]

                # record output in case used as input to a later node
                datasets['%s:%s' % (cmsRunNode.name, outModName)] = \
                                "/%s/%s/%s" % ( outDS['PrimaryDataset'],
                                                  outDS['ProcessedDataset'],
                                                  outDS['DataTier'])

        # optionally remap sibling relationships to parent-child (i.e HLTDEBUG)
        remapParentageForWorkflow(self.workflow)
        WorkflowTools.generateFilenames(self.workflow)

        return self.workflow
Ejemplo n.º 7
0
    msg = "--split-size option not provided: This is required"
    raise RuntimeError, msg

try:
    splitSize = int(splitSize)
except ValueError, ex:
    msg = "--split-size argument is not an integer: %s\n" % splitSize
    raise RuntimeError, msg

#channel0 = DatasetConventions.parseDatasetPath(dataset)['Primary']

if channel == None:
    #  //
    # // Assume same as input
    #//
    channel = DatasetConventions.parseDatasetPath(dataset)['Primary']
    



#  //
# // Checking arguments against naming conventions
#//
if not (re.findall("^v[0-9]+$", processingVersion)):
    msg = "processing_version '" + processingVersion + \
        "' violates naming conventions!\n" \
        "Processing version should match this regexp ^v[0-9]+$ " \
        "(see https://twiki.cern.ch/twiki/bin/view/CMS/DMWMPG_PrimaryDatasets)"
    raise RuntimeError, msg

if re.findall("[-]+", acquisitionEra):
Ejemplo n.º 8
0
def createMergeJobWorkflow(procSpec, isFastMerge = True, doCleanUp = True, littleE = False):
    """
    _createMergeJobWorkflow_

    Given a Processing Workflow, generate a set of Merge Job
    workflows that can be used to generate actual merge jobs 
    (as opposed to creating datasets like createMergeDatasetWorkflow)

    returns a dictionary of (input, IE MergeSensor watched) dataset name
    to workflow spec instances

    """
    mergeDatasetWF = createMergeDatasetWorkflow(procSpec, isFastMerge)
    mergeDatasets = mergeDatasetWF.outputDatasets()

    results = {}

    procSpecName = procSpec.workflowName()
    

    for dataset in mergeDatasets:
        inputDataset = dataset['ParentDataset']

        newWF = WorkflowSpec()
        newWF.parameters.update(procSpec.parameters)
        newWF.setWorkflowName(procSpecName)
        newWF.parameters['WorkflowType'] = "Merge"
        

        cmsRunNode = newWF.payload
        cmsRunNode.name = "cmsRun1"
        cmsRunNode.type = "CMSSW"
        cmsRunNode.application["Project"] = "CMSSW"
        cmsRunNode.application["Version"] = dataset['ApplicationVersion']
        cmsRunNode.application["Architecture"] = "slc3_ia32_gcc323"

        #  //
        # // Hack to forward UserSandbox to Merge Jobs
        #//
        userSandbox = dataset.get("UserSandbox", None)
        if userSandbox != None:
            cmsRunNode.userSandbox = userSandbox

        #if isFastMerge == True:
        #    if littleE:
        #        cmsRunNode.application["Executable"] = "edmFastMerge"
        #    else:
        #        cmsRunNode.application["Executable"] = _FastMergeBinary
        #    outputModuleName = "EdmFastMerge"
        #else:
        cmsRunNode.application["Executable"] = "cmsRun"
        outputModuleName = "Merged"

        #  //
        # // Input Dataset
        #//
        datasetBits = DatasetConventions.parseDatasetPath(inputDataset)
        inDataset = cmsRunNode.addInputDataset(datasetBits['Primary'],
                                               datasetBits['Processed'])
        inDataset["DataTier"] = datasetBits['DataTier']

        #  //
        # // Output Dataset
        #//
        
        outputDataset = cmsRunNode.addOutputDataset(
            dataset['PrimaryDataset'], 
            dataset['ProcessedDataset'], 
            outputModuleName)

        outputDataset["DataTier"] = dataset['DataTier']
        outputDataset["PSetHash"] = dataset['PSetHash']

        outputDataset["ApplicationName"] = \
                    cmsRunNode.application["Executable"]
        outputDataset["ApplicationProject"] = \
                    cmsRunNode.application["Project"]
        outputDataset["ApplicationVersion"] = \
                    cmsRunNode.application["Version"]
        outputDataset["ApplicationFamily"] = outputModuleName
        outputDataset["PhysicsGroup"] = \
                      procSpec.parameters.get('PhysicsGroup', None)
        outputDataset['ParentDataset'] = inputDataset
                
        
        #  //
        # // Add Stage Out node
        #//
        WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1")
        if doCleanUp == True:
            WorkflowTools.addCleanUpNode(cmsRunNode, "cleanUp1")

        #  //
        # // Add log archive node
        #//
        WorkflowTools.addLogArchNode(cmsRunNode, "logArchive")

        WorkflowTools.generateFilenames(newWF)

        
        results[inputDataset] = newWF

    return results
Ejemplo n.º 9
0
    def makeWorkflow(self):
        """
        _makeWorkflow_

        Call this method to create the workflow spec instance when
        done

        """
        self._Validate()
        
        #  //
        # // Add Stage Out node
        #//
        self.saveOutputFor.append(self.cmsRunNode.name)
        WorkflowTools.addStageOutNode(self.cmsRunNode,
                        "stageOut1", *self.saveOutputFor)
        WorkflowTools.addLogArchNode(self.cmsRunNode, "logArchive")

        #  //
        # // Input Dataset?
        #//
        if self.inputDataset['IsUsed']:
            inputDataset = self.cmsRunNodes[0].addInputDataset(
                self.inputDataset['Primary'],
                self.inputDataset['Processed']
                )
            inputDataset["DataTier"] = self.inputDataset['DataTier']
            for keyname in [
                'SplitType',
                'SplitSize',
                'OnlySites',
                'OnlyBlocks',
                'OnlyClosedBlocks',
                ]:
                if self.inputDataset[keyname] != None:
                    self.workflow.parameters[keyname] = self.inputDataset[keyname]
                    
            
        #  //
        # // Pileup Datasets?
        #//
        for pileupDataset in self.pileupDatasets:
            puDataset = self.cmsRunNodes[0].addPileupDataset(
                pileupDataset['Primary'],
                pileupDataset['DataTier'],
                pileupDataset['Processed'])
            puDataset['FilesPerJob'] = pileupDataset['FilesPerJob']
            if pileupDataset['TargetModule'] is not None:
                puDataset['TargetModule'] = pileupDataset['TargetModule']
            
        
        #  //
        # // Extract dataset info from cfg
        #//
        datasets = {}
        datasetsToForward = {}
        for cmsRunNode, config in zip(self.cmsRunNodes, self.configurations):
            
            # Ignore nodes that don't save any output. But keep input dataset
            # in case we need to forward it.
            if cmsRunNode.name not in self.saveOutputFor:
                # Store parent dataset in case we need to forward it.
                if self.inputDataset['IsUsed'] and \
                                            cmsRunNode == self.cmsRunNodes[0]:
                    datasetsToForward[cmsRunNode.name] = \
                                            self.inputDataset['DatasetName']
                elif cmsRunNode != self.cmsRunNodes[0]:
                    for inputLink in cmsRunNode._InputLinks:
                        # If the previous cmsRunNode stages out, pull down the
                        # dataset it produced.
                        if not inputLink["AppearStandalone"]:
                            # TODO: Wont work if more than one InputLink exists
                            datasetsToForward[cmsRunNode.name] = \
                                datasets['%s:%s' % (inputLink['InputNode'],
                                inputLink['OutputModule'])]
                        # If the previous cmsRunNode does not stage out, then
                        # use it's parent.
                        else:
                            # TODO: Wont work if more than one InputLink exists
                            datasetsToForward[cmsRunNode.name] = \
                                datasetsToForward[inputLink['InputNode']]
                continue
            
            for outModName in config.outputModules.keys():
                moduleInstance = config.getOutputModule(outModName)
                dataTier = moduleInstance['dataTier']
                filterName = moduleInstance["filterName"]
                primaryName = DatasetConventions.primaryDatasetName(
                                        PhysicsChannel = self.channel,
                                        )

                if self.useProperNamingConventions:
                    if self.processingString and filterName:
                        processingString = "_".join((self.processingString, filterName))
                    elif self.processingString:
                        processingString = self.processingString
                    elif filterName:
                        processingString = filterName
                    else:
                        processingString = None
                    processedName = DatasetConventions.properProcessedDatasetName(
                        AcquisitionEra = self.acquisitionEra,
                        ProcessingString = processingString,
                        ProcessingVersion = self.processingVersion,
                        Unmerged = True
                        )
                elif self.acquisitionEra == None:
                    processedName = DatasetConventions.processedDatasetName(
                        Version = cmsRunNode.application['Version'],
                        Label = self.label,
                        Group = self.group,
                        FilterName = filterName,
                        RequestId = self.requestId,
                        Unmerged = True
                        )
                else:
                    processedName = DatasetConventions.csa08ProcessedDatasetName(
                        AcquisitionEra = self.acquisitionEra,
                        Conditions = self.workflow.parameters['Conditions'],
                        ProcessingVersion = self.workflow.parameters['ProcessingVersion'],
                        FilterName = filterName,
                        Unmerged = True
                        )
                  
                dataTier = DatasetConventions.checkDataTier(dataTier)

                moduleInstance['primaryDataset'] = primaryName
                moduleInstance['processedDataset'] = processedName
    
                outDS = cmsRunNode.addOutputDataset(primaryName, 
                                                         processedName,
                                                         outModName)

                outDS['Status'] = self.outputDatasetStatus                
                outDS['DataTier'] = dataTier
                outDS["ApplicationName"] = \
                                         cmsRunNode.application["Executable"]
                outDS["ApplicationFamily"] = outModName
                outDS["PhysicsGroup"] = self.group
    
                # check for input dataset for first node
                if self.inputDataset['IsUsed'] and cmsRunNode == self.cmsRunNodes[0]:
                    outDS['ParentDataset'] = self.inputDataset['DatasetName']
                # check for staged out intermediates
                elif cmsRunNode != self.cmsRunNodes[0]:
                    for inputLink in cmsRunNode._InputLinks:
                        if not inputLink["AppearStandalone"]:
                            # TODO: Wont work if more than one InputLink exists
                            outDS['ParentDataset'] = datasets['%s:%s' % (inputLink['InputNode'],
                                                                    inputLink['OutputModule'])]
                        elif datasetsToForward.get(
                                inputLink['InputNode']) is not None:
                            outDS['ParentDataset'] = \
                                    datasetsToForward[inputLink['InputNode']]

                if self.options['FakeHash']:
                    guid = makeUUID()
                    outDS['PSetHash'] = "hash=%s;guid=%s" % \
                            (self.psetHashes[cmsRunNode.name], guid)
                else:
                    outDS['PSetHash'] = self.psetHashes[cmsRunNode.name]

                # record output in case used as input to a later node
                datasets['%s:%s' % (cmsRunNode.name, outModName)] = \
                                "/%s/%s/%s" % ( outDS['PrimaryDataset'],
                                                  outDS['ProcessedDataset'],
                                                  outDS['DataTier'])

        # optionally remap sibling relationships to parent-child (i.e HLTDEBUG)
        remapParentageForWorkflow(self.workflow)
        WorkflowTools.generateFilenames(self.workflow)

        return self.workflow
Ejemplo n.º 10
0
def createHarvestingWorkflow(dataset, site, cmsPath, scramArch,
                             cmsswVersion, globalTag, configFile = None,
                             DQMServer = None, proxyLocation = None, 
                             DQMCopyToCERN = None, runNumber = None,
                             doStageOut = None):
    """
    _createHarvestingWorkflow_

    Create a Harvesting workflow to extract DQM information from
    a dataset

    Enters an essentially empty process that will be updated
    at runtime to use the harvesting cfg from the release.

    """

    datasetPieces = DatasetConventions.parseDatasetPath(dataset)

    physicsGroup = "OfflineDQM"
    category = "DQM"
    
    if runNumber == None:
        requestId = "OfflineDQM"
        label = "%s-%s-%s" % (datasetPieces['Primary'], datasetPieces['Processed'],
                          datasetPieces['DataTier'])
        channel = "DQMHarvest"
    else:
        requestId = "%s-%s" % (datasetPieces["Primary"], datasetPieces["DataTier"])
        label = "DQMHarvesting"
        channel = "Run%s" % runNumber

    logging.debug("path, arch, ver: %s, %s, %s" % (cmsPath, scramArch, cmsswVersion))

    if configFile != None:
        cfgWrapper = configFromFile(cmsPath, scramArch,
                                    cmsswVersion, configFile)
    else:
        cfgWrapper = configOnFly(cmsPath, scramArch,
                                 cmsswVersion)
        
    #  //
    # // Pass in global tag
    #//
    cfgWrapper.conditionsTag = globalTag


    maker = WorkflowMaker(requestId, channel, label )
    maker.setCMSSWVersion(cmsswVersion)
    maker.setPhysicsGroup(physicsGroup)
    maker.setConfiguration(cfgWrapper, Type = "instance")
    maker.changeCategory(category)
    maker.setPSetHash("NO_HASH")
    maker.addInputDataset(dataset)
    maker.setActivity('harvesting')

    spec = maker.makeWorkflow()
    spec.parameters['WorkflowType'] = "Harvesting"
    spec.parameters['DBSURL'] = "http://cmsdbsprod.cern.ch/cms_dbs_prod_global/servlet/DBSServlet"
    spec.parameters['OnlySites'] = site
    if DQMServer != None :
        spec.parameters['DQMServer'] = DQMServer
    if proxyLocation != None :
        spec.parameters['proxyLocation'] = proxyLocation
    if DQMCopyToCERN != None :
        spec.parameters['DQMCopyToCERN'] = DQMCopyToCERN
    if doStageOut is not None:
        spec.parameters['DoStageOut'] = doStageOut

    spec.payload.scriptControls['PostTask'].append(
        "JobCreator.RuntimeTools.RuntimeOfflineDQM")

    if configFile == None:
        preExecScript = spec.payload.scriptControls["PreExe"]
        preExecScript.append("JobCreator.RuntimeTools.RuntimeOfflineDQMSetup")


    return spec
Ejemplo n.º 11
0
    def makeWorkflow(self):
        """
        _makeWorkflow_

        Call this method to create the workflow spec instance when
        done

        """
        self._Validate()

        #  //
        # // Input Dataset required for Tier0
        #//

        inputDataset = self.cmsRunNode.addInputDataset(
            self.inputDataset['Primary'], self.inputDataset['Processed'])
        inputDataset["DataTier"] = self.inputDataset['DataTier']
        for keyname in [
                'SplitType',
                'SplitSize',
                'OnlySites',
                'OnlyBlocks',
                'OnlyClosedBlocks',
        ]:
            if self.inputDataset[keyname] != None:
                self.workflow.parameters[keyname] = self.inputDataset[keyname]

        #  //
        # // Extract dataset info from cfg
        #//
        for outModName in self.configuration.outputModules.keys():
            moduleInstance = self.configuration.getOutputModule(outModName)
            #  //
            # // Data Tier same as input
            #//
            dataTier = self.inputDataset['DataTier']
            #  //
            # // Output primary dataset same as input primary
            #//
            primaryName = self.inputDataset['Primary']

            #  //
            # // Output processed dataset
            #//  (Note we pass way more info than is used, since
            #  //conventions have a tendency to change in CMS...
            # //
            #//
            processedName = DatasetConventions.tier0ProcessedDatasetName(
                Version=self.cmsswVersion,
                InputPrimaryDataset=self.inputDataset['Primary'],
                InputProcessedDataset=self.inputDataset['Processed'],
                Label=self.label,
                Group=self.group,
                RequestId=self.requestId,
                Unmerged=self.unmergedDataset)

            dataTier = DatasetConventions.checkDataTier(dataTier)

            moduleInstance['primaryDataset'] = primaryName
            moduleInstance['processedDataset'] = processedName

            outDS = self.cmsRunNode.addOutputDataset(primaryName,
                                                     processedName, outModName)

            outDS['DataTier'] = dataTier
            outDS["ApplicationName"] = \
                                     self.cmsRunNode.application["Executable"]
            outDS["ApplicationFamily"] = outModName
            outDS["PhysicsGroup"] = self.group
            outDS["ApplicationFamily"] = outModName

            if self.inputDataset['IsUsed']:
                outDS['ParentDataset'] = self.inputDataset['DatasetName']

            if self.options['FakeHash']:
                guid = makeUUID()
                outDS['PSetHash'] = "hash=%s;guid=%s" % (self.psetHash, guid)
            else:
                outDS['PSetHash'] = self.psetHash

        #  //
        # // Add Stage Out node
        #//
        WorkflowTools.addStageOutNode(self.cmsRunNode, "stageOut1")
        WorkflowTools.addLogArchNode(self.cmsRunNode, "logArchive")

        #  //
        # // generate tier0 LFN bases for this workflow
        #//
        tier0LFN = self.makeTier0LFN()

        self.workflow.parameters['MergedLFNBase'] = tier0LFN
        self.workflow.parameters['UnmergedLFNBase'] = tier0LFN

        return self.workflow
Ejemplo n.º 12
0
def createMergeJobWorkflow(procSpec,
                           isFastMerge=True,
                           doCleanUp=True,
                           littleE=False):
    """
    _createMergeJobWorkflow_

    Given a Processing Workflow, generate a set of Merge Job
    workflows that can be used to generate actual merge jobs 
    (as opposed to creating datasets like createMergeDatasetWorkflow)

    returns a dictionary of (input, IE MergeSensor watched) dataset name
    to workflow spec instances

    """
    mergeDatasetWF = createMergeDatasetWorkflow(procSpec, isFastMerge)
    mergeDatasets = mergeDatasetWF.outputDatasets()

    results = {}

    procSpecName = procSpec.workflowName()

    for dataset in mergeDatasets:
        inputDataset = dataset['ParentDataset']

        newWF = WorkflowSpec()
        newWF.parameters.update(procSpec.parameters)
        newWF.setWorkflowName(procSpecName)
        newWF.parameters['WorkflowType'] = "Merge"

        cmsRunNode = newWF.payload
        cmsRunNode.name = "cmsRun1"
        cmsRunNode.type = "CMSSW"
        cmsRunNode.application["Project"] = "CMSSW"
        cmsRunNode.application["Version"] = dataset['ApplicationVersion']
        cmsRunNode.application["Architecture"] = "slc3_ia32_gcc323"

        #  //
        # // Hack to forward UserSandbox to Merge Jobs
        #//
        userSandbox = dataset.get("UserSandbox", None)
        if userSandbox != None:
            cmsRunNode.userSandbox = userSandbox

        #if isFastMerge == True:
        #    if littleE:
        #        cmsRunNode.application["Executable"] = "edmFastMerge"
        #    else:
        #        cmsRunNode.application["Executable"] = _FastMergeBinary
        #    outputModuleName = "EdmFastMerge"
        #else:
        cmsRunNode.application["Executable"] = "cmsRun"
        outputModuleName = "Merged"

        #  //
        # // Input Dataset
        #//
        datasetBits = DatasetConventions.parseDatasetPath(inputDataset)
        inDataset = cmsRunNode.addInputDataset(datasetBits['Primary'],
                                               datasetBits['Processed'])
        inDataset["DataTier"] = datasetBits['DataTier']

        #  //
        # // Output Dataset
        #//

        outputDataset = cmsRunNode.addOutputDataset(
            dataset['PrimaryDataset'], dataset['ProcessedDataset'],
            outputModuleName)

        outputDataset["DataTier"] = dataset['DataTier']
        outputDataset["PSetHash"] = dataset['PSetHash']

        outputDataset["ApplicationName"] = \
                    cmsRunNode.application["Executable"]
        outputDataset["ApplicationProject"] = \
                    cmsRunNode.application["Project"]
        outputDataset["ApplicationVersion"] = \
                    cmsRunNode.application["Version"]
        outputDataset["ApplicationFamily"] = outputModuleName
        outputDataset["PhysicsGroup"] = \
                      procSpec.parameters.get('PhysicsGroup', None)
        outputDataset['ParentDataset'] = inputDataset

        #  //
        # // Add Stage Out node
        #//
        WorkflowTools.addStageOutNode(cmsRunNode, "stageOut1")
        if doCleanUp == True:
            WorkflowTools.addCleanUpNode(cmsRunNode, "cleanUp1")

        #  //
        # // Add log archive node
        #//
        WorkflowTools.addLogArchNode(cmsRunNode, "logArchive")

        WorkflowTools.generateFilenames(newWF)

        results[inputDataset] = newWF

    return results
Ejemplo n.º 13
0
    def makeWorkflow(self):
        """
        _makeWorkflow_

        Call this method to create the workflow spec instance when
        done

        """
        self._Validate()

        #  //
        # // Input Dataset required for Tier0
        #//
    
        inputDataset = self.cmsRunNode.addInputDataset(
            self.inputDataset['Primary'],
            self.inputDataset['Processed']
            )
        inputDataset["DataTier"] = self.inputDataset['DataTier']
        for keyname in [
            'SplitType',
            'SplitSize',
            'OnlySites',
            'OnlyBlocks',
            'OnlyClosedBlocks',
            ]:
            if self.inputDataset[keyname] != None:
                self.workflow.parameters[keyname] = self.inputDataset[keyname]
                
        
        #  //
        # // Extract dataset info from cfg
        #//
        for outModName in self.configuration.outputModules.keys():
            moduleInstance = self.configuration.getOutputModule(outModName)
            #  //
            # // Data Tier same as input
            #//
            dataTier = self.inputDataset['DataTier']
            #  //
            # // Output primary dataset same as input primary
            #//
            primaryName = self.inputDataset['Primary']

            #  //
            # // Output processed dataset
            #//  (Note we pass way more info than is used, since
            #  //conventions have a tendency to change in CMS...
            # //
            #//
            processedName = DatasetConventions.tier0ProcessedDatasetName(
                Version = self.cmsswVersion,
                InputPrimaryDataset = self.inputDataset['Primary'],
                InputProcessedDataset = self.inputDataset['Processed'],
                Label = self.label,
                Group = self.group,
                RequestId = self.requestId,
                Unmerged = self.unmergedDataset
                )
            
            dataTier = DatasetConventions.checkDataTier(dataTier)
            
            moduleInstance['primaryDataset'] = primaryName
            moduleInstance['processedDataset'] = processedName

            outDS = self.cmsRunNode.addOutputDataset(primaryName, 
                                                     processedName,
                                                     outModName)
            
            outDS['DataTier'] = dataTier
            outDS["ApplicationName"] = \
                                     self.cmsRunNode.application["Executable"]
            outDS["ApplicationFamily"] = outModName
            outDS["PhysicsGroup"] = self.group
            outDS["ApplicationFamily"] = outModName


            if self.inputDataset['IsUsed']:
                outDS['ParentDataset'] = self.inputDataset['DatasetName']
                
            if self.options['FakeHash']:
                guid = makeUUID()
                outDS['PSetHash'] = "hash=%s;guid=%s" % (self.psetHash,
                                                         guid)
            else:
                outDS['PSetHash'] = self.psetHash

            
        #  //
        # // Add Stage Out node
        #//
        WorkflowTools.addStageOutNode(self.cmsRunNode, "stageOut1")
        WorkflowTools.addLogArchNode(self.cmsRunNode, "logArchive")

        #  //
        # // generate tier0 LFN bases for this workflow
        #//
        tier0LFN = self.makeTier0LFN()

        self.workflow.parameters['MergedLFNBase'] = tier0LFN
        self.workflow.parameters['UnmergedLFNBase'] = tier0LFN
        
        return self.workflow